python source code of train

# Copyright (C) 2018  Mikel Artetxe <artetxem@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import argparse
import glob
import os
import shutil
import subprocess
import tempfile
from shlex import quote


ROOT = os.path.dirname(os.path.abspath(__file__))
FAST_ALIGN = ROOT + '/third-party/fast_align/build'
MOSES = ROOT + '/third-party/moses'
VECMAP = ROOT + '/third-party/vecmap'
PHRASE2VEC = ROOT + '/third-party/phrase2vec/word2vec'
TRAINING = ROOT + '/training'


def bash(command):
    subprocess.run(['bash', '-c', command])


def binarize(output_config, output_pt, lm_path, lm_order, phrase_table, reordering=None, prune=100):
    output_pt = os.path.abspath(output_pt)
    lm_path = os.path.abspath(lm_path)

    # Binarize
    reord_args = ' --lex-ro ' + quote(reordering) + ' --num-lex-scores 6' if reordering is not None else ''
    bash(quote(MOSES + '/scripts/generic/binarize4moses2.perl') +
         ' --phrase-table ' + quote(phrase_table) +
         ' --output-dir ' + quote(output_pt) +
         ' --num-scores 4' +
         ' --prune ' + str(prune) +
         reord_args)

    # Clean temporary files created by the binarization script
    for tmp in glob.glob(output_pt + '/../tmp.*'):
        shutil.rmtree(tmp)

    # Build configuration file
    with open(output_config, 'w') as f:
        print('[input-factors]', file=f)
        print('0', file=f)
        print('', file=f)
        print('[mapping]', file=f)
        print('0 T 0', file=f)
        print('', file=f)
        print('[distortion-limit]', file=f)
        print('6', file=f)
        print('', file=f)
        print('[feature]', file=f)
        print('UnknownWordPenalty', file=f)
        print('WordPenalty', file=f)
        print('PhrasePenalty', file=f)
        print('ProbingPT name=TranslationModel0 num-features=4' + 
              ' path=' + output_pt + ' input-factor=0 output-factor=0', file=f)
        if reordering is not None:
            print('LexicalReordering name=LexicalReordering0' +
                  ' num-features=6 type=wbe-msd-bidirectional-fe-allff' +
                  ' input-factor=0 output-factor=0 property-index=0', file=f)
        print('Distortion', file=f)
        print('KENLM name=LM0 factor=0 path=' + lm_path +
              ' order=' + str(lm_order), file=f)
        print('', file=f)
        print('[weight]', file=f)
        print('UnknownWordPenalty0= 1', file=f)
        print('WordPenalty0= -1', file=f)
        print('PhrasePenalty0= 0.2', file=f)
        print('TranslationModel0= 0.2 0.2 0.2 0.2', file=f)
        if reordering is not None:
            print('LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3', file=f)
        print('Distortion0= 0.3', file=f)
        print('LM0= 0.5', file=f)


def train_supervised(args, train_src, train_trg, dev_src, dev_trg, lm_path, lm_order, output_dir):
    lm_path = os.path.abspath(lm_path)
    output_dir = os.path.abspath(output_dir)
    tmp = args.tmp + '/train-supervised'
    os.mkdir(tmp)

    # Copy the corpus
    shutil.copy(train_src, tmp + '/corpus.src')
    shutil.copy(train_trg, tmp + '/corpus.trg')

    # Corpus cleaning
    bash(quote(MOSES + '/scripts/training/clean-corpus-n.perl') +
         ' ' + quote(tmp + '/corpus') + ' src trg' +
         ' ' + quote(tmp + '/clean') +
         ' ' + str(args.min_tokens) + ' ' + str(args.max_tokens))  # TODO Reusing min/max from step 1
    os.remove(tmp + '/corpus.src')
    os.remove(tmp + '/corpus.trg')

    # Merge both languages into a single file
    bash('paste -d " ||| " ' + quote(tmp + '/clean.src') +
         ' /dev/null /dev/null /dev/null /dev/null ' + quote(tmp + '/clean.trg') +
         ' > ' + quote(tmp + '/clean.txt'))

    # Align
    bash(quote(FAST_ALIGN + '/fast_align') +
         ' -i ' + quote(tmp + '/clean.txt') + ' -d -o -v' +
         ' > ' + quote(tmp + '/forward.align'))
    bash(quote(FAST_ALIGN + '/fast_align') +
         ' -i ' + quote(tmp + '/clean.txt') + ' -d -o -v -r' +
         ' > ' + quote(tmp + '/reverse.align'))
    os.remove(tmp + '/clean.txt')

    # Symmetrization
    bash(quote(FAST_ALIGN + '/atools') +
         ' -i ' + quote(tmp + '/forward.align') +
         ' -j ' + quote(tmp + '/reverse.align') +
         ' -c grow-diag-final-and' +
         ' > ' + quote(tmp + '/aligned.grow-diag-final-and'))
    os.remove(tmp + '/forward.align')
    os.remove(tmp + '/reverse.align')

    # Build model
    bash(quote(MOSES + '/scripts/training/train-model.perl') +
         ' -model-dir ' + quote(tmp) +
         ' -corpus ' + quote(tmp) + '/clean' +
         ' -f src -e trg' +
         ' -alignment grow-diag-final-and' +
         ' -reordering msd-bidirectional-fe' +
         ' -max-phrase-length 5' +
         ' -temp-dir ' + quote(tmp + '/tmp') +
         ' -lm "0:{}:{}:8"'.format(lm_order, lm_path) +
         ' -first-step 4' +
         ' -score-options="-MinScore 2:0.0001"' +
         ' -cores ' + str(args.threads) +
         ' -parallel -sort-buffer-size 10G -sort-batch-size 253 -sort-compress gzip' +
         ' -sort-parallel ' + str(args.threads))
    shutil.move(tmp + '/phrase-table.gz', args.tmp)
    shutil.move(tmp + '/reordering-table.wbe-msd-bidirectional-fe.gz', args.tmp)
    shutil.rmtree(tmp)

    # Binarize model
    binarize(args.tmp + '/moses.ini',
             output_dir + '/probing-table',
             lm_path,
             lm_order,
             args.tmp + '/phrase-table.gz',
             args.tmp + '/reordering-table.wbe-msd-bidirectional-fe.gz',
             prune=args.pt_prune)
    os.remove(args.tmp + '/phrase-table.gz')
    os.remove(args.tmp + '/reordering-table.wbe-msd-bidirectional-fe.gz')

    # MERT
    bash(quote(MOSES + '/scripts/training/mert-moses.pl') +
         ' ' + quote(dev_src) +
         ' ' + quote(dev_trg) +
         ' ' + quote(MOSES + '/bin/moses2') +
         ' ' + quote(args.tmp + '/moses.ini') +
         ' --no-filter-phrase-table' +
         ' --mertdir ' + quote(MOSES + '/bin/') +
         ' --threads ' + str(args.threads) +
         ' --decoder-flags="-threads ' + str(args.threads) + '"' +
         ' --working-dir ' + quote(os.path.abspath(args.tmp + '/mert')))
    shutil.move(args.tmp + '/mert/moses.ini', output_dir + '/moses.ini')
    shutil.rmtree(args.tmp + '/mert')
    os.remove(args.tmp + '/moses.ini')


# Step 1: Corpus preprocessing
def preprocess(args):
    root = args.working + '/step1'
    os.mkdir(root)
    for part, corpus, lang in (('src', args.src, args.src_lang), ('trg', args.trg, args.trg_lang)):
        # Tokenize, deduplicate, clean by length, and shuffle
        bash('export LC_ALL=C;' +
             quote(MOSES + '/scripts/tokenizer/tokenizer.perl') +
             ' -l ' + quote(lang) + ' -threads ' + str(args.threads) +
             ' < ' + quote(corpus) +
             ' | sort -S 10G --batch-size 253 --compress-program gzip' +
             ' --parallel ' + str(args.threads) + ' -T ' + quote(args.tmp) +
             ' | uniq' + 
             ' | python3 ' + quote(TRAINING + '/clean-corpus.py') +
                 ' --min ' + str(args.min_tokens) +
                 ' --max ' + str(args.max_tokens) +
             ' | shuf'
             ' > ' + quote(args.tmp + '/full.tok'))

        # Train truecaser
        bash(quote(MOSES + '/scripts/recaser/train-truecaser.perl') +
             ' --model ' + quote(root + '/truecase-model.' + part) +
             ' --corpus ' + quote(args.tmp + '/full.tok'))

        # Truecase
        bash(quote(MOSES + '/scripts/recaser/truecase.perl') +
             ' --model ' + quote(root + '/truecase-model.' + part) +
             ' < ' + quote(args.tmp + '/full.tok') +
             ' > ' + quote(args.tmp + '/full.true'))

        # Split train/dev
        bash('head -' + str(args.dev_size) +
             ' < ' + quote(args.tmp + '/full.true') +
             ' > ' + quote(root + '/dev.true.' + part))
        bash('tail -n +' + str(args.dev_size + 1) +
             ' < ' + quote(args.tmp + '/full.true') +
             ' > ' + quote(root + '/train.true.' + part))

    # Remove temporary files
    os.remove(args.tmp + '/full.tok')
    os.remove(args.tmp + '/full.true')


# Step 2: Language model training
def train_lm(args):
    root = args.working + '/step2'
    os.mkdir(root)
    for part in ('src', 'trg'):
        bash(quote(MOSES + '/bin/lmplz') +
             ' -T ' + quote(args.tmp + '/lm') +
             ' -o ' + str(args.lm_order) +
             ' --prune ' + ' '.join(map(str, args.lm_prune)) +
             ' < ' + quote(args.working + '/step1/train.true.' + part) +
             ' > ' + quote(args.tmp + '/model.arpa'))
        bash(quote(MOSES + '/bin/build_binary') +
             ' ' + quote(args.tmp + '/model.arpa') +
             ' ' + quote(root + '/' + part + '.blm'))
        os.remove(args.tmp + '/model.arpa')


# Step 3: Train embeddings
def train_embeddings(args):
    root = args.working + '/step3'
    os.mkdir(root)
    for part in ('src', 'trg'):
        corpus = args.working + '/step1/train.true.' + part

        # Extract n-grams
        counts = []
        for i, cutoff in enumerate(args.vocab_cutoff):
            counts.append(quote(args.tmp + '/ngrams.' + str(i+1)))
            bash('python3 ' + quote(TRAINING + '/extract-ngrams.py') +
                 ' -i ' + quote(corpus) +
                 ' --min-order ' + str(i+1) +
                 ' --max-order ' + str(i+1) +
                 ' --min-count ' + str(args.vocab_min_count) +
                 ' | sort -nr' +
                 ' | head -' + str(cutoff) +
                 ' > ' + counts[-1])
        bash('cat ' + ' '.join(counts) + ' | cut -f2 > ' + quote(args.tmp + '/phrases.txt'))

        # Build standard word2vec vocabulary
        bash(quote(PHRASE2VEC) +
             ' -train ' + quote(corpus) +
             ' -min-count ' + str(args.vocab_min_count) +
             ' -save-vocab ' + quote(args.tmp + '/vocab-full.txt'))
        bash('head -' + str(args.vocab_cutoff[0]) +
             ' ' + quote(args.tmp + '/vocab-full.txt') +
             ' > ' + quote(args.tmp + '/vocab.txt'))

        # Train embeddings
        bash(quote(PHRASE2VEC) +
            ' -train ' + quote(corpus) +
            ' -read-vocab ' + quote(args.tmp + '/vocab.txt') +
            ' -phrases ' + quote(args.tmp + '/phrases.txt') +
            ' -cbow 0 -hs 0 -sample 0' +  # Fixed params
            ' -size ' + str(args.emb_size) +
            ' -window ' + str(args.emb_window) + 
            ' -negative ' + str(args.emb_negative) +
            ' -iter ' + str(args.emb_iter) +
            ' -threads ' + str(args.threads) +
            ' -output ' + quote(root + '/emb.' + part))

        # Clean temporary files
        for f in os.listdir(args.tmp):
            os.remove(os.path.join(args.tmp, f))


# Step 4: Map embeddings
# TODO Add CUDA support
def map_embeddings(args):
    root = args.working + '/step4'
    os.mkdir(root)
    bash('export OMP_NUM_THREADS=' + str(args.threads) + ';'
         ' python3 ' + quote(VECMAP + '/map_embeddings.py') +
         ' --unsupervised -v' +
         ' ' + quote(args.working + '/step3/emb.src') +
         ' ' + quote(args.working + '/step3/emb.trg') +
         ' ' + quote(root + '/emb.src') +
         ' ' + quote(root + '/emb.trg'))


# Step 5: Induce phrase-table
# TODO Add CUDA support
# TODO Add additional options
def induce_phrase_table(args):
    root = args.working + '/step5'
    os.mkdir(root)
    bash('export OMP_NUM_THREADS=' + str(args.threads) + ';'
         ' python3 '  + quote(TRAINING + '/induce-phrase-table.py') +
         ' --src ' + quote(args.working + '/step4/emb.src') +
         ' --trg ' + quote(args.working + '/step4/emb.trg') +
         ' --src2trg ' + quote(args.tmp + '/src2trg.phrase-table') +
         ' --trg2src ' + quote(args.tmp + '/trg2src.phrase-table'))
    for part in 'src2trg', 'trg2src':
        bash('export LC_ALL=C;' +
             ' sort -S 10G --batch-size 253 --compress-program gzip' +
             ' --parallel ' + str(args.threads) + ' -T ' + quote(args.tmp) +
             ' ' + quote(args.tmp + '/' + part + '.phrase-table') +
             ' | gzip > ' + quote(root + '/' + part + '.phrase-table.gz'))
        os.remove(args.tmp + '/' + part + '.phrase-table')


# Step 6: Build initial model
def build_initial_model(args):
    root = args.working + '/step6'
    os.mkdir(root)
    for src, trg in ('src', 'trg'), ('trg', 'src'):
        part = src + '2' + trg
        binarize(root + '/' + part + '.moses.ini',
                 root + '/probing-table-' + part,
                 args.working + '/step2/' + trg + '.blm',
                 args.lm_order,
                 args.working + '/step5/' + part + '.phrase-table.gz',
                 prune=args.pt_prune)


# Step 7: Unsupervised tuning
def unsupervised_tuning(args):
    root = args.working + '/step7'
    os.mkdir(root)
    config = {('src', 'trg'): args.working + '/step6/src2trg.moses.ini',
              ('trg', 'src'): args.working + '/step6/trg2src.moses.ini'}
    for it in range(1, args.tuning_iter + 1):
        for src, trg in ('src', 'trg'), ('trg', 'src'):

            # Translate backwards
            bash(quote(MOSES + '/bin/moses2') +
                 ' -f ' + quote(config[(trg, src)]) + 
                 ' --threads ' + str(args.threads) +
                 ' < ' + quote(args.working + '/step1/dev.true.' + trg) +
                 ' > ' + quote(args.tmp + '/output.txt') +
                 ' 2> /dev/null')

            # MERT
            # TODO Should we start optimization from previous weights?
            bash(quote(MOSES + '/scripts/training/mert-moses.pl') +
                 ' ' + quote(args.tmp + '/output.txt') +
                 ' ' + quote(args.working + '/step1/dev.true.' + trg) +
                 ' ' + quote(MOSES + '/bin/moses2') +
                 ' ' + quote(config[(src, trg)]) +
                 ' --no-filter-phrase-table' +
                 ' --mertdir ' + quote(MOSES + '/bin/') +
                 ' --threads ' + str(args.threads) +
                 ' --decoder-flags="-threads ' + str(args.threads) + '"' +
                 ' --working-dir ' + quote(os.path.abspath(args.tmp + '/mert')))

            # Move tuned configuration file
            config[(src, trg)] = root + '/' + src + '2' + trg + '.it' + str(it) + '.moses.ini'
            shutil.move(args.tmp + '/mert/moses.ini', config[(src, trg)])

            # Remove temporary files
            shutil.rmtree(args.tmp + '/mert')
            os.remove(args.tmp + '/output.txt')

    shutil.copy(root + '/src2trg.it{0}.moses.ini'.format(args.tuning_iter), root + '/src2trg.moses.ini')
    shutil.copy(root + '/trg2src.it{0}.moses.ini'.format(args.tuning_iter), root + '/trg2src.moses.ini')


# Step 8: Iterative backtranslation
def iterative_backtranslation(args):
    root = args.working + '/step8'
    os.mkdir(root)
    config = {('src', 'trg'): args.working + '/step7/src2trg.moses.ini',
              ('trg', 'src'): args.working + '/step7/trg2src.moses.ini'}
    for part in 'src', 'trg':
        bash('head -' + str(args.backtranslation_sentences) +
             ' ' + quote(args.working + '/step1/train.true.' + part) +
             ' > ' + quote(args.tmp + '/train.' + part))
    for it in range(1, args.backtranslation_iter + 1):
        for src, trg in ('src', 'trg'), ('trg', 'src'):
            # TODO Use cube pruning?
            bash(quote(MOSES + '/bin/moses2') +
                 ' -f ' + quote(config[(trg, src)]) + 
                 ' --threads ' + str(args.threads) +
                 ' < ' + quote(args.tmp + '/train.' + trg) +
                 ' > ' + quote(args.tmp + '/train.bt') +
                 ' 2> /dev/null')
            bash(quote(MOSES + '/bin/moses2') +
                 ' -f ' + quote(config[(trg, src)]) + 
                 ' --threads ' + str(args.threads) +
                 ' < ' + quote(args.working + '/step1/dev.true.' + trg) +
                 ' > ' + quote(args.tmp + '/dev.bt') +
                 ' 2> /dev/null')
            train_supervised(args,
                             args.tmp + '/train.bt',
                             args.tmp + '/train.' + trg,
                             args.tmp + '/dev.bt',
                             args.working + '/step1/dev.true.' + trg,
                             args.working + '/step2/' + trg + '.blm',
                             args.lm_order,
                             root + '/' + src + '2' + trg + '-it' + str(it))
            os.remove(args.tmp + '/train.bt')
            os.remove(args.tmp + '/dev.bt')
            config[(src, trg)] = root + '/' + src + '2' + trg + '-it' + str(it) + '/moses.ini'

    shutil.copy(config[('src', 'trg')], args.working + '/src2trg.moses.ini')
    shutil.copy(config[('trg', 'src')], args.working + '/trg2src.moses.ini')
    os.remove(args.tmp + '/train.src')
    os.remove(args.tmp + '/train.trg')


def main():
    parser = argparse.ArgumentParser(description='Train an unsupervised SMT model')
    parser.add_argument('--src', metavar='PATH', required=True, help='Source language corpus')
    parser.add_argument('--trg', metavar='PATH', required=True, help='Target language corpus')
    parser.add_argument('--src-lang', metavar='STR', required=True, help='Source language code')
    parser.add_argument('--trg-lang', metavar='STR', required=True, help='Target language code')
    parser.add_argument('--from-step', metavar='N', type=int, default=1, help='Start at step N')
    parser.add_argument('--to-step', metavar='N', type=int, default=8, help='End at step N')
    parser.add_argument('--working', metavar='PATH', required=True, help='Working directory')
    parser.add_argument('--tmp', metavar='PATH', help='Temporary directory')
    parser.add_argument('--threads', metavar='N', type=int, default=20, help='Number of threads (defaults to 20)')

    parser.add_argument('--pt-prune', metavar='N', type=int, default=100, help='Phrase-table pruning (defaults to 100)')  # TODO Which group?

    preprocessing_group = parser.add_argument_group('Step 1', 'Corpus preprocessing')
    preprocessing_group.add_argument('--min-tokens', metavar='N', type=int, default=3, help='Remove sentences with less than N tokens (defaults to 3)')
    preprocessing_group.add_argument('--max-tokens', metavar='N', type=int, default=80, help='Remove sentences with more than N tokens (defaults to 80)')
    preprocessing_group.add_argument('--dev-size', metavar='N', type=int, default=10000, help='Number of sentences for tuning (defaults to 10000)')

    lm_group = parser.add_argument_group('Step 2', 'Language model training')
    lm_group.add_argument('--lm-order', metavar='N', type=int, default=5, help='Language model order (defaults to 5)')
    lm_group.add_argument('--lm-prune', metavar='N', type=int, nargs='+', default=[0, 0, 1], help='Language model pruning (defaults to 0 0 1)')

    phrase2vec_group = parser.add_argument_group('Step 3', 'Phrase embedding training')
    phrase2vec_group.add_argument('--vocab-cutoff', metavar='N', type=int, nargs='+', default=[200000, 400000, 400000], help='Vocabulary cut-off (defaults to 200000 400000 400000)')
    phrase2vec_group.add_argument('--vocab-min-count', metavar='N', type=int, default=10, help='Discard words with less than N occurrences (defaults to 10)')
    phrase2vec_group.add_argument('--emb-size', metavar='N', type=int, default=300, help='Dimensionality of the phrase embeddings (defaults to 300)')
    phrase2vec_group.add_argument('--emb-window', metavar='N', type=int, default=5, help='Max skip length between words (defauls to 5)')
    phrase2vec_group.add_argument('--emb-negative', metavar='N', type=int, default=10, help='Number of negative examples (defaults to 10)')
    phrase2vec_group.add_argument('--emb-iter', metavar='N', type=int, default=5, help='Number of training epochs (defaults to 5)')

    tuning_group = parser.add_argument_group('Step 7', 'Unsupervised tuning')
    tuning_group.add_argument('--tuning-iter', metavar='N', type=int, default=10, help='Number of unsupervised tuning iterations (defaults to 10)')

    backtranslation_group = parser.add_argument_group('Step 8', 'Iterative backtranslation')
    backtranslation_group.add_argument('--backtranslation-iter', metavar='N', type=int, default=3, help='Number of backtranslation iterations (defaults to 3)')
    backtranslation_group.add_argument('--backtranslation-sentences', metavar='N', type=int, default=2000000, help='Number of sentences for training backtranslation (defaults to 2000000)')

    args = parser.parse_args()

    if args.tmp is None:
        args.tmp = args.working

    os.makedirs(args.working, exist_ok=True)
    os.makedirs(args.tmp, exist_ok=True)
    with tempfile.TemporaryDirectory(dir=args.tmp) as args.tmp:
        if args.from_step <= 1 <= args.to_step:
            preprocess(args)
        if args.from_step <= 2 <= args.to_step:
            train_lm(args)
        if args.from_step <= 3 <= args.to_step:
            train_embeddings(args)
        if args.from_step <= 4 <= args.to_step:
            map_embeddings(args)
        if args.from_step <= 5 <= args.to_step:
            induce_phrase_table(args)
        if args.from_step <= 6 <= args.to_step:
            build_initial_model(args)
        if args.from_step <= 7 <= args.to_step:
            unsupervised_tuning(args)
        if args.from_step <= 8 <= args.to_step:
            iterative_backtranslation(args)


if __name__ == '__main__':
    main()