python source code of time_train

from time import time
import random
import string
import os, shutil
import numpy as np
import matplotlib.pylab as plt

from rasa_nlu.training_data import load_data
from rasa_nlu.model import Trainer
from rasa_nlu import config, evaluate

"""
Trains a bunch of models with differently sized lookup tables
Times the results of training and evaluation
Creates pretty plots.
"""


def clear_model_dir():
    # clears old models
    folder = "tmp/models"
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(e)


def construct_lookup(n_words, w_len):
    # makes a random lookup table
    lookup_fname = "tmp/lookup.txt"
    open(lookup_fname, "w").close()
    with open(lookup_fname, "w") as f:
        word = "".join(random.choices(string.ascii_letters + string.digits, k=w_len))
        words = "\n".join(n_words * [word])
        f.write(words)


def train_model():
    # trains a model and times it
    t = time()
    # training_data = load_data('demo_train.md')
    training_data = load_data("data/company_train_lookup.json")
    td_load_time = time() - t
    trainer = Trainer(config.load("config.yaml"))
    t = time()
    trainer.train(training_data)
    train_time = time() - t
    clear_model_dir()
    t = time()
    model_directory = trainer.persist(
        "./tmp/models"
    )  # Returns the directory the model is stored in
    persist_time = time() - t
    return td_load_time, train_time, persist_time


def evaluate_model():
    # evaluates a model and times it
    model_name = os.listdir("./tmp/models/default")[0]  # get first (and only) model
    t = time()
    evaluate.run_evaluation("data/demo_test.md", "./tmp/models/default/" + model_name)
    eval_time = time() - t
    return eval_time


def plot_results(
    words,
    lookup_construct_time,
    load_time,
    train_time,
    persist_time,
    eval_time,
    total_time,
    plt_times=True,
):
    f, ax = plt.subplots()
    ax.plot(words, load_time)
    ax.plot(words, train_time)
    ax.plot(words, persist_time)
    ax.plot(words, eval_time)
    ax.plot(words, total_time)
    if plt_times:
        a = 0.6
        ax.plot([words[0], words[-1]], [1, 1], "k:", alpha=a)
        ax.plot([words[0], words[-1]], [60, 60], "k-.", alpha=a)
        ax.plot([words[0], words[-1]], [600, 600], "k--", alpha=a)

    ax.set_xlabel("number of words")
    ax.set_ylabel("time to evaluate (sec)")
    ax.set_xscale("log")
    ax.set_yscale("log")
    if plt_times:
        ax.legend(
            (
                "load data",
                "train",
                "persist",
                "evaluate",
                "total",
                "1 sec",
                "1 min",
                "10 min",
            ),
            loc="upper left",
        )
    else:
        ax.legend(
            ("load data", "train", "persist", "evaluate", "total"), loc="upper left"
        )
    return f, ax


def print_stats(
    count,
    num_words_list,
    num_words,
    lookup_construct_time,
    td_load_time,
    train_time,
    persist_time,
    eval_time,
    total_time,
):

    print("{0:.2f} % done".format((count + 1) / len(num_words_list) * 100))
    print("with {} words in lookup table:".format(num_words))
    print("    took {} sec. to construct lookup table".format(lookup_construct_time))
    print("    took {} sec. to load training data".format(td_load_time))
    print("    took {} sec. to train model".format(train_time))
    print("    took {} sec. to perist model".format(persist_time))
    print("    took {} sec. to evaluate on test set".format(eval_time))
    print("    took {} sec. total".format(total_time))


if __name__ == "__main__":

    N = 1
    word_length = 10
    verbose = True

    num_word_distribution = np.logspace(0, 7, num=N)

    # integerize and uniquify this
    num_words_list = [int(num_word_distribution[i]) for i in range(N)]
    num_words_list = list(set(num_words_list))
    num_words_list.sort()

    t_train = []
    t_eval = []
    t_total = []

    for count, num_words in enumerate(num_words_list):

        t_tot = time()
        construct_lookup(num_words, word_length)
        train_time = train_model()
        eval_time = evaluate_model()
        total_time = time() - t_tot

        if verbose:
            print_stats(
                count, num_words_list, num_words, train_time, eval_time, total_time
            )

        t_train.append(train_time)
        t_eval.append(eval_time)
        t_total.append(total_time)

    f, ax = plot_results(num_words_list, t_train, t_eval, t_total, plt_times=True)
    f.savefig("img/results_terminal.png", dpi=400)
    plt.show()