"""Utilities to generate synthetic data"""
import random
import string

import pandas as pd


def generate_tokens(mean, std_dev, num_tokens):
    tokens = {}
    cnt = 0
    while cnt < num_tokens:
        length = int(round(random.normalvariate(mean,
                                                std_dev)))
        if length < 2:
            continue
        flag = True
        while flag:
            new_token = ''.join(random.choice(string.ascii_lowercase)
                                for i in range(length))
            if tokens.get(new_token) is None:
                tokens[new_token] = True
                flag = False
        cnt += 1
    return list(tokens.keys())


def generate_table(mean, std_dev, tokens, num_records,
                   id_col_name, attr_col_name):
    records = []
    cnt = 0
    num_tokens = len(tokens)
    while cnt < num_records:
        size = int(round(random.normalvariate(mean,
                                              std_dev)))
        new_string = ''
        for i in range(size):
            rand = random.randint(0, num_tokens - 1)
            if i == 0:
                new_string += tokens[rand]
            else:
                new_string += ' ' + tokens[rand]

        records.append([cnt, new_string])
        cnt += 1
    return pd.DataFrame(records, columns=[id_col_name, attr_col_name])