import random import string import os random.seed(42) OUTPUT = "vectors.txt" def random_str(n): return ''.join(random.choice(string.ascii_lowercase) for _ in range(n)) def random_substr(string, length): # [0, len(string)-length] s = random.randint(0, len(string) - 1 - length) # Note: we cannot just return index 's' here, as this might not be the # first occurence of the substring in string. return string[s:s+length] if __name__ == '__main__': if os.path.exists(OUTPUT): raise Exception("File already exists: " + OUTPUT) # String lengths to generate. nlens = [100, 1000, 10000, 100000, 1000000] # Strings that will be searched. search_strings = [] # Test strings that will be tested with search strings. test_strings = [] results = [] for n in nlens: print("Generating cases for length n:", n) # Search string. ss = random_str(n) s_test_strings = [] res = [] # Generate search strings of lengths 1 to len(ss). for x in range(1, min(len(ss), 300)): # Generate 50 random search strings of length x. random_cases = [random_str(x) for _ in range(50)] # Compute correct results for random strings. random_results = [ss.find(s) for s in random_cases] # For positive cases select 50 random substrings of length x. pos_cases = [random_substr(ss, x) for _ in range(50)] pos_results = [ss.find(s) for s in pos_cases] s_test_strings.append(random_cases + pos_cases) res.append(random_results + pos_results) search_strings.append(ss) test_strings.append(s_test_strings) results.append(res) print("Writing test vectors to file:") # vectors.txt is in: <search_string>,[<test_string>:<expected>] format. with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), OUTPUT), 'w') as f: for i in range(len(search_strings)): f.write(search_strings[i] + ",") for j in range(len(test_strings[i])): for k in range(len(test_strings[i][j])): f.write(test_strings[i][j][k] + ":" + str(results[i][j][k])+",") f.write("\n")