python source code of aggregate

"""
    Use this script to aggregate and analyze the data
"""

import argparse
import json
import re
from pprint import pprint
from typing import List, Tuple, Any

import numpy as np
from scipy.stats import gmean


def get_float(measure_string: str) -> float:
    r = re.compile(r"\d+(\.\d*)?")
    b = re.search(r, measure_string)
    if b:
        return float(b.group(0))
    return 0.0


def parse(file: Any) -> Tuple[List[str], List[float]]:
    benchmarks = []
    throughputs = []
    for i, line in enumerate(file.readlines()):
        if i <= 2:
            continue
        if i <= 4:
            splitted = line.strip().split()
            benchmarks.append(splitted[0])
            throughputs.append(get_float(splitted[1]))
        elif i == 5:
            splitted = line.strip().split()
            benchmarks.append(splitted[1])
            throughputs.append(float(splitted[0]))
        else:
            splitted = line.strip().split(": ")
            benchmarks.append(splitted[0])
            throughputs.append(get_float(splitted[1]))

    return benchmarks, throughputs


def aggregate(results: Any) -> Any:
    if args.aggregation == "worst":
        return results.max(axis=0)
    if args.aggregation == "mean":
        return results.mean(axis=0)
    if args.aggregation == "median":
        return np.median(results, axis=0)
    if results.shape[0] <= 2:
        return results.max(axis=0)
    return np.median(results, axis=0)


def main() -> None:
    throughputs_hase = []
    throughputs_original = []
    for i in range(args.n):
        with open(f"{args.name}_{i}.out") as file:
            benchmarks, throughput = parse(file)
            throughputs_original.append(throughput)
        with open(f"{args.name}_hase_{i}.out") as file:
            benchmarks, throughput = parse(file)
            throughputs_hase.append(throughput)

    throughputs_hase = np.array(throughputs_hase)
    throughputs_original = np.array(throughputs_original)

    ratios = aggregate(throughputs_hase) / aggregate(throughputs_original)

    for i in range(len(benchmarks)):
        print(f"{benchmarks[i]}\t{ratios[i]:.4f}")

    # print("GeoMean\t" + str(gmean(ratios)))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("n", type=int, help="The files to aggregate")
    parser.add_argument(
        "--name", type=str, default="nginx", help="The name of the benchmark"
    )
    parser.add_argument("--outdir", type=str, default=".", help="The output directory")
    parser.add_argument(
        "-a",
        "--aggregation",
        type=str,
        default="auto",
        choices=["auto", "median", "worst", "mean"],
        help="Choose a way to aggregate the data from different runs",
    )
    args = parser.parse_args()
    main()