python source code of whale

from argparse import ArgumentParser, ArgumentTypeError, ArgumentDefaultsHelpFormatter
from collections import defaultdict
import h5py
import pandas as pd
from whale_watch import fuse_reads
import sys
from pathlib import Path
import subprocess
from tqdm import tqdm


def full_path(file):
    return str(Path(file).expanduser().resolve())


def validate_file(file):
    f = Path(full_path(file))
    valid = ['.eps', '.ps', '.tex', '.pdf', '.jpeg', '.tiff', '.png', '.bmp', '.svg', '.wmf']
    if f.suffix not in valid:
        msg = "File doesn't end with one of {v}".format(v=valid)
        raise ArgumentTypeError(msg)
    return str(f)


def trim_series(keep_series, trim_df):
    """
    Given an input of a df, and list of Series, delete Series not in the list
    :param keep_series: list of Series to keep
    :param trim_df: pandas DataFrame
    :return: pandas DataFrame with series removed
    """
    for i, s in trim_df.items():
        if type(s).__name__ == "Series" and s.name not in keep_series:
            trim_df = trim_df.drop(s.name, axis=1)
    return trim_df


def get_annotations(path, fields, enum_field):
    data_labels = {}
    for field in fields:
        data_labels[field] = path[field]
    data_dtypes = {}
    if h5py.check_dtype(enum=path.dtype[enum_field]):
        dataset_dtype = h5py.check_dtype(enum=path.dtype[enum_field])
        # data_dtype may lose some dataset dtypes there are duplicates of 'v'
        data_dtypes = {v: k for k, v in dataset_dtype.items()}
    labels_df = pd.DataFrame(data=data_labels)
    return labels_df, data_dtypes


def prepare_data(seq_sum_df, interval_time, bulkfile, v, sf, run_id):
    seq_sum_df['end_time'] = seq_sum_df['start_time'] + seq_sum_df['duration']
    seq_sum_df = seq_sum_df[seq_sum_df['run_id'] == run_id]
    channels = seq_sum_df['channel'].drop_duplicates(keep='first').values
    dist_dict = defaultdict(list)
    sdist_dict = defaultdict(list)

    for ch in tqdm(channels):
        # format channel
        ch = int(ch)
        ch_str = "Channel_{ch}".format(ch=ch)
        # if v:
        #     print(ch_str)
        # set the paths
        try:
            int_path = bulkfile["IntermediateData"][ch_str]["Reads"]
        except KeyError:
            print("KeyError" + str(ch_str))
        except Exception as e:
            print(type(e).__name__)
            print(e)
            sys.exit()
        try:
            state_path = bulkfile["StateData"][ch_str]["States"]
        except KeyError:
            print("KeyError" + str(ch_str))
        except Exception as e:
            print(type(e).__name__)
            print(e)
            sys.exit()
        # get annotations from int data
        int_fields = ['read_id', 'read_start', 'modal_classification']
        labels, labels_dt = get_annotations(int_path, int_fields, 'modal_classification')
        labels = labels.drop_duplicates(subset="read_id", keep="first")
        labels['read_start'] = labels['read_start'] / sf
        labels['read_id'] = labels['read_id'].str.decode('utf8')
        # get annotations from state data
        state_fields = ['acquisition_raw_index', 'summary_state']
        state_label_df, state_label_dtypes = get_annotations(state_path, state_fields, 'summary_state')
        state_label_df['acquisition_raw_index'] = state_label_df['acquisition_raw_index'] / sf
        state_label_df = state_label_df.rename(
            columns={'acquisition_raw_index': 'read_start', 'summary_state': 'modal_classification'}
        )
        # merge int and state data
        labels = labels.append(state_label_df, ignore_index=True)
        labels.sort_values(by='read_start', ascending=True, inplace=True)
        labels_dt.update(state_label_dtypes)
        labels = labels.fillna(method='ffill')
        # match against channel
        seq_sum_match = seq_sum_df[seq_sum_df['channel'] == ch].dropna()
        for index, row in seq_sum_match.iterrows():
            b = labels[
                labels['read_start'].between(row['end_time'] - interval_time, row['end_time'], inclusive=False)]
            a = labels[
                labels['read_start'].between(row['end_time'], row['end_time'] + interval_time, inclusive=True)]
            for index2, row2 in b.iterrows():
                dist_dict[labels_dt[row2['modal_classification']]].append((row2['read_start'] - row['end_time']))
            for index3, row3 in a.iterrows():
                dist_dict[labels_dt[row3['modal_classification']]].append((row3['read_start'] - row['end_time']))
        # starts
        for index, row in seq_sum_match.iterrows():
            sb = labels[
                labels['read_start'].between(row['start_time'] - interval_time, row['start_time'], inclusive=False)]
            sa = labels[
                labels['read_start'].between(row['start_time'], row['start_time'] + interval_time, inclusive=True)]
            for index2, row2 in sb.iterrows():
                sdist_dict[labels_dt[row2['modal_classification']]].append((row2['read_start'] - row['start_time']))
            for index3, row3 in sa.iterrows():
                sdist_dict[labels_dt[row3['modal_classification']]].append((row3['read_start'] - row['start_time']))

    dist_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in dist_dict.items()]))
    sdist_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in sdist_dict.items()]))
    return dist_df, sdist_df


def main():
    args = get_args()
    """
    Use whale_watch to get fused reads data frames
    df2:               just fused reads df
    ss:                sequencing summary df
    chained_reads_ids: read_ids of just fused reads
    """
    df2, ss, chained_read_ids = fuse_reads(args.summary, args.paf, args.distance, 0, False, False)
    keep_cols = ['channel', 'read_id', 'run_id', 'start_time', 'duration']
    # ss2 is un-fused reads
    ss2 = ss[ss['read_id'].isin(chained_read_ids) == False]
    ss2 = ss2.reset_index()
    # ss3 is all to-be-fused reads
    ss3 = ss[ss['read_id'].isin(chained_read_ids) == True]
    # ss4 is 2nd chained read to last chained read
    ss4 = ss3[ss3['read_id'].isin(df2['read_id']) == False]
    #ss5 is all chained reads excluding the last in the chain
    ss5 = ss3[ss3['read_id'].isin(df2['last_read_id']) == False]
    ss5 = ss5.drop_duplicates(subset=keep_cols, keep='first')
    ss6 = ss3[ss3['read_id'].isin(df2['last_read_id'])==True]
    # Open bulkfile
    bulkfile = h5py.File(args.bulk_file, "r")
    sf = int(bulkfile["UniqueGlobalKey"]["context_tags"].attrs["sample_frequency"].decode('utf8'))
    run_id = bulkfile["UniqueGlobalKey"]["tracking_id"].attrs["run_id"].decode('utf8')
    ss4 = trim_series(keep_cols, ss4)
    ss2 = trim_series(keep_cols, ss2)
    ss5 = trim_series(keep_cols, ss5)
    ss6 = trim_series(keep_cols, ss6)
    df2 = trim_series(keep_cols, df2)

    df2['duration'] = pd.to_numeric(df2['duration'], errors='coerce')
    df2['channel'] = pd.to_numeric(df2['channel'], errors='coerce').astype('int64')

    i_dist_df, i_sdist_df = prepare_data(ss2, args.time, bulkfile, args.verbose, sf, run_id)
    k_dist_df, k_sdist_df = prepare_data(ss4, args.time, bulkfile, args.verbose, sf, run_id)
    j_dist_df, j_sdist_df = prepare_data(df2, args.time, bulkfile, args.verbose, sf, run_id)
    l_dist_df, l_sdist_df = prepare_data(ss6, args.time, bulkfile, args.verbose, sf, run_id)
    m_dist_df, m_sdist_df = prepare_data(ss5, args.time, bulkfile, args.verbose, sf, run_id)
    # starts of un-fused reads "Unique read start"
    i_sdist_df.to_csv(args.A, sep=",", header=True)
    # unique read ends
    i_dist_df.to_csv(args.B, sep=",", header=True)
    # start of just fused reads
    j_sdist_df.to_csv(args.C, sep=",", header=True)
    # starts of reads to be fused "Internal read starts"
    k_sdist_df.to_csv(args.D, sep=",", header=True)
    # internal read ends
    m_dist_df.to_csv(args.E, sep=",", header=True)
    # last fused read end
    l_dist_df.to_csv(args.F, sep=",", header=True)

    if not args.no_generate_plot:
        # subprocess to R, generate plot
        R_script = str(Path(Path(__file__).resolve().parent / 'whale.R'))
        cmd = "Rscript {r} {a} {b} {c} {d} {e} {f} {o} {run}".format(r=R_script,
                                                                     a=args.A,
                                                                     b=args.B,
                                                                     c=args.C,
                                                                     d=args.D,
                                                                     e=args.E,
                                                                     f=args.F,
                                                                     o=args.out,
                                                                     run=run_id
                                                                     )
        proc = subprocess.Popen(cmd, shell=True)
        proc.communicate()
        if proc.returncode == 0:
            print("plot saved as {f}".format(f=Path(args.out).name))
        else:
            print(proc.returncode)
    return


def get_args():
    parser = ArgumentParser(
        description="""Parse sequencing_summary.txt, .paf, and 
                    bulk fast5 files to generate CSV files 
                    containing the distributions of MinKNOW 
                    events around read starts and ends. These 
                    are divided into unique reads, split reads 
                    and internal reads. The R script, whale.R, 
                    is called to generate the plot; this requires
                    the packages: ggplot2, tidyr, and dplyr.
                    Note: of the MinKNOW classifications only above, 
                    adapter, pore, transition, unblocking, and 
                    unclassified are included.""",
        formatter_class=ArgumentDefaultsHelpFormatter,
        add_help=False)
    general = parser.add_argument_group(
        title='General options')
    general.add_argument("-h", "--help",
                         action="help",
                         help="Show this help and exit"
                         )
    general.add_argument("-d", "--distance",
                         help='''Specify the maximum distance, in bases, between consecutive mappings.
                              This is the difference between \'Target Start\' and \'Target End\' in 
                              a paf file''',
                         type=int,
                         default=10000,
                         )
    general.add_argument("-V", "--verbose",
                         help='''Print verbose output to terminal''',
                         action="store_true",
                         default=False,
                         )
    in_args = parser.add_argument_group(
        title='Input sources'
    )
    in_args.add_argument("-b", "--bulk-file",
                         help="An ONT bulk fast5 file containing raw signal",
                         type=str,
                         required=True,
                         )
    in_args.add_argument("-s", "--summary",
                         help="A sequencing summary file generated by albacore",
                         type=str,
                         required=True,
                         )
    in_args.add_argument("-p", "--paf",
                         help="A paf file generated by minimap2",
                         type=str,
                         required=True,
                         )
    in_args.add_argument("-t", "--time",
                         help='''+/- time around a strand event in seconds''',
                         type=int,
                         required=True,
                         default=10,
                         )
    out_args = parser.add_argument_group(
        title='Output files'
    )
    out_args.add_argument("--no-generate-plot",
                          help="If set, do not generate density plot",
                          action="store_true",
                          default=False,
                          )
    out_args.add_argument("-A",
                         help="CSV of MinKNOW events occurring before and after correctly called read starts",
                         type=full_path,
                         default='unique_read_start.csv',
                         required=False,
                         )
    out_args.add_argument("-B",
                         help="CSV of MinKNOW events occurring before and after correctly called read ends",
                         type=full_path,
                         default='unique_read_end.csv',
                         required=False,
                         )
    out_args.add_argument("-C",
                         help="""CSV of MinKNOW events occurring before and after 
                         the start of the first incorrectly split read in a group""",
                         type=full_path,
                         default='split_read_start.csv',
                         required=False,
                         )
    out_args.add_argument("-D",
                         help="""CSV of MinKNOW events occurring before and after incorrectly 
                         called read starts, within a group of incorrectly split reads""",
                         type=full_path,
                         default='internal_read_start.csv',
                         required=False,
                         )
    out_args.add_argument("-E",
                          help="""CSV of MinKNOW events occurring before and after incorrectly 
                          called read ends, within a group of incorrectly split reads""",
                          type=full_path,
                          default="internal_read_end.csv",
                          required=False,
                           )
    out_args.add_argument("-F",
                         help="""CSV of MinKNOW events occurring before and after the end 
                         of the first incorrectly split read in a group""",
                         type=full_path,
                         default='split_read_end.csv',
                         required=False,
                         )
    out_args.add_argument("--out",
                          help='''Specify the output filename for the plot.
                          File extension must be one of [.eps, .ps, .tex, .pdf, .jpeg, .tiff, .png, .bmp, .svg, .wmf]''',
                          type=validate_file,
                          default='classification_count.pdf',
                          required=False,
                           )
    return parser.parse_args()


if __name__ == '__main__':
    main()