python source code of atlas3

import json
import os.path as op
import pandas as pd
import ssbio.utils
import pickle
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

filter_proteome = [False, ]

filter_observations = ['A22.CCCP', 'A22.PYOCYANIN', 'CCCP.2', 'CHIR090.CCCP', 'DOXYCYCLINE.PARAQUAT', 'DOXYCYCLINE.PMS',
                       'DOXYCYCLINE.PYOCYANIN', 'FUMARATE.40C', 'FUMARATE.40MM', 'FUMARATE.A22', 'FUMARATE.CEFSULODIN',
                       'FUMARATE.PARAQUAT', 'FUMARATE.TOBRAMYCIN', 'NACL.PARAQUAT', 'NOSALT.FUMARATE', 'NOSALT.PARAQUAT',
                       'PARAQUAT.10UM', 'PH5.FUMARATE', 'PMS.PROCAINE', 'PYOCYANIN.0P2UM', 'PYOCYANIN.10', 'PYOCYANIN.1UM',
                       'TRIMETHOPRIM.PYOCYANIN', 'PYOCYANIN.10UM', 'pathotype_simple',
                       'ros_simulated', 'pathotype_simple2', 'pathotype_simple3'] #'isolation_source', 'pathotype', ]

filter_ismem = [('vizrecon','membrane'),
                ('vizrecon','inner_membrane'),
                ('vizrecon','outer_membrane'),]
                #('tmhmm','membrane')]

filter_notmem = [#('vizrecon','non_membrane'),
                 ('vizrecon','periplasm'),
                 ('vizrecon','cytosol'), ]
                 #('vizrecon','extracellular'),]
                 #('tmhmm','non_membrane')]

filter_ismem_subseq = [['all', 'acc_3D', 'metal_2_5D', 'metal_3D', 'tm_2D', 'tm_3D',
                        'csa_2_5D', 'sites_2_5D'], ]  # need the comma to treat it like one set
filter_notmem_subseq = [['all', 'disorder_2D', 'ss_disorder_2D', 'disorder_3D', 'ss_disorder_3D', 'acc_2D',
                         'acc_3D', 'surface_3D', 'metal_2_5D', 'metal_3D', 'dna_2_5D', 'csa_2_5D', 'sites_2_5D'], ]
filter_all_subseq = [['all', 'disorder_2D', 'ss_disorder_2D', 'disorder_3D', 'ss_disorder_3D', 'acc_2D',
                      'tm_2D', 'tm_3D',
                      'acc_3D', 'surface_3D', 'metal_2_5D', 'metal_3D', 'dna_2_5D', 'csa_2_5D', 'sites_2_5D'], ]

filter_subseq_suffixes = {'all': ['aa_%_ord', 'aa_%_dis', 'aa_%_M', 'aa_%_C', 'aa_%_chrg', 'aa_%_carb', 'aa_%_poschrg', 'aa_%_negchrg', 'aa_%_Y'],
                          'disorder_2D': ['aa_%_ord', 'aa_%_dis'],
                          'disorder_3D': ['aa_%_ord', 'aa_%_dis'],
                          'ss_disorder_2D': ['aa_%_ord', 'aa_%_dis'],
                          'ss_disorder_3D': ['aa_%_ord', 'aa_%_dis'],
                          'acc_2D': ['aa_%_M', 'aa_%_C', 'aa_%_chrg', 'aa_%_carb', 'aa_%_poschrg', 'aa_%_negchrg', 'aa_%_Y'],
                          'acc_3D': ['aa_%_M', 'aa_%_C', 'aa_%_chrg', 'aa_%_carb', 'aa_%_poschrg', 'aa_%_negchrg', 'aa_%_Y'],
                          'surface_3D': ['aa_%_M', 'aa_%_C', 'aa_%_chrg', 'aa_%_carb', 'aa_%_poschrg', 'aa_%_negchrg', 'aa_%_Y'],
                          'metal_2_5D': ['aa_%_M', 'aa_%_bulk', 'aa_%_C', 'aa_%_chrg', 'aa_%_carb', 'aa_%_poschrg', 'aa_%_negchrg'],
                          'metal_3D': ['aa_%_M', 'aa_%_bulk', 'aa_%_C', 'aa_%_chrg', 'aa_%_carb', 'aa_%_poschrg', 'aa_%_negchrg'],
                          'tm_2D': ['aa_%_M', 'aa_%_tmstab', 'aa_%_tmunstab'],
                          'tm_3D': ['aa_%_M', 'aa_%_tmstab', 'aa_%_tmunstab'],
                          'dna_2_5D': ['aa_%_dis', 'aa_%_ord'],
                          'csa_2_5D': ['aa_%_chrg', 'aa_%_M', 'aa_%_C'],
                          'sites_2_5D': ['aa_%_chrg', 'aa_%_M', 'aa_%_C']}

# filter_subseq_3D_suffixes = {'disorder_3D': ['aa_count_ord', 'aa_count_dis'],
#                           'ss_disorder_3D': ['aa_count_ord', 'aa_count_dis'],
#                           'acc_3D': ['aa_count_M', 'aa_count_C', 'aa_count_chrg', 'aa_count_carb', 'aa_count_poschrg', 'aa_count_negchrg'],
#                           'surface_3D': ['aa_count_M', 'aa_count_C', 'aa_count_chrg', 'aa_count_carb', 'aa_count_poschrg', 'aa_count_negchrg'],
#                           'metal_3D': ['aa_count_M', 'aa_count_bulk', 'aa_count_C', 'aa_count_chrg', 'aa_count_carb', 'aa_count_poschrg', 'aa_count_negchrg'],
#                           'tm_3D': ['aa_count_M', 'aa_count_tmstab', 'aa_count_tmunstab'],
#                           'dna_2_5D': ['aa_count_dis', 'aa_count_ord'],
#                           'csa_2_5D': ['aa_count_chrg', 'aa_count_M', 'aa_count_C'],
#                           'sites_2_5D': ['aa_count_chrg', 'aa_count_M', 'aa_count_C']}
#
# filter_subseq_2D_suffixes = {'disorder_2D': ['aa_count_ord', 'aa_count_dis'],
#                           'ss_disorder_2D': ['aa_count_ord', 'aa_count_dis'],
#                           'acc_2D': ['aa_count_M', 'aa_count_C', 'aa_count_chrg', 'aa_count_carb', 'aa_count_poschrg', 'aa_count_negchrg'],
#                           'metal_2_5D': ['aa_count_M', 'aa_count_bulk', 'aa_count_C', 'aa_count_chrg', 'aa_count_carb', 'aa_count_poschrg', 'aa_count_negchrg'],
#                           'tm_2D': ['aa_count_M', 'aa_count_tmstab', 'aa_count_tmunstab'],
#                           'dna_2_5D': ['aa_count_dis', 'aa_count_ord'],
#                           'csa_2_5D': ['aa_count_chrg', 'aa_count_M', 'aa_count_C'],
#                           'sites_2_5D': ['aa_count_chrg', 'aa_count_M', 'aa_count_C']}


def get_protein_feather_paths(protgroup, memornot, protgroup_dict, protein_feathers_dir, core_only_genes=None):
    """
    protgroup example: ('subsystem', 'cog_primary', 'H')
    memornot example: ('vizrecon', 'membrane')
    protgroup_dict example: {'databases': {'redoxdb': {'experimental_sensitive_cys': ['b2518','b3352','b2195','b4016'], ...}}}
    """
    prots_memornot = protgroup_dict['localization'][memornot[0]][memornot[1]]

    if protgroup[0] == 'localization':
        if protgroup[2] != 'all':
            if memornot[1] in ['membrane', 'inner_membrane', 'outer_membrane'] and protgroup[2] not in ['membrane', 'inner_membrane', 'outer_membrane']:
                return []
            if memornot[1] not in ['membrane', 'inner_membrane', 'outer_membrane'] and protgroup[2] in ['membrane', 'inner_membrane', 'outer_membrane']:
                return []

    prots_group = protgroup_dict[protgroup[0]][protgroup[1]][protgroup[2]]
    prots_filtered = list(set(prots_group).intersection(prots_memornot))
    if core_only_genes:
        prots_filtered = list(set(prots_filtered).intersection(core_only_genes))
    return [op.join(protein_feathers_dir, '{}_protein_strain_properties.fthr'.format(x)) for x in prots_filtered if op.exists(op.join(protein_feathers_dir, '{}_protein_strain_properties.fthr'.format(x)))]


def get_observed_strains_and_df(observation, observation_dict):
    """
    observation example: 'ros_simulated'
    observation_dict example: {'ros_simulated': [['NT12204_755', 'wt'], ['NT12120_270', 'wt'], ...] ...}
    """
    observed_df = pd.DataFrame.from_records(observation_dict[observation], columns=['strain','phenotype']).set_index('strain')
    observed_strains = observed_df.index.tolist()
    return observed_strains, observed_df


def get_interested_subsequences(subsequences):#, subseqdim):
    """
    subsequences example: ['disorder_2D', 'ss_disorder_2D', ...]
    filter_subseq_suffixes example: {'disorder_2D': ['aa_count_ord', 'aa_count_dis'], ... } -- defined above
    """
    keep_indices = []
    # if subseqdim == 'ALLD':
    #     filter_subseq_suffixes = filter_subseq_suffixes
    # elif subseqdim == '3D':
    #     filter_subseq_suffixes = filter_subseq_3D_suffixes
    # elif subseqdim == '2D':
    #     filter_subseq_suffixes = filter_subseq_2D_suffixes
    # else:
    #     raise ValueError('ALLD, 3D, or 2D only!')

    for subseq in subsequences:
        if subseq == 'all':
            keep_indices.extend([x for x in filter_subseq_suffixes[subseq]])
        else:
            keep_indices.extend([subseq + '_' + x for x in filter_subseq_suffixes[subseq]])
    return keep_indices


def load_feather(protein_feather, length_filter_pid=None, copynum_scale=False, copynum_df=None):
    """Load a feather of amino acid counts for a protein.

    Args:
        protein_feather (str): path to feather file
        copynum_scale (bool): if counts should be multiplied by protein copy number
        copynum_df (DataFrame): DataFrame of copy numbers

    Returns:
        DataFrame: of counts with some aggregated together

    """
    protein_df = pd.read_feather(protein_feather).set_index('index')

    # Combine counts for residue groups
    from ssbio.protein.sequence.properties.residues import _aa_property_dict_one, EXTENDED_AA_PROPERTY_DICT_ONE
    aggregators = {
        'aa_count_bulk'    : {'residues': EXTENDED_AA_PROPERTY_DICT_ONE['Bulky'],
                              'subseqs' : ['metal_2_5D', 'metal_3D']},
        'aa_count_carb'    : {'residues': EXTENDED_AA_PROPERTY_DICT_ONE['Carbonylation susceptible'],
                              'subseqs' : ['metal_2_5D', 'metal_3D', 'acc_2D', 'acc_3D', 'surface_3D']},
        'aa_count_chrg'    : {'residues': _aa_property_dict_one['Charged'],
                              'subseqs' : ['metal_2_5D', 'metal_3D', 'csa_2_5D', 'sites_2_5D', 'acc_2D', 'acc_3D',
                                           'surface_3D']},
        'aa_count_poschrg' : {'residues': _aa_property_dict_one['Basic'],
                              'subseqs' : ['metal_2_5D', 'metal_3D', 'acc_2D', 'acc_3D', 'surface_3D']},
        'aa_count_negchrg' : {'residues': _aa_property_dict_one['Acidic'],
                              'subseqs' : ['metal_2_5D', 'metal_3D', 'acc_2D', 'acc_3D', 'surface_3D']},
        'aa_count_tmstab'  : {'residues': EXTENDED_AA_PROPERTY_DICT_ONE['TM stabilizing'],
                              'subseqs' : ['tm_2D', 'tm_3D']},
        'aa_count_tmunstab': {'residues': EXTENDED_AA_PROPERTY_DICT_ONE['TM to Thr stabilizing'],
                              'subseqs' : ['tm_2D', 'tm_3D']},
        'aa_count_dis'     : {'residues': EXTENDED_AA_PROPERTY_DICT_ONE['Disorder promoting'],
                              'subseqs' : ['disorder_2D', 'ss_disorder_2D', 'disorder_3D', 'ss_disorder_3D',
                                           'dna_2_5D']},
        'aa_count_ord'     : {'residues': EXTENDED_AA_PROPERTY_DICT_ONE['Order promoting'],
                              'subseqs' : ['disorder_2D', 'ss_disorder_2D', 'disorder_3D', 'ss_disorder_3D',
                                           'dna_2_5D']}}

    # Do combination counts for all types of subsequences
    for suffix, info in aggregators.items():
        agg_residues = info['residues']
        for prefix in info['subseqs']:
            to_add_idxes = []
            for agg_res in agg_residues:
                to_add_idx = prefix + '_aa_count_' + agg_res
                if to_add_idx in protein_df.index:
                    to_add_idxes.append(to_add_idx)
            subseq_agged_col = protein_df.loc[to_add_idxes, :].sum()  # Add each residue series
            protein_df.loc[prefix + '_' + suffix] = subseq_agged_col  # Append to df

    ## REMOVE OTHER STRAINS WITH DELETIONS (use float -- length_filter_pid=0.8 to get only strains with >80% length
    ## alternative to atlas2.calculate_residue_counts_perstrain wt_pid_cutoff param -- works a little differently just considering length
    if length_filter_pid:
        keep_cols = protein_df.loc['aa_count_total'][protein_df.loc['aa_count_total'] > protein_df.at['aa_count_total', 'K12'] * length_filter_pid].index
        protein_df = protein_df[keep_cols]

    # Multiply by proteomics copy number?
    if copynum_scale:
        if not isinstance(copynum_df, pd.DataFrame):
            raise ValueError('Please supply copy numbers')
        protein_id = op.basename(protein_feather).split('_protein')[0]
        if protein_id in copynum_df.index:
            copynum = copynum_df.at[protein_id, 'copynum']
            if copynum > 0:  # TODO: currently keeping one copy of proteins with 0, is that ok?
                protein_df = protein_df * copynum

    return protein_df


def get_proteome_counts_simple(prots_filtered_feathers, outpath, length_filter_pid=None,
                               copynum_scale=False, copynum_df=None,
                               force_rerun=False):
    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outpath):
        big_strain_counts_df = pd.DataFrame()
        first = True
        for feather in prots_filtered_feathers:
            loaded = load_feather(protein_feather=feather, length_filter_pid=length_filter_pid,
                                  copynum_scale=copynum_scale,
                                  copynum_df=copynum_df)
            if first:
                big_strain_counts_df = pd.DataFrame(index=loaded.index, columns=loaded.columns)
                first = False
            big_strain_counts_df = big_strain_counts_df.add(loaded, fill_value=0)
        if len(big_strain_counts_df) > 0:
            big_strain_counts_df.astype(float).reset_index().to_feather(outpath)
        return big_strain_counts_df
    else:
        return pd.read_feather(outpath).set_index('index')


# def get_proteome_counts_simple_sc(sc, prots_filtered_feathers, outpath, length_filter_pid=None,
#                                     copynum_scale=False, copynum_df=None,
#                                     force_rerun=False):
#     import ssbio.utils
#     if ssbio.utils.force_rerun(flag=force_rerun, outfile=outpath):
#         protein_feathers_final_rdd = sc.parallelize(prots_filtered_feathers)
#         mapper = protein_feathers_final_rdd.map(lambda x: load_feather(protein_feather=x, length_filter_pid=None,
#                                                                        copynum_scale=copynum_scale,
#                                                                        copynum_df=copynum_df))
#         big_strain_counts_df = mapper.reduce(lambda df1, df2: df1.add(df2, fill_value=0))
#         big_strain_counts_df.astype(float).reset_index().to_feather(outpath)
#         return big_strain_counts_df
#     else:
#         return pd.read_feather(outpath).set_index('index')


def get_proteome_counts_impute_missing(prots_filtered_feathers, outpath, length_filter_pid=None,
                                       copynum_scale=False, copynum_df=None,
                                       force_rerun=False):
    """Get counts, uses the mean feature vector to fill in missing proteins for a strain"""
    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outpath):
        big_strain_counts_df = pd.DataFrame()
        first = True
        for feather in prots_filtered_feathers:
            loaded = load_feather(protein_feather=feather, length_filter_pid=length_filter_pid,
                                  copynum_scale=copynum_scale,
                                  copynum_df=copynum_df)
            if first:
                big_strain_counts_df = pd.DataFrame(index=_all_counts, columns=loaded.columns)
                first = False

            new_columns = list(set(loaded.columns.tolist()).difference(big_strain_counts_df.columns))
            if new_columns:
                for col in new_columns:
                    big_strain_counts_df[col] = big_strain_counts_df.mean(axis=1)

            not_in_loaded = list(set(big_strain_counts_df.columns).difference(loaded.columns.tolist()))
            if not_in_loaded:
                for col in not_in_loaded:
                    big_strain_counts_df[col] = big_strain_counts_df[col] + loaded.mean(axis=1)

            big_strain_counts_df = big_strain_counts_df.add(loaded, fill_value=0)

        if len(big_strain_counts_df) > 0:
            big_strain_counts_df.astype(float).reset_index().to_feather(outpath)
        return big_strain_counts_df
    else:
        return pd.read_feather(outpath).set_index('index')


def get_proteome_percentages(counts_df, outpath, force_rerun=False):
    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outpath):
        big_strain_percents_df = pd.DataFrame(columns=counts_df.columns)
        for strain in counts_df.columns:
            totals = list(filter(lambda x: x.endswith('total'), counts_df[strain].index))
            for t in totals:
                counts = t.rsplit('_', 1)[0]
                aa_counts = list(filter(lambda x: (x.startswith(counts) and x not in totals), counts_df[strain].index))
                for aa_count in aa_counts:
                    big_strain_percents_df.at[aa_count.replace('count', '%'), strain] = counts_df[strain][aa_count]/counts_df[strain][t]

        big_strain_percents_df.astype(float).reset_index().to_feather(outpath)
    else:
        big_strain_percents_df = pd.read_feather(outpath).set_index('index')

    big_strain_percents_df.index.name = None
    return big_strain_percents_df


def get_proteome_correct_percentages(prots_filtered_feathers, outpath, length_filter_pid=None,
                                     copynum_scale=False, copynum_df=None,
                                     force_rerun=False):
    """Get counts and normalize by number of proteins, providing percentages"""
    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outpath):
        prot_tracker = defaultdict(int)
        big_strain_counts_df = pd.DataFrame()
        first = True
        for feather in prots_filtered_feathers:
            loaded = load_feather(protein_feather=feather, length_filter_pid=length_filter_pid,
                                  copynum_scale=copynum_scale,
                                  copynum_df=copynum_df)

            if first:
                big_strain_counts_df = pd.DataFrame(columns=loaded.columns)
                first = False
            tmp_df = pd.DataFrame(columns=loaded.columns)
            for strain in loaded.columns:
                prot_tracker[strain] += 1
                totals = list(filter(lambda x: x.endswith('total'), loaded[strain].index))
                for t in totals:
                    counts = t.rsplit('_', 1)[0]
                    aa_counts = list(
                        filter(lambda x: (x.startswith(counts) and x not in totals), loaded[strain].index))
                    for aa_count in aa_counts:
                        tmp_df.at[aa_count.replace('count', '%'), strain] = loaded[strain][aa_count] / \
                                                                            loaded[strain][t]
            big_strain_counts_df = big_strain_counts_df.add(tmp_df, fill_value=0)

        for c, total in prot_tracker.items():
            big_strain_counts_df.loc[:, c] /= total

        if len(big_strain_counts_df) > 0:
            big_strain_counts_df.astype(float).reset_index().to_feather(outpath)
        return big_strain_counts_df
    else:
        return pd.read_feather(outpath).set_index('index')


def remove_correlated_feats(df):
    tmp = df.T
    # Remove columns with no variation
    nunique = tmp.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    tmp.drop(cols_to_drop, axis=1, inplace=True)

    perc_spearman = scipy.stats.spearmanr(tmp)
    abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
                           np.absolute(perc_spearman.correlation))
    np.fill_diagonal(abs_corr, 0)
    abs_corr_clean = np.maximum(abs_corr,
                                abs_corr.transpose())  # some floating point mismatches, just make symmetric
    clustering = linkage(squareform(abs_corr_clean), method='average')
    clusters = fcluster(clustering, .1, criterion='distance')
    names = tmp.columns.tolist()
    names_to_cluster = list(zip(names, clusters))
    indices_to_keep = []
    ### Extract models closest to cluster centroids
    for x in range(1, len(set(clusters)) + 1):
        # Create mask from the list of assignments for extracting submatrix of the cluster
        mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

        # Take the index of the column with the smallest sum of distances from the submatrix
        idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

        # Extract names of cluster elements from names_to_cluster
        sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

        # Element closest to centroid
        centroid = sublist[idx]
        indices_to_keep.append(centroid)

    return df.loc[df.index.isin(indices_to_keep)]


def get_simple_sigdict(prots_filtered_feathers, subsequences, observation, observation_dict,
                       remove_corr=True, force_rerun=False):
    sigdict = {'less': defaultdict(list),
               'greater': defaultdict(list)}

    for p in prots_filtered_feathers:

        p_id = op.basename(p).split('_')[0]
        outpath = op.join(op.dirname(p), '{}_protein_strain_properties_percfilt.fthr'.format(p_id))

        if ssbio.utils.force_rerun(flag=force_rerun, outfile=outpath):
            p_df = load_feather(protein_feather=p, length_filter_pid=0.8)
            p_perc_df = get_proteome_percentages(counts_df=p_df,
                                                 outpath=outpath,
                                                 force_rerun=force_rerun)
        else:
            p_perc_df = pd.read_feather(outpath).set_index('index')

        # Clean data first
        keep_subsequences = get_interested_subsequences(subsequences=subsequences)
        p_perc_df2 = p_perc_df.loc[p_perc_df.index.isin(keep_subsequences)]
        p_perc_df2 = p_perc_df2.astype(float).fillna(0)
        p_perc_df2 = p_perc_df2.loc[(p_perc_df2 > 0).any(axis=1)]

        if remove_corr:
            try:
                p_perc_df2 = remove_correlated_feats(p_perc_df2)
            except:
                continue
        all_features = p_perc_df2.index.tolist()

        # Add observations
        keep_strains, observations_df = get_observed_strains_and_df(observation=observation,
                                                                    observation_dict=observation_dict)
        feat_obs_df = p_perc_df2.T.join(observations_df, how='inner')

        # Split into 2 groups
        if observation == 'isolation_source':
            continue
        elif observation == 'ros_simulated':
            wt_df = feat_obs_df[feat_obs_df.phenotype == 'wt']
            mut_df = feat_obs_df[feat_obs_df.phenotype != 'wt']
        elif 'pathotype' in observation:
            wt_df = feat_obs_df[feat_obs_df.phenotype == 'Other']
            mut_df = feat_obs_df[feat_obs_df.phenotype != 'Other']
        else:
            wt_df = feat_obs_df[feat_obs_df.phenotype == 'No growth']
            mut_df = feat_obs_df[feat_obs_df.phenotype == 'Growth']

        if len(wt_df) == 0 or len(mut_df) == 0:
            continue

        # Mann-whitney test of each feature
        for alt in ['less', 'greater']:
            for feat in all_features:
                try:
                    z_stat, p_val = mannwhitneyu(wt_df[feat], mut_df[feat], alternative=alt)
                except ValueError:
                    continue
                if p_val < 0.01:
                    # Store differentiating features for this protein in a dictionary
                    sigdict[alt][p_id].append((feat, p_val))

    return sigdict


from scipy.cluster.hierarchy import linkage, fcluster
import numpy as np
import scipy.stats
from scipy.spatial.distance import squareform
from sklearn import preprocessing
from sklearn.decomposition import PCA
from scipy.stats import ks_2samp
import itertools
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
from scipy.spatial.distance import pdist  # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
from itertools import combinations
from sklearn.metrics.pairwise import euclidean_distances  # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html
import scipy.stats.stats as st
from scipy.stats import mannwhitneyu
from numpy.random import choice

plt.ioff()  # Turn interactive plotting off
sns.set(rc={'figure.figsize': (16, 8.5)})
sns.set_context('talk')
sns.set_style('ticks')
sns.set_palette('Set2')


class PCAMultiROS():
    def __init__(self, features_df, observations_df, plot_title, observation_colname='phenotype'):
        self.features_df = features_df
        self.observations_df = observations_df
        self.observation_colname = observation_colname
        self.plot_title = plot_title

        self.num_components = None
        self.pca = None
        self.pc_names_list = None
        self.pc_names_dict = None
        self.principal_df = None
        self.principal_observations_df = None
        self.markers = None

    def clean_data(self, keep_features=None, remove_correlated_feats=True):
        self.features_df = self.features_df.astype(float).fillna(0)
        self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)]

        if keep_features:
            self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)]

        if remove_correlated_feats:
            tmp = self.features_df.T

            # Remove columns with no variation
            nunique = tmp.apply(pd.Series.nunique)
            cols_to_drop = nunique[nunique == 1].index
            tmp.drop(cols_to_drop, axis=1, inplace=True)

            perc_spearman = scipy.stats.spearmanr(tmp)
            abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
                                   np.absolute(perc_spearman.correlation))
            np.fill_diagonal(abs_corr, 0)
            abs_corr_clean = np.maximum(abs_corr,
                                        abs_corr.transpose())  # some floating point mismatches, just make symmetric
            clustering = linkage(squareform(abs_corr_clean), method='average')
            clusters = fcluster(clustering, .1, criterion='distance')
            names = tmp.columns.tolist()
            names_to_cluster = list(zip(names, clusters))
            indices_to_keep = []
            ### Extract models closest to cluster centroids
            for x in range(1, len(set(clusters)) + 1):
                # Create mask from the list of assignments for extracting submatrix of the cluster
                mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

                # Take the index of the column with the smallest sum of distances from the submatrix
                idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

                # Extract names of cluster elements from names_to_cluster
                sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

                # Element closest to centroid
                centroid = sublist[idx]
                indices_to_keep.append(centroid)

            self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)]

    def run_pca(self, whiten=True):
        # Normalize
        for_pca_df = self.features_df.T
        for_pca_df_scaled = pd.DataFrame(preprocessing.scale(for_pca_df), columns=for_pca_df.columns)

        # Run PCA
        self.num_components = min(len(for_pca_df.T.columns), len(for_pca_df.T.index))
        pca = PCA(n_components=self.num_components, whiten=whiten)
        pca_fit = pca.fit_transform(for_pca_df_scaled)
        self.pc_names_list = ['PC{} ({:.0%})'.format(x + 1, pca.explained_variance_ratio_[x]) for x in
                                  range(self.num_components)]
        self.pc_names_dict = {k.split(' ')[0]: k for k in self.pc_names_list}
        principal_df = pd.DataFrame(data=pca_fit, columns=self.pc_names_list, index=for_pca_df.index)
        principal_df.index.name = 'strain'

        self.principal_df = principal_df
        self.pca = pca
        # self.principal_observations_df = self.principal_df.join(self.observations_df, how='inner')
        #
        # # Make iterable list of markers
        # mks = itertools.cycle(["<", "+", "o", 'D', 'x', '^', '*', '8', 's', 'p', 'v', 'X', '_', 'h'])
        # self.markers = [next(mks) for i in range(len(self.principal_observations_df[self.observation_colname].unique()))]

    def make_biplot(self, pc_x=1, pc_y=2, outpath=None, dpi=150, custom_markers=None, custom_order=None):
        if not custom_order:
            custom_order = sorted(self.observations_df[self.observation_colname].unique().tolist())
        if not custom_markers:
            custom_markers = self.markers
        plot = sns.lmplot(data=self.principal_observations_df,
                              x=self.principal_observations_df.columns[pc_x - 1],
                              y=self.principal_observations_df.columns[pc_y - 1],
                              hue=self.observation_colname,
                              hue_order=custom_order,
                              fit_reg=False,
                              size=6,
                              markers=custom_markers,
                            scatter_kws={'alpha': 0.5})
        plot = (plot.set(title='PC{} vs. PC{}'.format(pc_x, pc_y)))
        if outpath:
            plot.savefig(outpath, dpi=dpi)
        else:
            plt.show()
        plt.close()

    def make_pairplot(self, num_components_to_plot=4, outpath=None, dpi=150):
        # Get columns
        components_to_plot = [self.principal_observations_df.columns[x] for x in range(num_components_to_plot)]

        # Plot
        plot = sns.pairplot(data=self.principal_observations_df, hue=self.observation_colname,
                                vars=components_to_plot, markers=self.markers, size=4)
        plt.subplots_adjust(top=.95)
        plt.suptitle(self.plot_title)

        if outpath:
            plot.fig.savefig(outpath, dpi=dpi)
        else:
            plt.show()
        plt.close()

    def make_3Dplot(self, outpath=None, dpi=150):
        import mpl_toolkits.mplot3d
        fig = plt.figure(1, figsize=(8, 6))
        ax = mpl_toolkits.mplot3d.Axes3D(fig)
        for key, group in self.principal_observations_df.groupby(self.observation_colname):
            ax.plot(group[self.principal_observations_df.columns[0]],
                    group[self.principal_observations_df.columns[1]],
                    group[self.principal_observations_df.columns[2]],
                    'o', alpha=0.5, label=key)

        # Make simple, bare axis lines through space:
        xAxisLine = ((min(self.principal_observations_df[self.principal_observations_df.columns[0]]),
                      max(self.principal_observations_df[self.principal_observations_df.columns[0]])), (0, 0), (0, 0))
        ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r', alpha=0.4)
        yAxisLine = ((0, 0), (min(self.principal_observations_df[self.principal_observations_df.columns[1]]),
                              max(self.principal_observations_df[self.principal_observations_df.columns[1]])), (0, 0))
        ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r', alpha=0.4)
        zAxisLine = ((0, 0), (0, 0), (min(self.principal_observations_df[self.principal_observations_df.columns[2]]),
                                      max(self.principal_observations_df[self.principal_observations_df.columns[2]])))
        ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r', alpha=0.4)

        ax.set_title("PC1 vs. PC2 vs. PC3")
        ax.set_xlabel("PC1")
        ax.set_ylabel("PC2")
        ax.set_zlabel("PC3")
        ax.legend(loc='upper left', fontsize=15)

        if outpath:
            fig.savefig(outpath, dpi=dpi)
        else:
            plt.show()
        plt.close()

    def make_contribplot(self, pc_to_look_at=1, sigadder=0.01, outpath=None, dpi=150, return_top_contribs=False):
        """Make a plot showing contributions of properties to a PC"""
        cont = pd.DataFrame(self.pca.components_, columns=self.features_df.index, index=self.pc_names_list)
        tmp_df = pd.DataFrame(cont.iloc[pc_to_look_at - 1]).reset_index().rename(columns={'index': 'Property'})
        tmp_df['Contribution'] = tmp_df.iloc[:, 1] ** 2
        tmp_df = tmp_df[tmp_df['Contribution'] > 1 / len(
                cont.iloc[0]) + sigadder]  # Alter sigadder to just plot more/less significant contributors
        tmp_df['Sign'] = np.where(tmp_df.iloc[:, 1] >= 0, 'Positive', 'Negative')
        tmp_df = tmp_df.sort_values(by='Contribution', ascending=False)

        fig, ax = plt.subplots(figsize=(30, 10))
        sns.barplot(data=tmp_df, y='Property', x='Contribution', hue='Sign', dodge=False, ax=ax, hue_order=['Positive', 'Negative'],
                        palette=sns.color_palette("coolwarm", 2))

        # Random formatting crap
        self._change_height(ax, .6)  # Make bars thinner
        ax.set_title('{} contributors'.format(self.pc_names_list[pc_to_look_at - 1]))
        legend = plt.legend(loc=8, bbox_to_anchor=(1.2, .8), ncol=1, title='Sign', fontsize=10)
        plt.setp(legend.get_title(), fontsize=12)
        plt.gcf().subplots_adjust(left=.5, right=.65)
        if outpath:
            fig.savefig(outpath, dpi=dpi)
        else:
            plt.show()
        plt.close()

        if return_top_contribs:
            return tmp_df.Property.values.tolist()

    def _change_height(self, ax, new_value):
        """Make bars in horizontal bar chart thinner"""
        for patch in ax.patches:
            current_height = patch.get_height()
            diff = current_height - new_value

            # we change the bar height
            patch.set_height(new_value)

            # we recenter the bar
            patch.set_y(patch.get_y() + diff * .5)

    def get_pca_ks_stats(self, maxrange=5):
        """Get a dictionary of PC#: K-S test stat for each """
        pc_to_phenotype_pairs = {}
        num_components = self.principal_observations_df.shape[1]
        if num_components < maxrange:
            maxrange = num_components

        phenotypes = self.principal_observations_df.phenotype.unique().tolist()
        for i in range(0, maxrange):
            phenotype_pair_to_ks = {}
            for p1, p2 in combinations(phenotypes, 2):
                p1_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p1].iloc[:,i].as_matrix()
                p2_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p2].iloc[:,i].as_matrix()
                phenotype_pair_to_ks[(p1, p2)] = ks_2samp(p1_pc, p2_pc)
            pc_to_phenotype_pairs[i + 1] = phenotype_pair_to_ks

        return pc_to_phenotype_pairs


def get_intra_inter_distances(feat_df, obs_df, normalize=False, plot=False):
    # Drop zero rows, transpose, set type to float, and fill any missing values
    feat_df = feat_df.loc[(feat_df != 0).any(axis=1)].T.astype(float).fillna(0)
    if normalize:
        feat_df = pd.DataFrame(preprocessing.scale(feat_df), columns=feat_df.columns, index=feat_df.index)
    feat_obs_df = feat_df.join(obs_df, how='inner')

    obs_intra_distances = {}
    obs_inter_distances = {}

    obs_feat_vectors = {}

    for phenotype in feat_obs_df.phenotype.unique():
        feat_vectors = feat_obs_df[feat_obs_df.phenotype == phenotype].drop(columns=feat_obs_df.columns[-1]).as_matrix()
        obs_feat_vectors[phenotype] = feat_vectors

        # Intra-distances
        intra_distances_calc = pdist(feat_vectors)
        obs_intra_distances[phenotype] = intra_distances_calc

    # Inter-distances
    # Randomly sample from 1...?  uncomment next 2 lines and use obs_feat_vector instead of obs_feat_vector_choice
    # if you want to try that
    #     obs_shortest = min([x.shape[0] for x in obs_feat_vectors.values()])
    #     obs_feat_vector_choice = {k:(v[choice(v.shape[0], obs_shortest, replace=False), :] if v.shape[0]!=obs_shortest else v) for k,v in obs_feat_vectors.items()}
    for inter1, inter2 in combinations(obs_feat_vectors, 2):
        obs_inter_distances[(inter1, inter2)] = euclidean_distances(obs_feat_vectors[inter1],
                                                                    obs_feat_vectors[inter2])

    if plot:
        df = pd.DataFrame()
        for k, v in obs_intra_distances.items():
            ser = pd.Series(v)
            df[k] = ser

        for k, v in obs_inter_distances.items():
            ser = pd.Series(v.flatten())
            df[str(k)] = ser

        plotter = pd.melt(df, value_vars=df.columns.tolist())
        plotter = plotter[pd.notnull(plotter.value)]

        sns.violinplot(data=plotter, x='variable', y='value')

    return obs_intra_distances, obs_inter_distances


def compute_skew_stats(intra, inter):
    """Returns two dictionaries reporting (skew, skew_pval) for all groups"""
    # Intra (within a group) stats
    intra_skew = {}
    for k, v in intra.items():
        skew = st.skew(v)
        try:
            skew_zstat, skew_pval = st.skewtest(v)
        except ValueError:  # if sample size too small
            skew_zstat, skew_pval = (0, 1)
        intra_skew[k] = (skew, skew_zstat, skew_pval)

    # Inter (between groups) stats
    inter_skew = {}
    for k, v in inter.items():
        # Inter skew stats
        skew_sep = st.skew(v.flatten())
        try:
            skew_sep_zstat, skew_sep_pval = st.skewtest(v.flatten())
        except ValueError:
            skew_sep_zstat, skew_sep_pval = (0, 1)
        inter_skew['-'.join(k)] = (skew_sep, skew_sep_zstat, skew_sep_pval)

        # Significance of difference between intra and inter distributions
        for intra_key in k:
            try:
                separation_zstat, separation_pval = mannwhitneyu(intra[intra_key],
                                                                 v.flatten(),
                                                                 alternative='less')
            except ValueError:  # All numbers are identical in mannwhitneyu
                separation_zstat, separation_pval = (0, 1)
            inter_skew['{}<{}'.format(intra_key, '-'.join(k))] = (separation_zstat, separation_pval)

    return intra_skew, inter_skew


def run_all(protgroup, memornot, subsequences, observation, proteomescale, base_outdir,
            protgroup_dict, protein_feathers_dir, observation_dict, copynum_df, date,
            errfile, subseqdim, statfile_duo, statfile_trio, cutoff_num_proteins=0, core_only_genes=None,
            impute_counts=True, length_filter_pid=.8,
            force_rerun_counts=False,
            sc=None, run_percentages=True, force_rerun_percentages=False,
            run_pca_and_stats=True, remove_correlated_feats=True, save_plots=False, force_rerun_pca=False):
    import ssbio.utils

    # Need to set multiprocessing limit for scipy/numpy stuff if parallelizing anything
    import os
    os.environ['OMP_NUM_THREADS'] = '1'

    # First, filter down the protein group to the membrane/nonmembrane definition
    prots_filtered_feathers = get_protein_feather_paths(protgroup=protgroup, memornot=memornot,
                                                        protgroup_dict=protgroup_dict,
                                                        protein_feathers_dir=protein_feathers_dir,
                                                        core_only_genes=core_only_genes)
    num_proteins = len(prots_filtered_feathers)
    if num_proteins <= cutoff_num_proteins:
        return

    # Make output directories
    if proteomescale:
        protscale = 'proteome_scaled'
    else:
        protscale = 'proteome_unscaled'
    outdir_d0 = ssbio.utils.make_dir(op.join(base_outdir, protscale))
    outdir_d1 = ssbio.utils.make_dir(op.join(outdir_d0, '-'.join(memornot)))
    outdir_final = ssbio.utils.make_dir(op.join(outdir_d1, '-'.join(protgroup)))
    outdir_observ = ssbio.utils.make_dir(op.join(outdir_final, observation))
    outdir_observ_subseqdim = ssbio.utils.make_dir(op.join(outdir_observ, subseqdim))

    # Then load the protein feathers and add them all together to represent a "proteome"
    # if sc:
    #     big_strain_counts_df = get_proteome_counts_sc(sc=sc, prots_filtered_feathers=prots_filtered_feathers,
    #                                                   outpath=op.join(outdir_final,
    #                                                                   '{}-subsequence_proteome.fthr'.format(date)),
    #                                                   copynum_scale=proteomescale, copynum_df=copynum_df,
    #                                                   force_rerun=force_rerun_counts)
    # else:
    #     big_strain_counts_df = get_proteome_counts(prots_filtered_feathers=prots_filtered_feathers,
    #                                                outpath=op.join(outdir_final,
    #                                                                '{}-subsequence_proteome.fthr'.format(date)),
    #                                                copynum_scale=proteomescale, copynum_df=copynum_df,
    #                                                force_rerun=force_rerun_counts)
    # if len(big_strain_counts_df) == 0:
    #     with open(errfile, "a") as myfile:
    #         myfile.write('COUNT ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
    #     return

    # Impute averages for missing counts
    if impute_counts:
        big_strain_counts_df = get_proteome_counts_impute_missing(prots_filtered_feathers=prots_filtered_feathers,
                                                                  outpath=op.join(outdir_final,
                                                                                  '{}-subsequence_proteome_IMP.fthr'.format(
                                                                                      date)),
                                                                  length_filter_pid=length_filter_pid,
                                                                  copynum_scale=proteomescale, copynum_df=copynum_df,
                                                                  force_rerun=force_rerun_counts)

        big_strain_percents_df = get_proteome_percentages(counts_df=big_strain_counts_df,
                                                          outpath=op.join(outdir_final,
                                                                          '{}-subsequence_proteome_perc_IMP.fthr'.format(
                                                                                  date)),
                                                          force_rerun=force_rerun_percentages)

    # Divide by totals to get percentages in a new dataframe
    else:
        try:
            big_strain_percents_df = get_proteome_correct_percentages(prots_filtered_feathers=prots_filtered_feathers,
                                                                      outpath=op.join(outdir_final,
                                                                                      '{}-subsequence_proteome_perc.fthr'.format(
                                                                                          date)),
                                                                      length_filter_pid=length_filter_pid,
                                                                      force_rerun=force_rerun_percentages)
        except:
            with open(errfile, "a") as myfile:
                myfile.write('PERCENTAGES ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(
                    observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
            return

    # Stop here if only percentages desired
    if not run_pca_and_stats:
        return

    pca_pickle = op.join(outdir_observ_subseqdim, '{}-subsequence_pca.pckl'.format(date))
    if ssbio.utils.force_rerun(flag=force_rerun_pca, outfile=pca_pickle):

        # Then, get filters for the columns to available strains in an observation for percentage df
        keep_strains, observations_df = get_observed_strains_and_df(observation=observation,
                                                                    observation_dict=observation_dict)
        if keep_strains:
            big_strain_percents_df = big_strain_percents_df[big_strain_percents_df.columns[big_strain_percents_df.columns.isin(keep_strains)]]

        # Then, get filters for rows of the loaded feathers for interested subsequences
        keep_subsequences = get_interested_subsequences(subsequences=subsequences)

        # Some numbers: number of observations
        num_obs = observations_df.phenotype.value_counts().to_dict()
        if len(num_obs) < 2:  # If only one observation, what are we trying to compare?? nothing really
            return
        observations_string = ';'.join('{}:{}'.format(key, val) for key, val in num_obs.items())

        # Some numbers: number of features
        num_feats = len(big_strain_percents_df)

        # Make an unwieldy title
        big_title = 'LOC={0}; PROTGROUP={1}; PHENOTYPE={2}; PROTSCALE={3}; SUBSEQDIM={4};\n' \
                    'NUMPROTS={5}; NUMFEATS={6}; NUMSTRAINS={7}'.format('-'.join(memornot),
                                                                        '-'.join(protgroup),
                                                                        str(observation),
                                                                        str(proteomescale),
                                                                        subseqdim,
                                                                        num_proteins,
                                                                        num_feats,
                                                                        observations_string)

        # Run PCA and make plots
        runner = PCAMultiROS(features_df=big_strain_percents_df, observations_df=observations_df, plot_title=big_title)
        try:
            runner.clean_data(keep_features=keep_subsequences, remove_correlated_feats=remove_correlated_feats)
        except:
            with open(errfile, "a") as myfile:
                myfile.write(
                    'CLEAN ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
            return
        try:
            runner.run_pca()
        except:
            with open(errfile, "a") as myfile:
                myfile.write(
                    'PCA ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
            return

        if save_plots:
            try:
                runner.make_biplot(pc_x=1, pc_y=2,
                                   outpath=op.join(outdir_observ_subseqdim, '{}-subsequence_biplot_1_2.png'.format(date)))
            except:
                with open(errfile, "a") as myfile:
                    myfile.write(
                        'PCA BIPLOT ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
            try:
                runner.make_pairplot(num_components_to_plot=4,
                                     outpath=op.join(outdir_observ_subseqdim, '{}-subsequence_pairplot.png'.format(date)))
            except:
                with open(errfile, "a") as myfile:
                    myfile.write(
                        'PCA PAIRPLOT ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
            try:
                runner.make_3Dplot(outpath=op.join(outdir_observ_subseqdim, '{}-subsequence_3Dplot.png'.format(date)))
            except:
                with open(errfile, "a") as myfile:
                    myfile.write(
                        'PCA 3DPLOT ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
            try:
                runner.make_contribplot(pc_to_look_at=1, sigadder=0.01,
                                        outpath=op.join(outdir_observ_subseqdim, '{}-subsequence_contribplot.png'.format(date)))
            except:
                with open(errfile, "a") as myfile:
                    myfile.write(
                            'PCA CONTRIBPLOT ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(
                                observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
        with open(pca_pickle, 'wb') as f:
            pickle.dump(runner, f)
    else:
        with open(pca_pickle, 'rb') as f:
            runner = pickle.load(f)

    # Get stats
    try:
        ks_stats = runner.get_pca_ks_stats()
    except:
        with open(errfile, "a") as myfile:
            myfile.write(
                    'STAT K-S ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(
                            observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
        return

    try:
        # perc_intra, perc_inter = get_intra_inter_distances(feat_df=runner.features_df,
        #                                                    obs_df=runner.observations_df,
        #                                                    normalize=True,
        #                                                    plot=False)
        # perc_stats = compute_skew_stats(intra=perc_intra, inter=perc_inter)

        pca_intra, pca_inter = get_intra_inter_distances(feat_df=runner.principal_df.T,
                                                         obs_df=runner.observations_df,
                                                         normalize=False,
                                                         plot=False)
        skew_stats = compute_skew_stats(intra=pca_intra, inter=pca_inter)

    except:
        with open(errfile, "a") as myfile:
            myfile.write('STAT DIST ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(
                observation) + '\t' + str(proteomescale) + '\t' + subseqdim + "\n")
        return

    # with open(statfile, 'a') as statf:
    #     statf.write('-'.join(memornot) + '\t' + '-'.join(protgroup) + '\t' + str(observation) + '\t' + str(
    #         proteomescale) + '\t' + subseqdim + '\t' + str(num_proteins) + '\t' + str(num_feats) + '\t' + str(
    #         num_obs) + '\t' + str(perc_stats) + '\t' + str(skew_stats) + '\t' + str(ks_stats) + '\n')

    skew_pval_significant = True
    skew_worst_pval = -1
    ks_pval_significant = True
    ks_worst_pval = -1

    # Search for clusters not close to WT
    if observation == 'ros_simulated':
        for k, v in skew_stats[1].items():
            if 'wt' in k and '<' in k:
                pval = v[1]
                if pval > skew_worst_pval:
                    skew_worst_pval = pval
                if pval > 0.05:
                    skew_pval_significant = False
        for k, v in ks_stats[1].items():
            if 'wt' in k:
                if v.pvalue > ks_worst_pval:
                    ks_worst_pval = v.pvalue
                if v.pvalue > 0.05:
                    ks_pval_significant = False
        # Combination stats
        if ks_pval_significant and skew_pval_significant:
            if ks_worst_pval != -1 and skew_worst_pval != -1:
                with open(statfile_trio, 'a') as f:
                    f.write(str((protgroup, memornot, subsequences, observation, proteomescale)) + '\t' + str(ks_worst_pval) + '\t' + str(skew_worst_pval) + '\n')

    # Search for G/NG clusters and pathotype_simple ones
    else:
        for k, v in skew_stats[1].items():
            if '<' in k:
                pval = v[1]
                if pval > skew_worst_pval:
                    skew_worst_pval = pval
                if pval > 0.05:
                    skew_pval_significant = False
        for k, v in ks_stats[1].items():
            if v.pvalue > ks_worst_pval:
                ks_worst_pval = v.pvalue
            if v.pvalue > 0.05:
                ks_pval_significant = False

        # Combination stats
        if ks_pval_significant and skew_pval_significant:
            if ks_worst_pval != -1 and skew_worst_pval != -1:
                with open(statfile_duo, 'a') as f:
                    f.write(str((protgroup, memornot, subsequences, observation, proteomescale)) + '\t' + str(ks_worst_pval) + '\t' + str(skew_worst_pval) + '\n')


def run_all2(protgroup, memornot, subsequences, base_outdir,
            protgroup_dict, protein_feathers_dir, date, errfile, impute_counts=True,
            cutoff_num_proteins=0, core_only_genes=None,
            length_filter_pid=.8, remove_correlated_feats=True,
            force_rerun_counts=False, force_rerun_percentages=False, force_rerun_pca=False):
    """run_all but ignoring observations before pca"""
    import ssbio.utils

    # Need to set multiprocessing limit for scipy/numpy stuff if parallelizing anything
    import os
    os.environ['OMP_NUM_THREADS'] = '1'

    # First, filter down the protein group to the membrane/nonmembrane definition
    prots_filtered_feathers = get_protein_feather_paths(protgroup=protgroup, memornot=memornot,
                                                        protgroup_dict=protgroup_dict,
                                                        protein_feathers_dir=protein_feathers_dir,
                                                        core_only_genes=core_only_genes)
    num_proteins = len(prots_filtered_feathers)
    if num_proteins <= cutoff_num_proteins:
        return

    # Make output directories
    protscale = 'proteome_unscaled'
    outdir_d0 = ssbio.utils.make_dir(op.join(base_outdir, protscale))
    outdir_d1 = ssbio.utils.make_dir(op.join(outdir_d0, '-'.join(memornot)))
    outdir_final = ssbio.utils.make_dir(op.join(outdir_d1, '-'.join(protgroup)))

    if impute_counts:
        big_strain_counts_df = get_proteome_counts_impute_missing(prots_filtered_feathers=prots_filtered_feathers,
                                                                  outpath=op.join(outdir_final,
                                                                                  '{}-subsequence_proteome_IMP.fthr'.format(
                                                                                      date)),
                                                                  length_filter_pid=length_filter_pid,
                                                                  force_rerun=force_rerun_counts)

        big_strain_percents_df = get_proteome_percentages(counts_df=big_strain_counts_df,
                                                          outpath=op.join(outdir_final,
                                                                          '{}-subsequence_proteome_perc_IMP.fthr'.format(
                                                                                  date)),
                                                          force_rerun=force_rerun_percentages)
        pca_pickle = op.join(outdir_final, '{}-subsequence_pca.pckl'.format(date))
    # Divide by totals to get percentages in a new dataframe
    else:
        try:
            big_strain_percents_df = get_proteome_correct_percentages(prots_filtered_feathers=prots_filtered_feathers,
                                                                      outpath=op.join(outdir_final,
                                                                                      '{}-subsequence_proteome_perc_AVG.fthr'.format(
                                                                                              date)),
                                                                      length_filter_pid=length_filter_pid,
                                                                      force_rerun=force_rerun_percentages)
            pca_pickle = op.join(outdir_final, '{}-subsequence_pca_AVG.pckl'.format(date))
        except:
            with open(errfile, "a") as myfile:
                myfile.write('PERCENTAGES ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + "\n")
            return



    if ssbio.utils.force_rerun(flag=force_rerun_pca, outfile=pca_pickle):

        # Then, get filters for rows of the loaded feathers for interested subsequences
        keep_subsequences = get_interested_subsequences(subsequences=subsequences)

        # Some numbers: number of features
        num_feats = len(big_strain_percents_df)

        # Make an unwieldy title
        big_title = 'LOC={0}; PROTGROUP={1};\n' \
                    'NUMPROTS={2}; NUMFEATS={3}'.format('-'.join(memornot),
                                                                        '-'.join(protgroup),
                                                                        num_proteins,
                                                                        num_feats)

        # Run PCA and make plots
        runner = PCAMultiROS(features_df=big_strain_percents_df, observations_df=pd.DataFrame(), plot_title=big_title)
        try:
            runner.clean_data(keep_features=keep_subsequences, remove_correlated_feats=remove_correlated_feats)
        except:
            with open(errfile, "a") as myfile:
                myfile.write(
                    'CLEAN ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + "\n")
            return
        # try:
        runner.run_pca()
        # except:
        #     with open(errfile, "a") as myfile:
        #         myfile.write(
        #             'PCA ERR: ' + '-'.join(memornot) + '\t' + '-'.join(protgroup) + "\n")
        #     return
        with open(pca_pickle, 'wb') as f:
            pickle.dump(runner, f)
    else:
        with open(pca_pickle, 'rb') as f:
            runner = pickle.load(f)


def run_all_simple(protgroup, memornot, subsequences, observation, protgroup_dict, protein_feathers_dir, observation_dict, statfile, cutoff_num_proteins=0):

    # Need to set multiprocessing limit for scipy/numpy stuff if parallelizing anything
    import os
    os.environ['OMP_NUM_THREADS'] = '1'

    # First, filter down the protein group to the membrane/nonmembrane definition
    prots_filtered_feathers = get_protein_feather_paths(protgroup=protgroup, memornot=memornot,
                                                        protgroup_dict=protgroup_dict,
                                                        protein_feathers_dir=protein_feathers_dir)
    num_proteins = len(prots_filtered_feathers)
    if num_proteins <= cutoff_num_proteins:
        return

    # WT HAVE MORE OF THESE...
    sigdict = get_simple_sigdict(prots_filtered_feathers=prots_filtered_feathers,
                                        subsequences=subsequences,
                                        observation=observation,
                                        observation_dict=observation_dict)

    sig_predf = []
    for protein, feat_and_pvals in sigdict['greater'].items():
        for feat, pval in feat_and_pvals:
            sig_predf.append((protein, feat, pval))

    sig_df1 = pd.DataFrame(sig_predf, columns=['protein', 'feature', 'pval']).sort_values(by='pval', ascending=True)

    if len(sig_df1) > 0:
        num_prots1 = len(sig_df1.protein.unique())
        num_feats1 = len(sig_df1.feature.unique())

        # Very signficant features
        sig_df2 = sig_df1[sig_df1.pval <= sig_df1.pval.quantile(.25)]
        num_prots2 = len(sig_df2.protein.unique())
        num_feats2 = len(sig_df2.feature.unique())

        # Very significant features that are explainable
        sig_df3 = sig_df2[sig_df2.feature.isin(expected_in_wt)]
        num_prots3 = len(sig_df3.protein.unique())
        num_feats3 = len(sig_df3.feature.unique())

        wt_perc_sig_feats = num_feats3 / num_feats2
        wt_perc_sig_prots = num_prots3 / num_proteins
    else:
        wt_perc_sig_feats = 0
        wt_perc_sig_prots = 0


    # MUT HAVE MORE OF THESE...
    sig_predf = []
    for protein, feat_and_pvals in sigdict['less'].items():
        for feat, pval in feat_and_pvals:
            sig_predf.append((protein, feat, pval))

    sig_df1 = pd.DataFrame(sig_predf, columns=['protein', 'feature', 'pval']).sort_values(by='pval', ascending=True)

    if len(sig_df1) > 0:
        num_prots1 = len(sig_df1.protein.unique())
        num_feats1 = len(sig_df1.feature.unique())

        # Very signficant features
        sig_df2 = sig_df1[sig_df1.pval <= sig_df1.pval.quantile(.25)]
        num_prots2 = len(sig_df2.protein.unique())
        num_feats2 = len(sig_df2.feature.unique())

        # Very significant features that are explainable
        sig_df3 = sig_df2[sig_df2.feature.isin(expected_in_mut)]
        num_prots3 = len(sig_df3.protein.unique())
        num_feats3 = len(sig_df3.feature.unique())

        mut_perc_sig_feats = num_feats3 / num_feats2
        mut_perc_sig_prots = num_prots3 / num_proteins
    else:
        mut_perc_sig_feats = 0
        mut_perc_sig_prots = 0

    with open(statfile, 'a') as f:
        f.write(str((protgroup, memornot, observation)) + '\t' + str(wt_perc_sig_feats) + '\t' + str(wt_perc_sig_prots) + '\t' + str(mut_perc_sig_feats) + '\t' + str(mut_perc_sig_prots) + '\t' + str(num_proteins) + '\n')


expected_in_wt = ['aa_%_dis', 'disorder_2D_aa_%_dis', 'ss_disorder_2D_aa_%_dis', 'disorder_3D_aa_%_dis', 'ss_disorder_3D_aa_%_dis', 'dna_2_5D_aa_%_ord', 'aa_%_C', 'acc_2D_aa_%_C', 'acc_3D_aa_%_C', 'surface_3D_aa_%_C', 'metal_2_5D_aa_%_C', 'metal_3D_aa_%_C', 'csa_2_5D_aa_%_C', 'sites_2_5D_aa_%_C', 'aa_%_carb', 'acc_2D_aa_%_carb', 'acc_3D_aa_%_carb', 'surface_3D_aa_%_carb', 'metal_2_5D_aa_%_carb', 'metal_3D_aa_%_carb', 'aa_%_chrg', 'acc_2D_aa_%_chrg', 'acc_3D_aa_%_chrg', 'surface_3D_aa_%_chrg', 'metal_2_5D_aa_%_chrg', 'metal_3D_aa_%_chrg', 'csa_2_5D_aa_%_chrg', 'sites_2_5D_aa_%_chrg', 'acc_2D_aa_%_poschrg', 'acc_3D_aa_%_poschrg', 'surface_3D_aa_%_poschrg', 'metal_2_5D_aa_%_poschrg', 'metal_3D_aa_%_poschrg', 'acc_2D_aa_%_negchrg', 'acc_3D_aa_%_negchrg', 'surface_3D_aa_%_negchrg', 'tm_2D_aa_%_tmunstab', 'tm_3D_aa_%_tmunstab', 'surface_3D_aa_%_Y','aa_%_Y','acc_3D_aa_%_Y','acc_2D_aa_%_Y']
expected_in_mut = ['aa_%_ord', 'disorder_2D_aa_%_ord', 'ss_disorder_2D_aa_%_ord', 'disorder_3D_aa_%_ord', 'ss_disorder_3D_aa_%_ord', 'dna_2_5D_aa_%_dis', 'aa_%_M', 'acc_2D_aa_%_M', 'acc_3D_aa_%_M', 'surface_3D_aa_%_M', 'metal_2_5D_aa_%_M', 'metal_3D_aa_%_M', 'csa_2_5D_aa_%_M', 'sites_2_5D_aa_%_M', 'metal_2_5D_aa_%_negchrg', 'metal_3D_aa_%_negchrg', 'metal_2_5D_aa_%_bulk', 'metal_3D_aa_%_bulk', 'tm_2D_aa_%_M', 'tm_3D_aa_%_M', 'tm_2D_aa_%_tmstab', 'tm_3D_aa_%_tmstab']


_all_counts = ['aa_count_A', 'aa_count_C', 'aa_count_D', 'aa_count_E', 'aa_count_F', 'aa_count_G', 'aa_count_H',
               'aa_count_I', 'aa_count_K', 'aa_count_L', 'aa_count_M', 'aa_count_N', 'aa_count_P', 'aa_count_Q',
               'aa_count_R', 'aa_count_S', 'aa_count_T', 'aa_count_V', 'aa_count_W', 'aa_count_Y', 'aa_count_total',
               'acc_2D_aa_count_A', 'acc_2D_aa_count_C', 'acc_2D_aa_count_D', 'acc_2D_aa_count_E', 'acc_2D_aa_count_F',
               'acc_2D_aa_count_G', 'acc_2D_aa_count_H', 'acc_2D_aa_count_I', 'acc_2D_aa_count_K', 'acc_2D_aa_count_L',
               'acc_2D_aa_count_M', 'acc_2D_aa_count_N', 'acc_2D_aa_count_P', 'acc_2D_aa_count_Q', 'acc_2D_aa_count_R',
               'acc_2D_aa_count_S', 'acc_2D_aa_count_T', 'acc_2D_aa_count_V', 'acc_2D_aa_count_W', 'acc_2D_aa_count_Y',
               'acc_2D_aa_count_carb', 'acc_2D_aa_count_chrg', 'acc_2D_aa_count_negchrg', 'acc_2D_aa_count_poschrg',
               'acc_2D_aa_count_total', 'acc_3D_aa_count_A', 'acc_3D_aa_count_C', 'acc_3D_aa_count_D',
               'acc_3D_aa_count_E', 'acc_3D_aa_count_F', 'acc_3D_aa_count_G', 'acc_3D_aa_count_H', 'acc_3D_aa_count_I',
               'acc_3D_aa_count_K', 'acc_3D_aa_count_L', 'acc_3D_aa_count_M', 'acc_3D_aa_count_N', 'acc_3D_aa_count_P',
               'acc_3D_aa_count_Q', 'acc_3D_aa_count_R', 'acc_3D_aa_count_S', 'acc_3D_aa_count_T', 'acc_3D_aa_count_V',
               'acc_3D_aa_count_W', 'acc_3D_aa_count_Y', 'acc_3D_aa_count_carb', 'acc_3D_aa_count_chrg',
               'acc_3D_aa_count_negchrg', 'acc_3D_aa_count_poschrg', 'acc_3D_aa_count_total', 'csa_2_5D_aa_count_A',
               'csa_2_5D_aa_count_C', 'csa_2_5D_aa_count_D', 'csa_2_5D_aa_count_E', 'csa_2_5D_aa_count_F',
               'csa_2_5D_aa_count_G', 'csa_2_5D_aa_count_H', 'csa_2_5D_aa_count_I', 'csa_2_5D_aa_count_K',
               'csa_2_5D_aa_count_L', 'csa_2_5D_aa_count_M', 'csa_2_5D_aa_count_N', 'csa_2_5D_aa_count_P',
               'csa_2_5D_aa_count_Q', 'csa_2_5D_aa_count_R', 'csa_2_5D_aa_count_S', 'csa_2_5D_aa_count_T',
               'csa_2_5D_aa_count_V', 'csa_2_5D_aa_count_W', 'csa_2_5D_aa_count_Y', 'csa_2_5D_aa_count_chrg',
               'csa_2_5D_aa_count_total', 'disorder_2D_aa_count_A', 'disorder_2D_aa_count_C', 'disorder_2D_aa_count_D',
               'disorder_2D_aa_count_E', 'disorder_2D_aa_count_F', 'disorder_2D_aa_count_G', 'disorder_2D_aa_count_H',
               'disorder_2D_aa_count_I', 'disorder_2D_aa_count_K', 'disorder_2D_aa_count_L', 'disorder_2D_aa_count_M',
               'disorder_2D_aa_count_N', 'disorder_2D_aa_count_P', 'disorder_2D_aa_count_Q', 'disorder_2D_aa_count_R',
               'disorder_2D_aa_count_S', 'disorder_2D_aa_count_T', 'disorder_2D_aa_count_V', 'disorder_2D_aa_count_W',
               'disorder_2D_aa_count_Y', 'disorder_2D_aa_count_dis', 'disorder_2D_aa_count_ord',
               'disorder_2D_aa_count_total', 'disorder_3D_aa_count_A', 'disorder_3D_aa_count_C',
               'disorder_3D_aa_count_D', 'disorder_3D_aa_count_E', 'disorder_3D_aa_count_F', 'disorder_3D_aa_count_G',
               'disorder_3D_aa_count_H', 'disorder_3D_aa_count_I', 'disorder_3D_aa_count_K', 'disorder_3D_aa_count_L',
               'disorder_3D_aa_count_M', 'disorder_3D_aa_count_N', 'disorder_3D_aa_count_P', 'disorder_3D_aa_count_Q',
               'disorder_3D_aa_count_R', 'disorder_3D_aa_count_S', 'disorder_3D_aa_count_T', 'disorder_3D_aa_count_V',
               'disorder_3D_aa_count_W', 'disorder_3D_aa_count_Y', 'disorder_3D_aa_count_dis',
               'disorder_3D_aa_count_ord', 'disorder_3D_aa_count_total', 'dna_2_5D_aa_count_A', 'dna_2_5D_aa_count_C',
               'dna_2_5D_aa_count_D', 'dna_2_5D_aa_count_E', 'dna_2_5D_aa_count_F', 'dna_2_5D_aa_count_G',
               'dna_2_5D_aa_count_H', 'dna_2_5D_aa_count_I', 'dna_2_5D_aa_count_K', 'dna_2_5D_aa_count_L',
               'dna_2_5D_aa_count_M', 'dna_2_5D_aa_count_N', 'dna_2_5D_aa_count_P', 'dna_2_5D_aa_count_Q',
               'dna_2_5D_aa_count_R', 'dna_2_5D_aa_count_S', 'dna_2_5D_aa_count_T', 'dna_2_5D_aa_count_V',
               'dna_2_5D_aa_count_W', 'dna_2_5D_aa_count_Y', 'dna_2_5D_aa_count_dis', 'dna_2_5D_aa_count_ord',
               'dna_2_5D_aa_count_total', 'metal_2_5D_aa_count_A', 'metal_2_5D_aa_count_C', 'metal_2_5D_aa_count_D',
               'metal_2_5D_aa_count_E', 'metal_2_5D_aa_count_F', 'metal_2_5D_aa_count_G', 'metal_2_5D_aa_count_H',
               'metal_2_5D_aa_count_I', 'metal_2_5D_aa_count_K', 'metal_2_5D_aa_count_L', 'metal_2_5D_aa_count_M',
               'metal_2_5D_aa_count_N', 'metal_2_5D_aa_count_P', 'metal_2_5D_aa_count_Q', 'metal_2_5D_aa_count_R',
               'metal_2_5D_aa_count_S', 'metal_2_5D_aa_count_T', 'metal_2_5D_aa_count_V', 'metal_2_5D_aa_count_W',
               'metal_2_5D_aa_count_Y', 'metal_2_5D_aa_count_bulk', 'metal_2_5D_aa_count_carb',
               'metal_2_5D_aa_count_chrg', 'metal_2_5D_aa_count_negchrg', 'metal_2_5D_aa_count_poschrg',
               'metal_2_5D_aa_count_total', 'metal_3D_aa_count_bulk', 'metal_3D_aa_count_carb',
               'metal_3D_aa_count_chrg', 'metal_3D_aa_count_negchrg', 'metal_3D_aa_count_poschrg',
               'notdeep_3D_aa_count_A', 'notdeep_3D_aa_count_C', 'notdeep_3D_aa_count_D', 'notdeep_3D_aa_count_E',
               'notdeep_3D_aa_count_F', 'notdeep_3D_aa_count_G', 'notdeep_3D_aa_count_H', 'notdeep_3D_aa_count_I',
               'notdeep_3D_aa_count_K', 'notdeep_3D_aa_count_L', 'notdeep_3D_aa_count_M', 'notdeep_3D_aa_count_N',
               'notdeep_3D_aa_count_P', 'notdeep_3D_aa_count_Q', 'notdeep_3D_aa_count_R', 'notdeep_3D_aa_count_S',
               'notdeep_3D_aa_count_T', 'notdeep_3D_aa_count_V', 'notdeep_3D_aa_count_W', 'notdeep_3D_aa_count_Y',
               'notdeep_3D_aa_count_total', 'sites_2_5D_aa_count_A', 'sites_2_5D_aa_count_C', 'sites_2_5D_aa_count_D',
               'sites_2_5D_aa_count_E', 'sites_2_5D_aa_count_F', 'sites_2_5D_aa_count_G', 'sites_2_5D_aa_count_H',
               'sites_2_5D_aa_count_I', 'sites_2_5D_aa_count_K', 'sites_2_5D_aa_count_L', 'sites_2_5D_aa_count_M',
               'sites_2_5D_aa_count_N', 'sites_2_5D_aa_count_P', 'sites_2_5D_aa_count_Q', 'sites_2_5D_aa_count_R',
               'sites_2_5D_aa_count_S', 'sites_2_5D_aa_count_T', 'sites_2_5D_aa_count_V', 'sites_2_5D_aa_count_W',
               'sites_2_5D_aa_count_Y', 'sites_2_5D_aa_count_chrg', 'sites_2_5D_aa_count_total',
               'ss_disorder_2D_aa_count_A', 'ss_disorder_2D_aa_count_C', 'ss_disorder_2D_aa_count_D',
               'ss_disorder_2D_aa_count_E', 'ss_disorder_2D_aa_count_F', 'ss_disorder_2D_aa_count_G',
               'ss_disorder_2D_aa_count_H', 'ss_disorder_2D_aa_count_I', 'ss_disorder_2D_aa_count_K',
               'ss_disorder_2D_aa_count_L', 'ss_disorder_2D_aa_count_M', 'ss_disorder_2D_aa_count_N',
               'ss_disorder_2D_aa_count_P', 'ss_disorder_2D_aa_count_Q', 'ss_disorder_2D_aa_count_R',
               'ss_disorder_2D_aa_count_S', 'ss_disorder_2D_aa_count_T', 'ss_disorder_2D_aa_count_V',
               'ss_disorder_2D_aa_count_W', 'ss_disorder_2D_aa_count_Y', 'ss_disorder_2D_aa_count_dis',
               'ss_disorder_2D_aa_count_ord', 'ss_disorder_2D_aa_count_total', 'ss_disorder_3D_aa_count_A',
               'ss_disorder_3D_aa_count_C', 'ss_disorder_3D_aa_count_D', 'ss_disorder_3D_aa_count_E',
               'ss_disorder_3D_aa_count_F', 'ss_disorder_3D_aa_count_G', 'ss_disorder_3D_aa_count_H',
               'ss_disorder_3D_aa_count_I', 'ss_disorder_3D_aa_count_K', 'ss_disorder_3D_aa_count_L',
               'ss_disorder_3D_aa_count_M', 'ss_disorder_3D_aa_count_N', 'ss_disorder_3D_aa_count_P',
               'ss_disorder_3D_aa_count_Q', 'ss_disorder_3D_aa_count_R', 'ss_disorder_3D_aa_count_S',
               'ss_disorder_3D_aa_count_T', 'ss_disorder_3D_aa_count_V', 'ss_disorder_3D_aa_count_W',
               'ss_disorder_3D_aa_count_Y', 'ss_disorder_3D_aa_count_dis', 'ss_disorder_3D_aa_count_ord',
               'ss_disorder_3D_aa_count_total', 'surface_3D_aa_count_A', 'surface_3D_aa_count_C',
               'surface_3D_aa_count_D', 'surface_3D_aa_count_E', 'surface_3D_aa_count_F', 'surface_3D_aa_count_G',
               'surface_3D_aa_count_H', 'surface_3D_aa_count_I', 'surface_3D_aa_count_K', 'surface_3D_aa_count_L',
               'surface_3D_aa_count_M', 'surface_3D_aa_count_N', 'surface_3D_aa_count_P', 'surface_3D_aa_count_Q',
               'surface_3D_aa_count_R', 'surface_3D_aa_count_S', 'surface_3D_aa_count_T', 'surface_3D_aa_count_V',
               'surface_3D_aa_count_W', 'surface_3D_aa_count_Y', 'surface_3D_aa_count_carb', 'surface_3D_aa_count_chrg',
               'surface_3D_aa_count_negchrg', 'surface_3D_aa_count_poschrg', 'surface_3D_aa_count_total',
               'tm_2D_aa_count_tmstab', 'tm_2D_aa_count_tmunstab', 'tm_3D_aa_count_tmstab', 'tm_3D_aa_count_tmunstab']