python source code of mouse_brain

import numpy as np
import os
from scanorama import *
from scipy.sparse import vstack
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import normalize, LabelEncoder

from experiments import *
from mouse_brain import keep_valid, data_names
from process import load_names
from utils import *

np.random.seed(0)

NAMESPACE = 'mouse_brain'
METHOD = 'svd'
DIMRED = 100

def auroc(X, genes, labels, focus, background=None):
    assert(len(genes) == X.shape[1])

    focus_idx = focus == labels
    if background is None:
        background_idx = range(X.shape[0])
    else:
        background_idx = background == labels
        
    for g, gene in enumerate(genes):
        x_gene = X[:, g]
        x_focus = x_gene[focus_idx]
        x_background = x_gene[background_idx]
        y_score = np.concatenate([ x_focus, x_background ])
        
        y_true = np.zeros(len(x_focus) + len(x_background))
        y_true[:len(x_focus)] = 1

        auroc = roc_auc_score(y_true, y_score)

        print('{}\t{}'.format(gene, auroc))

def astro_oligo_violin(X, genes, gene, labels, name):
    X = X.toarray()

    gidx = list(genes).index(gene)

    astro = X[labels == 'astro', gidx]
    oligo = X[labels == 'oligo', gidx]
    both = X[labels == 'both', gidx]

    plt.figure()
    sns.violinplot(data=[ astro, oligo, both ], scale='width', cut=0)
    sns.stripplot(data=[ astro, oligo, both ], jitter=True, color='black', size=1)
    plt.xticks([0, 1, 2], ['Astrocytes', 'Oligodendrocytes', 'Both'])
    plt.savefig('{}_violin_{}.svg'.format(name, gene))
        
def astro_oligo_joint(X, genes, gene1, gene2, labels, focus, name):
    X = X.toarray()

    gidx1 = list(genes).index(gene1)
    gidx2 = list(genes).index(gene2)

    idx = labels == focus

    x1 = X[(idx, gidx1)]
    x2 = X[(idx, gidx2)]

    plt.figure()
    sns.jointplot(
        x1, x2, kind='scatter', space=0, alpha=0.3
    ).plot_joint(sns.kdeplot, zorder=0, n_levels=10)
    plt.savefig('{}_joint_{}_{}_{}.png'.format(name, focus, gene1, gene2))
    
if __name__ == '__main__':
    datasets, genes_list, n_cells = load_names(data_names, norm=False)
    keep_valid(datasets)
    datasets, genes = merge_datasets(datasets, genes_list)
    X = vstack(datasets)

    if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)):
        log('Dimension reduction with {}...'.format(METHOD))
        X_dimred = reduce_dimensionality(
            normalize(X), method=METHOD, dimred=DIMRED
        )
        log('Dimensionality = {}'.format(X_dimred.shape[1]))
        np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred)
    else:
        X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE))
        
    from geosketch import gs, uniform
    samp_idx = gs(X_dimred, 20000, replace=False)
    #samp_idx = uniform(X_dimred, 20000, replace=False)
    
    #from anndata import AnnData
    #import scanpy.api as sc
    #adata = AnnData(X=X_dimred[samp_idx, :])
    #sc.pp.neighbors(adata, use_rep='X')
    #sc.tl.louvain(adata, resolution=1.5, key_added='louvain')
    #
    #louv_labels = np.array(adata.obs['louvain'].tolist())
    #le = LabelEncoder().fit(louv_labels)
    #cell_labels = le.transform(louv_labels)
    #
    #np.savetxt('data/cell_labels/mouse_brain_louvain.txt', cell_labels)
    
    cell_labels = (
        open('data/cell_labels/mouse_brain_louvain.txt')
        .read().rstrip().split('\n')
    )
    le = LabelEncoder().fit(cell_labels)
    cell_labels = le.transform(cell_labels)

    astro = set([ 1, 4, 5, 7 ])
    oligo = set([ 8, 21, 23, 26, 27 ])
    focus = set([ 2 ])

    labels = []
    aob_labels = []
    for cl in cell_labels:
        if cl in focus:
            labels.append(2)
            aob_labels.append('both')
        elif cl in astro or cl in oligo:
            if cl in astro:
                if cl == 5:
                    aob_labels.append('bergman')
                    labels.append(3)
                else:
                    aob_labels.append('astro')
                    labels.append(0)
            else:
                aob_labels.append('oligo')
                labels.append(1)
        else:
            labels.append(4)
            aob_labels.append('none')
    labels = np.array(labels)
    aob_labels = np.array(aob_labels)

    X = np.log1p(normalize(X[samp_idx, :]))
    
    #auroc(X.toarray(), genes, labels, 0, background=1)
    #astro_oligo_joint(X, genes, 'GJA1', 'MBP', aob_labels, 'astro', NAMESPACE)
    #astro_oligo_joint(X, genes, 'GJA1', 'MBP', aob_labels, 'oligo', NAMESPACE)
    #astro_oligo_joint(X, genes, 'GJA1', 'MBP', aob_labels, 'both', NAMESPACE)
    #astro_oligo_joint(X, genes, 'GJA1', 'PLP1', aob_labels, 'astro', NAMESPACE)
    #astro_oligo_joint(X, genes, 'GJA1', 'PLP1', aob_labels, 'oligo', NAMESPACE)
    #astro_oligo_joint(X, genes, 'GJA1', 'PLP1', aob_labels, 'both', NAMESPACE)
    
    astro_oligo_violin(X, genes, 'GJA1', aob_labels, NAMESPACE)
    astro_oligo_violin(X, genes, 'MBP', aob_labels, NAMESPACE)
    astro_oligo_violin(X, genes, 'PLP1', aob_labels, NAMESPACE)

    viz_genes = [
        'SLC1A3', 'GJA1', 'MBP', 'PLP1', 'TRF',
        #'CST3', 'CPE', 'FTH1', 'APOE', 'MT1', 'NDRG2', 'TSPAN7',
        #'PLP1', 'MAL', 'PTGDS', 'CLDN11', 'APOD', 'QDPR', 'MAG', 'ERMN',
        #'PLP1', 'MAL', 'PTGDS', 'MAG', 'CLDN11', 'APOD', 'FTH1',
        #'ERMN', 'MBP', 'ENPP2', 'QDPR', 'MOBP', 'TRF',
        #'CST3', 'SPARCL1', 'PTN', 'CD81', 'APOE', 'ATP1A2', 'ITM2B'
    ]
    
    cell_labels = (
        open('data/cell_labels/mouse_brain_cluster.txt')
        .read().rstrip().split('\n')
    )
    le = LabelEncoder().fit(cell_labels)
    cell_labels = le.transform(cell_labels)
    
    embedding = visualize(
        [ X_dimred[samp_idx, :] ], labels,
        NAMESPACE + '_astro{}'.format(len(samp_idx)),
        [ str(ct) for ct in sorted(set(labels)) ],
        gene_names=viz_genes, gene_expr=X, genes=genes,
        perplexity=100, n_iter=500, image_suffix='.png',
        viz_cluster=True
    )

    #visualize_dropout(X, embedding, image_suffix='.png',
    #                  viz_prefix=NAMESPACE + '_dropout')