python source code of plinkfiles

"""
Code for handling plink files.

Uses plinkio.
"""
import scipy as sp
from plinkio import plinkfile

def get_chrom_dict(loci, chromosomes, debug=False):
    chr_dict = {}
    for chrom in chromosomes:
        chr_str = 'chrom_%s' % chrom
        chr_dict[chr_str] = {'sids':[], 'snp_indices':[], 'positions':[], 'nts':[]}
      
    for i, l in enumerate(loci):
        chrom = l.chromosome
        pos = l.bp_position
        chr_str = 'chrom_%d' % chrom
        chr_dict[chr_str]['sids'].append(l.name)
        chr_dict[chr_str]['snp_indices'].append(i)
        chr_dict[chr_str]['positions'].append(pos)
        chr_dict[chr_str]['nts'].append([l.allele1, l.allele2])
      
    if debug:
        print('Genotype dictionary filled')
    return chr_dict


def parse_plink_snps(genotype_file, snp_indices):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    num_snps = len(snp_indices)
    raw_snps = sp.empty((num_snps, num_individs), dtype='int8')
    # If these indices are not in order then we place them in the right place while parsing SNPs.
    snp_order = sp.argsort(snp_indices)
    ordered_snp_indices = list(snp_indices[snp_order])
    ordered_snp_indices.reverse()
    # Iterating over file to load SNPs
    snp_i = 0
    next_i = ordered_snp_indices.pop()
    line_i = 0
    max_i = ordered_snp_indices[0]
    while line_i <= max_i: 
        if line_i < next_i:
            next(plinkf)
        elif line_i == next_i:
            line = next(plinkf)
            snp = sp.array(line, dtype='int8')
            bin_counts = line.allele_counts()
            if bin_counts[-1] > 0:
                mode_v = sp.argmax(bin_counts[:2])
                snp[snp == 3] = mode_v
            s_i = snp_order[snp_i]
            raw_snps[s_i] = snp
            if line_i < max_i:
                next_i = ordered_snp_indices.pop()
            snp_i += 1
        line_i += 1
    plinkf.close()
    assert snp_i == len(raw_snps), 'Parsing SNPs from plink file failed.'
    num_indivs = len(raw_snps[0])
    freqs = sp.sum(raw_snps, 1, dtype='float32') / (2 * float(num_indivs))
    return raw_snps, freqs

def get_num_indivs(genotype_file):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    plinkf.close()
    return len(samples)

def get_phenotypes(plinkf, debug=False):
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print('Unable to find phenotype values.')
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        if debug:
            print('Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]))
        has_phenotype = True
    else:
        if debug:
            print('Found quantitative phenotype values')
        has_phenotype = True
    return {'has_phenotype':has_phenotype, 'fids':fids, 'iids':iids, 'phenotypes':Y, 'num_individs':num_individs}