/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package umcg.genetica.io.trityper;

import gnu.trove.map.hash.THashMap;
import gnu.trove.set.hash.THashSet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.Random;

import umcg.genetica.containers.Pair;
import umcg.genetica.io.Gpio;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
import umcg.genetica.math.stats.Log2Transform;
import umcg.genetica.math.stats.QuantileNormalization;

/**
 * @author harmjan
 */
public final class TriTyperGeneticalGenomicsDataset implements Comparable<TriTyperGeneticalGenomicsDataset> {

	private TriTyperGenotypeData genotypeData;
	private TriTyperExpressionData expressionData;
	private THashMap<String, String> genotypeToExpressionCouplings;
	private TriTyperGeneticalGenomicsDatasetSettings settings;
	private short[] expressionToGenotypeIdArray;
	private short totalGGSamples;
	private boolean expressionDataLoadedCorrectly = true;
	private short[] genotypeToExpressionIdArray;
	DoubleMatrixDataset<String, String> covariates = null;

	public TriTyperGeneticalGenomicsDataset(TriTyperGeneticalGenomicsDatasetSettings settings, Pair<List<String>, List<List<String>>> pathwayDefinitions, boolean displayWarnings) throws IOException, Exception {

		this.settings = settings;

		settings.genotypeLocation = Gpio.formatAsDirectory(settings.genotypeLocation);

		if (settings.expressionLocation == null) {
			settings.expressionLocation = settings.genotypeLocation + "ExpressionData.txt";
		}

		// load the genotype metadata
		genotypeData = new TriTyperGenotypeData();
		genotypeData.displayWarnings = displayWarnings;
		genotypeData.load(settings.genotypeLocation, settings.snpmapFileLocation, settings.snpFileLocation);
		THashSet<String> includedExpressionIndividuals = new THashSet<String>();
		Boolean[] isIncluded = genotypeData.getIsIncluded();

		// preload the sample coupling file
		loadCouplings();

		// determine which expression samples to include
		Set<Entry<String, String>> entries = genotypeToExpressionCouplings.entrySet();
		for (Entry<String, String> entry : entries) {
			String genotypeIndividual = entry.getKey();
			Integer genotypeIndividualId = genotypeData.getIndividualId(genotypeIndividual);

			if (genotypeIndividualId != -9 && isIncluded[genotypeIndividualId] != null && isIncluded[genotypeIndividualId]) {
				includedExpressionIndividuals.add(entry.getValue());
			}
		}

		if (includedExpressionIndividuals.isEmpty()) {
			System.err.println("ERROR: none of the expression samples will be included with your current settings.\n" +
					"zPlease check the links between genotype and gene expression samples and/or your PhenotypeInformation.txt");
			System.exit(-1);
		}

		// load the expression data
		expressionData = new TriTyperExpressionData();
		expressionData.displayWarnings = displayWarnings;
		expressionData.confineToProbes(settings.tsProbesConfine);
		expressionData.setConfineToProbesThatMapToAnyChromosome(settings.confineProbesToProbesMappingToAnyChromosome);
		expressionData.setConfineToProbesThatMapToChromosome(settings.confineProbesToProbesThatMapToChromosome);
		expressionData.setIncludeIndividuals(includedExpressionIndividuals);
		expressionData.setPathwayDefinitions(pathwayDefinitions);
		expressionDataLoadedCorrectly = expressionData.load(settings.expressionLocation, settings.probeannotation, settings.expressionplatform, (settings.cisAnalysis && settings.transAnalysis));
		pruneGenotypeToExpressionCouplings();

		if (settings.covariateFile != null && Gpio.exists(settings.covariateFile)) {
			// load covariates..
			System.out.println("Loading covariates: " + settings.covariateFile);
			HashSet<String> individualSet = new HashSet<String>();
			individualSet.addAll(Arrays.asList(expressionData.getIndividuals()));
			covariates = new DoubleMatrixDataset<String, String>(settings.covariateFile, null, individualSet);

			if (covariates.colObjects.isEmpty()) {
				// try the transpose
				System.out.println("Could not find matching sample identifiers between covariate file and expression file.\nTransposing your covariate file.");
				covariates = new DoubleMatrixDataset<String, String>(settings.covariateFile, individualSet);
				if (covariates.rowObjects.isEmpty()) {
					System.err.println("Could not find matching samples between expression data and covariate data.");
					System.exit(-1);
				} else {
					covariates.transposeDataset(); // put the covariates on the rows, samples on the columns
					covariates.recalculateHashMaps();
				}
			}

			covariates.removeColumnsWithNaNs();
			covariates.recalculateHashMaps();
			if (covariates.colObjects.isEmpty()) {
				System.err.println("ERROR: after removing samples with NaN values, no covariates remain");
				System.exit(-1);
			}

			System.out.println(covariates.rowObjects.size() + " covariates loaded for " + covariates.colObjects.size() + " samples");

			// remove expression samples without covariates, and reorder expression data
			expressionData.pruneAndReorderSamples(covariates.colObjects);

			// prune expression dataset to samples having covariates
			loadCouplings();
			pruneGenotypeToExpressionCouplings();
		}

	}

	public TriTyperGeneticalGenomicsDataset(TriTyperGeneticalGenomicsDatasetSettings settings) throws IOException, Exception {
		this(settings, null, true);
	}

	public TriTyperGeneticalGenomicsDataset(TriTyperGeneticalGenomicsDatasetSettings triTyperGeneticalGenomicsDatasetSettings, Pair<List<String>, List<List<String>>> pathwayDefinitions) throws Exception {
		this(triTyperGeneticalGenomicsDatasetSettings, pathwayDefinitions, true);
	}

	/**
	 * @return the genotypeData
	 */
	public TriTyperGenotypeData getGenotypeData() {
		return genotypeData;
	}

	/**
	 * @param genotypeData the genotypeData to set
	 */
	public void setGenotypeData(TriTyperGenotypeData genotypeData) {
		this.genotypeData = genotypeData;
	}

	/**
	 * @return the expressionData
	 */
	public TriTyperExpressionData getExpressionData() {
		return expressionData;
	}

	/**
	 * @param expressionData the expressionData to set
	 */
	public void setExpressionData(TriTyperExpressionData expressionData) {
		this.expressionData = expressionData;
	}

	/**
	 * @return the genotypeToExpressionCouplings
	 */
	public THashMap<String, String> getGenotypeToExpressionCouplings() {
		return genotypeToExpressionCouplings;
	}

	/**
	 * @param genotypeToExpressionCouplings the genotypeToExpressionCouplings to
	 *                                      set
	 */
	public void setGenotypeToExpressionCouplings(THashMap<String, String> genotypeToExpressionCouplings) {
		this.genotypeToExpressionCouplings = genotypeToExpressionCouplings;
	}

	/**
	 * @return the settings
	 */
	public TriTyperGeneticalGenomicsDatasetSettings getSettings() {
		return settings;
	}

	/**
	 * @param settings the settings to set
	 */
	public void setSettings(TriTyperGeneticalGenomicsDatasetSettings settings) {
		this.settings = settings;
	}

	public int getTotalGGSamples() {
		return totalGGSamples;
	}

	private void loadCouplings() throws IOException {
		genotypeToExpressionCouplings = new THashMap<String, String>();
		String genotypeToExpressionCoupling = settings.genotypeToExpressionCoupling;
		if (genotypeToExpressionCoupling != null && genotypeToExpressionCoupling.trim().length() > 0) {
			if (!Gpio.exists(genotypeToExpressionCoupling)) {
				throw new IOException("Error: genotype to expression coupling file: " + genotypeToExpressionCoupling + " does not exist.");
			}
			TextFile in = new TextFile(genotypeToExpressionCoupling, TextFile.R);
			String[] elems = in.readLineElemsReturnReference(TextFile.tab);

			while (elems != null) {
				if (elems.length > 1) {
					String key = new String(elems[0].getBytes("UTF-8"));
					String value = new String(elems[1].getBytes("UTF-8"));
					if (genotypeToExpressionCouplings.get(key) != null) {
						System.out.println("ERROR: your genotype to expression coupling file contains duplicate entries for individual: " + key);
						System.exit(0);
					} else {
						genotypeToExpressionCouplings.put(key, value);
					}

				}
				elems = in.readLineElemsReturnReference(TextFile.tab);
			}

			in.close();
		} else {
			Boolean[] isIncluded = genotypeData.getIsIncluded();
			int i = 0;
			String[] individuals = genotypeData.getIndividuals();
			for (String ind : individuals) {
				if (isIncluded[i] != null && isIncluded[i]) {
					if (genotypeToExpressionCouplings.get(ind) != null) {
						System.out.println("ERROR: your genotype data contains duplicate individuals: " + ind);
						System.exit(0);
					} else {
						genotypeToExpressionCouplings.put(ind, ind);
					}
				}
				i++;
			}
		}
	}

	public int[] getExpressionToGenotypeIdArray() {
		int[] intExpToGArr = new int[expressionToGenotypeIdArray.length];
		for (int i = 0; i < intExpToGArr.length; i++) {
			intExpToGArr[i] = expressionToGenotypeIdArray[i];
		}
		return intExpToGArr;
	}

	public short[] getExpressionToGenotypeIdArrayShort() {
		return expressionToGenotypeIdArray;
	}

	@Override
	public int compareTo(TriTyperGeneticalGenomicsDataset o) {
		int numIndsOther = o.getGenotypeData().getIndividuals().length;
		return genotypeData.getIndividuals().length - numIndsOther;
	}

	public boolean equals(TriTyperGeneticalGenomicsDataset o) {
		int numIndsOther = o.getGenotypeData().getIndividuals().length;
		if (genotypeData.getIndividuals().length == numIndsOther) {
			return true;
		} else {
			return false;
		}
	}

	/**
	 * Permutes the mapping between genotype and gene expression samples
	 */
	public void permuteSampleLables(Random r) {
		ArrayList<Short> alIndWGA = new ArrayList<Short>();
		int numSamples = expressionToGenotypeIdArray.length;
		for (int i = 0; i < numSamples; i++) {
			if (expressionToGenotypeIdArray[i] != -1) {
				alIndWGA.add(expressionToGenotypeIdArray[i]);
			}
		}

		short[] indWGANew = new short[numSamples];

		genotypeToExpressionIdArray = new short[genotypeData.getIndividuals().length];
		for (int i = 0; i < numSamples; i++) {
			if (expressionToGenotypeIdArray[i] == -1) {
				indWGANew[i] = -1;
			} else {
				short genotypeId = alIndWGA.remove((int) (r.nextDouble() * (double) alIndWGA.size()));
				indWGANew[i] = genotypeId;

				genotypeToExpressionIdArray[genotypeId] = (short) i;
			}
		}
		expressionToGenotypeIdArray = indWGANew;

	}

	public void permuteCovariates(Random r) {
		// shuffle covariate, if any
		if (covariates != null) {
			System.out.println("Randomizing covariates");
			for (int covariate = 0; covariate < covariates.nrRows; covariate++) {
				ArrayList<Double> covariateData = new ArrayList<Double>();
				for (int sample = 0; sample < covariates.nrRows; sample++) {
					covariateData.add(covariates.rawData[covariate][sample]);
				}
				Collections.shuffle(covariateData, r);
				for (int sample = 0; sample < covariates.nrRows; sample++) {
					covariates.rawData[covariate][sample] = covariateData.get(sample);
				}
			}
		}
	}

	public void resetGenotypeToExpressionCouplings() throws IOException {
		loadCouplings();
	}

	public void pruneGenotypeToExpressionCouplings() {
		// now check whether each genotype is actually linked to an expression individual...
		String[] individuals = genotypeData.getIndividuals();

		Boolean[] isReallyIncluded = new Boolean[individuals.length];
		THashMap<String, String> realGenotypeToExpressionCouplings = new THashMap<String, String>();
		totalGGSamples = 0;
		for (int i = 0; i < isReallyIncluded.length; i++) {
			String genotypeInd = individuals[i];
			if (!genotypeToExpressionCouplings.containsKey(genotypeInd)) {
				isReallyIncluded[i] = false;
			} else {
				String coupledExpressionSample = genotypeToExpressionCouplings.get(genotypeInd);
				if (coupledExpressionSample != null) {
					Integer expressionSampleId = expressionData.getIndividualId(coupledExpressionSample);
					if (expressionSampleId == -9) {
						isReallyIncluded[i] = false;
					} else {
						isReallyIncluded[i] = true;
						realGenotypeToExpressionCouplings.put(genotypeInd, coupledExpressionSample);
						totalGGSamples++;
					}
				}
			}
		}

		// exclude genotypes for which no expression data is available.
		genotypeData.setIsIncluded(isReallyIncluded);
		genotypeToExpressionCouplings = realGenotypeToExpressionCouplings;

		// couple expression IDs to genotype IDs for quick reference
		Set<Entry<String, String>> entries = realGenotypeToExpressionCouplings.entrySet();
		expressionToGenotypeIdArray = new short[totalGGSamples];
		HashSet<Integer> visitedNumbers = new HashSet<Integer>();
		for (Entry<String, String> entry : entries) {
			Integer expressionIndId = expressionData.getIndividualId(entry.getValue());
			Integer genotypeIndId = genotypeData.getIndividualId(entry.getKey());
			if (expressionIndId != -9 && genotypeIndId != -9) {
				if (visitedNumbers.contains(expressionIndId)) {
					System.out.println("ERROR: your dataset contains duplicate samples!");
				} else {
					expressionToGenotypeIdArray[expressionIndId] = genotypeIndId.shortValue();
					visitedNumbers.add(expressionIndId);
				}
			}
		}
	}

	public HashMap<Integer, Integer> getGenotypeToExpressionIdHash() {
		HashMap<Integer, Integer> gte = new HashMap<Integer, Integer>();
		int expressionIndId = 0;
		for (int genotypeIndId : expressionToGenotypeIdArray) {
			gte.put(genotypeIndId, expressionIndId);
			expressionIndId++;
		}
		return gte;
	}

	public HashMap<Integer, Integer> getExpressionToGenotypeIdHash() {
		HashMap<Integer, Integer> etg = new HashMap<Integer, Integer>();
		int expressionIndId = 0;
		for (int genotypeIndId : expressionToGenotypeIdArray) {
			etg.put(expressionIndId, genotypeIndId);
			expressionIndId++;
		}
		return etg;
	}

	/**
	 * @return the expressionDataLoadedCorrectly
	 */
	public boolean isExpressionDataLoadedCorrectly() {
		return expressionDataLoadedCorrectly;
	}

	/**
	 * @param expressionDataLoadedCorrectly the expressionDataLoadedCorrectly to
	 *                                      set
	 */
	public void setExpressionDataLoadedCorrectly(boolean expressionDataLoadedCorrectly) {
		this.expressionDataLoadedCorrectly = expressionDataLoadedCorrectly;
	}

	public DoubleMatrixDataset<String, String> getCovariateData() {
		return covariates;
	}
}