org.apache.commons.math3.random.RandomGeneratorFactory Java Examples

The following examples show how to use org.apache.commons.math3.random.RandomGeneratorFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BM.java    From pyramid with Apache License 2.0 6 votes vote down vote up
public BM(int numClusters, int dimension, long randomSeed) {
    this.numClusters = numClusters;
    this.dimension = dimension;
    this.distributions = new BernoulliDistribution[numClusters][dimension];
    this.mixtureCoefficients = new double[numClusters];
    Arrays.fill(mixtureCoefficients,1.0/numClusters);
    this.logMixtureCoefficients = new double[numClusters];
    Arrays.fill(logMixtureCoefficients,Math.log(1.0/numClusters));
    Random random = new Random(randomSeed);
    RandomGenerator randomGenerator = RandomGeneratorFactory.createRandomGenerator(random);
    UniformRealDistribution uniform = new UniformRealDistribution(randomGenerator, 0.25,0.75);
    for (int k=0;k<numClusters;k++){
        for (int d=0;d<dimension;d++){
            double p = uniform.sample();
            distributions[k][d] = new BernoulliDistribution(p);
        }
    }
    this.logClusterConditioinalForEmpty = new double[numClusters];
    updateLogClusterConditioinalForEmpty();

    this.names = new ArrayList<>(dimension);
    for (int d=0;d<dimension;d++){
        names.add(""+d);
    }
}
 
Example #2
Source File: AdaptiveMetropolisSamplerUnitTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testBeta() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
    for (final double a : Arrays.asList(10, 20, 30)) {
        for (final double b : Arrays.asList(10, 20, 30)) {

            final double theoreticalMean = a / (a + b);
            final double theoreticalVariance = a*b/((a+b)*(a+b)*(a+b+1));

            //Note: this is the theoretical standard deviation of the sample mean given uncorrelated
            //samples.  The sample mean will have a greater variance here because samples are correlated.
            final double standardDeviationOfMean = Math.sqrt(theoreticalVariance / NUM_SAMPLES);

            final Function<Double, Double> logPDF = x -> (a - 1) * Math.log(x) + (b - 1) * Math.log(1 - x);
            final AdaptiveMetropolisSampler sampler = new AdaptiveMetropolisSampler(INITIAL_BETA_GUESS, INITIAL_STEP_SIZE, 0, 1);
            final List<Double> samples = sampler.sample(rng, logPDF, NUM_SAMPLES, NUM_BURN_IN_STEPS);

            final double sampleMean = samples.stream().mapToDouble(x -> x).average().getAsDouble();
            final double sampleMeanSquare = samples.stream().mapToDouble(x -> x*x).average().getAsDouble();
            final double sampleVariance = (sampleMeanSquare - sampleMean * sampleMean)*NUM_SAMPLES/(NUM_SAMPLES-1);

            Assert.assertEquals(sampleMean, theoreticalMean, 10 * standardDeviationOfMean);
            Assert.assertEquals(sampleVariance, theoreticalVariance, 10e-4);
        }
    }
}
 
Example #3
Source File: AdaptiveMetropolisSamplerUnitTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testGaussian() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
    for (final double theoreticalMean : Arrays.asList(0)) {
        for (final double precision : Arrays.asList(1.0)) {
            final double variance = 1/precision;

            //Note: this is the theoretical standard deviation of the sample mean given uncorrelated
            //samples.  The sample mean will have a greater variance here because samples are correlated.
            final double standardDeviationOfMean = Math.sqrt(variance / NUM_SAMPLES);

            final Function<Double, Double> logPDF = x -> -(precision/2)*(x-theoreticalMean)*(x-theoreticalMean);
            final AdaptiveMetropolisSampler sampler = new AdaptiveMetropolisSampler(INITIAL_GAUSSIAN_GUESS, INITIAL_STEP_SIZE,
                    Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY);
            final List<Double> samples = sampler.sample(rng, logPDF, NUM_SAMPLES, NUM_BURN_IN_STEPS);

            final double sampleMean = samples.stream().mapToDouble(x -> x).average().getAsDouble();
            final double sampleMeanSquare = samples.stream().mapToDouble(x -> x*x).average().getAsDouble();
            final double sampleVariance = (sampleMeanSquare - sampleMean * sampleMean)*NUM_SAMPLES/(NUM_SAMPLES-1);

            Assert.assertEquals(sampleMean, theoreticalMean, 6 * standardDeviationOfMean);
            Assert.assertEquals(sampleVariance, variance, variance/10);
        }
    }
}
 
Example #4
Source File: CopyRatioModellerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testMCMC() {
    final double variance = 0.01;
    final double outlierProbability = 0.05;
    final int numSegments = 100;
    final double averageIntervalsPerSegment = 100.;
    final int numSamples = 150;
    final int numBurnIn = 50;
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));

    final SampleLocatableMetadata metadata = new SimpleSampleLocatableMetadata(
            "test-sample",
            new SAMSequenceDictionary(IntStream.range(0, numSegments)
                    .mapToObj(i -> new SAMSequenceRecord("chr" + i + 1, 10000))
                    .collect(Collectors.toList())));
    final CopyRatioSimulatedData simulatedData = new CopyRatioSimulatedData(
            metadata, variance, outlierProbability, numSegments, averageIntervalsPerSegment, rng);

    final CopyRatioModeller modeller = new CopyRatioModeller(simulatedData.getData().getCopyRatios(), simulatedData.getData().getSegments());
    modeller.fitMCMC(numSamples, numBurnIn);

    assertCopyRatioPosteriorCenters(modeller, simulatedData);
}
 
Example #5
Source File: AdaptiveMetropolisSamplerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testGaussian() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
    for (final double theoreticalMean : Arrays.asList(0)) {
        for (final double precision : Arrays.asList(1.0)) {
            final double variance = 1/precision;

            //Note: this is the theoretical standard deviation of the sample mean given uncorrelated
            //samples.  The sample mean will have a greater variance here because samples are correlated.
            final double standardDeviationOfMean = Math.sqrt(variance / NUM_SAMPLES);

            final Function<Double, Double> logPDF = x -> -(precision/2)*(x-theoreticalMean)*(x-theoreticalMean);
            final AdaptiveMetropolisSampler sampler = new AdaptiveMetropolisSampler(INITIAL_GAUSSIAN_GUESS, INITIAL_STEP_SIZE,
                    Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY);
            final List<Double> samples = sampler.sample(rng, logPDF, NUM_SAMPLES, NUM_BURN_IN_STEPS);

            final double sampleMean = samples.stream().mapToDouble(x -> x).average().getAsDouble();
            final double sampleMeanSquare = samples.stream().mapToDouble(x -> x*x).average().getAsDouble();
            final double sampleVariance = (sampleMeanSquare - sampleMean * sampleMean)*NUM_SAMPLES/(NUM_SAMPLES-1);

            Assert.assertEquals(sampleMean, theoreticalMean, 6 * standardDeviationOfMean);
            Assert.assertEquals(sampleVariance, variance, variance/10);
        }
    }
}
 
Example #6
Source File: AlleleFractionSegmenterUnitTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testChromosomesOnDifferentSegments() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));

    final double[] trueMinorAlleleFractions = new double[] {0.12, 0.32, 0.5};
    final double trueMemoryLength = 1e5;
    final AlleleFractionGlobalParameters trueParams = new AlleleFractionGlobalParameters(1.0, 0.01, 0.01);

    // randomly set positions
    final int chainLength = 100;
    final List<SimpleInterval> positions = CopyRatioSegmenterUnitTest.randomPositions("chr1", chainLength, rng, trueMemoryLength/4);
    positions.addAll(CopyRatioSegmenterUnitTest.randomPositions("chr2", chainLength, rng, trueMemoryLength/4));
    positions.addAll(CopyRatioSegmenterUnitTest.randomPositions("chr3", chainLength, rng, trueMemoryLength/4));

    final int trueState = 2;    //fix everything to the same state 2
    final List<Double> minorAlleleFractionSequence = Collections.nCopies(positions.size(), trueMinorAlleleFractions[trueState]);

    final AllelicCountCollection counts = generateCounts(minorAlleleFractionSequence, positions, rng, trueParams);

    final AlleleFractionSegmenter segmenter = new AlleleFractionSegmenter(10, counts, AllelicPanelOfNormals.EMPTY_PON);
    final List<ModeledSegment> segments = segmenter.getModeledSegments();

    //check that each chromosome has at least one segment
    final int numDifferentContigsInSegments = (int) segments.stream().map(ModeledSegment::getContig).distinct().count();
    Assert.assertEquals(numDifferentContigsInSegments, 3);
}
 
Example #7
Source File: CoverageDropoutDetectorTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private Object[][] getUnivariateGaussianTargets(final double sigma) {
    Random rng = new Random(337);
    final RandomGenerator randomGenerator = RandomGeneratorFactory.createRandomGenerator(rng);
    NormalDistribution n = new NormalDistribution(randomGenerator, 1, sigma);
    final int numDataPoints = 10000;
    final int numEventPoints = 2000;

    final List<ReadCountRecord.SingleSampleRecord> targetList = new ArrayList<>();
    for (int i = 0; i < (numDataPoints - numEventPoints); i++){
        targetList.add(new ReadCountRecord.SingleSampleRecord(new Target("arbitrary_name", new SimpleInterval("chr1", 100 + 2*i, 101 + 2 * i)), n.sample()));
    }
    for (int i = (numDataPoints - numEventPoints); i < numDataPoints; i++){
        targetList.add(new ReadCountRecord.SingleSampleRecord(new Target("arbitrary_name", new SimpleInterval("chr1", 100 + 2 * i, 101 + 2 * i)), 0.5 + n.sample()));
    }

    HashedListTargetCollection<ReadCountRecord.SingleSampleRecord> targets = new HashedListTargetCollection<>(targetList);

    List<ModeledSegment> segments = new ArrayList<>();
    segments.add(new ModeledSegment(new SimpleInterval("chr1", 100, 16050), 8000, 1));
    segments.add(new ModeledSegment(new SimpleInterval("chr1", 16100, 20200), 2000, 1.5));

    return new Object [] []{ {targets, segments}};
}
 
Example #8
Source File: AdaptiveMetropolisSamplerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testBeta() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
    for (final double a : Arrays.asList(10, 20, 30)) {
        for (final double b : Arrays.asList(10, 20, 30)) {

            final double theoreticalMean = a / (a + b);
            final double theoreticalVariance = a*b/((a+b)*(a+b)*(a+b+1));

            //Note: this is the theoretical standard deviation of the sample mean given uncorrelated
            //samples.  The sample mean will have a greater variance here because samples are correlated.
            final double standardDeviationOfMean = Math.sqrt(theoreticalVariance / NUM_SAMPLES);

            final Function<Double, Double> logPDF = x -> (a - 1) * Math.log(x) + (b - 1) * Math.log(1 - x);
            final AdaptiveMetropolisSampler sampler = new AdaptiveMetropolisSampler(INITIAL_BETA_GUESS, INITIAL_STEP_SIZE, 0, 1);
            final List<Double> samples = sampler.sample(rng, logPDF, NUM_SAMPLES, NUM_BURN_IN_STEPS);

            final double sampleMean = samples.stream().mapToDouble(x -> x).average().getAsDouble();
            final double sampleMeanSquare = samples.stream().mapToDouble(x -> x*x).average().getAsDouble();
            final double sampleVariance = (sampleMeanSquare - sampleMean * sampleMean)*NUM_SAMPLES/(NUM_SAMPLES-1);

            Assert.assertEquals(sampleMean, theoreticalMean, 10 * standardDeviationOfMean);
            Assert.assertEquals(sampleVariance, theoreticalVariance, 10e-4);
        }
    }
}
 
Example #9
Source File: XHMMModel.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Creates a new model instance.
 *
 * @param eventStartProbability the probability per base pair of a transition from neutral to a CNV
 * @param meanEventSize the expectation of the distance between consecutive targets in an event
 * @param deletionMeanShift the deletion depth of coverage negative shift.
 * @param duplicationMeaShift the duplication depth of coverage positive shift.
 * @throws IllegalArgumentException if any of the model parameters has an invalid value.
 */
public XHMMModel(final double eventStartProbability,
                 final double meanEventSize,
                 final double deletionMeanShift,
                 final double duplicationMeaShift) {
    super(new XHMMEmissionProbabilityCalculator(
            deletionMeanShift,
            duplicationMeaShift,
            EMISSION_SD, RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED))),
            eventStartProbability,
            meanEventSize);
}
 
Example #10
Source File: ChainPrunerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@DataProvider(name = "chainPrunerData")
public Object[][] getChainPrunerData() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(9));
    final int refLength = 100;
    final int leftSNVPosition = 15;
    final int middleSNVPosition = refLength / 2;
    final int rightSNVPosition = refLength - leftSNVPosition;

    final byte[] ref = new byte[refLength];
    IntStream.range(0, refLength).forEach(n -> ref[n] = BaseUtils.baseIndexToSimpleBase(rng.nextInt(4)));
    ref[leftSNVPosition] = 'A';
    ref[middleSNVPosition] = 'G';
    ref[rightSNVPosition] = 'T';

    final byte[] leftSNV = Arrays.copyOf(ref, refLength);
    leftSNV[leftSNVPosition] = 'G';

    final byte[] middleSNV = Arrays.copyOf(ref, refLength);
    middleSNV[middleSNVPosition] = 'T';

    final byte[] rightSNV = Arrays.copyOf(ref, refLength);
    rightSNV[rightSNVPosition] = 'A';

    // kmer size, ref bases, alt bases, alt fraction, base error rate, depth per start, log odds threshold, max unpruned variants
    return new Object[][] {
            { 10, ref, leftSNV, 0.5, 0.001, 20, 1.0},
            { 10, ref, middleSNV, 0.1, 0.001, 5, 1.0},
            { 25, ref, middleSNV, 0.1, 0.001, 5, 1.0},
            { 25, ref, middleSNV, 0.01, 0.001, 1000, 1.0},   // note the extreme depth -- this would confuse non-adaptive pruning
            { 10, ref, rightSNV, 0.1, 0.001, 2, 1.0}
    };
}
 
Example #11
Source File: AlleleFractionModellerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testMCMC() {
    final double meanBias = 1.2;
    final double biasVariance = 0.04;
    final double outlierProbability = 0.02;
    final AlleleFractionGlobalParameters globalParameters = new AlleleFractionGlobalParameters(meanBias, biasVariance, outlierProbability);
    final double minorAlleleFractionPriorAlpha = 1.;
    final AlleleFractionPrior prior = new AlleleFractionPrior(minorAlleleFractionPriorAlpha);
    final int numSegments = 50;
    final double averageHetsPerSegment = 50.;
    final double averageDepth = 50.;
    final int numSamples = 150;
    final int numBurnIn = 50;
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));

    final SampleLocatableMetadata metadata = new SimpleSampleLocatableMetadata(
            "test-sample",
            new SAMSequenceDictionary(IntStream.range(0, numSegments)
                    .mapToObj(i -> new SAMSequenceRecord("chr" + i + 1, 10000))
                    .collect(Collectors.toList())));
    final AlleleFractionSimulatedData simulatedData = new AlleleFractionSimulatedData(
            metadata, globalParameters, numSegments, averageHetsPerSegment, averageDepth, rng);

    final AlleleFractionModeller modeller = new AlleleFractionModeller(simulatedData.getData().getAllelicCounts(), simulatedData.getData().getSegments(), prior);
    modeller.fitMCMC(numSamples, numBurnIn);

    assertAlleleFractionPosteriorCenters(modeller, simulatedData);
}
 
Example #12
Source File: AlleleFractionInitializerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testInitialization() {
    final double meanBias = 1.2;
    final double biasVariance = 0.04;
    final double outlierProbability = 0.02;
    final AlleleFractionGlobalParameters globalParameters = new AlleleFractionGlobalParameters(meanBias, biasVariance, outlierProbability);
    final int numSegments = 100;
    final double averageHetsPerSegment = 50.;
    final double averageDepth = 50.;
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));

    final SampleLocatableMetadata metadata = new SimpleSampleLocatableMetadata(
            "test-sample",
            new SAMSequenceDictionary(IntStream.range(0, numSegments)
                    .mapToObj(i -> new SAMSequenceRecord("chr" + i + 1, 1000))
                    .collect(Collectors.toList())));
    final AlleleFractionSimulatedData simulatedData = new AlleleFractionSimulatedData(
            metadata, globalParameters, numSegments, averageHetsPerSegment, averageDepth, rng);

    final AlleleFractionSegmentedData data = simulatedData.getData();
    final AlleleFractionState initializedState = new AlleleFractionInitializer(data).getInitializedState();

    Assert.assertEquals(initializedState.meanBias(), meanBias, ABSOLUTE_TOLERANCE);
    Assert.assertEquals(initializedState.biasVariance(), biasVariance, ABSOLUTE_TOLERANCE);
    Assert.assertEquals(initializedState.outlierProbability(), outlierProbability, ABSOLUTE_TOLERANCE);

    final double averageMinorFractionError = IntStream.range(0, numSegments)
            .mapToDouble(s -> Math.abs(initializedState.segmentMinorFraction(s) - simulatedData.getTrueState().segmentMinorFraction(s)))
            .average().getAsDouble();
    Assert.assertEquals(averageMinorFractionError, 0, ABSOLUTE_TOLERANCE);
}
 
Example #13
Source File: MathUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testRandomSelect() {
    final RandomGenerator rg = RandomGeneratorFactory.createRandomGenerator(new Random(13));
    final int NUM_SAMPLES = 1000;
    final List<Integer> choices = Arrays.asList(-1,0,1);
    final List<Integer> result = IntStream.range(0, NUM_SAMPLES)
            .map(n -> MathUtils.randomSelect(choices, j -> j*j/2.0, rg))
            .boxed()
            .collect(Collectors.toList());
    Assert.assertEquals(result.stream().filter(n -> n==0).count(), 0);
    Assert.assertEquals(result.stream().filter(n -> n == 1).count(), NUM_SAMPLES / 2, 50);
}
 
Example #14
Source File: MathUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testRandomSelectFlatProbability() {
    final RandomGenerator rg = RandomGeneratorFactory.createRandomGenerator(new Random(13));
    final int NUM_SAMPLES = 1000;
    final List<Integer> choices = Arrays.asList(0,1,2);
    final List<Integer> result = IntStream.range(0, NUM_SAMPLES)
            .map(n -> MathUtils.randomSelect(choices, j -> 1.0 / choices.size(), rg))
            .boxed()
            .collect(Collectors.toList());
    Assert.assertEquals(result.stream().filter(n -> n == 0).count(), NUM_SAMPLES / choices.size(), 50);
}
 
Example #15
Source File: CoverageDropoutDetectorTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private Object[][] getUnivariateGaussianTargetsWithDropout(final double sigma, final double dropoutRate) {
    Random rng = new Random(337);
    final RandomGenerator randomGenerator = RandomGeneratorFactory.createRandomGenerator(rng);
    NormalDistribution n = new NormalDistribution(randomGenerator, 1, sigma);
    final int numDataPoints = 10000;
    final int numEventPoints = 2000;

    // Randomly select dropoutRate of targets and reduce by 25%-75% (uniformly distributed)
    UniformRealDistribution uniformRealDistribution = new UniformRealDistribution(randomGenerator, 0, 1.0);
    final List<ReadCountRecord.SingleSampleRecord> targetList = new ArrayList<>();
    for (int i = 0; i < numDataPoints; i++){
        double coverage = n.sample() + (i < (numDataPoints - numEventPoints) ? 0.0 : 0.5);
        if (uniformRealDistribution.sample() < dropoutRate) {
            double multiplier = .25 + uniformRealDistribution.sample()/2;
            coverage = coverage * multiplier;
        }
        targetList.add(new ReadCountRecord.SingleSampleRecord(new Target("arbitrary_name", new SimpleInterval("chr1", 100 + 2*i, 101 + 2 * i)), coverage));
    }

    HashedListTargetCollection<ReadCountRecord.SingleSampleRecord> targets = new HashedListTargetCollection<>(targetList);

    List<ModeledSegment> segments = new ArrayList<>();
    segments.add(new ModeledSegment(new SimpleInterval("chr1", 100, 16050), 8000, 1));
    segments.add(new ModeledSegment(new SimpleInterval("chr1", 16100, 20200), 2000, 1.5));

    return new Object [] []{ {targets, segments}};
}
 
Example #16
Source File: AlleleFractionSegmenterUnitTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testSegmentation() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));

    final List<Double> trueWeights = Arrays.asList(0.2, 0.5, 0.3);
    final List<Double> trueMinorAlleleFractions = Arrays.asList(0.12, 0.32, 0.5);
    final double trueMemoryLength = 1e5;

    final AlleleFractionGlobalParameters trueParams = new AlleleFractionGlobalParameters(1.0, 0.01, 0.01);

    final AlleleFractionHMM trueModel = new AlleleFractionHMM(trueMinorAlleleFractions, trueWeights,
            trueMemoryLength, AllelicPanelOfNormals.EMPTY_PON, trueParams);

    // randomly set positions
    final int chainLength = 10000;
    final List<SimpleInterval> positions = CopyRatioSegmenterUnitTest.randomPositions("chr1", chainLength, rng, trueMemoryLength/4);
    final List<Integer> trueStates = trueModel.generateHiddenStateChain(positions);
    final List<Double> truthMinorFractions = trueStates.stream().map(trueModel::getMinorAlleleFraction).collect(Collectors.toList());
    final AllelicCountCollection counts = generateCounts(truthMinorFractions, positions, rng, trueParams);

    final AlleleFractionSegmenter segmenter = new AlleleFractionSegmenter(10, counts, AllelicPanelOfNormals.EMPTY_PON);
    final List<ModeledSegment> segments = segmenter.getModeledSegments();
    final double[] segmentMinorFractions = segments.stream()
            .flatMap(s -> Collections.nCopies((int) s.getTargetCount(), s.getSegmentMean()).stream())
            .mapToDouble(x->x).toArray();

    final double averageMinorFractionError = IntStream.range(0, truthMinorFractions.size())
            .mapToDouble(n -> Math.abs(segmentMinorFractions[n] - truthMinorFractions.get(n)))
            .average().getAsDouble();

    Assert.assertEquals(averageMinorFractionError, 0, 0.01);
}
 
Example #17
Source File: CopyRatioSegmenterUnitTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testChromosomesOnDifferentSegments() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));
    final double[] trueLog2CopyRatios = new double[] {-2.0, 0.0, 1.7};
    final double trueMemoryLength = 1e5;

    final double trueStandardDeviation = 0.2;

    // randomly set positions
    final int chainLength = 100;
    final List<SimpleInterval> positions = randomPositions("chr1", chainLength, rng, trueMemoryLength/4);
    positions.addAll(randomPositions("chr2", chainLength, rng, trueMemoryLength/4));
    positions.addAll(randomPositions("chr3", chainLength, rng, trueMemoryLength/4));

    final int trueState = 2;    //fix everything to the same state 2

    final List<Double> data = new ArrayList<>();
    for (int n = 0; n < positions.size(); n++) {
        final double copyRatio = trueLog2CopyRatios[trueState];
        final double observed = generateData(trueStandardDeviation, copyRatio, rng);
        data.add(observed);
    }

    final List<Target> targets = positions.stream().map(Target::new).collect(Collectors.toList());
    final ReadCountCollection rcc = new ReadCountCollection(targets, Arrays.asList("SAMPLE"), new Array2DRowRealMatrix(data.stream().mapToDouble(x->x).toArray()));
    final CopyRatioSegmenter segmenter = new CopyRatioSegmenter(10, rcc);
    final List<ModeledSegment> segments = segmenter.getModeledSegments();

    //check that each chromosome has at least one segment
    final int numDifferentContigsInSegments = (int) segments.stream().map(ModeledSegment::getContig).distinct().count();
    Assert.assertEquals(numDifferentContigsInSegments, 3);
}
 
Example #18
Source File: CopyRatioSegmenterUnitTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testSegmentation() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(563));

    final List<Double> trueWeights = Arrays.asList(0.2, 0.5, 0.3);
    final List<Double> trueLog2CopyRatios = Arrays.asList(-2.0, 0.0, 1.4);
    final double trueMemoryLength = 1e5;
    final double trueStandardDeviation = 0.2;

    final CopyRatioHMM trueModel = new CopyRatioHMM(trueLog2CopyRatios, trueWeights,
            trueMemoryLength, trueStandardDeviation);

    final int chainLength = 10000;
    final List<SimpleInterval> positions = randomPositions("chr1", chainLength, rng, trueMemoryLength/4);
    final List<Integer> trueStates = trueModel.generateHiddenStateChain(positions);
    final List<Double> trueLog2CopyRatioSequence = trueStates.stream().map(n -> trueLog2CopyRatios.get(n)).collect(Collectors.toList());

    final List<Double> data = trueLog2CopyRatioSequence.stream()
            .map(cr -> generateData(trueStandardDeviation, cr, rng)).collect(Collectors.toList());

    final List<Target> targets = positions.stream().map(Target::new).collect(Collectors.toList());
    final ReadCountCollection rcc = new ReadCountCollection(targets, Arrays.asList("SAMPLE"), new Array2DRowRealMatrix(data.stream().mapToDouble(x->x).toArray()));
    final CopyRatioSegmenter segmenter = new CopyRatioSegmenter(10, rcc);
    final List<ModeledSegment> segments = segmenter.getModeledSegments();

    final double[] segmentCopyRatios = segments.stream()
            .flatMap(s -> Collections.nCopies((int) s.getTargetCount(), s.getSegmentMeanInLog2CRSpace()).stream())
            .mapToDouble(x -> x).toArray();

    final double averageCopyRatioError = IntStream.range(0, trueLog2CopyRatioSequence.size())
            .mapToDouble(n -> Math.abs(segmentCopyRatios[n] - trueLog2CopyRatioSequence.get(n)))
            .average().getAsDouble();

    Assert.assertEquals(averageCopyRatioError, 0, 0.025);
}
 
Example #19
Source File: GATKProtectedMathUtilsTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testRandomSelect() {
    final RandomGenerator rg = RandomGeneratorFactory.createRandomGenerator(new Random(13));
    final int NUM_SAMPLES = 1000;
    final List<Integer> choices = Arrays.asList(-1,0,1);
    final List<Integer> result = IntStream.range(0, NUM_SAMPLES)
            .map(n -> GATKProtectedMathUtils.randomSelect(choices, j -> j*j/2.0, rg))
            .boxed()
            .collect(Collectors.toList());
    Assert.assertEquals(result.stream().filter(n -> n==0).count(), 0);
    Assert.assertEquals(result.stream().filter(n -> n == 1).count(), NUM_SAMPLES / 2, 50);
}
 
Example #20
Source File: GATKProtectedMathUtilsTest.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testRandomSelectFlatProbability() {
    final RandomGenerator rg = RandomGeneratorFactory.createRandomGenerator(new Random(13));
    final int NUM_SAMPLES = 1000;
    final List<Integer> choices = Arrays.asList(0,1,2);
    final List<Integer> result = IntStream.range(0, NUM_SAMPLES)
            .map(n -> GATKProtectedMathUtils.randomSelect(choices, j -> 1.0 / choices.size(), rg))
            .boxed()
            .collect(Collectors.toList());
    Assert.assertEquals(result.stream().filter(n -> n == 0).count(), NUM_SAMPLES / choices.size(), 50);
}
 
Example #21
Source File: CoverageDropoutDetector.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/** <p>Produces a Gaussian mixture model based on the difference between targets and segment means.</p>
 * <p>Filters targets to populations where more than the minProportion lie in a single segment.</p>
 * <p>Returns null if no pass filtering.  Please note that in these cases,
 * in the rest of this class, we use this to assume that a GMM is not a good model.</p>
 *
 * @param segments  -- segments with segment mean in log2 copy ratio space
 * @param targets -- targets with a log2 copy ratio estimate
 * @param minProportion -- minimum proportion of all targets that a given segment must have in order to be used
 *                      in the evaluation
 * @param numComponents -- number of components to use in the GMM.  Usually, this is 2.
 * @return  never {@code null}.  Fitting result with indications whether it converged or was even attempted.
 */
private MixtureMultivariateNormalFitResult retrieveGaussianMixtureModelForFilteredTargets(final List<ModeledSegment> segments,
                                                                                          final TargetCollection<ReadCountRecord.SingleSampleRecord> targets, double minProportion, int numComponents){

    // For each target in a segment that contains enough targets, normalize the difference against the segment mean
    //  and collapse the filtered targets into the copy ratio estimates.
    final List<Double> filteredTargetsSegDiff = getNumProbeFilteredTargetList(segments, targets, minProportion);

    if (filteredTargetsSegDiff.size() < numComponents) {
        return new MixtureMultivariateNormalFitResult(null, false, false);
    }

    // Assume that Apache Commons wants data points in the first dimension.
    // Note that second dimension of length 2 (instead of 1) is to wrok around funny Apache commons API.
    final double[][] filteredTargetsSegDiff2d = new double[filteredTargetsSegDiff.size()][2];

    // Convert the filtered targets into 2d array (even if second dimension is length 1).  The second dimension is
    //  uncorrelated Gaussian.  This is only to get around funny API in Apache Commons, which will throw an
    //  exception if the length of the second dimension is < 2
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
    final NormalDistribution nd = new NormalDistribution(rng, 0, .1);
    for (int i = 0; i < filteredTargetsSegDiff.size(); i++) {
        filteredTargetsSegDiff2d[i][0] = filteredTargetsSegDiff.get(i);
        filteredTargetsSegDiff2d[i][1] = nd.sample();
    }

    final MixtureMultivariateNormalDistribution estimateEM0 = MultivariateNormalMixtureExpectationMaximization.estimate(filteredTargetsSegDiff2d, numComponents);
    final MultivariateNormalMixtureExpectationMaximization multivariateNormalMixtureExpectationMaximization = new MultivariateNormalMixtureExpectationMaximization(filteredTargetsSegDiff2d);

    try {
        multivariateNormalMixtureExpectationMaximization.fit(estimateEM0);
    } catch (final MaxCountExceededException | ConvergenceException | SingularMatrixException e) {
        // We are done, we cannot make a fitting.  We should return a result that we attempted a fitting, but it
        //  did not converge.  Include the model as it was when the exception was thrown.
        return new MixtureMultivariateNormalFitResult(multivariateNormalMixtureExpectationMaximization.getFittedModel(), false, true);
    }
    return new MixtureMultivariateNormalFitResult(multivariateNormalMixtureExpectationMaximization.getFittedModel(), true, true);
}
 
Example #22
Source File: HMM.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
default List<S> generateHiddenStateChain(final List<T> positions) {
    final RandomGenerator rg = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED_FOR_CHAIN_GENERATION));
    final List<S> hiddenStates = hiddenStates();
    final List<S> result = new ArrayList<>(positions.size());

    final S initialState = GATKProtectedMathUtils.randomSelect(hiddenStates, s -> Math.exp(logPriorProbability(s, positions.get(0))), rg);
    result.add(initialState);

    IntStream.range(1, positions.size()).forEach(n ->
        result.add(GATKProtectedMathUtils.randomSelect(hiddenStates,
                s -> Math.exp(logTransitionProbability(result.get(n-1), positions.get(n - 1), s, positions.get(n))), rg)));
    return result;
}
 
Example #23
Source File: GamaRNG.java    From gama with GNU General Public License v3.0 4 votes vote down vote up
@Override
public void setSeed(int[] seed) {
	 setSeed(RandomGeneratorFactory.convertToLong(seed));
}
 
Example #24
Source File: KernelSegmenter.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Returns a list of the indices of the changepoints, either sorted by decreasing change to the global segmentation cost
 * or by increasing index order.
 * @param maxNumChangepoints                    maximum number of changepoints to return (first and last points do not count towards this number)
 * @param kernel                                kernel function used to calculate segment costs
 * @param kernelApproximationDimension          dimension of low-rank approximation to the kernel
 * @param windowSizes                           list of sizes to use for the flanking segments used to calculate local changepoint costs
 * @param numChangepointsPenaltyLinearFactor    factor A for penalty of the form A * C, where C is the number of changepoints
 * @param numChangepointsPenaltyLogLinearFactor factor B for penalty of the form B * C * log (N / C),
 *                                              where C is the number of changepoints and N is the number of data points
 * @param changepointSortOrder                  sort by decreasing change to the global segmentation cost or by increasing index order
 */
public List<Integer> findChangepoints(final int maxNumChangepoints,
                                      final BiFunction<DATA, DATA, Double> kernel,
                                      final int kernelApproximationDimension,
                                      final List<Integer> windowSizes,
                                      final double numChangepointsPenaltyLinearFactor,
                                      final double numChangepointsPenaltyLogLinearFactor,
                                      final ChangepointSortOrder changepointSortOrder) {
    ParamUtils.isPositiveOrZero(maxNumChangepoints, "Maximum number of changepoints must be non-negative.");
    ParamUtils.isPositive(kernelApproximationDimension, "Dimension of kernel approximation must be positive.");
    Utils.validateArg(!windowSizes.isEmpty(), "At least one window size must be provided.");
    Utils.validateArg(windowSizes.stream().allMatch(ws -> ws > 0), "Window sizes must all be positive.");
    Utils.validateArg(windowSizes.stream().distinct().count() == windowSizes.size(), "Window sizes must all be unique.");
    ParamUtils.isPositiveOrZero(numChangepointsPenaltyLinearFactor,
            "Linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");
    ParamUtils.isPositiveOrZero(numChangepointsPenaltyLogLinearFactor,
            "Log-linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");

    if (maxNumChangepoints == 0) {
        logger.warn("No changepoints were requested, returning an empty list...");
        return Collections.emptyList();
    }
    if (data.isEmpty()) {
        logger.warn("No data points were provided, returning an empty list...");
        return Collections.emptyList();
    }

    logger.debug(String.format("Finding up to %d changepoints in %d data points...", maxNumChangepoints, data.size()));
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));

    logger.debug("Calculating low-rank approximation to kernel matrix...");
    final RealMatrix reducedObservationMatrix = calculateReducedObservationMatrix(rng, data, kernel, kernelApproximationDimension);
    final double[] kernelApproximationDiagonal = calculateKernelApproximationDiagonal(reducedObservationMatrix);

    logger.debug(String.format("Finding changepoint candidates for all window sizes %s...", windowSizes.toString()));
    final List<Integer> changepointCandidates = findChangepointCandidates(
            data, reducedObservationMatrix, kernelApproximationDiagonal, maxNumChangepoints, windowSizes);

    logger.debug("Performing backward model selection on changepoint candidates...");
    return selectChangepoints(
            changepointCandidates, maxNumChangepoints, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor,
            reducedObservationMatrix, kernelApproximationDiagonal).stream()
            .sorted((a, b) -> changepointSortOrder.equals(ChangepointSortOrder.INDEX) ? Integer.compare(a, b) : 0)    //if BACKWARD_SELECTION, simply retain original order from backward model selection
            .collect(Collectors.toList());
}
 
Example #25
Source File: CoverageModelParameters.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Generates random coverage model parameters.
 *
 * @param targetList list of targets
 * @param numLatents number of latent variables
 * @param seed random seed
 * @param randomMeanLogBiasStandardDeviation std of mean log bias (mean is set to 0)
 * @param randomBiasCovariatesStandardDeviation std of bias covariates (mean is set to 0)
 * @param randomMaxUnexplainedVariance max value of unexplained variance (samples are taken from a uniform
 *                                     distribution [0, {@code randomMaxUnexplainedVariance}])
 * @param initialBiasCovariatesARDCoefficients initial row vector of ARD coefficients
 * @return an instance of {@link CoverageModelParameters}
 */
public static CoverageModelParameters generateRandomModel(final List<Target> targetList,
                                                          final int numLatents,
                                                          final long seed,
                                                          final double randomMeanLogBiasStandardDeviation,
                                                          final double randomBiasCovariatesStandardDeviation,
                                                          final double randomMaxUnexplainedVariance,
                                                          final INDArray initialBiasCovariatesARDCoefficients) {
    Utils.validateArg(numLatents >= 0, "Dimension of the bias space must be non-negative");
    Utils.validateArg(randomBiasCovariatesStandardDeviation >= 0, "Standard deviation of random bias covariates" +
            " must be non-negative");
    Utils.validateArg(randomMeanLogBiasStandardDeviation >= 0, "Standard deviation of random mean log bias" +
            " must be non-negative");
    Utils.validateArg(randomMaxUnexplainedVariance >= 0, "Max random unexplained variance must be non-negative");
    Utils.validateArg(initialBiasCovariatesARDCoefficients == null || numLatents > 0 &&
            initialBiasCovariatesARDCoefficients.length() == numLatents, "If ARD is enabled, the dimension" +
            " of the bias latent space must be positive and match the length of ARD coeffecient vector");
    final boolean biasCovariatesEnabled = numLatents > 0;

    final int numTargets = targetList.size();
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(seed));

    /* Gaussian random for mean log bias */
    final INDArray initialMeanLogBias = Nd4j.create(getNormalRandomNumbers(
            numTargets, 0, randomMeanLogBiasStandardDeviation, rng), new int[] {1, numTargets});

    /* Uniform random for unexplained variance */
    final INDArray initialUnexplainedVariance = Nd4j.create(getUniformRandomNumbers(
            numTargets, 0, randomMaxUnexplainedVariance, rng), new int[] {1, numTargets});

    final INDArray initialMeanBiasCovariates;

    if (biasCovariatesEnabled) {
        /* Gaussian random for bias covariates */
        initialMeanBiasCovariates = Nd4j.create(getNormalRandomNumbers(numTargets * numLatents, 0,
                randomBiasCovariatesStandardDeviation, rng), new int[]{numTargets, numLatents});
    } else {
        initialMeanBiasCovariates = null;
    }

    return new CoverageModelParameters(targetList, initialMeanLogBias, initialUnexplainedVariance,
            initialMeanBiasCovariates, initialBiasCovariatesARDCoefficients);
}
 
Example #26
Source File: MultidimensionalModellerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Test
public void testMCMC() {
    final int numSegments = 25;
    final int numSamples = 150;
    final int numBurnIn = 50;
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));

    //copy-ratio model parameters
    final double varianceCR = 0.01;
    final double outlierProbabilityCR = 0.05;
    final double averageIntervalsPerSegment = 100.;

    //allele-fraction model parameters
    final double meanBiasAF = 1.2;
    final double biasVarianceAF = 0.04;
    final double outlierProbabilityAF = 0.02;
    final AlleleFractionGlobalParameters globalParametersAF = new AlleleFractionGlobalParameters(meanBiasAF, biasVarianceAF, outlierProbabilityAF);
    final double minorAlleleFractionPriorAlpha = 1.;
    final AlleleFractionPrior priorAF = new AlleleFractionPrior(minorAlleleFractionPriorAlpha);
    final double averageHetsPerSegment = 50.;
    final double averageDepthAF = 50.;

    //similar-segment merging parameters
    final int maxNumSmoothingIterations = 10;
    final int numSmoothingIterationsPerFit = 0;
    final double smoothingCredibleIntervalThresholdCopyRatio = 2.;
    final double smoothingCredibleIntervalThresholdAlleleFraction = 2.;

    //recall that both CR and AF data points are at loci 1, 2, 3, etc. and that each segment is on a different contig
    final SampleLocatableMetadata metadata = new SimpleSampleLocatableMetadata(
            "test-sample",
            new SAMSequenceDictionary(IntStream.range(0, numSegments)
                    .mapToObj(i -> new SAMSequenceRecord("chr" + i + 1, 10000))
                    .collect(Collectors.toList())));
    final CopyRatioSimulatedData simulatedDataCR = new CopyRatioSimulatedData(
            metadata, varianceCR, outlierProbabilityCR, numSegments, averageIntervalsPerSegment, rng);
    final AlleleFractionSimulatedData simulatedDataAF = new AlleleFractionSimulatedData(
            metadata, globalParametersAF, numSegments, averageHetsPerSegment, averageDepthAF, rng);

    //we introduce extra segments, which we will later merge to test similar-segment merging
    final MultidimensionalSegmentCollection oversegmentedSegments = new MultidimensionalSegmentCollection(
            metadata,
            constructOversegmentedSegments(simulatedDataCR, simulatedDataAF));

    final MultidimensionalModeller modeller = new MultidimensionalModeller(
            oversegmentedSegments,
            simulatedDataCR.getCopyRatios(),
            simulatedDataAF.getAllelicCounts(), priorAF,
            numSamples, numBurnIn, numSamples, numBurnIn);
    modeller.smoothSegments(maxNumSmoothingIterations, numSmoothingIterationsPerFit, smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction);

    CopyRatioModellerUnitTest.assertCopyRatioPosteriorCenters(modeller.getCopyRatioModeller(), simulatedDataCR);
    AlleleFractionModellerUnitTest.assertAlleleFractionPosteriorCenters(modeller.getAlleleFractionModeller(), simulatedDataAF);
}
 
Example #27
Source File: SimpleCopyRatioCallerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Test
public void testMakeCalls() {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
    final double segmentNoise = 0.05;
    final double intervalLog2Noise = 0.2;
    final List<Double> segmentCopyRatios = Arrays.asList(2., 3., 1., 1., 0.25, 1., 5., 1., 0., 0.5);
    final List<Integer> numIntervalsPerSegment = Arrays.asList(10, 5, 5, 100, 10, 10, 20, 10, 10, 5);
    final SampleLocatableMetadata metadata = new SimpleSampleLocatableMetadata(
            "test-sample",
            new SAMSequenceDictionary(IntStream.range(0, segmentCopyRatios.size())
                    .mapToObj(i -> new SAMSequenceRecord("chr" + i + 1, 1000))
                    .collect(Collectors.toList())));
    final List<CalledCopyRatioSegment.Call> expectedCalls = Arrays.asList(
            AMPLIFICATION, AMPLIFICATION, NEUTRAL, NEUTRAL, DELETION, NEUTRAL, AMPLIFICATION, NEUTRAL, DELETION, DELETION);

    final List<CopyRatioSegment> segments = new ArrayList<>();
    for (int segmentIndex = 0; segmentIndex < numIntervalsPerSegment.size(); segmentIndex++) {
        final String contig = "chr" + segmentIndex + 1;
        final List<CopyRatio> intervalLog2CopyRatiosInSegment = new ArrayList<>(numIntervalsPerSegment.size());
        for (int intervalIndex = 0; intervalIndex < numIntervalsPerSegment.get(segmentIndex); intervalIndex++) {
            final double log2CopyRatioValue = ParamUtils.log2(Math.max(EPSILON,
                    segmentCopyRatios.get(segmentIndex) + rng.nextGaussian() * segmentNoise)) + intervalLog2Noise * rng.nextGaussian();
            intervalLog2CopyRatiosInSegment.add(new CopyRatio(
                    new SimpleInterval(contig, intervalIndex + 1, intervalIndex + 1), log2CopyRatioValue));
        }
        segments.add(new CopyRatioSegment(
                new SimpleInterval(contig, 1, numIntervalsPerSegment.get(segmentIndex)),
                intervalLog2CopyRatiosInSegment));
    }
    final CopyRatioSegmentCollection copyRatioSegments = new CopyRatioSegmentCollection(metadata, segments);

    final CalledCopyRatioSegmentCollection calledCopyRatioSegments =
            new SimpleCopyRatioCaller(copyRatioSegments,
                    NEUTRAL_SEGMENT_COPY_RATIO_LOWER_BOUND, NEUTRAL_SEGMENT_COPY_RATIO_UPPER_BOUND,
                    OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD, CALLING_COPY_RATIO_Z_SCORE_THRESHOLD)
                    .makeCalls();

    Assert.assertEquals(copyRatioSegments.getMetadata(), calledCopyRatioSegments.getMetadata());
    Assert.assertEquals(
            copyRatioSegments.getIntervals(), calledCopyRatioSegments.getIntervals());
    Assert.assertEquals(
            copyRatioSegments.getRecords().stream().map(CopyRatioSegment::getMeanLog2CopyRatio).collect(Collectors.toList()),
            calledCopyRatioSegments.getRecords().stream().map(CopyRatioSegment::getMeanLog2CopyRatio).collect(Collectors.toList()));
    Assert.assertEquals(
            calledCopyRatioSegments.getRecords().stream().map(CalledCopyRatioSegment::getCall).collect(Collectors.toList()),
            expectedCalls);
}
 
Example #28
Source File: ChainPrunerUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Comprehensive test of adaptive pruning -- given an alt haplotype and a ref haplotype
 * @param kmerSize
 * @param ref           reference haplotype
 * @param alt           alt haplotype
 * @param altFraction   alt allele fraction to simulate somatic, mitochondrial etc variants
 * @param errorRate     substitution error rate of simulated reads
 * @param depthPerAlignmentStart    number of reads starting at each position.  Note that holding this constant yields
 *                                  low coverage at the beginning of the graph and high in the middle and end, simulating
 *                                  the leading edge of an exome target, for example
 * @param logOddsThreshold  log-10 odds threshold for pruning chains
 */
@Test(dataProvider = "chainPrunerData")
public void testAdaptivePruning(final int kmerSize, final byte[] ref, final byte[] alt, final double altFraction, final double errorRate, final int depthPerAlignmentStart, final double logOddsThreshold) {
    final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(kmerSize + ref.hashCode() + alt.hashCode()));
    final ReadThreadingGraph graph = new ReadThreadingGraph(kmerSize);
    graph.addSequence(ref, true);
    final List<byte[]> reads = IntStream.range(0, ref.length)
            .mapToObj(start -> IntStream.range(0, depthPerAlignmentStart).mapToObj(n -> generateReadWithErrors(rng.nextDouble() < altFraction ? alt : ref, start, errorRate, rng)))
            .flatMap(s -> s).collect(Collectors.toList());

    reads.forEach(read -> graph.addSequence(read, false));


    // note: these are the steps in ReadThreadingAssembler::createGraph
    graph.buildGraphIfNecessary();

    final ChainPruner<MultiDeBruijnVertex, MultiSampleEdge> pruner = new AdaptiveChainPruner<>(0.001, logOddsThreshold, 50);
    pruner.pruneLowWeightChains(graph);

    final SmithWatermanAligner aligner = SmithWatermanJavaAligner.getInstance();
    graph.recoverDanglingTails(1, 3, false, aligner);
    graph.recoverDanglingHeads(1, 3, false, aligner);
    graph.removePathsNotConnectedToRef();

    final SeqGraph seqGraph = graph.toSequenceGraph();
    seqGraph.zipLinearChains();
    seqGraph.removeSingletonOrphanVertices();
    seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
    seqGraph.simplifyGraph();
    seqGraph.removePathsNotConnectedToRef();
    seqGraph.simplifyGraph();

    final List<KBestHaplotype<SeqVertex, BaseEdge>> bestPaths = new GraphBasedKBestHaplotypeFinder<>(seqGraph).findBestHaplotypes(10);

    final OptionalInt altIndex = IntStream.range(0, bestPaths.size()).filter(n -> bestPaths.get(n).haplotype().basesMatch(alt)).findFirst();
    Assert.assertTrue(altIndex.isPresent());

    // the haplotype score is the sum of the log-10 of all branching fractions, so the alt haplotype score should come out to
    // around the log-10 of the allele fraction up to some fudge factor, assumign we didn't do any dumb pruning
    Assert.assertEquals(bestPaths.get(altIndex.getAsInt()).score(), Math.log10(altFraction), 0.5);
    Assert.assertTrue(bestPaths.size() < 15);
}