htsjdk.samtools.reference.IndexedFastaSequenceFile Java Examples

The following examples show how to use htsjdk.samtools.reference.IndexedFastaSequenceFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BwaMemIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testPerfectUnpairedMapping() throws Exception {
    final Random rdn = new Random(13);
    final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(fastaFile, fastaIndex);
    final List<SAMRecord> inputReads = ReadTestUtils.randomErrorFreeUnpairedReads(rdn, TEST_DICTIONARY, fasta,
            "read-", 0, 1000, 50, 200);
    final BwaMemAligner aligner = new BwaMemAligner(index);
    final List<List<BwaMemAlignment>> alignments = aligner.alignSeqs(inputReads, SAMRecord::getReadBases);
    Assert.assertEquals(alignments.size(), inputReads.size());
    for (int i = 0; i < alignments.size(); i++) {
        final List<BwaMemAlignment> blocks = alignments.get(i);
        final SAMRecord inputRead  = inputReads.get(i);
        Assert.assertNotNull(blocks);
        Assert.assertEquals(blocks.size(), 1);
        assertPerfectAlignmentMatch(inputRead, blocks.get(0));
    }
}
 
Example #2
Source File: SageApplication.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
private SageApplication(final Options options, final String... args) throws IOException, ParseException {
    final VersionInfo version = new VersionInfo("sage.version");
    LOGGER.info("SAGE version: {}", version.version());

    final CommandLine cmd = createCommandLine(args, options);
    this.config = SageConfig.createConfig(version.version(), cmd);

    hotspots = readHotspots();
    panel = panelWithHotspots(hotspots);
    highConfidence = readPanel(config.highConfidenceBed());

    final ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNameFormat("SAGE-%d").build();
    executorService = Executors.newFixedThreadPool(config.threads(), namedThreadFactory);
    refGenome = new IndexedFastaSequenceFile(new File(config.refGenome()));

    vcf = new SageVCF(refGenome, config);
    LOGGER.info("Writing to file: {}", config.outputFile());
}
 
Example #3
Source File: VariantContextEnrichmentPurple.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
public VariantContextEnrichmentPurple(boolean hotspotEnabled, double clonalityMaxPloidy, double clonalityBinWidth,
        @NotNull final String purpleVersion, @NotNull final String tumorSample, @NotNull final IndexedFastaSequenceFile reference,
        @NotNull final PurityAdjuster purityAdjuster, @NotNull final List<PurpleCopyNumber> copyNumbers,
        @NotNull final List<FittedRegion> fittedRegions, @NotNull final List<PeakModel> peakModel, @NotNull final String hotspots,
        @NotNull final Consumer<VariantContext> consumer) throws IOException {
    subclonalLikelihoodEnrichment = new SubclonalLikelihoodEnrichment(clonalityMaxPloidy, clonalityBinWidth, peakModel, consumer);
    purityEnrichment =
            new PurityEnrichment(purpleVersion, tumorSample, purityAdjuster, copyNumbers, fittedRegions, subclonalLikelihoodEnrichment);
    kataegisEnrichment = new KataegisEnrichment(purityEnrichment);
    somaticRefContextEnrichment = new SomaticRefContextEnrichment(reference, kataegisEnrichment);
    if (hotspotEnabled) {
        hotspotEnrichment = new VariantHotspotEnrichment(readFromVCF(hotspots), somaticRefContextEnrichment);
    } else {
        hotspotEnrichment = VariantContextEnrichmentFactory.noEnrichment().create(somaticRefContextEnrichment);
    }
}
 
Example #4
Source File: KmerGenerator.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
public KmerGenerator(final String refGenomeFile, final String kmerInputFile, final String outputDir)
{
    try
    {
        IndexedFastaSequenceFile refGenome =
                new IndexedFastaSequenceFile(new File(refGenomeFile));
        mRefGenome = new RefGenomeSource(refGenome);
    }
    catch(IOException e)
    {
        LOGGER.error("failed to load ref genome: {}", e.toString());
    }

    mKmerInputFile = kmerInputFile;
    mOutputDir = outputDir;
}
 
Example #5
Source File: SomaticRefContextEnrichment.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
@NotNull
static Pair<Integer, String> relativePositionAndRef(@NotNull final IndexedFastaSequenceFile reference, @NotNull final VariantContext variant) {
    final int refLength = variant.getReference().getBaseString().length();
    @Nullable
    final SAMSequenceRecord samSequenceRecord = reference.getSequenceDictionary().getSequence(variant.getContig());
    if (samSequenceRecord == null) {
        LOGGER.warn("Unable to locate contig {} in ref genome", variant.getContig());
        return new Pair<>(-1, Strings.EMPTY);
    }

    final int chromosomeLength = samSequenceRecord.getSequenceLength();
    long positionBeforeEvent = variant.getStart();

    long start = Math.max(positionBeforeEvent - 100, 1);
    long end = Math.min(positionBeforeEvent + refLength + 100 - 1, chromosomeLength - 1);
    int relativePosition = (int) (positionBeforeEvent - start);
    final String sequence;
    if (start < chromosomeLength && end < chromosomeLength) {
        sequence = reference.getSubsequenceAt(variant.getContig(), start, end).getBaseString();
    } else {
        sequence = Strings.EMPTY;
        LOGGER.warn("Requested base sequence outside of chromosome region!");
    }
    return new Pair<>(relativePosition, sequence);
}
 
Example #6
Source File: ReferenceResource.java    From VarDictJava with MIT License 6 votes vote down vote up
/**
 * Method convert fasta data to array contains header and nucleotide bases.
 * @param fasta path to fasta file
 * @param chr chromosome name of region
 * @param start start position of region
 * @param end end position of region
 * @return array of nucleotide bases in the region of fasta
 */
public String[] retrieveSubSeq(String fasta, String chr, int start, int end) {
    try {
        IndexedFastaSequenceFile idx = fetchFasta(fasta);
        ReferenceSequence seq = idx.getSubsequenceAt(chr, start, end);
        byte[] bases = seq.getBases();
        return new String[]{">" + chr + ":" + start + "-" + end, bases != null ? new String(bases) : ""};
    } catch (SAMException e){
        if (UNABLE_FIND_CONTIG.matcher(e.getMessage()).find()){
            throw new WrongFastaOrBamException(chr, e);
        } else if (WRONG_START_OR_END.matcher(e.getMessage()).find()){
            throw new RegionBoundariesException(chr, start, end, e);
        } else {
            throw e;
        }
    }
}
 
Example #7
Source File: GATKToolUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testBestSequenceDictionary_fromReadsAndReference() throws Exception {
    final GATKTool tool = new TestGATKToolWithReads();
    final CommandLineParser clp = new CommandLineArgumentParser(tool);
    final File bamFile = new File(publicTestDir + "org/broadinstitute/hellbender/engine/reads_data_source_test1.bam");
    final String fastaFile =   hg19MiniReference;
    final String[] args = {"-I", bamFile.getCanonicalPath(),
                           "-R", fastaFile};
    clp.parseArguments(System.out, args);
    tool.onStartup();
    //read the dict back in and compare to reference dict
    final SAMSequenceDictionary toolDict = tool.getBestAvailableSequenceDictionary();
    final SAMSequenceDictionary fastaDict = new IndexedFastaSequenceFile(new File(fastaFile)).getSequenceDictionary();
    toolDict.assertSameDictionary(fastaDict);
    fastaDict.assertSameDictionary(toolDict);

    Assert.assertEquals(toolDict, fastaDict);
}
 
Example #8
Source File: RefGenomeEnrichedSomaticVariantFactory.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
@NotNull
private static Pair<Integer, String> relativePositionAndRef(@NotNull final SomaticVariant variant,
        @NotNull final IndexedFastaSequenceFile reference) {
    final int chromosomeLength = reference.getSequenceDictionary().getSequence(variant.chromosome()).getSequenceLength();
    long positionBeforeEvent = variant.position();

    long start = Math.max(positionBeforeEvent - 100, 1);
    long end = Math.min(positionBeforeEvent + variant.ref().length() + 100 - 1, chromosomeLength - 1);
    int relativePosition = (int) (positionBeforeEvent - start);
    final String sequence;
    if (start < chromosomeLength && end < chromosomeLength) {
        sequence = reference.getSubsequenceAt(variant.chromosome(), start, end).getBaseString();
    } else {
        sequence = Strings.EMPTY;
        LOGGER.warn("Requested base sequence outside of chromosome region!");
    }
    return new Pair<>(relativePosition, sequence);
}
 
Example #9
Source File: CachingIndexedFastaSequenceFileUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testMixedCasesInExample() throws IOException {
    try(final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
        final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(IOUtils.getPath(exampleFASTA), true);
        final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(IOUtils.getPath(exampleFASTA), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
    ) {

        int nMixedCase = 0;
        for (SAMSequenceRecord contig : original.getSequenceDictionary().getSequences()) {
            nMixedCase += mixedCasesTestHelper(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1);

            final int step = 100;
            for (int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step) {
                mixedCasesTestHelper(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos);
            }
        }


        Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file.  Unexpected test state");
    }
}
 
Example #10
Source File: PurpleStructuralVariantSupplier.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
void write(@NotNull final PurityAdjuster purityAdjuster, @NotNull final List<PurpleCopyNumber> copyNumbers) throws IOException {
    if (header.isPresent()) {
        try (final IndexedFastaSequenceFile indexedFastaSequenceFile = new IndexedFastaSequenceFile(new File(refGenomePath));
                final VariantContextWriter writer = new VariantContextWriterBuilder().setOutputFile(outputVCF)
                        .setReferenceDictionary(header.get().getSequenceDictionary())
                        .setIndexCreator(new TabixIndexCreator(header.get().getSequenceDictionary(), new TabixFormat()))
                        .setOutputFileType(VariantContextWriterBuilder.OutputType.BLOCK_COMPRESSED_VCF)
                        .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
                        .build()) {

            final StructuralRefContextEnrichment refEnricher =
                    new StructuralRefContextEnrichment(indexedFastaSequenceFile, writer::add);
            writer.writeHeader(refEnricher.enrichHeader(header.get()));

            enriched(purityAdjuster, copyNumbers).forEach(refEnricher);
            refEnricher.flush();
        }
    }
}
 
Example #11
Source File: SageVCF.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
public SageVCF(@NotNull final IndexedFastaSequenceFile reference, @NotNull final SageConfig config) {
    writer = new VariantContextWriterBuilder().setOutputFile(config.outputFile())
            .modifyOption(Options.INDEX_ON_THE_FLY, true)
            .modifyOption(Options.USE_ASYNC_IO, false)
            .setReferenceDictionary(reference.getSequenceDictionary())
            .build();
    refContextEnrichment = new SomaticRefContextEnrichment(reference, this::writeToFile);

    final VCFHeader header = refContextEnrichment.enrichHeader(header(config));
    header.setSequenceDictionary(reference.getSequenceDictionary());
    writer.writeHeader(header);
}
 
Example #12
Source File: HaplotypeMap.java    From picard with MIT License 5 votes vote down vote up
public void writeAsVcf(final File output, final File refFile) throws FileNotFoundException {
    ReferenceSequenceFile ref = new IndexedFastaSequenceFile(refFile);
    try (VariantContextWriter writer = new VariantContextWriterBuilder()
            .setOutputFile(output)
            .setReferenceDictionary(ref.getSequenceDictionary())
            .build()) {

        final VCFHeader vcfHeader = new VCFHeader(
                VCFUtils.withUpdatedContigsAsLines(Collections.emptySet(), refFile, header.getSequenceDictionary(), false),
                Collections.singleton(HET_GENOTYPE_FOR_PHASING));

        VCFUtils.withUpdatedContigsAsLines(Collections.emptySet(), refFile, header.getSequenceDictionary(), false);

        vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString()));
        vcfHeader.addMetaDataLine(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"));
        vcfHeader.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
        vcfHeader.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.PHASE_SET_KEY, 1, VCFHeaderLineType.String, "Phase-set identifier for phased genotypes."));
        vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.SOURCE_KEY,"HaplotypeMap::writeAsVcf"));
        vcfHeader.addMetaDataLine(new VCFHeaderLine("reference","HaplotypeMap::writeAsVcf"));


      //  vcfHeader.addMetaDataLine(new VCFHeaderLine());
        writer.writeHeader(vcfHeader);
        final LinkedList<VariantContext> variants = new LinkedList<>(this.asVcf(ref));
        variants.sort(vcfHeader.getVCFRecordComparator());
        variants.forEach(writer::add);
    }
}
 
Example #13
Source File: ReferenceResource.java    From VarDictJava with MIT License 5 votes vote down vote up
synchronized private IndexedFastaSequenceFile fetchFasta(String file) {
    return threadLocalFastaFiles.get().computeIfAbsent(
            file,
            (f) -> {
                try {
                    return new IndexedFastaSequenceFile(new File((f)));
                } catch (FileNotFoundException e) {
                    throw new IllegalArgumentException("Couldn't open reference file: " + f, e);
                }
            }
    );
}
 
Example #14
Source File: IntervalUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@DataProvider(name="invalidIntervalTestData")
public Object[][] invalidIntervalDataProvider() throws Exception {
    File fastaFile = new File(publicTestDir + "exampleFASTA.fasta");
    GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));

    return new Object[][] {
            new Object[] {genomeLocParser, "1", 10000000, 20000000},
            new Object[] {genomeLocParser, "2", 1, 2},
            new Object[] {genomeLocParser, "1", -1, 50}
    };
}
 
Example #15
Source File: IntervalUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@DataProvider(name="negativeOneLengthIntervalTestData")
public Object[][] negativeOneLengthIntervalDataProvider() throws Exception {
    File fastaFile = new File(publicTestDir + "exampleFASTA.fasta");
    GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));

    return new Object[][] {
            new Object[] {genomeLocParser, "1", 2, 1},
            new Object[] {genomeLocParser, "1", 500, 499},
            new Object[] {genomeLocParser, "1", 1000, 999}
    };
}
 
Example #16
Source File: Utils.java    From cramtools with Apache License 2.0 5 votes vote down vote up
public static IndexedFastaSequenceFile createIndexedFastaSequenceFile(File file) throws RuntimeException,
		FileNotFoundException {
	if (IndexedFastaSequenceFile.canCreateIndexedFastaReader(file)) {
		IndexedFastaSequenceFile ifsFile = new IndexedFastaSequenceFile(file);

		return ifsFile;
	} else
		throw new RuntimeException(
				"Reference fasta file is not indexed or index file not found. Try executing 'samtools faidx "
						+ file.getAbsolutePath() + "'");
}
 
Example #17
Source File: ChromosomePipeline.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
public ChromosomePipeline(@NotNull final String chromosome, @NotNull final SageConfig config, @NotNull final Executor executor,
        @NotNull final List<VariantHotspot> hotspots, @NotNull final List<GenomeRegion> panelRegions,
        @NotNull final List<GenomeRegion> highConfidenceRegions, final Map<String, QualityRecalibrationMap> qualityRecalibrationMap,
        final Consumer<VariantContext> consumer)
        throws IOException {
    this.chromosome = chromosome;
    this.config = config;
    this.refGenome = new IndexedFastaSequenceFile(new File(config.refGenome()));
    this.consumer = consumer;
    this.sageVariantPipeline =
            new SomaticPipeline(config, executor, refGenome, hotspots, panelRegions, highConfidenceRegions, qualityRecalibrationMap);
}
 
Example #18
Source File: RefGenomeEnrichedSomaticVariantFactory.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
private static void addTrinucleotideContext(@NotNull final ImmutableEnrichedSomaticVariant.Builder builder,
        @NotNull final SomaticVariant variant, @NotNull IndexedFastaSequenceFile reference) {
    final int chromosomeLength = reference.getSequenceDictionary().getSequence(variant.chromosome()).getSequenceLength();
    if (variant.position() < chromosomeLength) {
        final ReferenceSequence sequence =
                reference.getSubsequenceAt(variant.chromosome(), Math.max(1, variant.position() - 1), variant.position() + 1);
        builder.trinucleotideContext(sequence.getBaseString());
    } else {
        LOGGER.warn("Requested ref sequence beyond contig length! variant = {}", variant);
    }
}
 
Example #19
Source File: ArtificialBAMBuilder.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public IndexedFastaSequenceFile getReference() {
    return reference;
}
 
Example #20
Source File: TestRandomAccess.java    From cramtools with Apache License 2.0 4 votes vote down vote up
public TestRandomAccess(File file1, File file2, String seq) throws FileNotFoundException {
	this.seq = seq;
	f1 = new BGZF_ReferenceSequenceFile(file1);
	f2 = new IndexedFastaSequenceFile(file2);
}
 
Example #21
Source File: SomaticStream.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
public void processAndWrite(@NotNull final PurityAdjuster purityAdjuster, @NotNull final List<PurpleCopyNumber> copyNumbers,
        @NotNull final List<FittedRegion> fittedRegions, @NotNull final List<PeakModel> somaticPeaks) throws IOException {
    final Consumer<VariantContext> driverConsumer =
            x -> somaticVariantFactory.createVariant(commonConfig.tumorSample(), x).ifPresent(somatic -> {
                tumorMutationalLoad.accept(somatic);
                drivers.add(somatic);
            });

    if (enabled) {
        try (IndexedFastaSequenceFile indexedFastaSequenceFile = new IndexedFastaSequenceFile(new File(refGenomeData.refGenome()));
                VCFFileReader vcfReader = new VCFFileReader(new File(inputVCF), false);
                VariantContextWriter writer = new VariantContextWriterBuilder().setOutputFile(outputVCF)
                        .setOption(htsjdk.variant.variantcontext.writer.Options.ALLOW_MISSING_FIELDS_IN_HEADER)
                        .build()) {

            final Consumer<VariantContext> consumer = microsatelliteIndels.andThen(writer::add).andThen(driverConsumer);

            final VariantContextEnrichmentPurple enricher = new VariantContextEnrichmentPurple(driverCatalogConfig.enabled(),
                    somaticConfig.clonalityMaxPloidy(),
                    somaticConfig.clonalityBinWidth(),
                    commonConfig.version(),
                    commonConfig.tumorSample(),
                    indexedFastaSequenceFile,
                    purityAdjuster,
                    copyNumbers,
                    fittedRegions,
                    somaticPeaks,
                    driverCatalogConfig.hotspots(),
                    consumer);

            final VCFHeader header = enricher.enrichHeader(vcfReader.getFileHeader());
            writer.writeHeader(header);

            for (VariantContext context : vcfReader) {
                enricher.accept(context);
            }

            enricher.flush();

        }
    }
}
 
Example #22
Source File: ReadTestUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Returns a list of unpaired reads perfectly matched against a contiguous random location in the
 * reference (non-chimeric reads).
 * <p>
 *     Output reads might map to the forward or reverse strand with equal provability and
 *     they are totally contain in a reference contig (no overhangs).
 * </p>
 * <p>
 *     The resulting reads won't have read-group.
 * </p>
 * <p>
 *     Their names will be result of applying a serial number after the prefix provided.
 * </p>
 * <p>
 *     The resulting read record will be mapped to their original location with the corresponding
 *     single match operation cigar, and their flag values will be those of an unpaired, primary alignment record.
 * </p>
 * <p>
 *     The mapping quality is set to the unknown value {@link SAMRecord#UNKNOWN_MAPPING_QUALITY},
 *     as the reads actually have not been aligned. Also the output reads will lack base qualities.
 * </p>
 * @param rdn random number generator to be used, must not be {@code null}.
 * @param dict reference sequence dictionary, it must match the reference contents in {@code reference}.
 * @param reference reference where the read sequences and locations are based on, must not be {@code null}.
 * @param numberOfReads number of reads to produce, must be 0 or greater.
 * @param namePrefix name prefix for all reads, must not be null.
 * @param serialStart name serial start value, must be 0 or greater.
 * @param minLength minimum read length, must be 1 or greater.
 * @param maxLength maximum read length, must be the same as {@code minLength} or greater.
 * @throws IllegalArgumentException if any of the input parameters do not comply with its requirements.
 * @return never {@code null}, and contains no {@code null} values. It can be further modified by the calling code.
 */
public static List<SAMRecord> randomErrorFreeUnpairedReads(final Random rdn, final SAMSequenceDictionary dict, final IndexedFastaSequenceFile reference,
                                                         final String namePrefix, final int serialStart,
                                                         final int numberOfReads, final int minLength, final int maxLength) {
    Utils.nonNull(rdn);
    Utils.nonNull(reference);
    Utils.nonNull(namePrefix);
    Utils.nonNull(dict);
    ParamUtils.isPositiveOrZero(serialStart, "serial start");
    ParamUtils.isPositiveOrZero(numberOfReads, "number of reads");
    ParamUtils.isPositive(minLength, "min read length");
    ParamUtils.isPositive(maxLength, "max read length");
    ParamUtils.inRange(maxLength, minLength, Integer.MAX_VALUE, "max read length must be greater or equal to min length");
    final List<SAMRecord> result = new ArrayList<>();
    final SAMFileHeader header = new SAMFileHeader(dict);
    final int numberOfDigitsInSerial = (int) Math.ceil(Math.log10(numberOfReads + serialStart - 1)) + 1;
    final String nameFormat = namePrefix + "%0" + numberOfDigitsInSerial + 'd';
    for (int i = 0; i < numberOfReads; i++) {
        final int contigIndex = rdn.nextInt(dict.size());
        final int seqLength = rdn.nextInt(maxLength - minLength) + minLength;
        final boolean reverse = rdn.nextBoolean();
        final int offset = rdn.nextInt(dict.getSequence(contigIndex).getSequenceLength() - seqLength);
        final int start = offset + 1;
        final byte[] bases = reference.getSubsequenceAt(dict.getSequence(contigIndex).getSequenceName(), start, start + seqLength - 1).getBases();
        if (reverse) {
            SequenceUtil.reverseComplement(bases);
        }
        final SAMRecord record = new SAMRecord(header);
        record.setReadName(String.format(nameFormat, serialStart + i));
        record.setReadNegativeStrandFlag(reverse);
        record.setReadBases(bases);
        record.setReadUnmappedFlag(false);
        record.setReferenceIndex(contigIndex);
        record.setAlignmentStart(start);
        record.setReadPairedFlag(false);
        record.setCigar(new Cigar(Collections.singletonList(new CigarElement(seqLength, CigarOperator.M))));
        record.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY);
        result.add(record);
    }
    return result;
}
 
Example #23
Source File: RefGenomeData.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
@NotNull
static RefGenomeData createRefGenomeConfig(@NotNull CommandLine cmd) throws ParseException, IOException {

    if (!cmd.hasOption(REF_GENOME)) {
        throw new ParseException(REF_GENOME + " is a mandatory argument");
    }

    final String refGenomePath = cmd.getOptionValue(REF_GENOME);
    final Map<Chromosome, GenomePosition> lengthPositions;
    try (final IndexedFastaSequenceFile indexedFastaSequenceFile = new IndexedFastaSequenceFile(new File(refGenomePath))) {
        SAMSequenceDictionary sequenceDictionary = indexedFastaSequenceFile.getSequenceDictionary();
        if (sequenceDictionary == null) {
            throw new ParseException("Supplied ref genome must have associated sequence dictionary");
        }

        lengthPositions = fromLengths(ChromosomeLengthFactory.create(indexedFastaSequenceFile.getSequenceDictionary()));
    }

    final GenomePosition chr1Length = lengthPositions.get(HumanChromosome._1);
    final RefGenome refGenome;
    if (chr1Length != null && chr1Length.position() == RefGenome.HG38.lengths().get(HumanChromosome._1)) {
        refGenome = RefGenome.HG38;
    } else {
        refGenome = RefGenome.HG19;
    }
    LOGGER.info("Using ref genome: {}", refGenome);

    final Map<Chromosome, String> contigMap =
            lengthPositions.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, x -> x.getValue().chromosome()));

    final List<HmfTranscriptRegion> rawGenePanel =
            refGenome == RefGenome.HG38 ? HmfGenePanelSupplier.allGeneList38() : HmfGenePanelSupplier.allGeneList37();

    final Function<String, String> chromosomeToContig = s -> contigMap.get(HumanChromosome.fromString(s));

    final List<HmfTranscriptRegion> genePanel = rawGenePanel.stream()
            .filter(x -> HumanChromosome.contains(x.chromosome()) && contigMap.containsKey(HumanChromosome.fromString(x.chromosome())))
            .map(x -> updateContig(x, chromosomeToContig))
            .collect(Collectors.toList());

    return ImmutableRefGenomeData.builder()
            .length(toPosition(refGenome.lengths(), contigMap))
            .centromere(toPosition(refGenome.centromeres(), contigMap))
            .refGenome(refGenomePath)
            .isHg38(refGenome.equals(RefGenome.HG38))
            .genePanel(genePanel)
            .build();
}
 
Example #24
Source File: EnrichedSomaticVariantFactory.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
public EnrichedSomaticVariantFactory(@NotNull final IndexedFastaSequenceFile reference) {
    super(reference);
}
 
Example #25
Source File: RefGenomeEnrichedSomaticVariantFactory.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
RefGenomeEnrichedSomaticVariantFactory(@NotNull final IndexedFastaSequenceFile reference) {
    this.reference = reference;
}
 
Example #26
Source File: BwaMemIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Test
public void testChimericUnpairedMapping() throws Exception {
    final Random rdn = new Random(13);
    final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(fastaFile, fastaIndex);
    final List<SAMRecord> inputRecords = ReadTestUtils.randomErrorFreeUnpairedReads(rdn, TEST_DICTIONARY, fasta,
            "read-", 0, 1000 * 2, 100, 200);
    final List<List<SAMRecord>> chimerics = new ArrayList<>();
    for (int i = 0; i < inputRecords.size(); i += 2) {
        chimerics.add(Arrays.asList(inputRecords.get(i), inputRecords.get(i + 1)));
    }
    final BwaMemAligner aligner = new BwaMemAligner(index);
    final List<List<BwaMemAlignment>> alignments = aligner.alignSeqs(chimerics,
            ll -> Utils.concat(ll.get(0).getReadBases(), ll.get(1).getReadBases()));
    Assert.assertEquals(alignments.size(), chimerics.size());
    for (int i = 0; i < alignments.size(); i++) {
        final List<BwaMemAlignment> blocks = alignments.get(i);
        final List<SAMRecord> chimeric = chimerics.get(i);
        Assert.assertNotNull(blocks);
        Assert.assertTrue(blocks.size() <= 2);
        if (blocks.size() == 1) { // in occasion they record pair may be close enough and have the same orientation
                                  // resulting in a single record... we only perform simple checks in those few cases:
            Assert.assertEquals(chimeric.get(0).getReferenceIndex(), chimeric.get(1).getReferenceIndex());
            Assert.assertEquals(chimeric.get(0).getReadNegativeStrandFlag(), chimeric.get(1).getReadNegativeStrandFlag());
            Assert.assertEquals(blocks.get(0).getRefId(), chimeric.get(0).getReferenceIndex().intValue());
            final int maxDistance = chimeric.get(0).getReadLength() + chimeric.get(1).getReadLength() + 100;
            Assert.assertTrue(Math.abs(blocks.get(0).getRefStart() - chimeric.get(0).getAlignmentStart()) < maxDistance);
            Assert.assertTrue(Math.abs(blocks.get(0).getRefStart() - chimeric.get(0).getAlignmentStart()) < maxDistance);
        } else {
            BwaMemAlignment first  = blocks.get(0);
            BwaMemAlignment second = blocks.get(1);
            // We need to match the output alignments pair elements with the input records pair elements.
            // The test bellow is not bullet proof but is quite unlikely that it will fail to spot
            // a case in where the switch is needed:
            if ((first.getRefId() == second.getRefId()
                        && Math.abs(first.getRefStart() - chimeric.get(0).getStart()) > Math.abs(first.getRefStart() - chimeric.get(1).getStart()))
                    ||  (first.getRefId() != second.getRefId()
                        && second.getRefId() == TEST_DICTIONARY.getSequenceIndex(chimeric.get(0).getContig()))) {
               first = second;
               second = blocks.get(0);
            }
            assertChimericAlignmentMatch(chimeric.get(0), first);
            assertChimericAlignmentMatch(chimeric.get(1), second);
            // one and only one is a supplementary alignment:
            Assert.assertNotEquals(SAMFlag.SUPPLEMENTARY_ALIGNMENT.isSet(first.getSamFlag()),
                    SAMFlag.SUPPLEMENTARY_ALIGNMENT.isSet(second.getSamFlag()));
        }
    }
}
 
Example #27
Source File: BamCountReader.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
void initialise(final String refGenomeFile, IndexedFastaSequenceFile ifSeqFile)
{
    mIndexedFastaSequenceFile = ifSeqFile;

    mRefGenomeFile = new File(refGenomeFile);
}
 
Example #28
Source File: RefGenomeSource.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
public RefGenomeSource(final IndexedFastaSequenceFile refGenome)
{
    mRefGenome = refGenome;
}
 
Example #29
Source File: InframeIndelHotspots.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
public InframeIndelHotspots(final int minMappingQuality, @NotNull final Collection<GenomeRegion> regions,
        @NotNull final IndexedFastaSequenceFile sequenceFile) {
    this.sequenceFile = sequenceFile;
    this.codingRegions = Multimaps.fromRegions(regions);
    this.samSlicer = new SAMSlicer(minMappingQuality, regions);
}
 
Example #30
Source File: VariantHotspotEvidenceFactory.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
@NotNull
private String refSequence(@NotNull final IndexedFastaSequenceFile sequenceFile, int start, int end, String contig) {
    return sequenceFile.getSubsequenceAt(contig, start, end).getBaseString();
}