/******************************************************************************* * Copyright 2013 EMBL-EBI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package net.sf.cram; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.cram.encoding.reader.AbstractFastqReader; import htsjdk.samtools.cram.encoding.reader.DataReaderFactory; import htsjdk.samtools.cram.encoding.reader.MultiFastqOutputter; import htsjdk.samtools.cram.encoding.reader.ReaderToFastq; import htsjdk.samtools.cram.io.DefaultBitInputStream; import htsjdk.samtools.cram.structure.Container; import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.samtools.util.Log; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; import java.util.zip.GZIPOutputStream; import net.sf.cram.ref.ReferenceSource; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import com.beust.jcommander.converters.FileConverter; public class Cram2Fastq { private static Log log = Log.getInstance(Cram2Fastq.class); public static final String COMMAND = "fastq"; private static void printUsage(JCommander jc) { StringBuilder sb = new StringBuilder(); sb.append("\n"); jc.usage(sb); System.out.println("Version " + Cram2Fastq.class.getPackage().getImplementationVersion()); System.out.println(sb.toString()); } @SuppressWarnings("restriction") public static void main(String[] args) throws Exception { Params params = new Params(); JCommander jc = new JCommander(params); try { jc.parse(args); } catch (Exception e) { System.out.println("Failed to parse parameteres, detailed message below: "); System.out.println(e.getMessage()); System.out.println(); System.out.println("See usage: -h"); System.exit(1); } if (args.length == 0 || params.help) { printUsage(jc); System.exit(1); } Log.setGlobalLogLevel(params.logLevel); SeekableFileStream sfs = new SeekableFileStream(params.cramFile); CramHeader cramHeader = CramIO.readCramHeader(sfs); ReferenceSource referenceSource = new ReferenceSource(params.reference); FixBAMFileHeader fix = new FixBAMFileHeader(referenceSource); fix.setConfirmMD5(!params.skipMD5Checks); fix.setIgnoreMD5Mismatch(params.ignoreMD5Mismatch); fix.fixSequences(cramHeader.getSamFileHeader().getSequenceDictionary().getSequences()); sfs.seek(0); if (params.reference == null) log.warn("No reference file specified, remote access over internet may be used to download public sequences. "); final AtomicBoolean brokenPipe = new AtomicBoolean(false); try { sun.misc.Signal.handle(new sun.misc.Signal("PIPE"), new sun.misc.SignalHandler() { @Override public void handle(sun.misc.Signal sig) { brokenPipe.set(true); } }); } catch (IllegalArgumentException e) { e.printStackTrace(); } CollatingDumper d = new CollatingDumper(sfs, referenceSource, 3, params.fastqBaseName, params.gzip, params.maxRecords, params.reverse, params.defaultQS, brokenPipe); d.prefix = params.prefix; d.run(); if (d.exception != null) throw d.exception; } private static abstract class Dumper implements Runnable { protected InputStream cramIS; protected byte[] ref = null; protected ReferenceSource referenceSource; protected FileOutput[] outputs; protected long maxRecords = -1; protected CramHeader cramHeader; protected Container container; protected AbstractFastqReader reader; protected Exception exception; private boolean reverse = false; protected AtomicBoolean brokenPipe; public Dumper(InputStream cramIS, ReferenceSource referenceSource, int nofStreams, String fastqBaseName, boolean gzip, long maxRecords, boolean reverse, int defaultQS, AtomicBoolean brokenPipe) throws IOException { this.cramIS = cramIS; this.referenceSource = referenceSource; this.maxRecords = maxRecords; this.reverse = reverse; this.brokenPipe = brokenPipe; outputs = new FileOutput[nofStreams]; for (int index = 0; index < outputs.length; index++) outputs[index] = new FileOutput(); if (fastqBaseName == null) { OutputStream joinedOS = System.out; if (gzip) joinedOS = (new GZIPOutputStream(joinedOS)); for (int index = 0; index < outputs.length; index++) outputs[index].outputStream = joinedOS; } else { String extension = ".fastq" + (gzip ? ".gz" : ""); String path; for (int index = 0; index < outputs.length; index++) { if (index == 0) path = fastqBaseName + extension; else path = fastqBaseName + "_" + index + extension; outputs[index].file = new File(path); OutputStream os = new BufferedOutputStream(new FileOutputStream(outputs[index].file)); if (gzip) os = new GZIPOutputStream(os); outputs[index].outputStream = os; } } } protected abstract AbstractFastqReader newReader(); protected abstract void containerHasBeenRead() throws IOException; protected void doRun() throws IOException { cramHeader = CramIO.readCramHeader(cramIS); FixBAMFileHeader fix = new FixBAMFileHeader(referenceSource); reader = newReader(); reader.reverseNegativeReads = reverse; MAIN_LOOP: while (!brokenPipe.get() && (container = ContainerIO.readContainer(cramHeader.getVersion(), cramIS)) != null) { if (container.isEOF()) break; DataReaderFactory f = new DataReaderFactory(); for (Slice s : container.slices) { if (s.sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && s.sequenceId != -2) { SAMSequenceRecord sequence = cramHeader.getSamFileHeader().getSequence(s.sequenceId); if (sequence == null) throw new RuntimeException("Null sequence for id: " + s.sequenceId); ref = referenceSource.getReferenceBases(sequence, true); if (!s.validateRefMD5(ref)) { log.error(String .format("Reference sequence MD5 mismatch for slice: seq id %d, start %d, span %d, expected MD5 %s", s.sequenceId, s.alignmentStart, s.alignmentSpan, String.format("%032x", new BigInteger(1, s.refMD5)))); throw new RuntimeException("Reference checksum mismatch."); } } else ref = new byte[0]; Map<Integer, InputStream> inputMap = new HashMap<Integer, InputStream>(); for (Integer exId : s.external.keySet()) { inputMap.put(exId, new ByteArrayInputStream(s.external.get(exId).getRawContent())); } reader.referenceSequence = ref; reader.prevAlStart = s.alignmentStart; reader.substitutionMatrix = container.header.substitutionMatrix; reader.recordCounter = 0; try { f.buildReader(reader, new DefaultBitInputStream(new ByteArrayInputStream(s.coreBlock.getRawContent())), inputMap, container.header, s.sequenceId); } catch (IllegalArgumentException e) { throw new RuntimeException(e); } for (int i = 0; i < s.nofRecords; i++) { reader.read(); if (maxRecords > -1) { if (maxRecords == 0) break MAIN_LOOP; maxRecords--; } } containerHasBeenRead(); } } if (!brokenPipe.get()) reader.finish(); } @Override public void run() { try { doRun(); if (outputs != null) { for (FileOutput os : outputs) os.close(); } } catch (Exception e) { this.exception = e; } } } private static class SimpleDumper extends Dumper { public SimpleDumper(InputStream cramIS, ReferenceSource referenceSource, int nofStreams, String fastqBaseName, boolean gzip, int maxRecords, boolean reverse, int defaultQS, AtomicBoolean brokenPipe) throws IOException { super(cramIS, referenceSource, nofStreams, fastqBaseName, gzip, maxRecords, reverse, defaultQS, brokenPipe); } @Override protected AbstractFastqReader newReader() { return new ReaderToFastq(); } @Override protected void containerHasBeenRead() throws IOException { ReaderToFastq reader = (ReaderToFastq) super.reader; for (int i = 0; i < outputs.length; i++) { ByteBuffer buf = reader.bufs[i]; OutputStream os = outputs[i].outputStream; buf.flip(); os.write(buf.array(), 0, buf.limit()); if (buf.limit() > 0) outputs[i].empty = false; buf.clear(); } } } private static class CollatingDumper extends Dumper { private FileOutput fo = new FileOutput(); private String prefix; private long counter = 1; private MultiFastqOutputter multiFastqOutputter; private int defaultQS; public CollatingDumper(InputStream cramIS, ReferenceSource referenceSource, int nofStreams, String fastqBaseName, boolean gzip, long maxRecords, boolean reverse, int defaultQS, AtomicBoolean brokenPipe) throws IOException { super(cramIS, referenceSource, nofStreams, fastqBaseName, gzip, maxRecords, reverse, defaultQS, brokenPipe); this.defaultQS = defaultQS; this.brokenPipe = brokenPipe; fo.file = File.createTempFile(fastqBaseName == null ? "overflow.bam" : fastqBaseName + ".overflow.bam", ".tmp"); fo.file.deleteOnExit(); fo.outputStream = new BufferedOutputStream(new FileOutputStream(fo.file)); } @Override protected AbstractFastqReader newReader() { if (multiFastqOutputter != null) { counter = multiFastqOutputter.getCounter(); } multiFastqOutputter = new MultiFastqOutputter(outputs, fo, referenceSource, cramHeader.getSamFileHeader(), counter); if (prefix != null) { multiFastqOutputter.setPrefix(prefix.getBytes()); // multiFastqOutputter.setCounter(counter); } multiFastqOutputter.defaultQS = this.defaultQS; return multiFastqOutputter; } @Override protected void containerHasBeenRead() throws IOException { } @Override public void doRun() throws IOException { super.doRun(); fo.close(); if (fo.empty) return; log.info("Sorting overflow BAM: ", fo.file.length()); SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT); SAMFileReader r = new SAMFileReader(fo.file); SAMRecordIterator iterator = r.iterator(); if (!iterator.hasNext()) { r.close(); fo.file.delete(); return; } SAMRecord r1 = iterator.next(); SAMRecord r2 = null; counter = multiFastqOutputter.getCounter(); log.info("Counter=" + counter); while (!brokenPipe.get() && iterator.hasNext()) { r2 = iterator.next(); if (r1.getReadName().equals(r2.getReadName())) { print(r1, r2); counter++; r1 = null; if (!iterator.hasNext()) break; r1 = iterator.next(); r2 = null; } else { print(r1, 0); r1 = r2; r2 = null; counter++; } } if (r1 != null) print(r1, 0); r.close(); fo.file.delete(); } private void print(SAMRecord r1, SAMRecord r2) throws IOException { if (r1.getFirstOfPairFlag()) { print(r1, 1); print(r2, 2); } else { print(r1, 2); print(r2, 1); } } private void print(SAMRecord r, int index) throws IOException { OutputStream os = outputs[index]; os.write('@'); if (prefix != null) { os.write(prefix.getBytes()); os.write('.'); os.write(String.valueOf(counter).getBytes()); os.write(' '); } os.write(r.getReadName().getBytes()); if (index > 0) { os.write('/'); os.write(48 + index); } os.write('\n'); os.write(r.getReadBases()); os.write("\n+\n".getBytes()); os.write(r.getBaseQualityString().getBytes()); os.write('\n'); } } private static class FileOutput extends OutputStream { File file; OutputStream outputStream; boolean empty = true; @Override public void write(int b) throws IOException { outputStream.write(b); empty = false; } @Override public void write(byte[] b) throws IOException { outputStream.write(b); empty = false; } @Override public void write(byte[] b, int off, int len) throws IOException { outputStream.write(b, off, len); empty = false; } @Override public void flush() throws IOException { outputStream.flush(); } @Override public void close() throws IOException { if (outputStream != null && outputStream != System.out && outputStream != System.err) { outputStream.close(); outputStream = null; } if (empty && file != null && file.exists()) file.delete(); } } @Parameters(commandDescription = "CRAM to FastQ dump conversion. ") static class Params { @Parameter(names = { "-l", "--log-level" }, description = "Change log level: DEBUG, INFO, WARNING, ERROR.", converter = CramTools.LevelConverter.class) Log.LogLevel logLevel = Log.LogLevel.ERROR; @Parameter(names = { "-h", "--help" }, description = "Print help and quit") boolean help = false; @Parameter(names = { "--input-cram-file", "-I" }, converter = FileConverter.class, description = "The path to the CRAM file to uncompress. Omit if standard input (pipe).") File cramFile; @Parameter(names = { "--reference-fasta-file", "-R" }, converter = FileConverter.class, description = "Path to the reference fasta file, it must be uncompressed and indexed (use 'samtools faidx' for example). ") File reference; @Parameter(names = { "--fastq-base-name", "-F" }, description = "'_number.fastq[.gz] will be appended to this string to obtain output fastq file name. If this parameter is omitted then all reads are printed with no garanteed order.") String fastqBaseName; @Parameter(names = { "--gzip", "-z" }, description = "Compress fastq files with gzip.") boolean gzip; @Parameter(names = { "--reverse" }, description = "Re-reverse reads mapped to negative strand.") boolean reverse = false; @Parameter(names = { "--enumerate" }, description = "Append read names with read index (/1 for first in pair, /2 for second in pair).") boolean appendSegmentIndexToReadNames; @Parameter(names = { "--max-records" }, description = "Stop after reading this many records.") long maxRecords = -1; @Parameter(names = { "--read-name-prefix" }, description = "Replace read names with this prefix and a sequential integer.") String prefix = null; @Parameter(names = { "--default-quality-score" }, description = "Use this quality score (decimal representation of ASCII symbol) as a default value when the original quality score was lost due to compression. Minimum is 33.") int defaultQS = '?'; @Parameter(names = { "--ignore-md5-mismatch" }, description = "Issue a warning on sequence MD5 mismatch and continue. This does not garantee the data will be read succesfully. ") public boolean ignoreMD5Mismatch = false; @Parameter(names = { "--skip-md5-check" }, description = "Skip MD5 checks when reading the header.") public boolean skipMD5Checks = false; } }