/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package be.ugent.intec.halvade.hadoop.mapreduce; import be.ugent.intec.halvade.hadoop.datatypes.ChromosomeRegion; import be.ugent.intec.halvade.utils.HalvadeConf; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMTag; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.Reducer; import org.seqdoop.hadoop_bam.KeyIgnoringBAMOutputFormat; import org.seqdoop.hadoop_bam.SAMRecordWritable; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; /** * * @author ddecap */ public class BamMergeReducer extends Reducer<ChromosomeRegion, SAMRecordWritable, LongWritable, SAMRecordWritable> { protected SAMFileHeader header; protected SAMSequenceDictionary dict; protected KeyIgnoringBAMOutputFormat outpFormat; protected String RGID = "GROUP1"; protected String RGLB = "LIB1"; protected String RGPL = "ILLUMINA"; protected String RGPU = "UNIT1"; protected String RGSM = "SAMPLE1"; protected SAMReadGroupRecord bamrg; protected boolean inputIsBam = false; protected RecordWriter<LongWritable,SAMRecordWritable> recordWriter; protected SAMRecordWritable samWritable = new SAMRecordWritable(); protected LongWritable outKey; boolean reportBest = false; @Override protected void setup(Context context) throws IOException, InterruptedException { outpFormat = new KeyIgnoringBAMOutputFormat(); String output = HalvadeConf.getOutDir(context.getConfiguration()); inputIsBam = HalvadeConf.inputIsBam(context.getConfiguration()); dict = HalvadeConf.getSequenceDictionary(context.getConfiguration()); if(inputIsBam) { header = SAMHeaderReader.readSAMHeaderFrom(new Path(HalvadeConf.getHeaderFile(context.getConfiguration())), context.getConfiguration()); } else { getReadGroupData(context.getConfiguration()); header = new SAMFileHeader(); header.setSequenceDictionary(dict); bamrg = new SAMReadGroupRecord(RGID); bamrg.setLibrary(RGLB); bamrg.setPlatform(RGPL); bamrg.setPlatformUnit(RGPU); bamrg.setSample(RGSM); header.addReadGroup(bamrg); } outpFormat.setSAMHeader(header); recordWriter = outpFormat.getRecordWriter(context, new Path(output + "mergedBam.bam")); outKey = new LongWritable(); outKey.set(0); } @Override protected void reduce(ChromosomeRegion key, Iterable<SAMRecordWritable> values, Context context) throws IOException, InterruptedException { Iterator<SAMRecordWritable> it = values.iterator(); SAMRecord sam = null; while(it.hasNext()) { sam = it.next().get(); sam.setAttribute(SAMTag.RG.name(), RGID); samWritable.set(sam); recordWriter.write(outKey, samWritable); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); //To change body of generated methods, choose Tools | Templates. recordWriter.close(context); } protected void getReadGroupData(Configuration conf) { String readGroup = HalvadeConf.getReadGroup(conf); String[] elements = readGroup.split(" "); for(String ele : elements) { String[] val = ele.split(":"); if(val[0].equalsIgnoreCase("id")) RGID = val[1]; else if(val[0].equalsIgnoreCase("lb")) RGLB = val[1]; else if(val[0].equalsIgnoreCase("pl")) RGPL = val[1]; else if(val[0].equalsIgnoreCase("pu")) RGPU = val[1]; else if(val[0].equalsIgnoreCase("sm")) RGSM = val[1]; } } }