java source code of ReadBAMTransform

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.genomics.dataflow.readers.bam;

import com.google.api.services.storage.Storage;
import com.google.cloud.genomics.dataflow.functions.BreakFusionTransform;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.Files;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.metrics.Metrics;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.util.GcsUtil;
import org.apache.beam.sdk.util.Transport;
import org.apache.beam.sdk.util.gcsfs.GcsPath;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import com.google.cloud.genomics.dataflow.utils.GCSOptions;
import com.google.cloud.genomics.utils.Contig;
import com.google.cloud.genomics.utils.OfflineAuth;
import com.google.genomics.v1.Read;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;

/**
 * Takes a tuple of 2 collections: Contigs and BAM files and transforms them into
 * a collection of reads by reading BAM files in a sharded manner.
 */
public class ReadBAMTransform extends PTransform<PCollection<BAMShard>, PCollection<Read>> {
  private static final Logger LOG = Logger.getLogger(ReadBAMTransform.class.getName());
  OfflineAuth auth;
  ReaderOptions options;

  public static class ReadFn extends DoFn<BAMShard, Read> {
    OfflineAuth auth;
    Storage.Objects storage;
    ReaderOptions options;

    public ReadFn(OfflineAuth auth, ReaderOptions options) {
      this.auth = auth;
      this.options = options;
    }

    @StartBundle
    public void startBundle(DoFn<BAMShard, Read>.StartBundleContext c) throws IOException {
      storage = Transport.newStorageClient(c.getPipelineOptions().as(GCSOptions.class)).build().objects();
    }

    @ProcessElement
    public void processElement(ProcessContext c) throws java.lang.Exception {
      final Reader reader = new Reader(storage, options, c.element(), c);
      reader.process();
      Metrics.counter(ReadBAMTransform.class, "Processed records").inc(reader.recordsProcessed);
      Metrics.counter(ReadBAMTransform.class, "Reads generated").inc(reader.readsGenerated);
      Metrics.counter(ReadBAMTransform.class, "Skipped start").inc(reader.recordsBeforeStart);
      Metrics.counter(ReadBAMTransform.class, "Skipped end").inc(reader.recordsAfterEnd);
      Metrics.counter(ReadBAMTransform.class, "Ref mismatch").inc(reader.mismatchedSequence);

    }
  }

  // ----------------------------------------------------------------
  // back to ReadBAMTransform


  /**
   * Get reads from a single BAM file by serially reading one shard at a time.
   *
   * This is useful when reads from a subset of genomic regions is desired.
   *
   * This method is marked as deprecated because getReadsFromBAMFilesSharded offers
   * the same functionality but shard reading occurs in parallel.
   *
   * This method should be removed when https://github.com/googlegenomics/dataflow-java/issues/214
   * is fixed.
   *
   * @param p
   * @param pipelineOptions
   * @param auth
   * @param contigs
   * @param options
   * @param BAMFile
   * @param shardingPolicy
   * @return
   * @throws IOException
   */
  @Deprecated
  public static PCollection<Read> getReadsFromBAMFileSharded(
      Pipeline p,
      PipelineOptions pipelineOptions,
      OfflineAuth auth,
      List<Contig> contigs,
      ReaderOptions options,
      String BAMFile,
      ShardingPolicy shardingPolicy) throws IOException {
    ReadBAMTransform readBAMSTransform = new ReadBAMTransform(options);
    readBAMSTransform.setAuth(auth);
    final Storage.Objects storage = Transport
        .newStorageClient(pipelineOptions.as(GCSOptions.class)).build().objects();
    final List<BAMShard> shardsList = Sharder.shardBAMFile(storage, BAMFile, contigs,
        shardingPolicy);
    PCollection<BAMShard> shards = p.apply(Create
        .of(shardsList));
    return readBAMSTransform.expand(shards);
  }

  /**
   * Get reads from one or more BAM files by reading shards in parallel.
   *
   * @param p
   * @param pipelineOptions
   * @param auth
   * @param contigs
   * @param options
   * @param bamFileListOrGlob
   * @param shardingPolicy
   * @return
   * @throws IOException
   * @throws URISyntaxException
   */
  public static PCollection<Read> getReadsFromBAMFilesSharded(
      Pipeline p,
      PipelineOptions pipelineOptions,
      OfflineAuth auth,
      final List<Contig> contigs,
      ReaderOptions options,
      String bamFileListOrGlob,
      final ShardingPolicy shardingPolicy) throws IOException, URISyntaxException {
      ReadBAMTransform readBAMSTransform = new ReadBAMTransform(options);
      readBAMSTransform.setAuth(auth);

      List<String> prefixes = null;
      File f = new File(bamFileListOrGlob);
      if (f.exists() && !f.isDirectory()) {
        String fileContents = Files.toString(f, Charset.defaultCharset());
        prefixes = ImmutableSet
            .<String>builder()
            .addAll(
                Splitter.on(CharMatcher.breakingWhitespace()).omitEmptyStrings().trimResults()
                    .split(fileContents))
            .build().asList();
      } else {
        prefixes = ImmutableSet
            .<String>builder()
            .add(bamFileListOrGlob)
            .build()
            .asList();
      }


      Set<String> uris = new HashSet<>();
      GcsUtil gcsUtil = pipelineOptions.as(GcsOptions.class).getGcsUtil();
      for (String prefix : prefixes) {
        URI absoluteUri = new URI(prefix);
        URI gcsUriGlob = new URI(
            absoluteUri.getScheme(),
            absoluteUri.getAuthority(),
            absoluteUri.getPath() + "*",
            absoluteUri.getQuery(),
            absoluteUri.getFragment());
        for (GcsPath entry : gcsUtil.expand(GcsPath.fromUri(gcsUriGlob))) {
          // Even if provided with an exact match to a particular BAM file, the glob operation will
          // still look for any files with that prefix, therefore also finding the corresponding
          // .bai file. Ensure only BAMs are added to the list.
          if (entry.toString().endsWith(BAMIO.BAM_FILE_SUFFIX)) {
            uris.add(entry.toString());
          }
        }
      }

      // Perform all sharding and reading in a distributed fashion, using the BreakFusion
      // transforms to ensure that work is auto-scalable based on the number of shards.
      return p
        .apply(Create.of(uris))
        .apply("Break BAM file fusion", new BreakFusionTransform<String>())
        .apply("BamsToShards", ParDo.of(new DoFn<String, BAMShard>() {
          Storage.Objects storage;
          @StartBundle
          public void startBundle(DoFn<String, BAMShard>.StartBundleContext c) throws IOException {
            storage = Transport.newStorageClient(c.getPipelineOptions().as(GCSOptions.class)).build().objects();
          }
          @DoFn.ProcessElement
          public void processElement(DoFn<String, BAMShard>.ProcessContext c) {
            List<BAMShard> shardsList = null;
            try {
              shardsList = Sharder.shardBAMFile(storage, c.element(), contigs, shardingPolicy);
              LOG.info("Sharding BAM " + c.element());
              Metrics.counter(ReadBAMTransform.class, "BAM files").inc();
              Metrics.counter(ReadBAMTransform.class, "BAM file shards").inc(shardsList.size());
            } catch (IOException e) {
              throw new RuntimeException(e);
            }
            for (BAMShard shard : shardsList) {
              c.output(shard);
            }
          }
        }))
        // We need a BreakFusionTransform here but BAMShard does not have a deterministic coder, a
        // requirement for values that are keys.  Send it as a value instead.
        .apply("Break BAMShard fusion - group", ParDo.of(new DoFn<BAMShard, KV<String, BAMShard>>() {
          @DoFn.ProcessElement
          public void processElement(DoFn<BAMShard, KV<String, BAMShard>>.ProcessContext c) throws Exception {
            c.output(KV.of(c.element().toString(), c.element()));
          }
        }))
        .apply("Break BAMShard fusion - shuffle", GroupByKey.<String, BAMShard>create())
        .apply("Break BAMShard fusion - ungroup", ParDo.of(new DoFn<KV<String, Iterable<BAMShard>>, BAMShard>() {
          @DoFn.ProcessElement
          public void processElement(DoFn<KV<String, Iterable<BAMShard>>, BAMShard>.ProcessContext c) {
            for (BAMShard shard : c.element().getValue()) {
              c.output(shard);
            }
          }
        }))
        .apply(readBAMSTransform);
  }

  @Override
  public PCollection<Read> expand(PCollection<BAMShard> shards) {
    final PCollection<Read> reads = shards.apply("Read reads from BAMShards", ParDo
        .of(new ReadFn(auth, options)));

    return reads;
  }

  public OfflineAuth  getAuth() {
    return auth;
  }

  public void setAuth(OfflineAuth auth) {
    this.auth = auth;
  }

  // non-public methods

  protected ReadBAMTransform(ReaderOptions options) {
    super();
    this.options = options;
  }
}