scala source code of BigFileDatasource

glow-master
- .github
  - PULL_REQUEST_TEMPLATE
- .circleci
  - config.yml
- stable-version.txt
- CODE-OF-CONDUCT.md
- scalastyle-config.xml
- levels_ridge_regression_tutorial.ipynb
- functions.yml
- .readthedocs.yml
- project
  - Dependencies.scala
  - build.properties
  - StableVersionPlugin.scala
  - plugins.sbt
- CONTRIBUTING.md
- python
  - environment.yml
  - test_render_template.py
  - version.py
  - project
    - build.properties
  - README.rst
  - setup.py
  - .style.yapf
  - render_template.py
  - glow
    - wgr
      - functions.py
      - __init__.py
      - tests
        test_block_variants_and_samples.py
        test_sample_id_extraction.py
      - linear_model
        functions.py
        ridge_udfs.py
        __init__.py
        tests
        test_ridge_regression.py
        test_functions.py
        ridge_model.py
    - functions.py
    - glow.py
    - __init__.py
    - tests
      - test_conversions.py
      - test_register.py
      - test_transform.py
    - conversions.py
    - functions.py.TEMPLATE
    - logging
      - hlseventlogger.py
      - __init__.py
    - conftest.py
- test-data
  - variantsplitternormalizer-test
    - test_left_align_hg38_altered_vtdecompose_bcftoolsnormalized.vcf
    - 20_altered_noindex.fasta
    - 01_IN_altered_multiallelic_vtdecompose_bcftoolsnormalized.vcf
    - test_left_align_hg38_altered_bcftoolsnormalized.vcf
    - test_left_align_hg38_altered_symbolic_vtdecompose.vcf
    - 20_altered_bgzip.fasta.gz.gzi
    - Homo_sapiens_assembly38.20.21_altered.fasta.fai
    - 01_IN_altered_biallelic.vcf
    - 01_IN_altered_biallelic_vtdecompose_bcftoolsnormalized.vcf
    - 20_altered_bgzip.fasta.gz.fai
    - 01_IN_altered_multiallelic_bcftoolsnormalized.vcf
    - 20_altered.fasta
    - 20_altered_bgzip_noindex.fasta.gz
    - 20_altered.fasta.fai
    - test_left_align_hg38_altered.vcf
    - 01_IN_altered_biallelic_vtdecompose.vcf
    - 20_altered_bgzip.fasta.gz
    - 01_IN_altered_multiallelic.vcf
    - test_left_align_hg38_altered_symbolic_vtdecompose_bcftoolsnormalized.vcf
    - test_left_align_hg38_altered_symbolic.vcf
    - 01_IN_altered_multiallelic_vtdecompose.vcf
    - 01_IN_altered_biallelic_bcftoolsnormalized.vcf
    - Homo_sapiens_assembly38.20.21_altered.fasta
    - test_left_align_hg38_altered_vtdecompose.vcf
    - test_left_align_hg38_altered_symbolic_bcftoolsnormalized.vcf
  - bgen
    - example.8bits.bgen
    - phased.16bits.vcf
    - complex.16bits.vcf
    - example.fake.sample
    - phased.8bits.bgen
    - phased.16bits.bgen
    - complex.16bits.bgen
    - example.16bits.oxford.corrupted.sample
    - phased.8bits.vcf
    - example.16bits.oxford.sample
    - README.md
    - example.3bits.bgen
    - example.sample
    - example.16bits.bgen.bgi
  - gwas
    - README
    - binary-phenotypes.csv.gz
    - covariates.csv.gz
    - genotypes.vcf.gz
  - wgr
    - ridge-regression
      - pts.csv
      - X2.csv
      - X0.csv
      - blockedGT.snappy.parquet
      - README.md
      - groupedIDs.snappy.parquet
  - vcf-merge
    - HG00097.vcf.bgz
    - README.txt
    - HG00096.vcf.bgz
  - vcf
    - test_withNanQual.vcf
    - missing_contig_length.vcf
    - 1row_bgz.vcf.gz
    - loftee.vcf
    - 1row.vcf.bgz
    - snpeff.vcf
    - VCFv4.3.vcf
    - scripts
      - remove-rows.sh
      - group_file.txt
      - remove-info.sh
      - gwas.sh
      - gwas-region.py
      - prepend-chr.sh
    - 1row_not_bgz.vcf.gz
    - misnumbered_info.vcf
    - vep.vcf
  - saige_output.txt
  - combined.chr20_18210071_18210093.g.vcf
  - 1000genomes-phase3-1row.vcf
  - bedtools
    - intersect_21.bed
  - NA12878_21_10002403.g.vcf
  - 1kg_sample.vcf
  - plink
    - README
    - five-samples-five-variants
      - no-bim
        test.fam
        test.bed
      - vcf
        test.vcf
      - no-fam
        test.bim
        test.bed
      - malformed
        test.fam
        test.bim
        test.bed
      - map-ped
        test.map
        test.ped
      - bed-bim-fam
        test.fam
        test.bim
        test.bed
  - NA12878_21_10002403.bp.g.vcf
  - NA12878_21_10002403.vcf
  - r
    - sex2.txt
  - tabix-test-vcf
    - CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf.gz.tbi
    - NA12878_21_10002403.vcf.gz
    - 1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz.tbi
    - NA12878_21_10002403NoTbi.vcf.gz
    - CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf.gz
    - NA12878_21_10002403.vcf.gz.tbi
    - combined.chr20_18210071_18210093.g.vcf.gz
    - combined.chr20_18210071_18210093.g.vcf.gz.tbi
    - 1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz
  - test.chr17.vcf
  - variantsampleblockmaker-test
    - README
    - 1000G.phase3.broad.withGenotypes.chr20.10100000.100Samples.Blocked.tsv
  - no_header.csv
  - gff
    - test_gff_with_fasta.gff
    - test_gff_with_fasta.gff.bgz
    - test_gff_with_fasta.gff.gz
    - test_gff_with_fasta_bgzip.gff.gz
    - test_gff_empty.gff
    - test_gff_with_fasta_multicase_attribute.gff
  - liftover
    - unlifted.test.bed
    - lifted.combined.chr20_18210071_18210093.g.vcf
    - lifted.test.vcf
    - lifted.swapRefAltAndArrays.vcf
    - picard
      - failed.mismatchRefSeq.testLiftoverBiallelicIndels.vcf
      - failed.testLiftoverIndelFlip.vcf
      - lifted.testLiftoverIndelFlip.vcf
      - testLiftoverSwapRefAltVariants.vcf
      - failed.testLiftoverBiallelicIndels.vcf
      - failed.testLiftoverSwapRefAltVariants.vcf
      - testLiftoverUsingMissingContig.vcf
      - testLiftoverMismatchingSnps.vcf
      - failed.testLiftoverIndelNoFlip.vcf
      - dummy.reference.dict
      - test.over.chain
      - dummy2.reference.fasta
      - testLiftoverIndelFlip.vcf
      - lifted.testLiftoverIndelNoFlip.vcf
      - testLiftoverMultiallelicIndels.vcf
      - dummy.two.block.reference.fasta
      - lifted.testLiftoverBiallelicIndels.vcf
      - mini.reference.dict
      - lifted.mismatchRefSeq.testLiftoverBiallelicIndels.vcf
      - dummy.two.block.reference.dict
      - mini.reference.fasta
      - testLiftoverBiallelicIndels.vcf
      - failed.testLiftoverMultiallelicIndels.vcf
      - testLiftoverIndelNoFlip.vcf
      - lifted.testLiftoverMultiallelicIndels.vcf
      - dummy.reference.fasta
      - dummy2.reference.dict
      - test.two.block.over.chain
      - lifted.testLiftoverSwapRefAltVariants.vcf
    - failed.combined.chr20_18210071_18210093.g.vcf
    - failed.minMatch001.test.bed
    - failed.swapRefAltAndArrays.vcf
    - README
    - hg19.chr20.dict
    - unlifted.test.vcf
    - unlifted.swapRefAltAndArrays.vcf
    - failed.test.vcf
    - lifted.minMatch05.test.bed
    - failed.minMatch05.test.bed
    - lifted.minMatch001.test.bed
- pyspark-setup.py
- README.md
- .scalafmt.conf
- bin
  - spark-submit
- core
  - src
    - main
      - resources
        functions.yml
        META-INF
        services
        org.apache.spark.sql.sources.DataSourceRegister
        io.projectglow.transformers.pipe.InputFormatterFactory
        io.projectglow.DataFrameTransformer
        io.projectglow.transformers.pipe.OutputFormatterFactory
      - scala-2.11
        com
        typesafe
        scalalogging
        Logging.scala
      - shim
        2.4
        SparkShim.scala
        3.0
        SparkShim.scala
      - scala
        htsjdk
        variant
        vcf
        VCFEncoderUtils.scala
        variantcontext
        writer
        VCFHeaderWriter.scala
        io
        projectglow
        sql
        SqlExtensionProvider.scala
        util
        SerializableConfiguration.scala
        ExpectsGenotypeFields.scala
        LeveneHaldane.scala
        ComDatabricksDataSource.scala
        HadoopLineIterator.scala
        RowConverter.scala
        BGZFCodec.scala
        BigFileDatasource.scala
        expressions
        CovariateQRContext.scala
        FirthTest.scala
        MeanSubstitute.scala
        VariantQcExprs.scala
        NormalizeVariantExpr.scala
        LinearRegressionExpr.scala
        VariantUtilExprs.scala
        LogisticRegressionExpr.scala
        AggregateByIndex.scala
        LikelihoodRatioTest.scala
        LogisticRegressionGwas.scala
        PerSampleSummaryStatistics.scala
        glueExpressions.scala
        LinearRegressionGwas.scala
        ExpressionHelper.scala
        LiftOverCoordinatesExpr.scala
        SampleCallSummaryStats.scala
        MomentAggState.scala
        dsl
        package.scala
        optimizer
        hlsOptimizerRules.scala
        transformers
        util
        StringUtils.scala
        LiftOverVariantsTransformer.scala
        pipe
        Piper.scala
        UTF8TextInputFormatter.scala
        CSVOutputFormatter.scala
        CSVInputFormatter.scala
        PipeTransformer.scala
        CleanupPipeTransformer.scala
        UTF8TextOutputFormatter.scala
        blockvariantsandsamples
        BlockVariantsAndSamplesTransformer.scala
        VariantSampleBlockMaker.scala
        splitmultiallelics
        VariantSplitter.scala
        SplitMultiallelicsTransformer.scala
        normalizevariants
        VariantNormalizer.scala
        NormalizeVariantsTransformer.scala
        bgen
        BgenRowToInternalRowConverter.scala
        BgenSchemaInferrer.scala
        BgenFileFormat.scala
        BigBgenDatasource.scala
        BgenFileIterator.scala
        BgenHeaderWriter.scala
        BgenRecordWriter.scala
        BgenConverterUtils.scala
        InternalRowToBgenRowConverter.scala
        vcf
        VariantContextToInternalRowConverter.scala
        LineIteratorImpl.scala
        VCFStreamWriter.scala
        VCFOutputFormatter.scala
        BigVCFDatasource.scala
        VCFHeaderUtils.scala
        TabixIndexHelper.scala
        VCFRowHeaderLines.scala
        VCFFileWriter.scala
        InternalRowToVariantContextConverter.scala
        VCFWriterUtils.scala
        VCFSchemaInferrer.scala
        AnnotationUtils.scala
        VCFFileFormat.scala
        VCFInputFormatter.scala
        VCFMetadataLoader.scala
        common
        WithUtils.scala
        ConverterUtils.scala
        schemas.scala
        HasStringency.scala
        Named.scala
        HailUtils.scala
        SimpleInterval.scala
        datasourceOptions.scala
        GlowLogging.scala
        logging
        HlsUsageLogging.scala
        HlsEventRecorder.scala
        plink
        BedFileIterator.scala
        PlinkFileFormat.scala
        PlinkRowToInternalRowConverter.scala
        functions.scala
        SparkShimBase.scala
        gff
        GffDataSource.scala
        Glow.scala
        org
        apache
        spark
        sql
        catalyst
        expressions
        SenaryExpression.scala
        execution
        datasources
        csv
        UnivocityGenerator.scala
        SQLUtils.scala
        hadoop_bam
        util
        DatabricksBGZFOutputStream.scala
      - java
        org
        projectglow
        sql
        Functions.java
    - test
      - resources
        log4j.properties
        test-functions.yml
        META-INF
        services
        io.projectglow.sql.BigFileUploader
        io.projectglow.transformers.pipe.InputFormatterFactory
        io.projectglow.DataFrameTransformer
        io.projectglow.transformers.pipe.OutputFormatterFactory
      - shim
        2.4
        SparkTestShim.scala
        3.0
        SparkTestShim.scala
      - scala
        io
        projectglow
        sql
        ComDatabricksDataSourceSuite.scala
        GlowTestData.scala
        util
        ExpectsGenotypeFieldsSuite.scala
        SqlExtensionProviderSuite.scala
        GlowBaseTest.scala
        BigFileDatasourceSuite.scala
        SingleFileWriterSuite.scala
        transformers
        LiftOverVariantsTransformerSuite.scala
        util
        StringUtilsSuite.scala
        pipe
        PipeTransformerSuite.scala
        CSVPiperSuite.scala
        TextPiperSuite.scala
        blockvariantsandsamples
        BlockVariantsAndSamplesTransformerSuite.scala
        splitmultiallelics
        SplitMultiallelicsTransformerSuite.scala
        VariantSplitterSuite.scala
        normalizevariants
        VariantNormalizerSuite.scala
        NormalizeVariantsTransformerSuite.scala
        bgen
        BgenConverterBaseTest.scala
        BgenRowConverterSuite.scala
        BgenReaderSuite.scala
        BgenWriterSuite.scala
        vcf
        VCFConverterBaseTest.scala
        VariantContextToInternalRowConverterSuite.scala
        InternalRowToVariantContextConverterSuite.scala
        VCFDatasourceSuite.scala
        VCFSchemaInferrerSuite.scala
        VCFFileWriterSuite.scala
        VCFHeaderUtilsSuite.scala
        VCFStreamWriterSuite.scala
        TabixHelperSuite.scala
        VCFPiperSuite.scala
        common
        SimpleIntervalSuite.scala
        WithUtilsSuite.scala
        TestUtils.scala
        plink
        PlinkReaderSuite.scala
        GlowSuite.scala
        SparkTestShimBase.scala
        tertiary
        LiftOverCoordinatesExprSuite.scala
        LinearRegressionSuite.scala
        AggregateByIndexSuite.scala
        MomentAggStateSuite.scala
        RegressionTestUtils.scala
        SampleQcExprsSuite.scala
        LogisticRegressionSuite.scala
        VariantQcExprsSuite.scala
        VariantUtilExprsSuite.scala
        gff
        GffReaderSuite.scala
  - functions.scala.TEMPLATE
- build.sbt
- .gitignore
- docs
  - extensions
    - notebook.py
  - Makefile
  - source
    - additional-resources.rst
    - etl
      - variant-data.rst
      - merge.rst
      - utility-functions.rst
      - variant-splitter.rst
      - sample-qc.rst
      - vcf2delta.rst
      - lift-over.rst
      - gff.rst
      - variant-qc.rst
      - index.rst
      - variant-normalization.rst
    - introduction.rst
    - api-docs
      - pyspark-functions.rst
      - toplevel-functions.rst
      - index.rst
    - blogs
      - glowgr-blog
        glowgr-blog.rst
      - release-0-3-0-blog
        release-0-3-0-blog.rst
      - release-0-4-0-blog
        release-0-4-0-blog.rst
      - index.rst
      - variant-normalization-blog
        variant-normalization-blog.rst
    - getting-started.rst
    - index.rst
    - conf.py
    - tertiary
      - whole-genome-regression.rst
      - pandas-udf.rst
      - regression-tests.rst
      - pipe-transformer.rst
      - index.rst
    - _static
      - css
        glow.css
    - conftest.py
    - troubleshooting.rst
  - make.bat
  - README.md
- LICENSE.txt
- static
  - favicon.ico
- conftest.py
- version.sbt

/*
 * Copyright 2019 The Glow Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.projectglow.sql

import java.net.URI
import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

import io.projectglow.common.{GlowLogging, WithUtils}

/**
 * Base class for big file datasources. Handles plumbing that's necessary for all such sources:
 * - Checking the save mode
 * - Uploading an RDD of byte arrays
 */
abstract class BigFileDatasource extends CreatableRelationProvider {

  /**
   * Implemented by subclasses. Must return an RDD where each partition is exactly 1 byte array.
   */
  protected def serializeDataFrame(options: Map[String, String], df: DataFrame): RDD[Array[Byte]]

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      options: Map[String, String],
      data: DataFrame): BaseRelation = {

    val path = BigFileDatasource.checkPath(options)
    val filesystemPath = new Path(path)
    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
    val doSave = if (fs.exists(filesystemPath)) {
      mode match {
        case SaveMode.Append =>
          sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
        case SaveMode.Overwrite =>
          fs.delete(filesystemPath, true)
          true
        case SaveMode.ErrorIfExists =>
          sys.error(s"Path $path already exists.")
        case SaveMode.Ignore => false
      }
    } else {
      true
    }

    if (doSave) {
      WithUtils.withCachedDataset(data) { cachedDs =>
        val byteRdd = serializeDataFrame(options, cachedDs)
        SingleFileWriter.write(byteRdd, path)
      }
    }
    SingleFileRelation(sqlContext, data.schema)
  }
}

object BigFileDatasource {
  def checkPath(parameters: Map[String, String]): String = {
    parameters.getOrElse("path", sys.error("'path' must be specified"))
  }
}

case class SingleFileRelation(sqlContext: SQLContext, schema: StructType) extends BaseRelation

trait BigFileUploader {
  def canUpload(conf: Configuration, path: String): Boolean
  def upload(bytes: RDD[Array[Byte]], path: String): Unit
}

object SingleFileWriter extends GlowLogging {

  lazy val uploaders: Seq[BigFileUploader] = ServiceLoader
    .load(classOf[BigFileUploader])
    .iterator()
    .asScala
    .toSeq

  /**
   * Writes a single file in parallel to a storage system.
   *
   * Infers the destination storage system from the provided path.
   *
   * @param rdd The RDD to write.
   * @param path The path to write the RDD to.
   */
  def write(rdd: RDD[Array[Byte]], path: String) {
    val uri = new URI(path)
    uploaders.find(_.canUpload(rdd.sparkContext.hadoopConfiguration, path)) match {
      case Some(uploader) => uploader.upload(rdd, path)
      case None =>
        logger.info(s"Could not find a parallel uploader for $path, uploading from the driver")
        writeFileFromDriver(new Path(uri), rdd)
    }
  }

  private def writeFileFromDriver(path: Path, byteRdd: RDD[Array[Byte]]): Unit = {
    val sc = byteRdd.sparkContext
    val fs = path.getFileSystem(sc.hadoopConfiguration)
    WithUtils.withCloseable(fs.create(path)) { stream =>
      WithUtils.withCachedRDD(byteRdd) { cachedRdd =>
        cachedRdd.count()
        cachedRdd.toLocalIterator.foreach { chunk =>
          stream.write(chunk)
        }
      }
    }
  }
}