package org.biodatageeks.sequila.datasources.VCF

import io.projectglow.Glow
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SparkSession}
import org.apache.spark.sql.sources._
import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs}
import org.apache.spark.sql.functions._




class VCFRelation(path: String,
                  normalization_mode: Option[String] = None,
                  ref_genome_path : Option[String] = None )(@transient val sqlContext: SQLContext) extends BaseRelation
  with PrunedScan
  with Serializable
  with Logging {

  val spark: SparkSession = sqlContext.sparkSession

  val cleanContigUDF = udf[String, String](DataQualityFuncs.cleanContig)

  lazy val inputDf: DataFrame = spark
    .read
    .format("vcf")
    .option("splitToBiallelic", "true")
    .load(path)
  lazy val dfNormalized = {
    normalization_mode match {
    case Some(m) => {
      if (m.equalsIgnoreCase("normalize") || m.equalsIgnoreCase("split_and_normalize")
        && ref_genome_path == None) throw new Exception(s"Variant normalization mode specified but ref_genome_path is empty ")
      Glow.transform(m.toLowerCase(), inputDf, Map("reference_genome_path" -> ref_genome_path.get))
    }
    case _ => inputDf
    }
  }.withColumnRenamed("contigName", Columns.CONTIG)
    .withColumnRenamed("start", Columns.START)
    .withColumnRenamed("end", Columns.END)
    .withColumnRenamed("referenceAllele", Columns.REF)
    .withColumnRenamed("alternateAlleles", Columns.ALT)

  lazy val df = dfNormalized
    .withColumn(Columns.CONTIG, cleanContigUDF(dfNormalized(Columns.CONTIG)))

  override def schema: org.apache.spark.sql.types.StructType = {
   df.schema
  }

  override def buildScan(requiredColumns: Array[String] ): RDD[Row] = {

    {
      if (requiredColumns.length > 0)
        df.select(requiredColumns.head, requiredColumns.tail: _*)
      else
        df
    }.rdd


  }

}