/* * Copyright 2014 Databricks * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.databricks.spark.xml import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession} import org.apache.spark.sql.types.StructType import com.databricks.spark.xml.util.XmlFile import com.databricks.spark.xml.util.FailFastMode /** * A collection of static functions for working with XML files in Spark SQL */ class XmlReader extends Serializable { private var parameters = collection.mutable.Map.empty[String, String] private var schema: StructType = null def withCharset(charset: String): XmlReader = { parameters += ("charset" -> charset) this } def withCompression(codec: String): XmlReader = { parameters += ("codec" -> codec) this } def withRowTag(rowTag: String): XmlReader = { parameters += ("rowTag" -> rowTag) this } def withSamplingRatio(samplingRatio: Double): XmlReader = { parameters += ("samplingRatio" -> samplingRatio.toString) this } def withExcludeAttribute(exclude: Boolean): XmlReader = { parameters += ("excludeAttribute" -> exclude.toString) this } def withTreatEmptyValuesAsNulls(treatAsNull: Boolean): XmlReader = { parameters += ("treatEmptyValuesAsNulls" -> treatAsNull.toString) this } @deprecated("Use withParseMode(\"FAILFAST\") instead", "0.10.0") def withFailFast(failFast: Boolean): XmlReader = { if (failFast) { parameters += ("mode" -> FailFastMode.name) } else { parameters -= "mode" } this } def withParseMode(mode: String): XmlReader = { parameters += ("mode" -> mode) this } def withAttributePrefix(attributePrefix: String): XmlReader = { parameters += ("attributePrefix" -> attributePrefix) this } def withValueTag(valueTag: String): XmlReader = { parameters += ("valueTag" -> valueTag) this } def withColumnNameOfCorruptRecord(name: String): XmlReader = { parameters += ("columnNameOfCorruptRecord" -> name) this } def withIgnoreSurroundingSpaces(ignore: Boolean): XmlReader = { parameters += ("ignoreSurroundingSpaces" -> ignore.toString) this } def withSchema(schema: StructType): XmlReader = { this.schema = schema this } def withRowValidationXSDPath(path: String): XmlReader = { parameters += ("rowValidationXSDPath" -> path) this } /** * @param spark current SparkSession * @param path path to XML files to parse * @return XML parsed as a DataFrame */ def xmlFile(spark: SparkSession, path: String): DataFrame = { // We need the `charset` and `rowTag` before creating the relation. val (charset, rowTag) = { val options = XmlOptions(parameters.toMap) (options.charset, options.rowTag) } val relation = XmlRelation( () => XmlFile.withCharset(spark.sparkContext, path, charset, rowTag), Some(path), parameters.toMap, schema)(spark.sqlContext) spark.baseRelationToDataFrame(relation) } /** * @param spark current SparkSession * @param ds XML for individual 'rows' as Strings * @return XML parsed as a DataFrame */ def xmlDataset(spark: SparkSession, ds: Dataset[String]): DataFrame = { xmlRdd(spark, ds.rdd) } /** * @param spark current SparkSession * @param xmlRDD XML for individual 'rows' as Strings * @return XML parsed as a DataFrame */ def xmlRdd(spark: SparkSession, xmlRDD: RDD[String]): DataFrame = { val relation = XmlRelation( () => xmlRDD, None, parameters.toMap, schema)(spark.sqlContext) spark.baseRelationToDataFrame(relation) } /** Returns a Schema RDD for the given XML path. */ @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0") def xmlFile(sqlContext: SQLContext, path: String): DataFrame = { // We need the `charset` and `rowTag` before creating the relation. val (charset, rowTag) = { val options = XmlOptions(parameters.toMap) (options.charset, options.rowTag) } val relation = XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), Some(path), parameters.toMap, schema)(sqlContext) sqlContext.baseRelationToDataFrame(relation) } @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0") def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = { val relation = XmlRelation( () => xmlRDD, None, parameters.toMap, schema)(sqlContext) sqlContext.baseRelationToDataFrame(relation) } }