scala source code of ShapefileRelation

/**
 * Copyright 2015 Ram Sriharsha
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package magellan

import java.util.Objects

import magellan.io._
import magellan.mapreduce._
import org.apache.hadoop.io.{ArrayWritable, LongWritable, MapWritable, Text}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext

import scala.collection.JavaConversions._
import scala.util.Try

/**
 * A Shapefile relation is the entry point for working with Shapefile formats.
 */
case class ShapeFileRelation(
    path: String,
    parameters: Map[String, String])
    (@transient val sqlContext: SQLContext)
  extends SpatialRelation {

  protected override def _buildScan(): RDD[Array[Any]] = {

    // read the shx files, if they exist
    val fileNameToFileSplits = Try(sc.newAPIHadoopFile(
      path + "/*.shx",
      classOf[ShxInputFormat],
      classOf[Text],
      classOf[ArrayWritable]
    ).map { case (txt: Text, splits: ArrayWritable) =>
      val fileName = txt.toString
      val s = splits.get()
      val size = s.length
      var i = 0
      val v = Array.fill(size)(0L)
      while (i < size) {
        v.update(i, s(i).asInstanceOf[LongWritable].get())
        i += 1
      }
      (fileName, v)
    }.collectAsMap())

    fileNameToFileSplits.map(SplitInfos.SPLIT_INFO_MAP.set(_))

    val shapefileRdd = sqlContext.sparkContext.newAPIHadoopFile(
      path + "/*.shp",
      classOf[ShapeInputFormat],
      classOf[ShapeKey],
      classOf[ShapeWritable]
    )

    val dbaseRdd = sqlContext.sparkContext.newAPIHadoopFile(
      path + "/*.dbf",
      classOf[DBInputFormat],
      classOf[ShapeKey],
      classOf[MapWritable]
    )

    val dataRdd = shapefileRdd.map { case (k, v) =>
      ((k.getFileNamePrefix(), k.getRecordIndex()), v.shape)
    }

    val metadataRdd = dbaseRdd.map { case (k, v) =>
      val meta = v.entrySet().map { kv =>
        val k = kv.getKey.asInstanceOf[Text].toString
        val v = kv.getValue.asInstanceOf[Text].toString
        (k, v)
      }.toMap
      ((k.getFileNamePrefix(), k.getRecordIndex()), meta)
    }
    dataRdd.leftOuterJoin(metadataRdd).map(f => Array(f._2._1, f._2._2))
  }

  override def hashCode(): Int = Objects.hash(path, schema)
}