scala source code of CRFFromParsedFile

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.intel.ssg.bdt.nlp

import java.io.{DataOutputStream, DataInputStream, FileInputStream, FileOutputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object CRFFromParsedFile {

  def main(args: Array[String]) {
    val templateFile = "src/test/resources/chunking/template"
    val trainFile = "src/test/resources/chunking/serialized/train.data"
    val testFile = "src/test/resources/chunking/serialized/test.data"

    val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}")
    val sc = new SparkContext(conf)

    val templates: Array[String] = scala.io.Source.fromFile(templateFile).getLines().filter(_.nonEmpty).toArray
    val trainRDD: RDD[Sequence] = sc.textFile(trainFile).filter(_.nonEmpty).map(Sequence.deSerializer)

    val model: CRFModel = CRF.train(templates, trainRDD, 0.25, 1, 100, 1E-3, "L1")

    val testRDD: RDD[Sequence] = sc.textFile(testFile).filter(_.nonEmpty).map(Sequence.deSerializer)

    /**
      * an example of model saving and loading
      */
    new java.io.File("target/model").mkdir()
    //model save as String
    new java.io.PrintWriter("target/model/model1") { write(CRFModel.save(model)); close() }
    val modelFromFile1 = CRFModel.load(scala.io.Source.fromFile("target/model/model1").getLines().toArray.head)
    // model save as RDD
    sc.parallelize(CRFModel.saveArray(model)).saveAsTextFile("target/model/model2")
    val modelFromFile2 = CRFModel.loadArray(sc.textFile("target/model/model2").collect())
    // model save as BinaryFile
    val path = "target/model/model3"
    new java.io.File(path).mkdir()
    CRFModel.saveBinaryFile(model, path)
    val modelFromFile3 = CRFModel.loadBinaryFile(path)

    /**
      * still use the model in memory to predict
      */
    val results: RDD[Sequence] = model.setNBest(10)
      .setVerboseMode(VerboseLevel1)
      .predict(testRDD)

    val score = results
      .zipWithIndex()
      .map(_.swap)
      .join(testRDD.zipWithIndex().map(_.swap))
      .map(_._2)
      .map(x => x._1.compare(x._2))
      .reduce(_ + _)
    val total = testRDD.map(_.toArray.length).reduce(_ + _)
    println(s"Prediction Accuracy: $score / $total")

    sc.stop()
  }
}