/* SimpleApp.scala */ import com.micvog.ml.{AnomalyDetection, FeaturesParser} import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD object MainRun { val rawFilePath = "./src/test/resources/training.csv" val cvFilePath = "./src/test/resources/cross_val.csv" def main(args: Array[String]) { val conf = new SparkConf().setAppName("Anomaly Detection Spark") val sc = new SparkContext(conf) val rawdata = sc.textFile(rawFilePath, 2).cache() val cvData = sc.textFile(cvFilePath, 2).cache() //convert raw data to vectors val trainingVec: RDD[Vector] = FeaturesParser.parseFeatures(rawdata) val cvLabeledVec: RDD[LabeledPoint] = FeaturesParser.parseFeaturesWithLabel(cvData) val data = trainingVec.cache() val anDet: AnomalyDetection = new AnomalyDetection() //derive model val model = anDet.run(data) val dataCvVec = cvLabeledVec.cache() val optimalModel = anDet.optimize(dataCvVec, model) //find outliers in CV val cvVec = cvLabeledVec.map(_.features) val results = optimalModel.predict(cvVec) val outliers = results.filter(_._2).collect() outliers.foreach(v => println(v._1)) println("\nFound %s outliers\n".format(outliers.length)) } }