org.apache.spark.mllib.fpm.FPGrowth Scala Examples

The following examples show how to use org.apache.spark.mllib.fpm.FPGrowth. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: FPGrowthExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.fpm.FPGrowth


object FPGrowthExample {

  case class Params(
    input: String = null,
    minSupport: Double = 0.3,
    numPartition: Int = -1) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("FPGrowthExample") {
      head("FPGrowth: an example FP-growth app.")
      opt[Double]("minSupport")
        .text(s"minimal support level, default: ${defaultParams.minSupport}")
        .action((x, c) => c.copy(minSupport = x))
      opt[Int]("numPartition")
        .text(s"number of partition, default: ${defaultParams.numPartition}")
        .action((x, c) => c.copy(numPartition = x))
      arg[String]("<input>")
        .text("input paths to input data set, whose file format is that each line " +
          "contains a transaction with each item in String and separated by a space")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
    val sc = new SparkContext(conf)
    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()

    println(s"Number of transactions: ${transactions.count()}")

    val model = new FPGrowth()
      .setMinSupport(params.minSupport)
      .setNumPartitions(params.numPartition)
      .run(transactions)

    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    sc.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: SimpleFPGrowth.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

// $example on$
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.rdd.RDD
// $example off$

import org.apache.spark.{SparkContext, SparkConf}

object SimpleFPGrowth {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("SimpleFPGrowth")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/sample_fpgrowth.txt")

    val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))

    val fpg = new FPGrowth()
      .setMinSupport(0.2)
      .setNumPartitions(10)
    val model = fpg.run(transactions)

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    val minConfidence = 0.8
    model.generateAssociationRules(minConfidence).collect().foreach { rule =>
      println(
        rule.antecedent.mkString("[", ",", "]")
          + " => " + rule.consequent .mkString("[", ",", "]")
          + ", " + rule.confidence)
    }
    // $example off$
  }
}
// scalastyle:on println 
Example 3
Source File: FPGrowthExample.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.{SparkConf, SparkContext}


object FPGrowthExample {

  case class Params(
    input: String = null,
    minSupport: Double = 0.3,
    numPartition: Int = -1) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("FPGrowthExample") {
      head("FPGrowth: an example FP-growth app.")
      opt[Double]("minSupport")
        .text(s"minimal support level, default: ${defaultParams.minSupport}")
        .action((x, c) => c.copy(minSupport = x))
      opt[Int]("numPartition")
        .text(s"number of partition, default: ${defaultParams.numPartition}")
        .action((x, c) => c.copy(numPartition = x))
      arg[String]("<input>")
        .text("input paths to input data set, whose file format is that each line " +
          "contains a transaction with each item in String and separated by a space")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
    val sc = new SparkContext(conf)
    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()

    println(s"Number of transactions: ${transactions.count()}")

    val model = new FPGrowth()
      .setMinSupport(params.minSupport)
      .setNumPartitions(params.numPartition)
      .run(transactions)

    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    sc.stop()
  }
}
// scalastyle:on println 
Example 4
Source File: SimpleFPGrowth.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.rdd.RDD
// $example off$

object SimpleFPGrowth {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("SimpleFPGrowth")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/sample_fpgrowth.txt")

    val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))

    val fpg = new FPGrowth()
      .setMinSupport(0.2)
      .setNumPartitions(10)
    val model = fpg.run(transactions)

    model.freqItemsets.collect().foreach { itemset =>
      println(s"${itemset.items.mkString("[", ",", "]")},${itemset.freq}")
    }

    val minConfidence = 0.8
    model.generateAssociationRules(minConfidence).collect().foreach { rule =>
      println(s"${rule.antecedent.mkString("[", ",", "]")}=> " +
        s"${rule.consequent .mkString("[", ",", "]")},${rule.confidence}")
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 5
Source File: FPGrowthExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.fpm.FPGrowth


object FPGrowthExample {

  case class Params(
    input: String = null,
    minSupport: Double = 0.3,
    numPartition: Int = -1) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("FPGrowthExample") {
      head("FPGrowth: an example FP-growth app.")
      opt[Double]("minSupport")
        .text(s"minimal support level, default: ${defaultParams.minSupport}")
        .action((x, c) => c.copy(minSupport = x))
      opt[Int]("numPartition")
        .text(s"number of partition, default: ${defaultParams.numPartition}")
        .action((x, c) => c.copy(numPartition = x))
      arg[String]("<input>")
        .text("input paths to input data set, whose file format is that each line " +
          "contains a transaction with each item in String and separated by a space")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
    val sc = new SparkContext(conf)
    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()

    println(s"Number of transactions: ${transactions.count()}")

    val model = new FPGrowth()
      .setMinSupport(params.minSupport)
      .setNumPartitions(params.numPartition)
      .run(transactions)

    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")

    model.freqItemsets.collect().foreach { itemset =>
      println(s"${itemset.items.mkString("[", ",", "]")}, ${itemset.freq}")
    }

    sc.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: FPGrowthExample.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.{SparkConf, SparkContext}


object FPGrowthExample {

  case class Params(
    input: String = null,
    minSupport: Double = 0.3,
    numPartition: Int = -1) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("FPGrowthExample") {
      head("FPGrowth: an example FP-growth app.")
      opt[Double]("minSupport")
        .text(s"minimal support level, default: ${defaultParams.minSupport}")
        .action((x, c) => c.copy(minSupport = x))
      opt[Int]("numPartition")
        .text(s"number of partition, default: ${defaultParams.numPartition}")
        .action((x, c) => c.copy(numPartition = x))
      arg[String]("<input>")
        .text("input paths to input data set, whose file format is that each line " +
          "contains a transaction with each item in String and separated by a space")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    }.getOrElse {
      sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
    val sc = new SparkContext(conf)
    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()

    println(s"Number of transactions: ${transactions.count()}")

    val model = new FPGrowth()
      .setMinSupport(params.minSupport)
      .setNumPartitions(params.numPartition)
      .run(transactions)

    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    sc.stop()
  }
} 
Example 7
Source File: SimpleFPGrowth.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.rdd.RDD
// $example off$

object SimpleFPGrowth {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("SimpleFPGrowth")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/sample_fpgrowth.txt")

    val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))

    val fpg = new FPGrowth()
      .setMinSupport(0.2)
      .setNumPartitions(10)
    val model = fpg.run(transactions)

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    val minConfidence = 0.8
    model.generateAssociationRules(minConfidence).collect().foreach { rule =>
      println(
        rule.antecedent.mkString("[", ",", "]")
          + " => " + rule.consequent .mkString("[", ",", "]")
          + ", " + rule.confidence)
    }
    // $example off$
  }
}
// scalastyle:on println 
Example 8
Source File: FPGrowthExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.fpm.FPGrowth


object FPGrowthExample {

  case class Params(
    input: String = null,
    minSupport: Double = 0.3,
    numPartition: Int = -1) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("FPGrowthExample") {
      head("FPGrowth: an example FP-growth app.")
      opt[Double]("minSupport")
        .text(s"minimal support level, default: ${defaultParams.minSupport}")
        .action((x, c) => c.copy(minSupport = x))
      opt[Int]("numPartition")
        .text(s"number of partition, default: ${defaultParams.numPartition}")
        .action((x, c) => c.copy(numPartition = x))
      arg[String]("<input>")
        .text("input paths to input data set, whose file format is that each line " +
          "contains a transaction with each item in String and separated by a space")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
    val sc = new SparkContext(conf)
    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()

    println(s"Number of transactions: ${transactions.count()}")

    val model = new FPGrowth()
      .setMinSupport(params.minSupport)
      .setNumPartitions(params.numPartition)
      .run(transactions)

    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    sc.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: FPGAlgorithm.scala    From pio-template-fpm   with Apache License 2.0 5 votes vote down vote up
package org.template.fpm



import org.apache.predictionio.controller.P2LAlgorithm
import org.apache.predictionio.controller.Params
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import grizzled.slf4j.Logger
import org.apache.spark.mllib.fpm.{FPGrowth,FPGrowthModel}

case class AlgorithmParams(
  val minSupport: Double,
  val minConfidence: Double,
  val numPartitions: Int
) extends Params

class FPGModel(
  val resultList: List[(String,Array[String],Double)]
) extends Serializable {}

class FPGAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, FPGModel, Query, PredictedResult] {

  @transient lazy val logger = Logger[this.type]

  def train(sc: SparkContext, data: PreparedData): FPGModel = {
    println("Training FPM model.")
    val fpg = new FPGrowth().setMinSupport(ap.minSupport).setNumPartitions(ap.numPartitions)
    val model = fpg.run(data.transactions.cache)
    val res = model.generateAssociationRules(ap.minConfidence).map(x=>(x.antecedent.mkString(" "),x.consequent,x.confidence)).collect.toList

    new FPGModel(resultList=res)
  }

  def predict(model: FPGModel, query: Query): PredictedResult = {
    val qArr = query.items.toList.sorted.mkString(" ")
    val result = model.resultList.filter(x=>{x._1==qArr}).sortBy(_._3).map(x=>{new ConsequentItem(x._2,x._3)})

    PredictedResult(consequentItems=result.toArray)
  }
} 
Example 10
Source File: SimpleFPGrowth.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.rdd.RDD
// $example off$

object SimpleFPGrowth {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("SimpleFPGrowth")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/sample_fpgrowth.txt")

    val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))

    val fpg = new FPGrowth()
      .setMinSupport(0.2)
      .setNumPartitions(10)
    val model = fpg.run(transactions)

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    val minConfidence = 0.8
    model.generateAssociationRules(minConfidence).collect().foreach { rule =>
      println(
        rule.antecedent.mkString("[", ",", "]")
          + " => " + rule.consequent .mkString("[", ",", "]")
          + ", " + rule.confidence)
    }
    // $example off$
  }
}
// scalastyle:on println 
Example 11
Source File: FPGrowthExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.fpm.FPGrowth


object FPGrowthExample {

  case class Params(
    input: String = null,
    minSupport: Double = 0.3,
    numPartition: Int = -1) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("FPGrowthExample") {
      head("FPGrowth: an example FP-growth app.")
      opt[Double]("minSupport")
        .text(s"minimal support level, default: ${defaultParams.minSupport}")
        .action((x, c) => c.copy(minSupport = x))
      opt[Int]("numPartition")
        .text(s"number of partition, default: ${defaultParams.numPartition}")
        .action((x, c) => c.copy(numPartition = x))
      arg[String]("<input>")
        .text("input paths to input data set, whose file format is that each line " +
          "contains a transaction with each item in String and separated by a space")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
    val sc = new SparkContext(conf)
    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()

    println(s"Number of transactions: ${transactions.count()}")

    val model = new FPGrowth()
      .setMinSupport(params.minSupport)
      .setNumPartitions(params.numPartition)
      .run(transactions)

    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    sc.stop()
  }
}
// scalastyle:on println 
Example 12
import org.apache.spark.SparkContext
import org.apache.spark.mllib.fpm.FPGrowth


object SampleFPGrowthApp {
  def main(args: Array[String]) {
    val transactions = Seq(
      "r z h k p",
      "z y x w v u t s",
      "s x o n r",
      "x z y m t s q e",
      "z",
      "x z y r q t p")
      .map(_.split(" "))
    val sc = new SparkContext("local[2]", "Chapter 5 App")
    val rdd = sc.parallelize(transactions, 2).cache()

    val fpg = new FPGrowth()

    val model = fpg
      .setMinSupport(0.2)
      .setNumPartitions(1)
      .run(rdd)

    model.freqItemsets.collect().foreach { itemset =>
        println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }
  }
} 
Example 13
package com.sparksample

import org.apache.spark.mllib.fpm.FPGrowth


object SampleFPGrowthApp {
  def main(args: Array[String]) {
    val transactions = Seq(
      "r z h k p",
      "z y x w v u t s",
      "s x o n r",
      "x z y m t s q e",
      "z",
      "x z y r q t p")
      .map(_.split(" "))
    val sc = Util.sc
    val rdd = sc.parallelize(transactions, 2).cache()

    val fpg = new FPGrowth()

    val model6 = fpg
      .setMinSupport(0.2)
      .setNumPartitions(1)
      .run(rdd)

    model6.freqItemsets.collect().foreach { itemset =>
        println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }
  }
} 
Example 14
package com.sparksample

import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.mllib.recommendation.Rating

import scala.collection.mutable.ListBuffer


    val rawRatings = rawData.map(_.split("\t").take(3))
    rawRatings.first()
    val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    val ratingsFirst = ratings.first()
    println(ratingsFirst)

    val movies = Util.getMovieData()
    val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
    titles(123)

    var eRDD = sc.emptyRDD
    var z = Seq[String]()

    val l = ListBuffer()
    val aj = new Array[String](400)
    var i = 0
    for( a <- 501 to 900) {
      val moviesForUserX = ratings.keyBy(_.user).lookup(a)
      val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10)
      val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product)
      var temp = ""
      for( x <- moviesForUserX_10_1){
        if(temp.equals(""))
          temp = x.toString
        else {
          temp =  temp + " " + x
        }
      }

      aj(i) = temp
      i += 1
    }
    z = aj

    val transaction = z.map(_.split(" "))
    val rddx = sc.parallelize(transaction, 2).cache()

    val fpg = new FPGrowth()
    val model = fpg
      .setMinSupport(0.1)
      .setNumPartitions(1)
      .run(rddx)

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }
    sc.stop()
  }

} 
Example 15
import org.apache.spark.SparkContext
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.mllib.recommendation.Rating

import scala.collection.mutable.ListBuffer


    val rawRatings = rawData.map(_.split("\t").take(3))
    rawRatings.first()
    // 14/03/30 13:22:44 INFO SparkContext: Job finished: first at <console>:21, took 0.003703 s
    // res25: Array[String] = Array(196, 242, 3)

    val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    val ratingsFirst = ratings.first()
    println(ratingsFirst)

    val userId = 789
    val K = 10

    val movies = sc.textFile(PATH + "/ml-100k/u.item")
    val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
    titles(123)

    var eRDD = sc.emptyRDD
    var z = Seq[String]()

    val l = ListBuffer()
    val aj = new Array[String](100)
    var i = 0
    for( a <- 801 to 900) {
      val moviesForUserX = ratings.keyBy(_.user).lookup(a)
      val moviesForUserX_10 = moviesForUserX.sortBy(-_.rating).take(10)
      val moviesForUserX_10_1 = moviesForUserX_10.map(r => r.product)
      var temp = ""
      for( x <- moviesForUserX_10_1){
        temp = temp + " " + x
        println(temp)

      }

      aj(i) = temp
      i += 1
    }
    z = aj
    val transaction2 = z.map(_.split(" "))

    val rddx = sc.parallelize(transaction2, 2).cache()

    val fpg = new FPGrowth()
    val model6 = fpg
      .setMinSupport(0.1)
      .setNumPartitions(1)
      .run(rddx)

    model6.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }
    sc.stop()
  }

} 
Example 16
Source File: FPGrowthTestv6.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.{SparkConf, SparkContext}
import scopt.OptionParser

import scala.reflect.runtime.universe._

object FPGrowthTestv6 {

  case class Params(
    input: String = null,
    minSupport: Double = 0.3,
    numPartition: Int = -1) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("FPGrowthTestv6") {
      head("FPGrowth: an example FP-growth app.")
      opt[Double]("minSupport")
        .text(s"minimal support level, default: ${defaultParams.minSupport}")
        .action((x, c) => c.copy(minSupport = x))
      opt[Int]("numPartition")
        .text(s"number of partition, default: ${defaultParams.numPartition}")
        .action((x, c) => c.copy(numPartition = x))
      arg[String]("<input>")
        .text("input paths to input data set, whose file format is that each line " +
          "contains a transaction with each item in String and separated by a space")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"FPGrowthTestv6 with $params")
    val sc = new SparkContext(conf)
    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()

    println(s"Number of transactions: ${transactions.count()}")

    val model = new FPGrowth()
      .setMinSupport(params.minSupport)
      .setNumPartitions(params.numPartition)
      .run(transactions)

    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    sc.stop()
  }
  abstract class AbstractParams[T: TypeTag] {
    private def tag: TypeTag[T] = typeTag[T]
    
    override def toString: String = {
      val tpe = tag.tpe
      val allAccessors = tpe.declarations.collect {
        case m: MethodSymbol if m.isCaseAccessor => m
      }
      val mirror = runtimeMirror(getClass.getClassLoader)
      val instanceMirror = mirror.reflect(this)
      allAccessors.map { f =>
        val paramName = f.name.toString
        val fieldMirror = instanceMirror.reflectField(f)
        val paramValue = fieldMirror.get
        s" $paramName:\t$paramValue"
      }.mkString("{\n", ",\n", "\n}")
    }
  }
} 
Example 17
Source File: L9-14FPMining.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object FPMiningApp {

  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: FPMiningApp <appname> <batchInterval> <iPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, iPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val minSupport = 0.4

    ssc.textFileStream(iPath)
      .map(r => r.split(" "))
      .foreachRDD(transactionRDD => {
        val fpg = new FPGrowth()
          .setMinSupport(minSupport)
        val model = fpg.run(transactionRDD)

        model.freqItemsets
          .collect()
          .foreach(itemset => println("Items: %s, Frequency: %s".format(itemset.items.mkString(" "), itemset.freq)))
      })

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 18
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.{FPGrowth, PrefixSpan}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object MSNBCPatternMining extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC.com data pattern mining")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)

    val transactionTest = sc.parallelize(Array(Array("A", "B", "C"), Array("B", "C", "A")))
    val fp = new FPGrowth().setMinSupport(0.8).setNumPartitions(5)
    fp.run(transactionTest)

    val transactions: RDD[Array[Int]] = sc.textFile("./msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }

    // NOTE: Caching data is recommended
    val uniqueTransactions: RDD[Array[Int]] = transactions.map(_.distinct).cache()


    val fpGrowth = new FPGrowth().setMinSupport(0.01)
    val model = fpGrowth.run(uniqueTransactions)
    val count = uniqueTransactions.count()

    model.freqItemsets.collect().foreach { itemset =>
      if (itemset.items.length >= 3)
        println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq / count.toDouble )
    }

    val rules = model.generateAssociationRules(confidence = 0.4)
    rules.collect().foreach { rule =>
      println("[" + rule.antecedent.mkString(",") + "=>"
        + rule.consequent.mkString(",") + "]," + (100 * rule.confidence).round / 100.0)
    }

    val frontPageConseqRules = rules.filter(_.consequent.head == 1)
    frontPageConseqRules.count
    frontPageConseqRules.filter(_.antecedent.contains(2)).count
    rules.filter(_.antecedent.contains(7)).count


    val sequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()

    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(sequences)

    psModel.freqSequences.map(fs => (fs.sequence.length, 1))
      .reduceByKey(_ + _)
      .sortByKey()
      .collect()
      .foreach(fs => println(s"${fs._1}: ${fs._2}"))

    psModel.freqSequences
      .map(fs => (fs.sequence.length, fs))
      .groupByKey()
      .map(group => group._2.reduce((f1, f2) => if (f1.freq > f2.freq) f1 else f2))
      .map(_.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
      .collect.foreach(println)


    psModel.freqSequences
      .map(fs => (fs.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"), 1))
      .reduceByKey(_ + _)
      .reduce( (f1, f2) => if (f1._2 > f2._2) f1 else f2 )


    psModel.freqSequences.reduce( (f1, f2) => if (f1.freq > f2.freq) f1 else f2 )
    psModel.freqSequences.filter(_.sequence.length == 1).map(_.sequence.toString).collect.foreach(println)

    psModel.freqSequences.collect().foreach {
      freqSequence =>
        println(
          freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq
        )
    }
} 
Example 19
Source File: SimpleFPGrowth.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.fpm.FPGrowth
import org.apache.spark.rdd.RDD
// $example off$

object SimpleFPGrowth {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("SimpleFPGrowth")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/sample_fpgrowth.txt")

    val transactions: RDD[Array[String]] = data.map(s => s.trim.split(' '))

    val fpg = new FPGrowth()
      .setMinSupport(0.2)
      .setNumPartitions(10)
    val model = fpg.run(transactions)

    model.freqItemsets.collect().foreach { itemset =>
      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
    }

    val minConfidence = 0.8
    model.generateAssociationRules(minConfidence).collect().foreach { rule =>
      println(
        rule.antecedent.mkString("[", ",", "]")
          + " => " + rule.consequent .mkString("[", ",", "]")
          + ", " + rule.confidence)
    }
    // $example off$
  }
}
// scalastyle:on println