scala source code of CountVectorizerDemo

Project: Scala-and-Spark-for-Big-Data-Analytics (GitHub Link)

Scala-and-Spark-for-Big-Data-Analytics-master
- Chapter08
  - chapter8
    - UDFanduserdefinedfunctioncreated_13.txt
    - executionoftheplangroupbyandaggapi_9.txt
    - multipleaggregationsatsametime_10and11.txt
    - innerjoin_40.txt
    - Last_22.txt
    - dataframeastable_7.txt
    - Implicitschema_14.txt
    - variance_27.txt
    - aggoperationondataframecolumns_8.txt
    - averageormean_24.txt
    - sortoperationontable_7and8.txt
    - sum_25.txt
    - Filtersandthenexecutionoffilters_12.txt
    - First_21.txt
    - cube_30.txt
    - skewness_26.txt
    - groupingdatausinggroupby_8.txt
    - encoders_16.txt
    - pivots_11.txt
    - oncedataframeloaded_6.txt
    - outerjoin_44.txt
    - windowfunction_32.txt
    - leftouterjoin_42.txt
    - kurtosis_25.txt
    - crossjoin_47.txt
    - count_20.txt
    - approximatecountdistinct_22and23.txt
    - leftsemijoin_46.txt
    - ntiles_33.txt
    - encoderscontinued_17.txt
    - leftantijoin_45.txt
    - explicitschema_15.txt
    - standarddeviation_27.txt
    - min_23.txt
    - loadingdatasets_18.txt
    - loadingcsvwithheaderintodataframe_6.txt
    - max_24.txt
    - chainingoperationstogether_9and10.txt
    - jointtypesworkingwithadataset_39.txt
    - savingdatasets_18.txt
    - rightouterjoin_43.txt
    - covariance_28.txt
    - rollup_29.txt
    - groupby_29.txt
- Chapter13
  - DecisionTreeClassificationExample.scala
  - pom.xml
  - OneVsRestExample.scala
  - NaiveBayesCrossValidation.scala
  - NaiveBayes.scala
- Chapter18
  - AssertResult.scala
  - TransformationTestWithSparkTestingBase.scala
  - SimpleScalaTest.scala
  - wordCount.scala
  - pom.xml
  - CompileOrNot.scala
  - wordCounterTestDemo.scala
  - MakingTaskSerilazible.scala
  - wordCountRDD.scala
  - wordCountTest.scala
  - myCustomLog.scala
  - myCustomLogwithClosureSerializable.scala
  - myCustomLogwithClosure.scala
  - ExpectedException.scala
  - VectorSpace.scala
  - DebugTestSBT.scala
  - DistanceFromCentroid.scala
- Chapter16
  - KyroRegistrationDemo.scala
  - MyLog.scala
  - MyLogCompleteDemo.scala
- LICENSE
- Chapter12
  - topicModellingwithLDA.scala
  - MovieRecommendation.scala
  - LogisticRegressionExample.scala
  - CreditRiskAnalysisTuning.scala
- .gitattributes
- Chapter19
  - SparkR_Codes
    - MLP_SparkR.R
    - Visualization_SparkR.R
    - Data_manipulations_SparkR.R
    - SQL_with_SparkR.R
    - SparkR_configuration.R
    - UsingExternalDataSource_SparkR.R
  - PySpark_Codes
    - LIVSVM-with_PySpark.py
    - KMeans.py
    - CSV_with_PySpark.py
    - HelloWorld.py
    - UDF_with_PySpark.py
    - pca_example.py
    - sample_raw_file.txt
    - Text_file_with_PySpark.py
- Chapter10
  - chapter10
    - connectedcomponents_19and20.txt
    - usingcollectfunctionongraph_13and14.txt
    - shortestpathcontinued_23.txt
    - EdgeRDD_13.txt
    - filter_15.txt
    - aggregatemessages_17.txt
    - friendstructure_2and3.txt
    - vertexIDNameOperation_12.txt
    - pagerank_27.txt
    - shortestpath_21.txt
    - graphoperators_14.txt
    - trianglecounting_17.txt
    - mapvalues_15.txt
- AppendixA
  - chapter20
    - alluxiowithspark_pg17.txt
- Chapter15
  - chapter15
    - IDF_pg16and17.txt
    - Tokenizer_pg9.txt
    - Stopwordsremoval_pg11.txt
    - textclassification_pg27to30.txt
    - ngrams_pg13.txt
    - ldatopicmonitor_pg23.txt
    - createdataframe_pg8.txt
    - countvectorizer_pg21.txt
    - hashingTF_pg15and16.txt
    - word2vec_pg19.txt
- Chapter14
  - B05877_13_Dataset
    - B05877_13_Dataset
      - Saratoga NY Homes.txt
  - B05877_13_Codes
    - B05877_13_Codes
      - BisectingKMeansDemo.scala
      - pom.xml
      - KMeansDemo.scala
      - GaussianMixtureModelDemo.scala
- data
  - data
    - users.txt
    - statesTaxRates.csv
    - streamfiles
      - new3.txt
      - new1.txt
    - wiki1.txt
    - friends.txt
    - statesPopulation.csv
- README.md
- Chapter02
  - ClassesinScalaTest.scala
  - ListOperationTest.scala
  - PublicAccessScala.scala
  - PatternMatchingDemo2.scala
  - DataTypes.scala
  - ScalaGenericsForStackt.scala
  - UsingExtends.scala
  - CaseClass.scala
  - AbstractClass.scala
  - Variables.scala
  - ListScalaDemo.scala
  - Constructors.scala
  - ListReverseTest.scala
  - ForClasesDemo.scala
  - UsingGenericsForLinkedList.scala
  - AccesibilityScala.scala
  - TestOperationMaxMin.scala
  - LazyVal.scala
  - Immutability.scala
  - ListAccessingwithOperator.scala
  - MapsTestDemo.scala
  - Dimension.scala
  - PatternMatchingDemo1.scala
- Chapter03
  - UsingTryCatchFinally2.scala
  - ScalaExceptionHandling.scala
  - TryCatchFinally.scala
  - UsingRange.scala
  - CurryingHOF.scala
  - RunOneTaskbutBlock.scala
  - filterExample.scala
  - mapExample.scala
  - MonadiacExample.scala
  - PureandNonPureFunction.scala
  - ListScala.scala
  - flatMapExample.scala
  - HigherOrderFunction.scala
  - FunctionAsReturnValue.scala
  - Either.scala
  - CollectionExample.scala
- Chapter07
  - chapter7
    - AccumulatorsExample.txt
    - Aggregations_17.txt
    - yearaskeypairRDD_17.txt
    - sequenceRDD_8.txt
    - Accumulator_37.txt
    - RDDsequenceofnumbersAdding2toeachelement_2.txt
    - CoGroupedRDD_9and10.txt
    - HadoopRDD_14and15.txt
    - aggregationsExample.txt
    - RDDsequenceofnumbers_2.txt
    - creatingbroadcastvariables_33and34.txt
    - unionRDD_12and13.txt
    - RDDpairs_5.txt
    - StateaskeypairRDD_17.txt
    - exampleofreusingadestroyingbroadcastvariables_35and36.txt
    - DoubleRDD_6and7.txt
    - transformationofintegerRDDmultiplyingbyanotherint_34.txt
    - shuffledRDD_11.txt
    - rangepartitioner_26.txt
    - harshpartitioner_25.txt
    - partitioningandshuffling_23.txt
    - RDDcombinationofStringandInt_3and4.txt
    - RDDsequenceofnumbersMultiplyingeachby3_2.txt
    - RDDeachuppercase_4.txt
    - cleaningbroadcastvariables_35.txt
    - newHadoopRDD_16.txt
    - destroyingbroadcastvariables_35and36.txt
    - RDDlineageusingtoDebugString_2and3.txt
- Chapter11
  - CancerAnalysisusingLogisticRegression.scala
  - StringIndexerDemo.scala
  - RandomForestDemo.scala
  - PCAExample.scala
  - CountVectorizerDemo.scala
  - OneHotEncoderDemo2.scala
  - topicModellingwithLDA.scala
  - TockenizerExample.scala
  - MovieRecommendation.scala
  - LogisticRegressionExample.scala
  - PCA_LinearRegression_Demo.scala
  - StopWordsRemoverExample.scala
  - CreditRiskAnalysisTuning.scala
- .gitignore
- Chapter04
  - MapExample.scala
  - OptionsExample.scala
  - ListExample.scala
  - UsingImplicitConversion.scala
  - UsingInfix.scala
  - ExistsExample.scala
  - ArrayExample.scala
  - ScalaOptions.scala
  - SetExample.scala
  - TupleExample.scala
  - UsingFatArrow.scala
- AppendixB
  - BankDatawithZepplein.scala
  - SpamFilteringNotebook.json
  - SpamFilteringDemo.scala
- Chapter06
  - chapter6
    - reduce_42.txt
    - repartition_39.txt
    - readingdatafromexternalsource_22.txt
    - coalesce_38.txt
    - caching_47.txt
    - transformationofanexistingRDD_22.txt
    - wholeTextFile_48and49.txt
    - loadingdataasRDD_26.txt
    - count_43and44.txt
    - filterfunction_37.txt
    - parallelizingconnection_21.txt
    - mapfunction_34.txt
    - flatmapfunction_35and36.txt
    - persist_47.txt
    - savingRDD_49.txt
    - collect_45.txt
    - textFile_48.txt
- Chapter09
  - chapter9
    - twitterstreamexample_18.txt
    - textfilestreamexample_14.txt
    - structuredStreaming_40.txt
    - startingstreamingcontext_11.txt
    - structuredStreaming_42.txt
    - creatingstreamingcontext_10and11.txt
    - streamingcontext from sparkcontext steps_17and18.txt
    - stoppingstreamingcontext_11and12.txt
    - twitterstreamexample_15.txt

package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }

object CountVectorizerDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, Array("Jason", "David")),
        (1, Array("David", "Martin")),
        (2, Array("Martin", "Jason")),
        (3, Array("Jason", "Daiel")),
        (4, Array("Daiel", "Martin")),
        (5, Array("Moahmed", "Jason")),
        (6, Array("David", "David")),
        (7, Array("Jason", "Martin")))).toDF("id", "name")

    df.show(false)

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("name")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    val feature = cvModel.transform(df)
    feature.show(false)

    spark.stop()
  }
}