spark分词
scala> import org.apache.spark.ml.feature.{HashingTF,IDF,Tokenizer}
scala> import spark.implicits._
scala> val sentenceData = spark.createDataFrame(Seq(
| (0, "I heard about Spark and I love Spark"),
| (0, "I wish Java could use case classes"),
| (1, "Logistic regression models are neat")
| )).toDF("label", "sentence")
scala> val tokenizer=new Tokenizer().setInputCol("sentence").setOutputCol("words")
scala> val wordsData=tokenizer.transform(sentenceData)
scala> wordsData.show(false)
+-----+------------------------------------+---------------------------------------------+
|label|sentence |words |
+-----+------------------------------------+---------------------------------------------+
|0 |I heard about Spark and I love Spark|[i, heard, about, spark, and, i, love, spark]|
|0 |I wish Java could use case classes |[i, wish, java, could, use, case, classes] |
|1 |Logistic regression models are neat |[logistic, regression, models, are, neat] |
+-----+------------------------------------+---------------------------------------------+
scala> val hashingTF=new HashingTF().
| setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(2000)
scala> val featurizedData = hashingTF.transform(wordsData)
scala> val featurizedData = hashingTF.transform(wordsData)
featurizedData: org.apache.spark.sql.DataFrame = [label: int, sentence: string ... 2 more fields]
scala> featurizedData.select("rawFeatures").show(false)
+---------------------------------------------------------------------+
|rawFeatures |
+---------------------------------------------------------------------+
|(2000,[240,333,1105,1329,1357,1777],[1.0,1.0,2.0,2.0,1.0,1.0]) |
|(2000,[213,342,489,495,1329,1809,1967],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(2000,[286,695,1138,1193,1604],[1.0,1.0,1.0,1.0,1.0]) |
+---------------------------------------------------------------------+
scala> val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
idf: org.apache.spark.ml.feature.IDF = idf_dec3903d338a
scala> val idfModel = idf.fit(featurizedData)
idfModel: org.apache.spark.ml.feature.IDFModel = idf_dec3903d338a
scala> val rescaledData = idfModel.transform(featurizedData)
rescaledData: org.apache.spark.sql.DataFrame = [label: int, sentence: string ... 3 more fields]
scala> rescaledData.select("features", "label").take(3).foreach(println)
[(2000,[240,333,1105,1329,1357,1777],[0.6931471805599453,0.6931471805599453,1.3862943611198906,0.5753641449035617,0.6931471805599453,0.6931471805599453]),0]
[(2000,[213,342,489,495,1329,1809,1967],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453]),0]
[(2000,[286,695,1138,1193,1604],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453]),1]

浙公网安备 33010602011771号