spark分词

scala> import org.apache.spark.ml.feature.{HashingTF,IDF,Tokenizer}

  scala> import spark.implicits._

  scala> val sentenceData = spark.createDataFrame(Seq(
| (0, "I heard about Spark and I love Spark"),
| (0, "I wish Java could use case classes"),
| (1, "Logistic regression models are neat")
| )).toDF("label", "sentence")

scala> val tokenizer=new Tokenizer().setInputCol("sentence").setOutputCol("words")

scala> val wordsData=tokenizer.transform(sentenceData)

scala> wordsData.show(false)
+-----+------------------------------------+---------------------------------------------+
|label|sentence |words |
+-----+------------------------------------+---------------------------------------------+
|0 |I heard about Spark and I love Spark|[i, heard, about, spark, and, i, love, spark]|
|0 |I wish Java could use case classes |[i, wish, java, could, use, case, classes] |
|1 |Logistic regression models are neat |[logistic, regression, models, are, neat] |
+-----+------------------------------------+---------------------------------------------+

scala> val hashingTF=new HashingTF().
| setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(2000)

scala> val featurizedData = hashingTF.transform(wordsData)

scala> val featurizedData = hashingTF.transform(wordsData)
featurizedData: org.apache.spark.sql.DataFrame = [label: int, sentence: string ... 2 more fields]

scala> featurizedData.select("rawFeatures").show(false)
+---------------------------------------------------------------------+
|rawFeatures |
+---------------------------------------------------------------------+
|(2000,[240,333,1105,1329,1357,1777],[1.0,1.0,2.0,2.0,1.0,1.0]) |
|(2000,[213,342,489,495,1329,1809,1967],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(2000,[286,695,1138,1193,1604],[1.0,1.0,1.0,1.0,1.0]) |
+---------------------------------------------------------------------+


scala> val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
idf: org.apache.spark.ml.feature.IDF = idf_dec3903d338a

scala> val idfModel = idf.fit(featurizedData)
idfModel: org.apache.spark.ml.feature.IDFModel = idf_dec3903d338a

scala> val rescaledData = idfModel.transform(featurizedData)
rescaledData: org.apache.spark.sql.DataFrame = [label: int, sentence: string ... 3 more fields]

scala> rescaledData.select("features", "label").take(3).foreach(println)
[(2000,[240,333,1105,1329,1357,1777],[0.6931471805599453,0.6931471805599453,1.3862943611198906,0.5753641449035617,0.6931471805599453,0.6931471805599453]),0]
[(2000,[213,342,489,495,1329,1809,1967],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453]),0]
[(2000,[286,695,1138,1193,1604],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453]),1]

 

posted @ 2019-04-09 22:31  我是一个粉刷匠^~^  阅读(369)  评论(0)    收藏  举报