Spark中文文本分析建模

实用的朴素贝叶斯模型建模
建模过程主要是把文本转化成向量然后再作分析
数据格式:

0,善良 美丽
1,丑陋 阴险 卑鄙
0,温和
.......
注:前面是给文章贴的标签,后面是文章的分词,分词可以找关于分词的文章去查看,后面我也会写关于分词的文章
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

class CreatModel {

}
object CreatModel{
  case class RawDataRecord(category: String, text: String)

  def main(args: Array[String]): Unit = {
    val config = new SparkConf().setAppName("createModel").setMaster("local[4]");
    val sc =new  SparkContext(config);
    val spark = SparkSession.builder().config(config).config("spark.sql.warehouse.dir", "warehouse/dir").getOrCreate();
    import spark.implicits._
    //分数据
    val Array(srcDF,testDF) = sc.textFile("D:\\decstop\\testFiles\\sougou").map {
      x =>
        val data = x.split(",")
        RawDataRecord(data(0),data(1))
    }.toDF().randomSplit(Array(0.7,0.3))

    //分词
    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    val wordsData = tokenizer.transform(srcDF)
    wordsData.show(false)
    val testtokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    val testwordsData = testtokenizer.transform(testDF)

    //文档词频
    val hashingTF =
      new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
    val featurizedData = hashingTF.transform(wordsData)

    val testhashingTF =
      new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
    val testfeaturizedData = testhashingTF.transform(testwordsData)

    //逆文档词频
    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val idfModel = idf.fit(featurizedData)
    val rescaledData = idfModel.transform(featurizedData)

    val testidf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val testidfModel = testidf.fit(testfeaturizedData)
    val testrescaledData = testidfModel.transform(testfeaturizedData)
    rescaledData.show(false) 
    //转换成贝叶斯的输入格式
    val trainDataRdd = rescaledData.select($"category",$"features").map {
      case Row(label: String, features:Vector) =>
        LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
    }

    val testtrainDataRdd = testrescaledData.select($"category",$"features").map {
      case Row(label: String, features:Vector) =>
        LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
    }

    val model =new NaiveBayes().fit(trainDataRdd)

    val predictions = model.transform(testtrainDataRdd)
    println("predictln out:");
    predictions.show();
    model.write.overwrite().save("resoult")

    //模型评估
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(predictions)
    println("accuracy out :")
    println("Accuracy:"+accuracy)

  }
}

 


posted @ 2017-05-16 11:55  大葱拌豆腐  阅读(2049)  评论(0编辑  收藏  举报