import breeze.linalg
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, Word2Vec}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
//http://qkxue.net/info/28517/SparkML
//spark-shell --driver-class-path /home/hadoop/test/mysqljdbc.jar
object WbClassifier {
def main(args: Array[String]) {
val VECTOR_SIZE =500
val conf = new SparkConf().setAppName("WEIBO MLPC Classification")
val sc = new SparkContext(conf)
val sqlCtx = new SQLContext(sc)
val titlesplit1 = sqlCtx.jdbc("jdbc:mysql://192.168.0.37:3306/emotional?user=root&password=123456", "mltest")
val titlesplit =titlesplit1.toDF().registerTempTable("mltest")
val value =sqlCtx.sql("SELECT mltest.svalue,mltest.words FROM mltest")
//model
val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/SMSSpamCollection.txt").map(_.split("\t")).map(eachRow => {
(eachRow(0),eachRow(1).split(" "))
})
// val parsedRDD= value.map(p => {
// val v0 = p.get(0).toString
// val v1 = p.getString(1).split(",")
// (v0, v1)
//})
// val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/20170725.txt").map(line=>(line.split(" ")(3),line.split(" ")(2).split(",")))
// val parsedRDD = sc.textFile("hdfs://192.168.0.211:9000/user/hadoop/emotion/20170726.txt").map(_.split("\t")).map(eachRow => {
// (eachRow(0),eachRow(1).split(" "))
// })
// http://doc.okbase.net/u013719780/archive/239004.html
val msgDF = sqlCtx.createDataFrame(parsedRDD).toDF("label", "message")
// 主成分分析
//
val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(msgDF)
val word2Vec = new Word2Vec().setInputCol("message").setOutputCol("features").setVectorSize(VECTOR_SIZE).setMinCount(1)
val layers = Array[Int](VECTOR_SIZE, 6,5,3)
val multilayerPerceptronClassifier = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1024).setSeed(1234L).setMaxIter(456).setFeaturesCol("features").setLabelCol("indexedLabel").setPredictionCol("prediction")
val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
val Array(trainingData, testData) = msgDF.randomSplit(Array(0.8, 0.2))
val pipeline = new Pipeline().setStages(Array(labelIndexer, word2Vec, multilayerPerceptronClassifier, labelConverter))
val model = pipeline.fit(trainingData)
val predictionResultDF = model.transform(testData)
predictionResultDF.printSchema
//predictionResultDF.select("message", "label","features", "predictedLabel").show(30)
predictionResultDF.select("message","features","label","predictedLabel").show(30)
// predictionResultDF.select("message","features","label","predictedLabel").write.save("file:///logs")
predictionResultDF.select("predictedLabel").distinct().take(5)
val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
val predictionAccuracy = evaluator.evaluate(predictionResultDF)
println("Testing Accuracy is %2.4f".format(predictionAccuracy * 100) + "%")
sc.stop
}
}