Kaggle电信客户流失预测——基于GBDT融合LR

package com.fiveonevv.app.Model

import java.io.{FileInputStream, IOException, ObjectInputStream}

import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, FeatureType}
import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, Node}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

class GBDTPreprocessor extends Serializable {
    /**
      *
      * @param node 节点
      * @return 树的叶子节点
      */
    def getLeafNodes(node: Node): Array[Int] = {
        var treeLeafNodes = new Array[Int](0)
        if (node.isLeaf) {
            treeLeafNodes = treeLeafNodes.:+(node.id)
        } else {
            treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.leftNode.get)
            treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.rightNode.get)
        }
        treeLeafNodes
    }

    /**
      *
      * @param node 树节点
      * @param features 特征数据
      * @return 返回样本所在的叶节点id
      */
    def predictModify(node: Node, features: Vector): Int = {
        val split = node.split
        if (node.isLeaf) {
            node.id
        } else {
            // 判断是连续或者离散特征
            if (split.get.featureType == FeatureType.Continuous) {
                if (features(split.get.feature) <= split.get.threshold) {
                    predictModify(node.leftNode.get, features)
                } else {
                    predictModify(node.rightNode.get, features)
                }
            } else {
                if (split.get.categories.contains(features(split.get.feature))) {
                    predictModify(node.leftNode.get, features)
                } else {
                    predictModify(node.rightNode.get, features)
                }
            }
        }
    }

    def gbtTrain(gbtTrainData: RDD[LabeledPoint], numTrees: Int): (GradientBoostedTreesModel, Array[Array[Int]]) = {
        val boostingStrategy = BoostingStrategy.defaultParams("Classification")
        boostingStrategy.setNumIterations(numTrees)
        val gbdtModel = GradientBoostedTrees.train(gbtTrainData, boostingStrategy)
        val treeLeafArray = new Array[Array[Int]](numTrees)

        for (i <- 0.until(numTrees)) {
            // 获取所有树的叶子节点
            treeLeafArray(i) = getLeafNodes(gbdtModel.trees(i).topNode)
        }
        (gbdtModel, treeLeafArray)
    }


    /**
      *
      * @param gbtTestData 需要生成特征的数据
      * @param gbtModel gbt模型
      * @param treeLeafArray gbt模型树的所有叶子节点
      * @param numTrees 树的数量
      * @return
      */
    def gbtFeaturePredict(gbtTestData: RDD[(String, (Double, DenseVector))], gbtModel: GradientBoostedTreesModel, treeLeafArray: Array[Array[Int]], numTrees: Int): RDD[(String, LabeledPoint)] = {
        val newFeature = gbtTestData.map(line => {
            var gbtFeatures = new Array[Double](0)
            for (i <- 0.until(numTrees)) {
                val treePredict = predictModify(gbtModel.trees(i).topNode, line._2._2)
                val leafArray = new Array[Double]((gbtModel.trees(i).numNodes + 1) / 2)  // 完全二叉树叶节点的数量
                // 将叶子节点处置为1
                leafArray(treeLeafArray(i).indexOf(treePredict)) = 1  // 输入样本落入叶子节点的位置
                gbtFeatures = gbtFeatures ++ leafArray
            }
            (line._1, line._2._1, gbtFeatures)  // id, label, gbtFeatures
        })
        val gbtFeatureRDD = newFeature.map(
            x => (x._1, LabeledPoint(x._2, Vectors.dense(x._3)))
        )
        gbtFeatureRDD
    }

    /**
      *
      * @param data 标签
      * @param model 模型
      * @param isAppend
      * @return G B D T 构造新的特征
      */
    def getNodeListWithGBDT(data: RDD[LabeledPoint], model: GradientBoostedTreesModel, spark: SparkSession, isAppend: Boolean): Option[RDD[LabeledPoint]] = {
        val numTrees = model.numTrees
        // 存放每棵树的叶子节点编号
        val treeLeafArray = new Array[Array[Int]](numTrees)
        for (i <- 0.until(numTrees)) {
            treeLeafArray(i) = getLeafNodes(model.trees(i).topNode)
        }
        // 构造新的特征
        val newData:RDD[LabeledPoint] = data.map(line => {
            var newFeatures = new Array[Double](0)
            for (i <- 0.until(numTrees)) {
                // 获取特征所在的节点编号
                val treePredict = predictModify(model.trees(i).topNode, line.features)
                val treeArray = new Array[Double]((model.trees(i).numNodes + 1) / 2)
                treeArray(treeLeafArray(i).indexOf(treePredict)) = 1
                newFeatures = newFeatures ++ treeArray
            }
            if (isAppend) {
                new LabeledPoint(line.label, Vectors.dense(newFeatures ++ line.features.toArray))
            } else {
                new LabeledPoint(line.label, Vectors.dense(newFeatures))
            }
        })
        Option(newData)
    }

    def loadModel(path: String): Option[GradientBoostedTreesModel] = {
        try {
            val in = new ObjectInputStream(new FileInputStream(path))
            val model = Option(in.readObject().asInstanceOf[GradientBoostedTreesModel])
            in.close()
            model
        } catch {
            case ex: ClassNotFoundException =>
                println(ex.printStackTrace())
                None
            case ex: IOException =>
                println(ex.printStackTrace())
                None
            case _: Throwable =>
                throw new Exception
        }
    }
}
package com.fiveonevv.app.Model

import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature._
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

class GBDTLRModelProcess {
    /**
      * 本地读取数据预处理,处理成labeledPoint和DenseVector
      * @param rdd 本地读取txt数据 包含features,label
      * @return denseVectorRDD
      */
    def localDataProcess(rdd:RDD[String]): RDD[(String, LabeledPoint, LabeledPoint, (Double, DenseVector))] = {
        val denseVectorRDD = rdd.map{
            line =>{
                val arr = line.toString.split("\t")
                val userInfo = arr(0)
                val nonFeatures = arr(1).split("#").map(_.toDouble)
                val features = arr(2).split("#").map(_.toDouble)
                val label = arr(3).toDouble
                //创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
                (userInfo,LabeledPoint(label, new DenseVector(features)), LabeledPoint(label, new DenseVector(nonFeatures)),
                  (label, new DenseVector(nonFeatures)))
            }
        }
        denseVectorRDD
    }

    /**
      * yarn集群读取hive数据预处理,处理成labeledPoint和DenseVector
      * @param rdd 读取hive dataFrame转换成rdd
      * @return denseVectorRDD
      */
    def hiveDataProcess(rdd:RDD[(String, Array[Double], Array[Double], String)]): RDD[(String, LabeledPoint, LabeledPoint,
      (Double, DenseVector))] = {

        val denseVectorRDD = rdd.map{
            line => {
                val userInfo = line._1
                val numFeatures = line._2  // 数值型特征
                val cateFeatures = line._3 // 类别型特征
                val label = line._4.toDouble
                //创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
                (userInfo,
                  LabeledPoint(label, new DenseVector(cateFeatures)),
                  LabeledPoint(label,new DenseVector(numFeatures)),
                  (label, new DenseVector(numFeatures)))
            }
        }
        denseVectorRDD
    }


    /**
      * 用gbdt将连续型的特征离散化处理
      * @param train 训练用数据
      * @param test  测试用数据
      * @return 离散化处理后的训练集和测试集
      */
    def gbtFeatureProcess(train:RDD[(String,LabeledPoint,LabeledPoint,(Double,DenseVector))],
                         test:RDD[(String,LabeledPoint,LabeledPoint,(Double,DenseVector))],
                         spark:SparkSession): (DataFrame, DataFrame) = {
        // 离散特征
        val trainRDD = train.map(x => (x._1,x._2)).map(x => ((x._1,x._2.label),x._2.features.asML))
        val testRDD = test.map(x => (x._1,x._2)).map(x => ((x._1,x._2.label),x._2.features.asML))
        // 连续型特征
        val gbtTrain = train.map(x => x._3)
        val gbtTrainData = train.map(x => (x._1,x._4))
        val gbtTestData = test.map(x => (x._1,x._4))
        // 连续特征离散化处理
        val gbdtPreprocessor = new GBDTPreprocessor
        val numTrees = 10
        // treeLeafArray所有树的叶子节点
        val (gbtModel, treeLeafArray) = gbdtPreprocessor.gbtTrain(gbtTrain,numTrees)
        val gbtTrainRDD = gbdtPreprocessor.gbtFeaturePredict(gbtTrainData,gbtModel,treeLeafArray,numTrees)
          .map(x => ((x._1,x._2.label),x._2.features.asML))
        val allTrainRDD = trainRDD.join(gbtTrainRDD)
        val trainDF = spark.createDataFrame(allTrainRDD.map(x => (
          x._1._1,
          x._1._2,
          x._2._1,
          x._2._2)))
          .toDF("userInfo","label","feature1","feature2")

        val gbtTestRDD = gbdtPreprocessor.gbtFeaturePredict(gbtTestData,gbtModel,treeLeafArray,numTrees)
          .map(x => ((x._1,x._2.label),x._2.features.asML))
        val allTestRDD = testRDD.join(gbtTestRDD)
        val testDF = spark.createDataFrame(allTestRDD.map(x => (
          x._1._1,
          x._1._2,
          x._2._1,
          x._2._2
        )))
          .toDF("userInfo","label","feature1","feature2")
        (trainDF,testDF)
    }


    /**
      * 构建管道训练流程:归一化、特征选择、网格搜索
      * @param data 训练集
      * @return pipelineModel
      */
    def pipelineTrain(data:DataFrame): PipelineModel = {
        data.persist()
        val featureScaler = new MinMaxScaler()
          .setInputCol("features")
          .setOutputCol("scaledFeatures")
        val featureSelector = new ChiSqSelector()
          .setFeaturesCol("scaledFeatures")
          .setLabelCol("label")
          .setNumTopFeatures(80)
          .setOutputCol("selectedFeatures")
        val lr = new LogisticRegression()
          .setMaxIter(200)
          .setElasticNetParam(1.0)
          .setRegParam(0.001)
          .setThreshold(0.5)
          .setLabelCol("label")
          .setFeaturesCol("selectedFeatures")
        // build pipeline
        val pipeline = new Pipeline()
          .setStages(Array(featureScaler,featureSelector,lr))
        // 网格搜索:特征数量、正则化系数、弹性网络参数、迭代次数
        val paramGrid = new ParamGridBuilder()
          .addGrid(featureSelector.numTopFeatures,Array(70))
          .addGrid(lr.maxIter,Array(100))
          .addGrid(lr.elasticNetParam,Array(1.0,0.0))
          .addGrid(lr.regParam,Array(0.00075))
          .build()
        // 交叉验证
        val cv = new CrossValidator()
          .setEstimator(pipeline)
          .setEvaluator(new BinaryClassificationEvaluator())
          .setEstimatorParamMaps(paramGrid)
          .setNumFolds(5)
        val cvModel = cv.fit(data)
        val pipelineModel = cvModel.bestModel.asInstanceOf[PipelineModel]
        data.unpersist()
        pipelineModel
    }


    /**
      * pipeline的中间计算结果
      * @return 归一化结果、特征选择结果、lr分类结果
      */
    def pipelinePredict(data: DataFrame,pipelineModel: PipelineModel): (DataFrame, DataFrame, DataFrame) = {
        data.persist()
        val featureScaleModel = pipelineModel.stages(0).asInstanceOf[MinMaxScalerModel]
        val chiSqSelectorModel = pipelineModel.stages(1).asInstanceOf[ChiSqSelectorModel]
        val lrModel = pipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]

        println("特征选择个数:",chiSqSelectorModel.explainParam(chiSqSelectorModel.numTopFeatures))
        println("LR迭代次数:",lrModel.explainParam(lrModel.maxIter))
        println("LR正则化系数:",lrModel.explainParam(lrModel.regParam))
        println("LR分类阈值:",lrModel.explainParam(lrModel.threshold))
        println("L1L2正则比例:",lrModel.explainParam(lrModel.elasticNetParam))
        println("LR特征个数:",lrModel.numFeatures)
        val scaledData = featureScaleModel.transform(data)          //归一化
        val selectedData = chiSqSelectorModel.transform(scaledData) //特征选择
        val predictions = lrModel.transform(selectedData)           //lr预测

        data.unpersist()
        (scaledData,selectedData,predictions)
    }

    /**
      * 特征合并
      * @param data 数据集dataFrame 包含features1和features2
      * @return 合并后的features的数据集
      */
    def featureAssembler(data:DataFrame):DataFrame ={
        val assembler = new VectorAssembler()
          .setInputCols(Array("feature1", "feature2"))
          .setOutputCol("features")
        val output = assembler.transform(data)
        output
    }


    /**
      * 评估模型的效果
      * @return 准确率、加权精确率、加权召回率、F1值
      */
    def multiClassEvaluate(data: RDD[(Double,Double)]): (Double,Double,Double,Double) = {
        val metrics = new MulticlassMetrics(data)
        val accuracy = metrics.accuracy
        val weightedPrecision = metrics.weightedPrecision
        val weightedRecall = metrics.weightedRecall
        val f1 = metrics.weightedFMeasure
        (accuracy,weightedPrecision,weightedRecall,f1)
    }
}
package com.fiveonevv.app.core

import com.fiveonevv.app.Model.GBDTLRModelProcess
import com.fiveonevv.app.util.SparkSqlUtil
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.functions.{udf, _}

import scala.collection.mutable.ListBuffer

object GBDTLrTrain {
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
    def main(args: Array[String]): Unit = {
        val spark = SparkSqlUtil.initSparkSession(SparkSqlUtil.initSparkBuilder(),"GBDTLRTrainDemo")
        // 从hive读取数据
        val rawDF = spark
          .sql("""SELECT * FROM tmp.telco_churn""")
          .na.fill(0.0,Seq("TotalCharges"))
        // 类别型字段和数值型字段
        val cateCols = Array("gender","partner","dependents","phone_service","multiple_lines","internet_service","online_security",
            "online_backup","device_protection","tech_support","streaming_tv","streaming_movies","paperless_billing","payment_method")
        val numCols = Array("senior_citizen","tenure","monthly_charges","total_charges")
        // 建立类别索引
        val indexer = cateCols.map(colName => new StringIndexer().setInputCol(colName).setOutputCol(s"${colName}Index"))
        //val encoder = new OneHotEncoderEstimator().setInputCols(indexCols).setOutputCols(cateCols map (name => s"${name}Vec"))
        // 合并类别型特征
        val cateAssembler = new VectorAssembler().setInputCols(cateCols.map(_ + "Index")).setOutputCol("cateFeatures")
        // 合并数值型特征
        val numAssembler = new VectorAssembler().setInputCols(numCols).setOutputCol("numFeatures").setHandleInvalid("skip")
        val stagesArray = new ListBuffer[PipelineStage]()
        for (stringIndexer <- indexer) {
            stagesArray.append(stringIndexer)
        }
        stagesArray.append(cateAssembler,numAssembler)
        val dataPrePipeline = new Pipeline().setStages(stagesArray.toArray)
        // pipeline转换的结果中混杂了稀疏向量和稠密向量,统一转换为稠密向量
        val toDense = udf((v: org.apache.spark.ml.linalg.Vector) => v.toDense)
        val processedRDD = dataPrePipeline.fit(rawDF).transform(rawDF)
          .selectExpr("customerid","numFeatures","cateFeatures","case when churn = 'Yes' then 1.0 else 0.0 end as label")
          .withColumn("cateDenseFeatures",toDense(col("cateFeatures")))
          .selectExpr("customerid","numFeatures","cateDenseFeatures cateFeatures","label")
          .rdd.map(x => (
            x(0).toString,
            // ml向量不能直接转换为mllib向量,先转成Array然后再转成mllib的稠密向量
            x(1).asInstanceOf[org.apache.spark.ml.linalg.Vector].toArray,
            x(2).asInstanceOf[org.apache.spark.ml.linalg.DenseVector].toArray,
            x(3).toString)
        )

        val Array(trainRDD, testRDD) = processedRDD.randomSplit(weights=Array(0.7,0.3),1234)
        val modelProcess = new GBDTLRModelProcess
        val denseVectorTrainRDD = modelProcess.hiveDataProcess(trainRDD)
        val denseVectorTestRDD = modelProcess.hiveDataProcess(testRDD)

        //gbt训练 将连续型特征离散化并和原离散特征合并成新特征
        val (gbtFeatureTrainDF, gbtFeatureTestDF) = modelProcess.gbtFeatureProcess(denseVectorTrainRDD, denseVectorTestRDD, spark)
        val unionTrainDF = modelProcess.featureAssembler(gbtFeatureTrainDF) //gbt离散化后特征合并原特征
        val unionTestDF = modelProcess.featureAssembler(gbtFeatureTestDF)

        //训练数据上采样 正样本复制2倍
        val positiveDF = unionTrainDF.filter("label=1")
        val negativeDF = unionTrainDF.filter("label=0")
        val upPositiveDF = positiveDF//.union(positiveDF).union(positiveDF)
        val upSampleDF = negativeDF.union(upPositiveDF)

        //管道训练和预测
        val pipelineModel = modelProcess.pipelineTrain(upSampleDF)
        val (scaledDF, selectedDF, predictions) = modelProcess.pipelinePredict(unionTestDF, pipelineModel)

        // 评估模型效果
        predictions.select("customerid","label","rawPrediction","probability","prediction").show(50)
        val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
        val areaUnderROC = evaluator.setMetricName("areaUnderROC").evaluate(predictions)
        val areaUnderPR = evaluator.setMetricName("areaUnderPR").evaluate(predictions)

        // 检查模型在测试集上的表现
        val lp = predictions.select( "label", "prediction")
        val countTotal = predictions.count()
        val correct = lp.filter(lp("label") === lp("prediction")).count()  // 预测正确的样本数量
        lp.show(200)
        val ratioCorrect = correct.toDouble / countTotal.toDouble

        // 1 流失 0 留存
        val truePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") === lp("prediction")).count()  // 真流失用户
        val falsePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") =!= lp("prediction")).count()  // 假流失用户
        val trueNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") === lp("prediction")).count()  // 真留存用户
        val falseNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") =!= lp("prediction")).count()  // 假留存用户

        // 真正例率、假正例率
        val tpr = truePositive.toDouble / (truePositive + falseNegative)
        val fpr = falsePositive.toDouble / (falsePositive + trueNegative)
        // 流失用户查准率
        val positivePrecision = truePositive.toDouble / (truePositive + falsePositive)
        // 流失用户召回率
        val positiveRecall = truePositive.toDouble / (truePositive + falseNegative)
        // 留存用户查准率
        val negativePrecision = trueNegative.toDouble / (trueNegative + falseNegative)
        // 留存用户召回率
        val negativeRecall = trueNegative.toDouble / (trueNegative + falsePositive)
        println(s"预测样本总数: $countTotal")
        println(s"正确预测样本数量: $correct")
        println(s"模型准确率: $ratioCorrect")
        println(s"模型ROC值:$areaUnderROC")
        println(s"模型PR值:$areaUnderPR")
        println(s"预测结果中真流失用户个数:$truePositive")
        println(s"预测结果中假流失用户个数:$falsePositive")
        println(s"预测结果中真流失用户比例: $tpr")
        println(s"预测结果中假流失用户比例: $fpr")
        println(s"流失用户查准率:$positivePrecision")
        println(s"流失用户召回率:$positiveRecall")
        println(s"留存用户查准率:$negativePrecision")
        println(s"留存用户召回率:$negativeRecall")
        spark.stop()
    }
}
scala> val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
evaluator: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_f0b527f4e73d

scala> val areaUnderROC = evaluator.setMetricName("areaUnderROC").evaluate(predictions)
areaUnderROC: Double = 0.8306899086101781                                       

scala> val areaUnderPR = evaluator.setMetricName("areaUnderPR").evaluate(predictions)
areaUnderPR: Double = 0.6296575868466127                                        

scala> val lp = predictions.select( "label", "prediction")
lp: org.apache.spark.sql.DataFrame = [label: double, prediction: double]

scala> val countTotal = predictions.count()
countTotal: Long = 2095

scala> val truePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") === lp("prediction")).count()  // 真流失用户
truePositive: Long = 270                                                        

scala> val falsePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") =!= lp("prediction")).count()  // 假流失用户
falsePositive: Long = 146                                                       

scala> val trueNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") === lp("prediction")).count()  // 真留存用户
trueNegative: Long = 1397

scala> val falseNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") =!= lp("prediction")).count()  // 假留存用户
falseNegative: Long = 282

scala> val tpr = truePositive.toDouble / (truePositive + falseNegative)
tpr: Double = 0.4891304347826087

scala> val fpr = falsePositive.toDouble / (falsePositive + trueNegative)
fpr: Double = 0.09462086843810759

scala> val positivePrecision = truePositive.toDouble / (truePositive + falsePositive)
positivePrecision: Double = 0.6490384615384616

scala> val positiveRecall = truePositive.toDouble / (truePositive + falseNegative)
positiveRecall: Double = 0.4891304347826087

scala> val negativePrecision = trueNegative.toDouble / (trueNegative + falseNegative)
negativePrecision: Double = 0.8320428826682549

scala> val negativeRecall = trueNegative.toDouble / (trueNegative + falsePositive)
negativeRecall: Double = 0.9053791315618924

scala> println(s"预测样本总数: $countTotal")
预测样本总数: 2095

scala> println(s"正确预测样本数量: $correct")
正确预测样本数量: 1667

scala> println(s"模型准确率: $ratioCorrect")
模型准确率: 0.7957040572792363

scala> println(s"模型ROC值:$areaUnderROC")
模型ROC值:0.8306899086101781

scala> println(s"模型PR值:$areaUnderPR")
模型PR值:0.6296575868466127

scala> println(s"预测结果中真流失用户个数:$truePositive")
预测结果中真流失用户个数:270

scala> println(s"预测结果中假流失用户个数:$falsePositive")
预测结果中假流失用户个数:146

scala> println(s"预测结果中真流失用户比例: $tpr")
预测结果中真流失用户比例: 0.4891304347826087

scala> println(s"预测结果中假流失用户比例: $fpr")
预测结果中假流失用户比例: 0.0946208
posted @ 2021-04-20 22:15  swordspoet  阅读(120)  评论(0编辑  收藏  举报