优化spark RDD autoKMeans one-hot数据 无监督聚类

package scala.learningRDD

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

import scala.collection.mutable.ArrayBuffer


object KmsModelDef{
  def eudist(v1: Array[Double], v2: Array[Double]): Double = {
    require(v1.length == v2.length, "Vectors must be of the same length.")
    //    sqrt(v1.zip(v2).map { case (a, b) => math.pow(a - b, 2) }.sum)
    v1.zip(v2).map { case (a, b) => if (a==b && a>0 && b>0) 1 else 0}.sum.toDouble/
      v1.zip(v2).map { case (a, b) => if (a>0 || b>0) 1 else 0}.sum.toDouble
  }
}


class ManualKMeansModel(spark: SparkSession,similarity:Double, iter:Int, acc:Double)  {

//  val spark: SparkSession = SparkSession.builder()
//    .master("local[*]").appName("autoKMeans").getOrCreate()

  import spark.implicits._
  val sc: SparkContext = spark.sparkContext

    var data: RDD[Vector]=_
    var centers:RDD[(Double,Vector)]=_
    var center_data:RDD[(Double,Double,Double)]=_



    val smllt:Double = similarity
    val iters: Long = iter
    val accs:Double = acc
//    var data_size:Long = _




  // 优化k_meas_jj方法
  def k_meas_jj_optimized(): Unit = {
    // 初始中心点查找优化
    val maxtuple0: (Double, Double) = this.data.map(v => (v.toArray.head, v.toArray.tail.sum))
      .max()(Ordering.by(_._2)) // 使用max替代reduce

    val cts_0: Vector = data.filter(v => v(0) == maxtuple0._1).first()
    //this.centers = sc.parallelize(Seq((0, cts_0)))

    var iteval = 0.0
    var n = 1
    val centersList: ArrayBuffer[(Double, Vector)] = new ArrayBuffer[(Double, Vector)]()
    centersList += ((0, cts_0))

    while (iteval < this.smllt && n < 100) { // 添加安全上限
      val centersBroadcast = sc.broadcast(centersList.toMap)

      // 使用mapPartitions减少shuffle
      val farthestPoint: (Vector, Double) = this.data.mapPartitions { partition =>
        val localCenters: Map[Double, Vector] = centersBroadcast.value
        partition.map { vector =>
          val minDistance = localCenters.values.map { center =>
            KmsModelDef.eudist(center.toArray.tail, vector.toArray.tail)
          }.min
          (vector, minDistance)
        }
      }.max()(Ordering.by(_._2)) // 找到距离最远的点

      iteval = farthestPoint._2

      if (iteval > this.smllt) return

      val newCenter: (Double, Vector) = (n.toDouble, farthestPoint._1)
      centersList += newCenter
      this.centers = sc.parallelize(centersList)

      n += 1
    }
  }





  def assign_clusters(): Unit ={

    val center_data_item: RDD[((Double, Vector), Vector)] = this.centers.cartesian(this.data)

    val rsl_1: RDD[(Double, Double, Double)] = center_data_item.flatMap(item=>
    {Some(item._1._1,item._2.toArray.head,KmsModelDef.eudist(item._1._2.toArray.tail,item._2.toArray.tail))})

    // 计算 每个数据点 到中新点 最大 的类别(类别,索引,距离)
    val maxByGroup: RDD[(Double, Double, Double)] = rsl_1
      .map(tuple => (tuple._2, tuple))
      .reduceByKey { (t1, t2) =>
        if (t1._3 > t2._3) t1 else t2
      }.map(_._2)

    this.center_data = maxByGroup

//    maxByGroup.toDF().show()

  }

  // 优化update_centers方法
  def update_centers_optimized(): Double = {
    val old_centers: Double = this.center_data.map(v => v._3).sum()

    // 使用join代替cartesian
    val centerDataKV: RDD[(Double, (Double, Double))] = this.center_data.map(tuple => (tuple._2, (tuple._1, tuple._3)))
    val dataKV = this.data.map(v => (v.toArray.head, v))

    val joined: RDD[(Double, ((Double, Double), Vector))] = centerDataKV.join(dataKV)

    // 使用aggregateByKey进行高效聚合
    val newCentersInfo: RDD[(Double, (Vector, Int))] = joined.map { case (_, ((centerId, _), vector)) =>
      (centerId, (vector, 1))
    }.reduceByKey { case ((v1, c1), (v2, c2)) =>
      // 这里应该是计算均值,但根据您的逻辑需要调整
      (v1, c1 + c2) // 示例聚合
    }

    // 更新中心点逻辑
    val updatedCenters: RDD[(Double, Vector)] = newCentersInfo.map { case (centerId, (vector, count)) =>
      // 根据您的业务逻辑计算新中心点
      (centerId, vector)
    }

    this.centers = updatedCenters
    this.assign_clusters()

    val new_centers: Double = this.center_data.map(v => v._3).sum()
    Math.abs(new_centers - old_centers)
  }

//  def update_centers(): Double ={
//    val old_centers: Double = this.center_data.map(v=>v._3).sum()
//
//    val center_data_item: RDD[((Double, Double, Double), Vector)] = this.center_data.cartesian(this.data).filter(item=>item._1._2==item._2.toArray.head)
//
//    val rsl_1: RDD[(Double, Double, Double)] = center_data_item.flatMap(item=>{Some(item._1._1,item._1._2,item._2.toArray.tail.sum)})
//
//    val rsl_2: RDD[(Double, Double, Double)] = rsl_1
//      .map(tuple => (tuple._1, tuple))
//      .reduceByKey { (t1, t2) =>
//        if (t1._3 > t2._3) t1 else t2
//      }.map(_._2)
//
////    rsl_2.toDF().show()
//
//    //更新 新中心点
//    this.centers = rsl_2.cartesian(this.data).filter(item=>item._1._2==item._2.toArray.head).map(item=>(item._1._1,item._2))
//
//    //更新 新 center_data 中心与数据的对照关系
//    // assign_clusters()------------------------------------------------------------------------
//
//    val center_data_item_2: RDD[((Double, Vector), Vector)] = this.centers.cartesian(this.data)
//
//    val clus_2: RDD[(Double, Double, Double)] = center_data_item_2.flatMap(item=>
//    {Some(item._1._1,item._2.toArray.head,KmsModelDef.eudist(item._1._2.toArray.tail,item._2.toArray.tail))})
//
//    // 计算 每个数据点 到中新点 最大 的类别(类别,索引,距离)
//    val maxByGroup_1: RDD[(Double, Double, Double)] = clus_2.map(tuple => (tuple._2, tuple))
//      .reduceByKey { (t1, t2) =>
//        if (t1._3 > t2._3) t1 else t2
//      }.map(_._2)
//
//    this.center_data = maxByGroup_1
//
//    //---------------------------------------------------------------------------------------------
//
//    val new_centers: Double = this.center_data.map(v=>v._3).sum()
//
//    return Math.abs(new_centers-old_centers)
//  }



  def fit(df:RDD[Vector]): Unit ={
    this.data = df
//    this.data_size = df.count()
    // 初始化中心点
    k_meas_jj_optimized()

    var n:Long = 0
    var accing = Double.MaxValue
    while (accing> this.accs && n<this.iters) {
      assign_clusters()
      accing = update_centers_optimized()

      println(accing,n)

      n = n+1
    }

    // 打印运行结果 centers 和 center_data
    this.centers.toDF().show()
    this.center_data.toDF().show()

  }
}



object demo{


  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .master("local[*]")
      .appName("autoKMeans")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.kryoserializer.buffer.max", "256m")
      .config("spark.sql.adaptive.enabled", "true")
      .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
      .config("spark.default.parallelism", "10") // 根据集群调整
      .config("spark.sql.shuffle.partitions", "10")
      .getOrCreate()
    val sc: SparkContext = spark.sparkContext


    val dataseq: Seq[Vector] = Seq(
      Vectors.dense(1, 0, 0, 1,1),
      Vectors.dense(2, 0, 0, 1,1),
      Vectors.dense(3, 1, 0, 0,1),
      Vectors.dense(4, 1, 0, 0,1),
      Vectors.dense(5, 1, 1, 1,1),
      Vectors.dense(6, 1, 0, 1,1),
      Vectors.dense(7, 0, 0, 1,1),
      Vectors.dense(8, 0, 0, 1,1),
      Vectors.dense(9, 1, 0, 0,1),
      Vectors.dense(10, 1, 0, 0,1),
      Vectors.dense(11, 1, 1, 1,1),
      Vectors.dense(12, 1, 1, 0,1),
      Vectors.dense(13, 1, 0, 1,0),
      Vectors.dense(14, 1, 0, 1,0)
    )

    val data: RDD[Vector] = sc.parallelize(dataseq)

    val model:ManualKMeansModel = new ManualKMeansModel(spark,0.65,300,0.00001)
    model.fit(data)


//    val r = model.eudist(Vectors.dense(6, 1, 1, 1).toArray.tail,Vectors.dense(4, 1, 0, 0).toArray.tail)
//    println(r)

    spark.stop()
  }

}

  

posted @ 2026-01-03 01:42  ARYOUOK  阅读(6)  评论(0)    收藏  举报