Spark

Student.py
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("student") \
        .getOrCreate()
df = spark.read \
    .format("csv") \
    .option("sep", ",") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load("..\\data\\score.csv")
df.createTempView("student")

df.groupBy("name").count().show()
df.groupBy("name").agg({ \
    "score":"sum",\
}).withColumnRenamed("sum(score)", "sum_score").show()

df.groupBy("name").agg({ \
    "score":"avg",\
}).withColumnRenamed("avg(score)", "avg_score").show()
teacher.py
from pyspark.sql import SparkSession


spark = SparkSession \
        .builder \
        .appName("test") \
        .getOrCreate()

sc = spark.sparkContext


def cal_class_times(line):
    line = line.split(",")
    class_times = int(line[4]) * int(line[7])
    return (int(line[0]), class_times)

def map_name(line):
       line = line.split(",")
       return (int(line[0]), line[1])

'''import sys
reload(sys)
sys.setdefaultencoding("utf8")
'''

from operator import add
rdd = sc.textFile("../data/teacher.CSV")
header = rdd.first()
rdd_2 =  rdd.filter(lambda x : x != header)
rdd_3 = rdd_2.map(lambda x: cal_class_times(x)).reduceByKey(add)

rdd_name = rdd_2.map(lambda x:map_name(x)).distinct()
rdd_join =rdd_3.join(rdd_name).sortBy(lambda x:x[1][0], False)
rdd_join.collect()
rdd_join.saveAsTextFile("../data/out")
score.py
from pyspark.sql import SparkSession


spark = SparkSession \
        .builder \
        .appName("test") \
        .getOrCreate()

sc = spark.sparkContext

# name,subject,score
def one_subject_score(line):
    line = line.split(",")
    return (line[1], int(line[2])) #subject score

def ave_score(subject):
    count = len(subject[1])
    ave = sum(subject[1])*1.0/count
    return (subject[0], ave)

rdd = sc.textFile("../data/score.CSV")
header = rdd.first()
rdd_2 =  rdd.filter(lambda x : x != header)
rdd_3 = rdd_2.map(lambda line: one_subject_score(line)).groupByKey()
rdd_4 = rdd_3.map(lambda subject:ave_score(subject))
rdd_4.collect()

SimpleGraphx代码
package org.training.spark.main

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
import org.apache.spark.{SparkConf, SparkContext}

case class User(name: String, age: Int, inDeg: Int, outDeg: Int)

object SimpleGraphX {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
    val sc = SparkContext.getOrCreate(conf)

    //设置顶点和边,注意顶点和边都是用元组定义的Array
    //顶点的数据类型是VD:(String,Int)
    val vertexArray = Array(
      (1l, ("Alice", 28)),
      (2l, ("Bob", 27)),
      (3L, ("Charlie", 65)),
      (4L, ("David", 42)),
      (5L, ("Ed", 55)),
      (6L, ("Fran", 50))
    )
    //边的数据类型ED:Int
    val edgeArray = Array(
      Edge(2l, 1l, 7),
      Edge(2L, 4L, 2),
      Edge(3L, 2L, 4),
      Edge(3L, 6L, 3),
      Edge(4L, 1L, 1),
      Edge(5L, 2L, 2),
      Edge(5L, 3L, 8),
      Edge(5L, 6L, 3)
    )
    //构造vertexRDD和edgeRDD
    val vertexRDD = sc.parallelize(vertexArray)
    val edgeRDD = sc.parallelize(edgeArray)

    //构造图Graph[VD,ED]
    val graph: Graph[(String, Int), Int] = Graph(vertexRDD, edgeRDD)

    /** ***************** 图的属性 *******************/
    println("***********************************************")
    println("属性演示")
    println("**********************************************************")
    println("---------------------------打印顶点:")
    graph.vertices.take(10).foreach{case (id,(name,age))=>println(s"userID :${id} ,name :${name} ,age :${age}")}
    println
    println("---------------------------打印边:")
    graph.edges.take(10).foreach{case Edge(src,dst,attr)=> println(s"原顶点: ${src}, 目标顶点: ${dst} ,关系: ${attr}")}
    println
    println("找出图中年龄大于30的顶点:")
    graph.vertices.filter { case (id, (name, age)) => age > 30 }.collect().foreach {
      case (_, (name, age)) => println(s"${name} is ${age}")
    }

    println
    //边操作:找出图中属性大于5的边
    println("找出图中属性大于5的边:")
    graph.edges.filter { case Edge(src, dst, attr) => attr > 5 }.collect().foreach {
      case Edge(srcId, dstId, attr) => println(s"${srcId} to ${dstId} attr ${attr}")
    }

    println
    //triplets操作,((srcId, srcAttr), (dstId, dstAttr), attr)
    println("列出边属性>5的tripltes:")
    for (triplet <- graph.triplets.filter(t => t.attr > 5).collect) {
      println(s"${triplet.srcAttr._1} likes ${triplet.dstAttr._1}")
    }

    println
    //Degrees操作
    println("找出图中最大的出度、入度、度数:")
    println("max of outDegrees: " + graph.outDegrees.reduce(max) + " max of inDegrees: " + graph.inDegrees.reduce(max) + " max of Degrees: " + graph.degrees.reduce(max))
    println

    /** ************************* 转换操作 ***************************/
    println("**********************************************************")
    println("转换操作")
    println("顶点的转换操作,顶点age + 10 println:")
    graph.mapVertices { case (id, (name, age)) => (id, (name, age + 10)) }.vertices.collect.foreach(println)
    println("顶点的转换操作,顶点age + 10:")
    graph.mapVertices { case (id, (name, age)) => (id, (name, age + 10)) }.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))


    println
    println("边的转换操作,边的属性*2:")
    graph.mapEdges(e => e.attr * 2).edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} attr ${e.attr}"))

    /** ************************* 结构操作 ***************************/
    println("**********************************************************")
    println("结构操作")
    println("**********************************************************")
    println("顶点年纪>30的子图:")
    val subGraph = graph.subgraph(vpred = (id, vd) => vd._2 >= 30)
    println("子图所有顶点:")
    subGraph.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))
    println
    println("子图所有边:")
    subGraph.edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} attr ${e.attr}"))
    println

    /** ************************* 连接操作 ***************************/
    println("**********************************************************")
    println("连接操作")
    println("**********************************************************")
    val inDegrees = graph.inDegrees
    //创建一个新图,顶点VD的数据类型为User,并从graph做类型转换
    val initialUserGraph = graph.mapVertices { case (id, (name, age)) => User(name, age, 0, 0) }

    //initialUserGraph与inDegrees、outDegrees(RDD)进行连接,并修改initialUserGraph中inDeg值、outDeg值
    val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
      case (id, u, inDegOut) => User(u.name, u.age, inDegOut.getOrElse(0), u.outDeg)
    }.outerJoinVertices(initialUserGraph.outDegrees) {
      case (id, u, outDegOpt) => User(u.name, u.age, u.inDeg, outDegOpt.getOrElse(0))
    }

    println("连接图的属性:")
    userGraph.vertices.collect.foreach(v => println(s"${v._2.name} inDeg: ${v._2.inDeg} outDeg: ${v._2.outDeg}"))
    println

    println("出度和入度相同的人员:")
    userGraph.vertices.filter {
      case (id, u) => u.inDeg == u.outDeg
    }.collect().foreach {
      case (id, property) => println(property.name)
    }
    println

    /** ************************* 聚合操作 ***************************/
    println("**********************************************************")
    println("聚合操作")
    println("**********************************************************")
    println("找出年纪最大的追求者:")
    /*val oldestFollower: VertexRDD[(String, Int)] = userGraph.aggregateMessages[(String, Int)](
      edge => Iterator((edge.dstId, (edge.srcAttr.name, edge.srcAttr.age))),
      (a, b) => if (a._2 > b._2) a else b
    )*/
    val oldestFollower = userGraph.aggregateMessages[(String, Int)](
      //将源顶点的属性发给目标顶点 map过程
      edge => {
        if (edge.srcAttr.age > edge.dstAttr.age) {
          edge.sendToDst(edge.srcAttr.name, edge.srcAttr.age)
        }else{
          edge.sendToDst(edge.srcAttr.name, edge.srcAttr.age)
        }
      },
      // 聚合函数
      (a, b) => if (a._2 > b._2) a else b
    )


    userGraph.vertices.leftJoin(oldestFollower) {
      (id, user, optOldestFollower) =>
        optOldestFollower match {
          case None => s"${user.name} does not have any followers"
          case Some((name, age)) => s"${name} is the oldest follower of ${user.name}"
        }
    }.collect.foreach { case (id, str) => println(str) }
    println


    /** ************************* 实用操作 ***************************/
    println("**********************************************************")
    println("聚合操作")
    println("**********************************************************")
    println("找出5到各顶点的最短:")

    val sourceId: VertexId = 5l
    val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist),
      triplet => {
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b)
    )
    println(sssp.vertices.collect.mkString("\n"))

    sc.stop()
  }

  def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
    if (a._2 > b._2) a else b
  }
}


posted @ 2021-05-22 14:11  笔记_y  阅读(83)  评论(0)    收藏  举报