1 package com.day07 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 object Leijia { 6 def main(args: Array[String]): Unit = { 7 //配置spark 8 var conf = new SparkConf().setAppName("wc").setMaster("local") 9 10 //获取spark上下文对象 11 var sc = new SparkContext(conf) 12 13 //读取数据文件 14 var dataRdd = sc.textFile("D:\\IDEA_Maven\\day07\\src\\main\\resources\\aa.txt") 15 16 //默认情况下,是没办法修改i的值的 17 // var i = 0 18 //定义累加器 19 var i = sc.accumulator(0) 20 21 dataRdd.foreach(s => { 22 i += 1 23 println(s + i) 24 }) 25 26 println(i) 27 } 28 29 30 }
1 package com.day07 2 3 import org.apache.spark.{SparkConf, SparkContext} 4 5 object TestGB { 6 def main(args: Array[String]): Unit = { 7 val sparkConf = new SparkConf().setAppName("TestGB").setMaster("local") 8 val sc = new SparkContext(sparkConf) 9 10 var dataRdd = sc.textFile("D:\\IDEA_Maven\\day07\\src\\main\\resources\\aa.txt") 11 12 var list = sc.broadcast(List("hello world")) 13 14 dataRdd.foreach(s => { 15 //使用.value进行获取数据 16 if (list.value.contains(s)) { 17 18 println(s) 19 } 20 }) 21 } 22 23 }
1 package com.day07 2 3 import java.sql.{Connection, DriverManager, PreparedStatement} 4 5 import org.apache.spark.broadcast.Broadcast 6 import org.apache.spark.rdd.RDD 7 import org.apache.spark.{SparkConf, SparkContext} 8 9 object IPLocaltion_Test { 10 def main(args: Array[String]): Unit = { 11 //todo:创建sparkconf 设置参数 12 //local 1 13 //local[n] 14 //local[*] 15 16 //1.本地运行 开发测试 17 //2.yarn 18 //3.standalone 19 val sparkConf: SparkConf = new SparkConf().setAppName("IPLocaltion_Test").setMaster("local") 20 21 //todo:创建SparkContext 22 val sc = new SparkContext(sparkConf) 23 24 //todo:读取基站数据 25 val data: RDD[String] = sc.textFile("D:\\IDEA_Maven\\day07\\src\\main\\resources\\ip.txt") 26 27 //todo:对基站数据进行切分 ,获取需要的字段 (ipStart,ipEnd,城市位置,经度,纬度) 28 val jizhanRDD: RDD[(String, String, String, String, String)] = data.map(_.split("\\|")).map( 29 x => (x(2), x(3), x(4) + "-" + x(5) + "-" + x(6) + "-" + x(7) + "-" + x(8), x(13), x(14))) 30 31 //todo:获取RDD的数据 32 val jizhanData: Array[(String, String, String, String, String)] = jizhanRDD.collect() 33 34 //todo:广播变量,一个只读的数据区,所有的task都能读到的地方 广播变量 35 val jizhanBroadcast: Broadcast[Array[(String, String, String, String, String)]] = sc.broadcast(jizhanData) 36 37 38 //todo:读取目标数据 39 val destData: RDD[String] = sc.textFile("D:\\IDEA_Maven\\day07\\src\\main\\resources\\20090121000132.394251.http.format") 40 41 //todo:获取数据中的ip地址字段 42 val ipData: RDD[String] = destData.map(_.split("\\|")).map(x => x(1)) 43 44 //todo:把IP地址转化为long类型,然后通过二分法去基站数据中查找,找到的维度做wordCount 45 //map 和mapPartition 的区别 46 val result = ipData.mapPartitions(iter => { 47 //获取广播变量中的值 48 val valueArr: Array[(String, String, String, String, String)] = jizhanBroadcast.value 49 50 //todo:操作分区中的itertator 51 iter.map(ip => { 52 //将ip转化为数字long 自己的转换规则 53 val ipNum: Long = ipToLong(ip) 54 55 //拿这个数字long去基站数据中通过二分法查找,返回ip在valueArr中的下标(目标数据 , 规则) 56 val index: Int = binarySearch(ipNum, valueArr) 57 58 //根据下标获取对一个的经纬度 59 val tuple = valueArr(index) 60 //返回结果 ((经度,维度),1) 61 ((tuple._4, tuple._5), 1) 62 }) 63 64 }) 65 66 //todo:分组聚合 67 val resultFinal: RDD[((String, String), Int)] = result.reduceByKey(_ + _) 68 69 //todo:打印输出 70 resultFinal.foreach(println) 71 72 //todo:将结果保存到mysql表中 73 74 resultFinal.map(x => (x._1._1, x._1._2, x._2)).foreachPartition(data2Mysql) 75 sc.stop() 76 77 } 78 79 //todo:ip转为long类型 80 def ipToLong(ip: String): Long = { 81 //todo:切分ip地址。 82 val ipArray: Array[String] = ip.split("\\.") 83 var ipNum = 0L 84 // |:按位或运算符 相同位上只要有1的都是1 85 // <<:按位进行左移位运算 86 for (i <- ipArray) { 87 ipNum = i.toLong | ipNum << 8L 88 } 89 ipNum 90 } 91 92 //todo:通过二分查找法,获取ip在广播变量中的下标 93 def binarySearch(ipNum: Long, valueArr: Array[(String, String, String, String, String)]): Int = { 94 95 //开始下标 96 var start = 0 97 //结束下标 98 var end = valueArr.length - 1 99 100 while (start <= end) { 101 //去中间 102 val middle = (start + end) / 2 103 //判断我们的ip 是否在这个数组的范围内 104 if (ipNum >= valueArr(middle)._1.toLong && ipNum <= valueArr(middle)._2.toLong) { 105 return middle 106 } 107 108 if (ipNum > valueArr(middle)._2.toLong) { 109 start = middle 110 } 111 112 if (ipNum < valueArr(middle)._1.toLong) { 113 end = middle 114 } 115 } 116 117 -1 118 } 119 120 //todo:数据保存到mysql表中 121 def data2Mysql(iterator: Iterator[(String, String, Int)]): Unit = { 122 //todo:创建数据库连接Connection 123 var conn: Connection = null 124 //todo:创建PreparedStatement对象 125 var ps: PreparedStatement = null 126 //todo:采用拼占位符问号的方式写sql语句。 127 var sql = "insert into iplocation(longitude,latitude,total_count) values(?,?,?)" 128 println(sql) 129 //todo:获取数据连接 130 conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/spark?serverTimezone=UTC", "root", "123") 131 132 133 //todo: 选中想被try/catch包围的语句 ctrl+alt+t 快捷键选中try/catch/finally 134 try { 135 iterator.foreach(line => { 136 println("---------------") 137 //todo:预编译sql语句 138 ps = conn.prepareStatement(sql) 139 140 //todo:对占位符设置值,占位符顺序从1开始,第一个参数是占位符的位置,第二个参数是占位符的值。 141 ps.setString(1, line._1) 142 ps.setString(2, line._2) 143 ps.setLong(3, line._3) 144 //todo:执行 145 ps.execute() 146 }) 147 } catch { 148 case e: Exception => println(e) 149 } finally { 150 if (ps != null) { 151 ps.close() 152 } 153 if (conn != null) { 154 conn.close() 155 } 156 } 157 158 } 159 160 }