<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hnkjzy.cn</groupId>
<artifactId>weather15</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
</dependencies>
</project>
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Weather15 {
def main(args: Array[String]): Unit = {
//指定hadoop的目录
// System.setProperty("hadoop.home.dir","E:\\junior\\Hadoop\\hadoop-2.6.4")
//1.创建SparkConf对象,设置appName和Master地址
//spark-shell --master local[2]
val sparkconf=new SparkConf().setAppName("Weather15").setMaster("local[1]") //在本机Windows上运行
//2.创建SparkContext对象,它是所有任务计算的源头,它会创建DAGScheduler和TaskScheduler
val sparkContext=new SparkContext(sparkconf)
//3.读取数据文件,RDD可以简单的理解为是一个集合,集合中存放的元素是String类型
//.textFile("")
val data:RDD[String]=sparkContext.textFile("E:\\inputweather")
//4.切分每一行,获取所有的单词
val words:RDD[Array[String]]=data.map(_.split(" "))
val words2:RDD[String]=words.map(x=>{ //"阴/小雨"--->"阴"
if (x(1).indexOf("/")>0)
x(1).substring(0,x(1).indexOf("/"))
else
x(1)
})
//5.每种天气情况记为1,转换为(单词,1)
val wordAndOne:RDD[(String,Int)]=words2.map(x=>(x,1))
//6.相同天气情况汇总,前一个下划线表示累加数据,后一个下划线表示新数据
val result:RDD[(String,Int)]=wordAndOne.reduceByKey(_+_)
//7.收集打印结果数据
// val finalResult:Array[(String,Int)]=result.collect()
// println(finalResult.toBuffer)
//保存为文本文件
result.coalesce(1,true).saveAsTextFile("E:\\output15")
//8.关闭sparkContext对象
sparkContext.stop()
}
}
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['font.family']='SimHei'
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
data=pd.read_csv(r"E:\output15\part-00000",delimiter=',',header=None)
data.columns=['天气情况','天数']
data['天气情况']=data['天气情况'].map(lambda x: x.split('(',1)).str[1]
data['天数']=data['天数'].map(lambda x: x.split(')',1)).str[0]
data.head()
plt.figure(figsize=(10,8))
plt.pie(data['天数'],labels=data['天气情况'],autopct='%1.2f%%')
plt.title("2018年长沙全年各种类型天气占比数量图")
plt.legend(loc='upper right',bbox_to_anchor=(1.7,1.05),fontsize=10,borderaxespad=0.3)
plt.show()