import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.DoubleWritable;
import java.io.IOException;
public class Job52Mapper extends Mapper<LongWritable,Text,Text, DoubleWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//supper.map(key,value,context);
//一行一行读,然后将每一行文本转成字符串
String line=value.toString();
//分割每一行
String[] arr=line.split( "\t"); //[大数据开发工程师 上海吉祥航空股份有限公司 上海]
String city = null;
String salary = null;
String post = null;
if (arr.length > 1){
post = arr[0];
if (post.indexOf("大数据") != -1) {
if (arr.length > 2){
city = arr[2];
int index = city.indexOf("-");
if (index > 0){
city = city.substring(0,index);
}
if (arr.length > 3){
double avg = 0;
salary = arr[3];
if (salary.indexOf("万/年") != -1 || salary.indexOf("万以上/年") != -1){
String str1 = salary.split("万")[0];
String[] arr1 = str1.split("-");
avg = (Double.parseDouble(arr1[0]) + Double.parseDouble(arr1[arr1.length - 1])) * 10 / 12 / 2;
}
if (salary.indexOf("万/月") != -1){
String str1 = salary.replace("万/月","");
String[] arr1 = str1.split("-");
avg = (Double.parseDouble(arr1[0]) + Double.parseDouble(arr1[arr1.length - 1])) * 10 / 2;
}
if (salary.indexOf("千/月") != -1 || salary.indexOf("千以下/月") != -1){
String str1 = salary.split("千")[0];
String[] arr1 = str1.split("-");
avg = (Double.parseDouble(arr1[0]) + Double.parseDouble(arr1[arr1.length - 1])) / 2;
}
context.write(new Text(city),new DoubleWritable(avg));
}
}
}
}
}
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//输入:Mapper的输出<上海,1> <苏州,1> <上海,1>
//<上海,<1,1,1>>
//Reducer的输出 <上海,2300>
public class Job52Reducer extends Reducer<Text,DoubleWritable,Text,DoubleWritable> {
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
//super.reduce(key, values, context);
int sum=0;
int index = 0;
for(DoubleWritable i :values){
sum+=i.get();
index++;
}
context.write(key,new DoubleWritable(sum/index)); //reducer的输出结果
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.FileOutputStream;
import java.io.IOException;
public class Job52Runner {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
//创建job
Job job= Job.getInstance(conf,"job52");
//设置输入输出路径
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置运行类
job.setJarByClass(Job52Runner.class);
job.setMapperClass(Job52Mapper.class);
job.setReducerClass(Job52Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['font.family']='SimHei'
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
data = pd.read_csv("E:\output2\part-r-00000",encoding="utf-8",delimiter='\t')
data.columns = ['地区名称','平均薪资']
data.head()
data = data.sort_values(by='岗位招聘数',ascending=False).head(5)#岗位招聘数利用降序挑选前五名出来
plt.figure(figsize=(8,5))
x=data["地区名称"]
y=data["平均薪资"]
#制作柱状图
plt.bar(x,y,width=0.5,color="g")
plt.xticks(x,data["地区名称"])
plt.title("大数据相关职位地区前五的平均薪资")
plt.xlabel("城市名称")
plt.ylabel("平均薪资")
plt.legend(labels=["千/月"])
plt.show()