51JOB网站大数据_岗位数据_分析与可视化
Job51Mapper
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; //keyin:行号,valuein:Text , //输出<上海,1> <苏州,1> public class Job51Mapper extends Mapper<LongWritable,Text,Text,IntWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // super.map(key, value, context); //一行一行的读,然后将每一行文本转成字符串 String line=value.toString(); //分割每一行 String[] arr=line.split("\\t"); //[大数据开发工程师 上海吉祥航空股份有限公司 上海] if(arr.length>2){ String city=arr[2]; //判断地区是否包含“-” int index=city.indexOf("-"); if(index>0){ //条件满足的话说明地区包含“-”,例如“深圳-龙华新区”,利用subString截取-前面的数据 city=city.substring(0,index); } context.write(new Text(city),new IntWritable(1)); } } }
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; //输入:Mapper的输出<上海,1> <苏州,1> <上海,1> //<上海,<1,1,1>> //Reducer的输出 <上海,2300> public class Job51Reducer extends Reducer<Text,IntWritable,Text,IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { //super.reduce(key, values, context); int sum=0; //是每个地区的岗位数量和 for(IntWritable i :values){ sum+=i.get(); //i.get()是把IntWritable转成int } context.write(key,new IntWritable(sum)); //reducer的输出结果 } }
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.FileOutputStream; import java.io.IOException; public class Job51Runner { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); //创建job Job job= Job.getInstance(conf,"job51"); //设置输入输出路径 FileInputFormat.addInputPath(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置运行类 job.setJarByClass(Job51Runner.class); job.setMapperClass(Job51Mapper.class); job.setReducerClass(Job51Reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true)?0:1); } }
import pandas as pd import matplotlib import matplotlib.pyplot as plt matplotlib.rcParams['font.family']='SimHei' matplotlib.rcParams['font.sans-serif'] = ['SimHei']
data = pd.read_csv("E:\output1\part-r-00000",encoding="utf-8",delimiter='\t')
data.columns = ['地区名称','岗位招聘数']
data.head()
data=data.head(5) #岗位招聘数利用降序挑选前五名出来 plt.figure(figsize=(10,6)) x=data['地区名称'] y=data['岗位招聘数'] plt.bar(x,y,color='r',width=.3,label='岗位数量') plt.xlabel('城市名称') plt.ylabel('岗位数量') plt.title('岗位招聘数前五的城市') plt.legend(fontsize=12) plt.show()

浙公网安备 33010602011771号