import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class Movie12Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] arr=line.split("\00");
if (arr.length > 4) {
String type = arr[4];
int idx = type.indexOf(",");
if (idx > 0){
type = type.substring(0, idx);
}
context.write(new Text(type), new IntWritable(1));
}
}
}
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class Movie12Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values){
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* mapreduce的运行类
*/
public class Movie12Runner {
// args两个元素,一是输入目录,二是输出目录
// 注意:输出目录原来是不存在的
public static void main(String[] args) throws Exception{
// 加载配置
Configuration conf = new Configuration();
// 创建一个Job
Job job = Job.getInstance(conf, "movie12");
// 设置输入输出路径
FileInputFormat.addInputPath(job, new Path(args[0])); // 输入
FileOutputFormat.setOutputPath(job, new Path(args[1])); // 输出
// 设置运行类
job.setJarByClass(Movie12Runner.class);
job.setMapperClass(Movie12Mapper.class);
job.setReducerClass(Movie12Reducer.class);
job.setOutputKeyClass(Text.class); // 设置输出键的类型
job.setOutputValueClass(IntWritable.class); // 设置输出值的类型
System.exit(job.waitForCompletion(true)?0:1);
}
}
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['font.family']='SimHei'
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
data=pd.read_csv(r"E:\output12\part-r-00000",sep='\t',header=None)
data.columns=['类型','上映电影数量']
data.head()
plt.figure(figsize=(12,8))
plt.pie(data['上映电影数量'],labels=data['类型'],autopct='%1.2f%%')
plt.title("上映电影类型数目占比图")
plt.legend(loc='upper right',bbox_to_anchor=(1.7,1.05),fontsize=10,borderaxespad=0.3)
plt.show()