import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class Movie10Mapper extends Mapper<LongWritable,Text,Text, DoubleWritable> {
@Override
protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
//super.map(key,value,context);
String line=value.toString(); //[0 芳华 9.1 http://maoyan.com/films/1170264 剧情,爱情,战争 中国大陆 大陆上映 136 2017]
String[] arr=line.split("\00");
double mark=0;
String story=null;
if (arr.length>4) {
mark = Double.parseDouble(arr[2]);
story=arr[4];
if (story == null ||"".equals(story)){
return;
}else if (story.contains(",")){
story=story.split(",")[0];
}
context.write(new Text(story),new DoubleWritable(mark));
}
}
}
}
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class Movie10Reducer extends Reducer<Text, DoubleWritable,Text,DoubleWritable> {
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
//super.reduce(key, values, context);
double sum=0;
double sc=0;
double avg_mark;
for (DoubleWritable i:values){
sum+=i.get();
sc++;
}
String str=String.format("%.1f",sum/sc); //保留一位小数
avg_mark=Double.parseDouble(str);
context.write(key,new DoubleWritable(avg_mark));
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.FileOutputStream;
import java.io.IOException;
public class Movie10Runner {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
//创建job
Job job= Job.getInstance(conf,"maoyan");
//设置输入输出路径
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置运行类
job.setJarByClass(Movie10Runner.class);
job.setMapperClass(Movie10Mapper.class);
job.setReducerClass(Movie10Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
System.exit(job.waitForCompletion(true)?0:1);
}
}
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['font.family']='SimHei'
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
data=pd.read_csv(r"E:\output10\part-r-00000",sep='\t',header=None)
data.columns=['类型','上映电影评分均值']
data.head()
plt.figure(figsize=(12,6))
x=data['类型']
y=data['上映电影评分均值']
plt.bar(x,y,width=0.5,label='分数')
plt.xlabel('电影类型')
plt.ylabel('评分均值')
plt.xticks(rotation=30)
plt.title('上映电影类型的评分均值图表')
plt.legend(fontsize=12)
plt.show()