• 博客园logo
  • 会员
  • 众包
  • 新闻
  • 博问
  • 闪存
  • 赞助商
  • HarmonyOS
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录

dmhn

  • 博客园
  • 联系
  • 管理

公告

View Post

mapreducer提取一个网站的信息


package zidingyi;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class YongSubmitter{
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf=new Configuration();
    FileSystem fs=FileSystem.get(conf);
    if(fs.exists(new Path(args[1]))) {
        fs.delete(new Path(args[1]),true);
    }
    Job job=Job.getInstance();
    job.setJarByClass(YongSubmitter.class);
    
    job.setMapperClass(YongMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);
    
    job.setReducerClass(YongReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.waitForCompletion(true);
}
}

 


package zidingyi;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.
Text;
import org.apache.hadoop.mapreduce.Mapper;

public class YongMapper extends Mapper<LongWritable,Text, Text,NullWritable>{

@Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException,
InterruptedException {
String line
=value.toString();
String
str[]=line.split(",");
String phone
=str[0]+"\t"+str[3]+"\t"+str[4];
context.write(new
Text(phone),NullWritable.get());
}
}
package zidingyi;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class YongReducer extends Reducer<Text,NullWritable, Text,NullWritable>{
    @Override
    protected void reduce(Text k, Iterable<NullWritable> v,
            Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
         context.write(k,NullWritable.get());    
    }

}

 文本和文件在文本里

posted on 2020-05-24 21:32  我是一只魈魈魈鸟  阅读(33)  评论(0)    收藏  举报

刷新页面返回顶部
 
博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3