HADOOP实验-HDFS与MAPREDUCE操作

 

HDFS接口编程

调用HDFS文件接口实现对分布式文件系统中文件的访问,如创建、修改、删除等。

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;

public class HdfsClient {
    private FileSystem fs;
    @Before
    public void init() throws IOException, InterruptedException, URISyntaxException {

        URI uri=new URI("hdfs://hadoop102:8020");
        Configuration conf=new Configuration();

        String user="admin";
        //接受客户端对象

        fs= FileSystem.get(uri,conf,user);

    }
    @After
    public void close() throws IOException {
        fs.close();
    }
    @Test
    public void testMkdir() throws IOException{

        fs.mkdirs(new Path("/xiyou"));

    }
    //参数优先级
    /*
    hdfs-default.xml<hdfs-site.xml<资源目录下的配置文件<代码中配置
     */
    //上传
    @Test

    public void testPut() throws IOException {
        //1.表示是否删除原数据 2,是否允许覆盖 3.原数据路径 4.目标路径
        fs.copyFromLocalFile(true,true,new Path("D:\\text.txt"),new Path("/xiyou"));
        //2有了文件,且是false则会报错
    }
    //下载
   @Test
    public void testGet() throws IOException {
        //1.源文件是否删除 2.源文件的路径 3.目标地址路径 4.是否进行本地校验
        fs.copyToLocalFile(false,new Path("/xiyou/text.txt"),new Path("D:\\"),false);
   }
   @Test
   //删除
    public void testDelete()throws IOException{
        //1.删除的路径 2.是否递归删除
        fs.delete(new Path("/sanguo"),false);
        //删除非空目录,必须递归删除
    }
    @Test
    //文件的更名和移动
    public void testMove() throws IOException {
        //1.源文件路径 2.目标文件路径
        //文件名称修改
//        fs.rename(new Path("/xiyou"),new Path("/xiyou1"));
        //文件移动
        fs.rename(new Path("/xiyou1/text.txt"),new Path("/textc.txt"));
    }

    @Test
    //获取文件信息
    public void testFiles() throws IOException {
        RemoteIterator<LocatedFileStatus> list=fs.listFiles(new Path("/"),true);

        while (list.hasNext()) {
            LocatedFileStatus f = list.next();
            System.out.println("======" + f.getPath() + "=====");
            System.out.println(f.getOwner());
            System.out.println(f.getLen());
            System.out.println(f.getGroup());
            System.out.println(f.getPermission());
            System.out.println(f.getModificationTime());
            System.out.println(f.getReplication());
            System.out.println(f.getBlockSize());
            System.out.println(f.getPath().getName());
            BlockLocation[] blockLocations = f.getBlockLocations();
            System.out.println(Arrays.toString(blockLocations));
        }
    }
   @Test
    public void testcFie() throws IOException {

       FileStatus[] listStatus = fs.listStatus(new Path("/"));

       for (FileStatus fileStatus : listStatus) {

           // 如果是文件
           if (fileStatus.isFile()) {
               System.out.println("f:"+fileStatus.getPath().getName());
           }else {
               System.out.println("d:"+fileStatus.getPath().getName());
           }
       }

   }

MAPREDUCE并行程序开发求每年最高气温

 

package org.example.yunjisuan;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet;
import org.example.wordcount.WordCountDriver;
import org.example.wordcount.WordCountMapper;
import org.example.wordcount.WordCountReducer;

public class Temperature {
       static class TempMapper extends
            Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            // 打印样本: Before Mapper: 0, 2000010115
            System.out.print("Before Mapper: " + key + ", " + value);
            String line = value.toString();
            String year = line.substring(0, 4);
            int temperature = Integer.parseInt(line.substring(8));
            context.write(new Text(year), new IntWritable(temperature));
            // 打印样本: After Mapper:2000, 15
            System.out.println(
                    "======" +
                            "After Mapper:" + new Text(year) + ", " + new IntWritable(temperature));
        }
    }

    static class TempReducer extends
            Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context) throws IOException, InterruptedException {
            int maxValue = Integer.MIN_VALUE;
            StringBuffer sb = new StringBuffer();
            //取values的最大值
            for (IntWritable value : values) {
                maxValue = Math.max(maxValue, value.get());
                sb.append(value).append(", ");
            }
            // 打印样本: Before Reduce: 2000, 15, 23, 99, 12, 22,
            System.out.print("Before Reduce: " + key + ", " + sb.toString());
            context.write(key, new IntWritable(maxValue));
            // 打印样本: After Reduce: 2000, 99
            System.out.println(
                    "======" +
                            "After Reduce: " + key + ", " + maxValue);
        }

    }
    public static void main(String[] args) throws Exception {
        //输入路径
        String dst = "hdfs://hadoop102:8020/cloudcomputing/data.TXT";
        //输出路径,必须是不存在的,空文件加也不行。
        String dstOut = "hdfs://hadoop102:8020/cloudcomputing/outputtem";
        //Configuration hadoopConfig = new Configuration();
       // Configuration conf = new Configuration();
        Configuration hadoopConfig = new Configuration();

        hadoopConfig.set("fs.hdfs.impl",
                org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
        );
        hadoopConfig.set("fs.file.impl",
                org.apache.hadoop.fs.LocalFileSystem.class.getName()
        );
        Job job = new Job(hadoopConfig);

        //如果需要打成jar运行,需要下面这句
        //job.setJarByClass(NewMaxTemperature.class);

        //job执行作业时输入和输出文件的路径
        FileInputFormat.addInputPath(job, new Path(dst));
        FileOutputFormat.setOutputPath(job, new Path(dstOut));

        //指定自定义的Mapper和Reducer作为两个阶段的任务处理类
        job.setMapperClass(TempMapper.class);
        job.setReducerClass(TempReducer.class);

        //设置最后输出结果的Key和Value的类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //执行job,直到完成
        job.waitForCompletion(true);
        System.out.println("Finished");

    }

 

实验数据上传

hadoop fs copyFromLocal mydata/data.TXT /cloudcomputing/data.TXT

 

 统计词频

Mapper
package org.example.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.lang.module.Configuration;

/*
KEYIN:Map阶段输入key的类型:LongWritable 偏移量
VALUE:          value  :TEXT 一行
KEYOUT:                 TEXT   单词
VALUEOUT:                IntWritable 单词次数

*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
//    Text outk=new Text();
//    IntWritable outv=new IntWritable();
//    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//        //获取一行
//        String line=value.toString();
//        //切割
//        String[]words=line.split(" ");
//        //循环写出
//        for(String word:words){
//
//            outk.set(word);
//            context.write(outk,outv);
//        }
//    }
    Text k = new Text();
    IntWritable v = new IntWritable(1);//必须给值

    @Override
    protected void map(LongWritable key, Text value, Context context)  throws IOException, InterruptedException {

        // 1 获取一行
        String line = value.toString();

        // 2 切割
        String[] words = line.split(" ");

        // 3 输出
        for (String word : words) {

            k.set(word);
            context.write(k, v);
        }
    }
}
Reducer
package org.example.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet;

import java.io.IOException;
/*
KEYIN:reduce阶段输入key的类型:TEXT
VALUE:          value  :IntWritable
KEYOUT:       输出          TEXT
VALUEOUT:                IntWritable

*/
public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {

//        //admin,(1,1)
//        //累加
//        int sum=0;
//        for(IntWritable value:values){
//
//            sum+=value.get();
//        }
////        IntWritable outv=new IntWritable();
//        outv.set(sum);
//        //写出
//        context.write(key,outv);
//

// 1 累加求和
//R,(1,1)
    int sum;
    IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        // 1 累加求和
        sum = 0;
        for (IntWritable count : values) {
            sum = sum+count.get();
        }

        // 2 输出
        v.set(sum);
        context.write(key, v);
    }
}
WordcountDriver
package org.example.wordcount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class WordCountDriver {
    public static void main(String[]args) throws IOException, ClassNotFoundException, InterruptedException {
//        //1.获取job
//
//        Configuration conf=new Configuration();
//        Job job=Job.getInstance(conf);
//        //2、设置jar路径
//
//        job.setJarByClass(WordCountDriver.class);
//        //3.管理mapper与reducer
//
//        job.setMapperClass(WordCountMapper.class);
//        job.setReducerClass(WordCountReducer.class);
//        //4,设置map输出的key和value类型
//
//        job.setMapOutputKeyClass(Text.class);
//        job.setMapOutputValueClass(IntWritable.class);
//        //5.设置最终输出的最终kv类型
//
//        job.setOutputKeyClass(Text.class);
//        job.setOutputValueClass(IntWritable.class);
//        //6.设置输入路径和输出路径
//
//        FileInputFormat.setInputPaths(job,new Path("F:\\mrtest\\inputword"));
//        FileOutputFormat.setOutputPath(job,new Path("F:\\mrtest\\output"));
//        //7.提交job
//        boolean result=job.waitForCompletion(true);
//        System.exit(result?0:1);
        // 1 获取配置信息以及获取job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 2 关联本Driver程序的jar
        job.setJarByClass(WordCountDriver.class);

        // 3 关联Mapper和Reducer的jar
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        // 4 设置Mapper输出的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 5 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop102:8020/cloudcomputing/word.TXT"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop102:8020/cloudcomputing/outputword"));

        // 7 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }
}

 

posted @ 2021-11-10 13:20  好吗,好  阅读(150)  评论(0)    收藏  举报