HADOOP实验-HDFS与MAPREDUCE操作
HDFS接口编程
调用HDFS文件接口实现对分布式文件系统中文件的访问,如创建、修改、删除等。
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.junit.After; import org.junit.Before; import org.junit.Test; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.Arrays; public class HdfsClient { private FileSystem fs; @Before public void init() throws IOException, InterruptedException, URISyntaxException { URI uri=new URI("hdfs://hadoop102:8020"); Configuration conf=new Configuration(); String user="admin"; //接受客户端对象 fs= FileSystem.get(uri,conf,user); } @After public void close() throws IOException { fs.close(); } @Test public void testMkdir() throws IOException{ fs.mkdirs(new Path("/xiyou")); } //参数优先级 /* hdfs-default.xml<hdfs-site.xml<资源目录下的配置文件<代码中配置 */ //上传 @Test public void testPut() throws IOException { //1.表示是否删除原数据 2,是否允许覆盖 3.原数据路径 4.目标路径 fs.copyFromLocalFile(true,true,new Path("D:\\text.txt"),new Path("/xiyou")); //2有了文件,且是false则会报错 } //下载 @Test public void testGet() throws IOException { //1.源文件是否删除 2.源文件的路径 3.目标地址路径 4.是否进行本地校验 fs.copyToLocalFile(false,new Path("/xiyou/text.txt"),new Path("D:\\"),false); } @Test //删除 public void testDelete()throws IOException{ //1.删除的路径 2.是否递归删除 fs.delete(new Path("/sanguo"),false); //删除非空目录,必须递归删除 } @Test //文件的更名和移动 public void testMove() throws IOException { //1.源文件路径 2.目标文件路径 //文件名称修改 // fs.rename(new Path("/xiyou"),new Path("/xiyou1")); //文件移动 fs.rename(new Path("/xiyou1/text.txt"),new Path("/textc.txt")); } @Test //获取文件信息 public void testFiles() throws IOException { RemoteIterator<LocatedFileStatus> list=fs.listFiles(new Path("/"),true); while (list.hasNext()) { LocatedFileStatus f = list.next(); System.out.println("======" + f.getPath() + "====="); System.out.println(f.getOwner()); System.out.println(f.getLen()); System.out.println(f.getGroup()); System.out.println(f.getPermission()); System.out.println(f.getModificationTime()); System.out.println(f.getReplication()); System.out.println(f.getBlockSize()); System.out.println(f.getPath().getName()); BlockLocation[] blockLocations = f.getBlockLocations(); System.out.println(Arrays.toString(blockLocations)); } } @Test public void testcFie() throws IOException { FileStatus[] listStatus = fs.listStatus(new Path("/")); for (FileStatus fileStatus : listStatus) { // 如果是文件 if (fileStatus.isFile()) { System.out.println("f:"+fileStatus.getPath().getName()); }else { System.out.println("d:"+fileStatus.getPath().getName()); } } }
MAPREDUCE并行程序开发求每年最高气温
package org.example.yunjisuan; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet; import org.example.wordcount.WordCountDriver; import org.example.wordcount.WordCountMapper; import org.example.wordcount.WordCountReducer; public class Temperature { static class TempMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 打印样本: Before Mapper: 0, 2000010115 System.out.print("Before Mapper: " + key + ", " + value); String line = value.toString(); String year = line.substring(0, 4); int temperature = Integer.parseInt(line.substring(8)); context.write(new Text(year), new IntWritable(temperature)); // 打印样本: After Mapper:2000, 15 System.out.println( "======" + "After Mapper:" + new Text(year) + ", " + new IntWritable(temperature)); } } static class TempReducer extends Reducer<Text, IntWritable, Text, IntWritable> { @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int maxValue = Integer.MIN_VALUE; StringBuffer sb = new StringBuffer(); //取values的最大值 for (IntWritable value : values) { maxValue = Math.max(maxValue, value.get()); sb.append(value).append(", "); } // 打印样本: Before Reduce: 2000, 15, 23, 99, 12, 22, System.out.print("Before Reduce: " + key + ", " + sb.toString()); context.write(key, new IntWritable(maxValue)); // 打印样本: After Reduce: 2000, 99 System.out.println( "======" + "After Reduce: " + key + ", " + maxValue); } } public static void main(String[] args) throws Exception { //输入路径 String dst = "hdfs://hadoop102:8020/cloudcomputing/data.TXT"; //输出路径,必须是不存在的,空文件加也不行。 String dstOut = "hdfs://hadoop102:8020/cloudcomputing/outputtem"; //Configuration hadoopConfig = new Configuration(); // Configuration conf = new Configuration(); Configuration hadoopConfig = new Configuration(); hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName() ); hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName() ); Job job = new Job(hadoopConfig); //如果需要打成jar运行,需要下面这句 //job.setJarByClass(NewMaxTemperature.class); //job执行作业时输入和输出文件的路径 FileInputFormat.addInputPath(job, new Path(dst)); FileOutputFormat.setOutputPath(job, new Path(dstOut)); //指定自定义的Mapper和Reducer作为两个阶段的任务处理类 job.setMapperClass(TempMapper.class); job.setReducerClass(TempReducer.class); //设置最后输出结果的Key和Value的类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //执行job,直到完成 job.waitForCompletion(true); System.out.println("Finished"); }
实验数据上传
hadoop fs copyFromLocal mydata/data.TXT /cloudcomputing/data.TXT
统计词频
Mapper package org.example.wordcount; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.lang.module.Configuration; /* KEYIN:Map阶段输入key的类型:LongWritable 偏移量 VALUE: value :TEXT 一行 KEYOUT: TEXT 单词 VALUEOUT: IntWritable 单词次数 */ public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable>{ // Text outk=new Text(); // IntWritable outv=new IntWritable(); // public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // //获取一行 // String line=value.toString(); // //切割 // String[]words=line.split(" "); // //循环写出 // for(String word:words){ // // outk.set(word); // context.write(outk,outv); // } // } Text k = new Text(); IntWritable v = new IntWritable(1);//必须给值 @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 获取一行 String line = value.toString(); // 2 切割 String[] words = line.split(" "); // 3 输出 for (String word : words) { k.set(word); context.write(k, v); } } } Reducer package org.example.wordcount; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet; import java.io.IOException; /* KEYIN:reduce阶段输入key的类型:TEXT VALUE: value :IntWritable KEYOUT: 输出 TEXT VALUEOUT: IntWritable */ public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> { // //admin,(1,1) // //累加 // int sum=0; // for(IntWritable value:values){ // // sum+=value.get(); // } //// IntWritable outv=new IntWritable(); // outv.set(sum); // //写出 // context.write(key,outv); // // 1 累加求和 //R,(1,1) int sum; IntWritable v = new IntWritable(); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { // 1 累加求和 sum = 0; for (IntWritable count : values) { sum = sum+count.get(); } // 2 输出 v.set(sum); context.write(key, v); } } WordcountDriver package org.example.wordcount; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCountDriver { public static void main(String[]args) throws IOException, ClassNotFoundException, InterruptedException { // //1.获取job // // Configuration conf=new Configuration(); // Job job=Job.getInstance(conf); // //2、设置jar路径 // // job.setJarByClass(WordCountDriver.class); // //3.管理mapper与reducer // // job.setMapperClass(WordCountMapper.class); // job.setReducerClass(WordCountReducer.class); // //4,设置map输出的key和value类型 // // job.setMapOutputKeyClass(Text.class); // job.setMapOutputValueClass(IntWritable.class); // //5.设置最终输出的最终kv类型 // // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(IntWritable.class); // //6.设置输入路径和输出路径 // // FileInputFormat.setInputPaths(job,new Path("F:\\mrtest\\inputword")); // FileOutputFormat.setOutputPath(job,new Path("F:\\mrtest\\output")); // //7.提交job // boolean result=job.waitForCompletion(true); // System.exit(result?0:1); // 1 获取配置信息以及获取job对象 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); // 2 关联本Driver程序的jar job.setJarByClass(WordCountDriver.class); // 3 关联Mapper和Reducer的jar job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); // 4 设置Mapper输出的kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // 5 设置最终输出kv类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 6 设置输入和输出路径 FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop102:8020/cloudcomputing/word.TXT")); FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop102:8020/cloudcomputing/outputword")); // 7 提交job boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } }