hive课堂测试1
1、 数据清洗:按照进行数据清洗,并将清洗后的数据导入hive数据库中。
两阶段数据清洗:
(1)第一阶段:把需要的信息从原始日志中提取出来
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
文章: article/11325
视频: video/3235
(2)第二阶段:根据提取出来的信息做精细化操作
ip--->城市 city(IP)
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
(3)hive数据库表结构:
create table data( ip string, time string , day string, traffic bigint,
type string, id string )
直接放源码:
package test;
import java.io.IOException;
import java.lang.String;
import java.util.*;
import java.text.SimpleDateFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class test3{
public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原时间格式
public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd-HH:mm:ss");//现时间格式
private static Date parseDateFormat(String string) { //转换时间格式
Date parse = null;
try {
parse = FORMAT.parse(string);
} catch (Exception e) {
e.printStackTrace();
}
return parse;
}
public static String[] parse(String line) {
String ip = parseIP(line); //ip
String time = parseTime(line); //时间
String day=parseDay(line);//天数
String type = parseType(line); //视频video或文章article
String id = parseId(line); //视频或者文章的id
String traffic = parseTraffic(line);//流量
return new String[] { ip, time,day,traffic,type,id};
}
private static String parseIP(String line) { //ip
String ip = line.split(",")[0].trim();//str.trim(); 去掉首尾空格
return ip;
}
private static String parseTime(String line) { //时间
final int first = line.indexOf(",");
final int last = line.indexOf(" +0800,");
String time = line.substring(first + 1, last).trim();
Date date = parseDateFormat(time);
return dateformat1.format(date);
}
private static String parseDay(String line) { //天数
String day = line.split(",")[2].trim();
return day;
}
private static String parseTraffic(String line) { //流量,转为int型
String traffic= line.split(",")[3].trim();
return traffic;
}
private static String parseType(String line) {
String day = line.split(",")[4].replace(" ", "");
return day;
}
private static String parseId(String line) {
String day = line.split(",")[5].replace(" ", "");//去掉所有空格
return day;
}
public static class Map extends Mapper<Object, Text, Text, NullWritable> {
public static Text word = new Text();
public void map(Object key, Text value, Context context)throws IOException, InterruptedException {
// 将输入的纯文本文件的数据转化成String
String line = value.toString();
String arr[] = parse(line);
word.set(arr[0]+"\t"+arr[1]+"\t"+arr[2]+"\t"+arr[3]+"\t"+arr[4]+"\t"+arr[5]+"\t");//一定用'\t',空格容易乱会有意想不到的问题
context.write(word,NullWritable.get());
}
}
public static class Reduce extends Reducer<Text, NullWritable, Text, NullWritable> {
// 实现reduce函数
public void reduce(Text key, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf=new Configuration();
System.out.println("start");
Job job=Job.getInstance(conf);
job.setJarByClass(test3.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);//设置map的输出格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path in = new Path("hdfs://localhost:8020/mapReduce/mymapreduce1/result.txt");
Path out = new Path("hdfs://localhost:8020/mapReduce/mymapreduce1/out");
FileInputFormat.addInputPath(job,in );
FileOutputFormat.setOutputPath(job,out);
boolean flag = job.waitForCompletion(true);
System.out.println(flag);
System.exit(flag? 0 : 1);
}
}
记得创建的是一个maven项目,然后在pom.xml里导入hadoop等的依赖
浙公网安备 33010602011771号