hadoop执行hdfs文件到hbase表插入操作(xjl456852原创)
本例中需要将hdfs上的文本文件,解析后插入到hbase的表中.
本例用到的hadoop版本2.7.2 hbase版本1.2.2
hbase的表如下:
create 'ns2:user', 'info'
hdfs上的文本文件如下[data/hbase_input/hbase.txt]
1,xiejl,202,haha,303,liudehua,404,daoming,41
可以通过命令查看hadoop的classpath现在包含哪些jar包:
[hadoop@master ~]$ hdfs classpath
java的主方法:
package com.xjl456852.mapreduce;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import java.io.IOException;/*** 将hdfs中的文本文件写入到hbase的表中* 程序的运行需要加入hadoop的配置文件和hbase的配置文件到jar包中* 对应的hbase的表* create 'ns2:user','info'** Created by xiejl on 2016/8/10.*/public class HBaseApp {public static void main(String [] args) {try {Job job = Job.getInstance();job.setJobName("text into hbase table");job.setJarByClass(HBaseApp.class);FileInputFormat.addInputPath(job, new Path(args[0]));//设置表名job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, args[1]);//设置输出格式为tablejob.setOutputFormatClass(TableOutputFormat.class);//设置输出的key类型为ImmutableBytesWritablejob.setOutputKeyClass(ImmutableBytesWritable.class);//设置输出的value类型为Putjob.setOutputValueClass(Put.class);//因为map输出key和reduce输出的key类型不一致,所以需要再设置map的key输出类型为Textjob.setMapOutputKeyClass(Text.class);//因为map输出value和reduce输出的value类型不一致,所以需要再设置map的value输出类型为Textjob.setMapOutputValueClass(Text.class);//Mapperjob.setMapperClass(MyMapper.class);//Reducerjob.setReducerClass(MyReducer.class);System.exit(job.waitForCompletion(true) ? 0 : 1);} catch (InterruptedException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} catch (ClassNotFoundException e) {e.printStackTrace();}}}
Mapper类:
package com.xjl456852.mapreduce;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/*** Created by xiejl on 2016/8/10.*/public class MyMapper extends Mapper<LongWritable, Text, Text, Text> {@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();int index = line.indexOf(",");String rowKey = line.substring(0, index);//跳过逗号String valueLine = line.substring(index+1);context.write(new Text(rowKey), new Text(valueLine));}}
Reducer类:
package com.xjl456852.mapreduce;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/*** Created by xiejl on 2016/8/11.*/public class MyReducer extends Reducer<Text, Text, ImmutableBytesWritable, Put> {@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {byte[] rowKey = Bytes.toBytes(key.toString());for(Text text : values) {//设置put对象的行键Put put = new Put(rowKey);String line = text.toString();int index = line.indexOf(",");String name = line.substring(0, index);String age = line.substring(index+1);//列族的是建表时固定的,列和值是插入时添加的.put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"),Bytes.toBytes(name));put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"),Bytes.toBytes(age));context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put);}}}
将程序打为jar包,jar包中需要加入hadoop的配置文件和hbase的配置 (有人将程序打成胖包,就是将依赖的jar,依赖的四个类库hbase-client,hbase-server,hbase-common,hbsae-protocol放入lib目录中,我试验不行,会出现map和reduce任务都执行到100%时卡住不动,等十分钟又出现 FAILED AttemptID:attempt_xxx Timed out after 600 secs,然后又重新执行mapreduce任务,然后又卡住,得结束掉mapreduce进程才能终止)
需要修改集群的配置文件,以满足hadoop执行hbase表插入操作时,能找到相关的类库.
将HBase的类jar包加到hadoop的classpath下, 修改${HADOOP_HOME}/etc/hadoop/hadoop-env.sh。配置好这个文件,分发到各个节点,改这个配置不用重启集群.
TEMP=`ls /opt/modules/hbase/lib/*.jar`HBASE_JARS=`echo $TEMP | sed 's/ /:/g'`HADOOP_CLASSPATH=$HBASE_JARS
如果现在运行程序还是会出错,详情可以看我的另一篇文章.hadoop执行hbase插入表操作,出错:Stack trace: ExitCodeException exitCode=1:(xjl456852原创)
还需要在${HADOOP_HOME}/etc/hadoop/yarn-site.xml中加入mapreduce运行时需要的类库,需要设置yarn.application.classpath:
所以我在yarn-site.xml中加入了如下配置,并加入了hbase的lib目录,配置好这个文件,分发到各个节点,这个配置需要重启集群
<property><name>yarn.application.classpath</name><value>/opt/modules/hadoop/etc/*,/opt/modules/hadoop/etc/hadoop/*,/opt/modules/hadoop/lib/*,/opt/modules/hadoop/share/hadoop/common/*,/opt/modules/hadoop/share/hadoop/common/lib/*,/opt/modules/hadoop/share/hadoop/mapreduce/*,/opt/modules/hadoop/share/hadoop/mapreduce/lib/*,/opt/modules/hadoop/share/hadoop/hdfs/*,/opt/modules/hadoop/share/hadoop/hdfs/lib/*,/opt/modules/hadoop/share/hadoop/yarn/*,/opt/modules/hadoop/share/hadoop/yarn/lib/*,/opt/modules/hbase/lib/*</value></property>
然后执行:
hadoop jar hbase.jar com.xjl456852.mapreduce.HBaseApp data/hbase_input ns2:user
查看hbase的表内容:
hbase(main):001:0> scan 'ns2:user'ROW COLUMN+CELL1 column=info:age, timestamp=1470966325326, value=201 column=info:name, timestamp=1470966325326, value=xiejl2 column=info:age, timestamp=1470966325326, value=302 column=info:name, timestamp=1470966325326, value=haha3 column=info:age, timestamp=1470966325326, value=403 column=info:name, timestamp=1470966325326, value=liudehua4 column=info:age, timestamp=1470966325326, value=414 column=info:name, timestamp=1470966325326, value=daoming4 row(s) in 0.3100 seconds
可以看到数据已经插入到hbase表中.
还可以将Reducer类写成继承TableReducer方式,代码如下,执行后会有同样的结果:
package com.xjl456852.mapreduce;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.io.ImmutableBytesWritable;import org.apache.hadoop.hbase.mapreduce.TableReducer;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.Text;import java.io.IOException;/*** 如果继承TableReducer,从源码中可以看到,输出的value是Mutation类型,也就是输出的值可以是Put,Delete之类的类型* Created by xiejl on 2016/8/11.*/public class MyReducer2 extends TableReducer<Text, Text, ImmutableBytesWritable> {@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {byte[] rowKey = Bytes.toBytes(key.toString());for(Text text : values) {//设置put对象的行键Put put = new Put(rowKey);String line = text.toString();int index = line.indexOf(",");String name = line.substring(0, index);String age = line.substring(index+1);//列族的是建表时固定的,列和值是插入时添加的.put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"),Bytes.toBytes(name));put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"),Bytes.toBytes(age));context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put);}context.getCounter("reduce", "over").increment(1);}}

浙公网安备 33010602011771号