【原】Hadoop读取HDFS 文件简单示例
本文以文本分类中统计TF、IDF为例,简单介绍Java读HDFS文件操作,其中,文件URL在已配置到conf中。
1 protected void setup(Context context) throws IOException, InterruptedException {
2
3 Configuration conf = context.getConfiguration();
4 String TfUrl=conf.get("TfUrl"); //tf中间文件路径
5 String IdfUrl=conf.get("IdfUrl");//idf中间文件路径
6 System.out.println("TfUrl:"+TfUrl);
7 System.out.println("IdfUrl:"+IdfUrl);
8
9 /*取TF*/
10 tfMap = new HashMap<String,String>();
11 tfs = FileSystem.get(URI.create(TfUrl),conf);
12
13 InputStream tin= tfs.open(new Path(TfUrl));
14 BufferedReader tread = new BufferedReader(new InputStreamReader(tin));
15 String tline=null;
16 while((tline=tread.readLine())!=null){
17 //System.out.println("result:"+line.trim());
18 tfMap.put(tline.split("\t")[0], tline.split("\t")[1]);
19 }
20 // System.out.println(tfMap);
21 System.out.println("tfMap size:"+tfMap.size());
22 tread.close();
23 tin.close();
24 // tfs.close(); //fs.close会报错。。。注释掉就OK,yyyyyyyyyyyyy??????
25
26
27
28 /*取IDF*/
29 idfMap = new HashMap<String,String>();
30 dfs = FileSystem.get(URI.create(IdfUrl),conf);
31
32 InputStream din= dfs.open(new Path(IdfUrl));
33 BufferedReader dread = new BufferedReader(new InputStreamReader(din));
34 String dline=null;
35 while((dline=dread.readLine())!=null){
36 //System.out.println("result:"+line.trim());
37 idfMap.put(dline.split("\t")[0], dline.split("\t")[1]); //IDF无重复,直接放入map即可,hashmap查询O(1)
38 }
39 // System.out.println(idfMap);
40 System.out.println("idfMap size:"+idfMap.size());
41 dread.close();
42 din.close();
43 //dfs.close(); //fs.close会报错。。。注释掉就OK,yyyyyyyyyyyyy??????
44
45 }

浙公网安备 33010602011771号