hadoop的c++版wordcount例子(streaming方式)

0.数据文件

我爱你
首都
北京
我爱你
北京
我爱你
伟大首都
北京
首都
我爱java
come
go



1.map
#include <iostream>
using namespace std;
void map(){
    string line;
    getline(cin,line);
    while(!cin.eof()){
        cout << line <<
            "\t" << "1" <<endl;
        getline(cin,line);
    }

}
int main(int argc,char** argv){
    map();
}

2.reduce
#include <iostream>
#include <vector>
using namespace std;
//自个写的string分割split方法,无奈cpp官方没提供
vector<string> split(const string& src, const string& separator)
{
    vector<string> dest;
    string str = src;
    string substring;
    string::size_type start = 0, index;
    do
    {
        index = str.find_first_of(separator,start);
        if (index != string::npos)
        {
            substring = str.substr(start,index-start);
            dest.push_back(substring);
            start = str.find_first_not_of(separator,index);
            if (start == string::npos) return dest;
        }
    }while(index != string::npos);
    substring = str.substr(start);
    dest.push_back(substring);
    return dest;
}

void reduce(){
    string last_word = ""; //用作reduce的sort后的分界
    string line;
    getline(cin,line);
    int word_num = 0;
    while(!cin.eof()){
        try{
            vector<string> all = split(line,"\t");
            string word = all[0];
            //first time last_word is ""
            if("" == last_word){
                last_word = word;
                word_num = 0;
            }
            //repeat word occurs
            if(word == last_word){
                word_num++;
            }
            //not equal current word,next word
            else{
                cout << last_word << "\t" << word_num <<endl;
                word_num = 1;
                last_word = word;
            }
            getline(cin,line);
        }
        catch(const exception& e){
            cerr << e.what() <<endl;
        }
    }
    cout << last_word << "\t" << word_num <<endl;
}

int main(int argc,char** argv){
    reduce();
}

3.启动命令
hadoop fs -rmr /output-data

hadoop jar /home/machen/hadoop/hadoop-1.0.3/contrib/streaming/hadoop-streaming-1.0.3.jar -file /home/machen/hadoop/hadoop-1.0.3/WordCount/python/stream_map.out  -file /home/machen/hadoop/hadoop-1.0.3/WordCount/python/stream_reduce.out -mapper /home/machen/hadoop/hadoop-1.0.3/WordCount/python/stream_map.out -reducer /home/machen/hadoop/hadoop-1.0.3/WordCount/python/stream_reduce.out -input /input-data  -output /output-data

posted @ 2012-09-14 12:15  sharpstill  阅读(1312)  评论(0)    收藏  举报