声明:

 

按类别特征词选择算法声明
vector<pair<string,double> >LocalDFFeatureSelectionForPerclass(DICTIONARY& mymap,CONTINGENCY& contingencyTable,string classLabel);//局部DF法针对每个词对每个类别进行排序
        void DFFeatureSelection(vector<string> classLabels,DICTIONARY &mymap,CONTINGENCY& contingencyTable,int N,char *address);//调用局部DF特征词选择函数

 

函数实现:

 

对词典中的每个词,统计其在某一个类别中出现的次数,并按词频从大到小排序
/************************************************************************/
/*  按类别的DF特征词选择法                                               */
/************************************************************************/
vector
<pair<string,double> > Preprocess::LocalDFFeatureSelectionForPerclass(DICTIONARY& mymap,CONTINGENCY& contingencyTable ,string classLabel)
{
    
//int finalKeyWordsCount=0;//计算共取了多少个关键词
    clock_t start,finish;
    
double totaltime;
    start
=clock();
    vector
<pair<string,double> >DFinfo;
    
for(map<string,vector<pair<int,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
    {
        
        pair
<string,string>compoundKey=make_pair(it->first,classLabel);
        
double classCount=(double)contingencyTable[compoundKey].first;
        DFinfo.push_back(make_pair(it
->first,classCount));
        
    }

    stable_sort(DFinfo.begin(),DFinfo.end(),isLarger);
    finish
=clock();
    totaltime
=(double)(finish-start)/CLOCKS_PER_SEC;
    cout
<<"为类别"<<classLabel<<"遴选特征词共用了"<<totaltime<<endl;

    
return DFinfo;


}

 

 

DF特征词选择法:

 

代码
/************************************************************************/
/* DF特征词选择法                                                                     */
/************************************************************************/
void Preprocess:: DFFeatureSelection(vector<string >classLabels,DICTIONARY &mymap,CONTINGENCY& contingencyTable,int N,char *address)
{
    clock_t start,finish;
    
double totaltime;
    
int totalTraingingCorpus=endIndex-beginIndex+1;//训练语料库总共的文章数目
    set<string>finalKeywords;//存放最终遴选出的特征词
    vector<pair<string,double>>DFInfo;
    start
=clock();
    
for(vector<string>::iterator it=classLabels.begin();it!=classLabels.end();it++)
    {
        
//训练语料库中某个类别的文章数目
        int N_subClassCnt=getCategorizationNum(*it,"TrainingCorpus");
        
//threshold决定每个类别遴选多少个特征词
        int threshold=N_subClassCnt*N/totalTraingingCorpus;
        DFInfo
=LocalDFFeatureSelectionForPerclass(mymap,contingencyTable,*it);
        
for(vector<pair<string,double> >::size_type j=0;j<threshold;j++)
        {
            finalKeywords.insert(DFInfo[j].first);

        }
        DFInfo.clear();




    }


    ofstream outfile(address);
    
int finalKeyWordsCount=finalKeywords.size();
    
for (set<string>::iterator it=finalKeywords.begin();it!=finalKeywords.end();it++)
    {
        outfile
<<*it<<endl;

    }
    outfile.close();
    cout
<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
    finish
=clock();
    totaltime
=(double)(finish-start)/CLOCKS_PER_SEC;
    cout
<<"遴选特征词共有了"<<totaltime<<endl;

}

 主函数调用:

 

代码
p.LoadDictionary(mymap,"F:\\finallyliuyu\\dict.dat");
    p.LoadContingencyTable(contingenyTable,
"F:\\finallyliuyu\\contingency.dat");
    p.DFFeatureSelection(labels,mymap,contingenyTable,
2000,"F:\\finallyliuyu\\keywords.dat");

 

 

 

posted on 2010-10-04 16:28  finallyly  阅读(3486)  评论(0编辑  收藏  举报