目的:从数据库中抽取文章关键词,并统计这些关键词在哪些文章中出现,出现多少次。(算是词袋子模型吧),然后对每篇文章形成形成VSM模型,写成weka的数据格式,然后调用weka对文章聚类。

目前“形成此代码模型一块已经完毕”

其中词袋子的数据结构如下:

map<string,vector<pair<int,int>>>&mymap),

目前已经完成此部分的serilize(save/load)以及print 功能

#include "stdafx.h"
#include<iostream>
#include<map>
#include<vector>
#include<string>
#include<iomanip>
#include<fstream>
//#include<boost/tokenizer.hpp>
using namespace std;

 

形成词袋子模型
nt ConstructMap(map<string,vector<pair<int,int>>>&mymap)
{
    
    vector
<string> mySplit(string s);
    CoInitialize(NULL);
    _ConnectionPtr pConn(__uuidof(Connection));
    _RecordsetPtr pRst(__uuidof(Recordset));
    pConn
->ConnectionString="Provider=SQLOLEDB.1;Password=xxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection";
    pConn
->Open("","","",adConnectUnspecified);
    pRst
=pConn->Execute("select CKeyWord,ArticleId from Article order by ArticleId",NULL,adCmdText);
    
while(!pRst->rsEOF)
    {    vector
<string>wordcollection;
        
string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
        
if(keywordstr!="")
        {
                wordcollection
=mySplit(keywordstr);
                
string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
                
int articleid=atoi(tempid.c_str());
                
for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
                {
                    vector
<pair<int,int>>::iterator it;
                    
if(mymap[*strit].empty())
                    {
                        pair
<int,int>mytemppair=make_pair(articleid,1);
                        mymap[
*strit].push_back(mytemppair);

                    }
                    
else
                    {
                        
for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
                        {  
                            
if(it->first==articleid)
                            {
                                it
->second=++(it->second);
                                
break;
                            }
                    
                        }
                        
if(it==mymap[*strit].end())
                        {
                            pair
<int,int>mytemppair=make_pair(articleid,1);
                            mymap[
*strit].push_back(mytemppair);
                        }

                    }

            }
            

        }
        
        
        pRst
->MoveNext();
        wordcollection.clear();
    }
    pRst
->Close();
    pConn
->Close();
    pRst.Release();
    pConn.Release();
    CoUninitialize();
    
return 0;

}

 

 

加载词袋子模型
void load(map<string,vector<pair<int,int> > >&mymap)
{
    ifstream infile(
"c:\\mydict.dat",ios::binary);
    
int lenMyMap;//保存词典长度
    int lenVector;//保存每个词出现的文章数目
    string key;//保存读出的map的键值
    int articleId;//文章标号
    int count;//在该文章中刚出现的数目
    string comma;
    
string semicolon;
    
    infile
>>lenMyMap;
    
while(!infile.eof())
    {
        infile
>>key;
        infile
>>lenVector;
        vector
<pair<int,int> >temp;
        
for (int i=0;i<lenVector;i++)
        {
            infile
>>articleId>>count>>semicolon;
            temp.push_back(make_pair(articleId,count));
        }
        mymap[key]
=temp;
        
        
    }
    

    infile.close();

}

 

保存词袋子模型
void save(map<string,vector<pair<int,int> > >&mymap)
{   ofstream outfile(
"c:\\mydict.dat",ios::binary);
    outfile
<<mymap.size()<<endl;
    map
<string,vector<pair<int,int> > >::iterator it;
    
for (it=mymap.begin();it!=mymap.end();it++)
    {   outfile
<<it->first<<endl;
        vector
<pair<int,int>>::iterator subit;
        outfile
<<it->second.size()<<endl;
        
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
        {
            outfile
<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
        }
        outfile
<<endl;
    }
    
//outfile.write((char *)&mymap,sizeof(mymap));

    outfile.close();
}
打印词袋子模型
void print(map<string,vector<pair<int,int> > >&mymap)
{   
    cout
<<mymap.size()<<endl;
    map
<string,vector<pair<int,int> > >::iterator it;
    
for (it=mymap.begin();it!=mymap.end();it++)
    {   cout
<<it->first<<endl;
        vector
<pair<int,int>>::iterator subit;
        cout
<<it->second.size()<<endl;
        
for(subit=(it->second).begin();subit!=(it->second).end();++subit)
        {
            cout
<<subit->first<<','<<subit->second<<";";
        }
        cout
<<endl;
    }
    
}

 

 

 

posted on 2010-08-25 16:47  finallyly  阅读(1057)  评论(7编辑  收藏  举报