菜鸟学习C++练笔之整理搜狗2008版语料库--获取分类语料库

注：此篇博客无法提供源代码下载，读者可自行从博文上拷贝代码。

作者：finallyliuyu(博文转载请注明作者和出处)

注：有网友提出来说直接把抽取之后的搜狗2008版新闻语料库发布，根据搜狗实验室的数据使用许可说明(http://www.sogou.com/labs/dl/license.html):
有如下条款

三. 受益者的义务
禁止将搜狗实验室数据提供给第三方使用，第三方如有需要可直接访问搜狗实验室主页下载或提出介质(硬盘、光盘)拷贝申请。

所以如果我处理语料库并且上传的话，会涉及到侵犯搜狗实验室知识产权的的问题。有需要的网友先去搜狗实验室下载数据，然后拷贝我的程序进行数据处理。

搜狗2008版分类语料库（http://www.sogou.com/labs/dl/cs.html）和2006版相比内容丰富了很多。这一版语料库中有标题，正文，还有类别。而2006版里面只有正文和类别。但是这一版资料的格式变化很大。资料格式为：

格式说明：
数据格式：
<doc>
<url>页面URL</url>
<docno>页面ID</docno>
<contenttitle>页面标题</Contenttitle>
<content>页面内容</content>
</doc>
注意：content字段去除了HTML标签，保存的是新闻正文文本

下图是搜狗开源语料库中的一个文件的部分内容的截图：

本博客提供的程序的目的在于将搜狗实验室开放的文本资料转存到数据库中。

数据库字段格式设置如下图所示：

关于宽窄字符串转换部分请见《从C++边路路径中含有汉字的文件夹看宽窄字符转换的问题》

先给出处理结果，然后在给出程序代码：

头文件：

#include "stdafx.h"
#include<iostream>
#include <cwchar>
//#include<cstdio>
#include<map>
#include<set>
#include<vector>
#include<string>
#include<iomanip>
#include<fstream>
#include<algorithm>
#include<cmath>
#include<sstream>
#include<limits>
#include <xstring>
#include"ictclas30.h"
#include"boost\tr1\regex.hpp"
#include"boost/algorithm/string.hpp"
#pragma comment(lib, "ICTCLAS30.lib")
using namespace std;
#include"windows.h"

用到的结构体：

typedef struct  
{
	string ArticleTitle;
	string ArticleText;
	string Categorization;//用URL表征
}ARTICLE;//储存新闻实体

string wstring之间互相转换的函数：

/************************************************************************/
/*  功能：将窄字符转化成宽字符，string->wstring                         */
/************************************************************************/
wstring myMultibyteToWideChar(string sResult)
{
	int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。（不包含字符串结束符）
	wchar_t *lpwsz= new wchar_t [iWLen+1];
	MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
	lpwsz[iWLen] = L'\0'; 
	wstring wsResult(lpwsz);
	delete []lpwsz;
	return wsResult;
}
/************************************************************************/
/* 将宽字符串转化成窄字符串用于输出                                     */
/************************************************************************/
string myWideCharToMultibyte(wstring wsResult)
{	string sResult;
	int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。（包含字符串结束符）
	char *lpsz= new char[iLen];
	WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
	sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。
	delete []lpsz;
	return sResult;




}

利用正则表达式提取新闻实体的三要素，标题，正文，URL的函数

/************************************************************************/
 /* 从文本文件中提取出文章的标题，正文和类别                                                                     */
 /************************************************************************/
 vector<ARTICLE> FindArticles(string rawtext)
 {   vector<ARTICLE>articleCollection;
	boost::wregex wregdoc(L"<doc>.*?</doc>",boost::regbase::icase);//获得doc标签内的内容
	boost::wregex wregurl(L"<url>(.*?)</url>",boost::regbase::icase);//获得url标签内的内容
	boost::wregex wregtitle(L"<contenttitle>(.*?)</contenttitle>",boost::regbase::icase);//获得标题
	boost::wregex wregcontent(L"<content>(.*?)</content>",boost::regbase::icase);//获得内容
	wstring temprawtext=myMultibyteToWideChar(rawtext);
	ARTICLE article;
	boost::wsmatch mDOC;
	boost::wsmatch mURL;
	boost::wsmatch mTitle;
	boost::wsmatch mContent;
	wstring::const_iterator  it=temprawtext.begin();
	wstring::const_iterator  end=temprawtext.end();
	while(boost::regex_search(it,end,mDOC,wregdoc))
	{
		wstring wdoc=mDOC[0];
		
		wstring wurl=L"";
		wstring wtitle=L"";
		wstring wcontent=L"";
		if(boost ::regex_search(wdoc,mURL,wregurl))
		{
			wurl=mURL[1];

		}
		if(boost::regex_search(wdoc,mTitle,wregtitle))
		{
			wtitle=mTitle[1];
		}
		if(boost::regex_search(wdoc,mContent,wregcontent))
		{
			wcontent=mContent[1];
		}
		if(wcontent!=L""&&wtitle!=L""&&wurl!=L"")
		{
			article.ArticleText=myWideCharToMultibyte(wcontent);
			article.ArticleTitle=myWideCharToMultibyte(wtitle);
			article.Categorization=myWideCharToMultibyte(wurl);
			articleCollection.push_back(article);
		}


		it=mDOC[0].second;
		
	}
	
	return articleCollection;
	
 }

将文本中的单引号转化成双引号的函数（如果文本中存在单引号，则数据库插入操作会失败）

string ProcessforMSSQL(string src)
 {
	 int pos=src.find('\'');
	 while(pos!=string::npos)
	 { //string& replace ( size_t pos1, size_t n1,   size_t n2, char c );
		src=src.replace(pos,1,1,'\'\'');
		pos=src.find('\'',pos);

	 }
	 
	 return src;
 }

向数据库中插入信息的函数：

/************************************************************************/
 /* 将处理后的搜狗新闻存入数据库中                                                                     */
 /************************************************************************/
 void InsertArticlesToDataBase(string rawtext)
 {
	 vector<ARTICLE> articleCollection=FindArticles(rawtext);
	 CoInitialize(NULL);
	 _ConnectionPtr pConn(__uuidof(Connection));
	 //_RecordsetPtr pRst(__uuidof(Recordset));
	 pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxxx;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
	 pConn->Open("","","",adConnectUnspecified);
	 char *sqlInsert=new char[100000];
	 for(vector<ARTICLE>::iterator it=articleCollection.begin();it!=articleCollection.end();++it)
	 {
		 _variant_t RecordsAffected;
		 memset(sqlInsert,0,100000);
		 //将其中的带引号换为双引号
		 string url=ProcessforMSSQL((*it).Categorization);
		 string title=ProcessforMSSQL((*it).ArticleTitle);
		 string text=ProcessforMSSQL((*it).ArticleText);
		 sprintf_s(sqlInsert,100000,"insert into SoGouCorpus(ArticleTitle,ArticleText,Categorization) values('%s','%s','%s')",title.c_str(),text.c_str(),url.c_str());
		 pConn->Execute(sqlInsert,&RecordsAffected,-1);
		 cout<<title<<"添加完毕"<<endl;

	 }
	 delete sqlInsert;
	 pConn->Close();
	 pConn.Release();
	 CoUninitialize();


 }

遍历当前目录下的所有文件，并且提取每个文件中的所有新闻实体信息，并且存入数据库中。

/************************************************************************/
/* 遍历文件夹                                                                     */
/************************************************************************/
 void FindFile(wchar_t *pFilePath)
 {
	 WIN32_FIND_DATA FindFileData;
	 HANDLE hFind = INVALID_HANDLE_VALUE;
	 wchar_t  DirSpec[MAX_PATH + 1];// 指定路径 
	 DWORD dwError;
	 wcsncpy (DirSpec, pFilePath, wcslen(pFilePath) + 1);
	 wcsncat (DirSpec, L"\\\*", 3);
	 hFind=FindFirstFile(DirSpec,&FindFileData);
	 if (hFind == INVALID_HANDLE_VALUE) {
		 wprintf(L"Invalid file handle. Error is %u ", GetLastError());
		 return ;
	 } 
	 bool bFinish=false;
	 while(!bFinish)
	 {
		 if (FindFileData.dwFileAttributes != FILE_ATTRIBUTE_DIRECTORY )
		 {

			 wchar_t temp[3000];
			 memset(temp,0,3000*sizeof(wchar_t));
			 //wprintf_s(temp,L"%S\\%S\n",pFilePath,FindFileData.cFileName);
			 wcscpy(temp,pFilePath);
			 wcscat(temp,L"\\");
			 wcscat(temp,FindFileData.cFileName);
			 string rawtext="";
			 string line;
			 ifstream infile;
			 infile.open(temp);
			 if(infile)
				{
					while(getline(infile,line))
					{
						rawtext+=line;
					}
					

				}

			 infile.clear();
			 infile.close();
			InsertArticlesToDataBase(rawtext);


		 } 
		 bFinish = (FindNextFile(hFind, &FindFileData) == false);

	 }

	 
	 
	
				
			 
			 
		

	 
 }

主函数：

int _tmain(int argc, _TCHAR* argv[])
{
 	int end;
	//DictionaryToDataBase();
	
	FindFile(L"E:\\新闻语料\\SogouCS.reduced");
   cout<<"finish"<<endl;
	
	
	cin>>end;






}

posted on 2010-09-18 22:20 finallyly 阅读(7095) 评论(17) 收藏举报

刷新页面返回顶部

菜鸟学习C++练笔之整理搜狗2008版语料库--获取分类语料库

公告