最近自己写了下文章分析程序,用的是c# 2.0,数据库是mysql 5.0,自己弄两个mysql的Helper类,其中有涉及到线程和委托的东西,我找高手指点了我一下,结果,弄出来了,分析1万篇文章,大概1个小时左右,想要这个程序或者想交流的可以联系我.

帖一下自己的代码:

 

代码
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;

using System.Data.OleDb;

using WoWExpress.Core;
using MySql.Data.MySqlClient;
using System.Text.RegularExpressions;
using System.Threading;

using Rainsoft.WordSeg;
namespace CSVProject
{
    
public partial class Form1 : Form
    {
        
public Form1()
        {
            InitializeComponent();
        }

 

        
public DataSet GetStopwords()
        {
            
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost""hwyd""root""8152");
            
string mysqlStr = "select * from stopwords";

            DataSet stopwordsDataSet 
= WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
            
return stopwordsDataSet;
        }

        
public DataSet GetArticles()
        {
            
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost""hwyd""root""8152");
            
string mysqlStr = "select * from ccl_addonarticle";

            DataSet stopwordsDataSet 
= WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);
            
return stopwordsDataSet;
        }


        
/*一篇文章一个对象,对象包括文章主题,文章id等,现在就只要两个参数
         文章对象放入ArrayList,这样可以循环操作文章
         1.对文章使用停用词表,把文章隔开,如何隔开?利用停用词表集合循环的把文章中的停用词给用标识替换(如[%stopword%])
         2.直接使用split(artirleBody,[%stopword%])来分隔文章,留下的词就全部分入数组,数组循环判断,从第一个开始,相同就数量加1
         * 插入新的对象关键词对象,对象包括关键词id,关键词,关键词在本篇文章数量,关键词在本篇文章的百分比(这个需要在本篇文章循环
         * 完才可以计算的出),关键词在本数据库中的数量,关键词在本数据库中的百分比(这个需要在所有文章循环
         * 完才可以计算的出)
         * 全部循环完之后,需要的数据就是关键词对象,这个也相应的显示出来,并且存到数据库静态化,但是当数据不断增加的时候,每次
         * 就需要重新计算一次,得出当前最真实的结果,这样也会导致速度越来越慢,不过这是将来需要处理的。
         
*/

        
/*获得文章*/
        
public List<ArticleInfo> GetMyArticles(string pageLength)
        {

            List
<ArticleInfo> articlesInfo = new List<ArticleInfo>();

            
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost""hwyd""root""8152");
            
string mysqlStr = " select aid,body,isDo from ccl_addonarticle where isDo = 0 limit @pageLength";
            mysqlStr 
= mysqlStr.Replace("@pageLength", pageLength);

            
//Execute the query against the database
            using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
            {
                
// Scroll through the results
                while (rdr.Read())
                {
                    
//预先分词
                    this.segment(rdr.GetString(1));

                    ArticleInfo articleInfo 
= new ArticleInfo(Convert.ToInt32(rdr.GetString(0)), rdr.GetString(1));
                    
//Add each item to the arraylist
                    articlesInfo.Add(articleInfo);
                }
            }
            
return articlesInfo;
        }
        
/*获得文章总数*/
        
public int GetArticlesCount()
        {

            List
<ArticleInfo> articlesInfo = new List<ArticleInfo>();

            
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost""hwyd""root""8152");
            
string mysqlStr = " select count(*) from ccl_addonarticle where isDo = 0";
            
int result = 0;

            
//Execute the query against the database
            using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
            {
                
// Scroll through the results
                if (rdr.Read())
                {
                    result 
= rdr.GetInt32(0);
                }
            }
            
return result;
        }

 

        
/*获得停用词表*/
        
public List<StopwordsInfo> GetMyStopwords()
        {

            List
<StopwordsInfo> stopwords = new List<StopwordsInfo>();

            
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost""hwyd""root""8152");
            
string mysqlStr = "select * from stopwords";

            
//Execute the query against the database
            using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))
            {
                
// Scroll through the results
                while (rdr.Read())
                {
                    StopwordsInfo stopwordsInfo 
= new StopwordsInfo(Convert.ToInt32(rdr.GetString(0)), rdr.GetString(1).Trim());
                    
//Add each item to the arraylist
                    stopwords.Add(stopwordsInfo);
                }
            }
            
return stopwords;
        }

        
/*使用停用词表*/
        
public List<ArticleInfo> UseStopwords(List<ArticleInfo> articlesInfo, List<StopwordsInfo> stopwords)
        {
            
/*处理过后的文章*/
            List
<ArticleInfo> targetArticles = new List<ArticleInfo>();

            
/*循环文章*/
            
foreach (ArticleInfo articleInfo in articlesInfo)
            {

                
/*每篇文章循环使用停用词表里面的各个词*/
                
string curArticleBody = articleInfo.ArticleBody.ToString();
                
/*去除所有html代码*/
                curArticleBody 
= this.stripHtml(curArticleBody);
                curArticleBody 
= this.StripHTML3(curArticleBody);


                
foreach (StopwordsInfo stopwordsInfo in stopwords)
                {
                    
string curStopwords = stopwordsInfo.Stopwords.ToString();
                    curArticleBody 
= curArticleBody.Replace(curStopwords, " ");
                }
                
/*处理每篇文章后,在把每篇文章放入新的列表里面等待使用*/

                
/*去除所有html代码-在处理一次*/
                curArticleBody 
= this.stripHtml(curArticleBody);

                articleInfo.ArticleBody 
= curArticleBody;
                targetArticles.Add(articleInfo);
            }
            
return targetArticles;
        }

        
/*分隔文章到单词--这里已经得到了单篇文章的关键词统计*/
        
public List<SingleKeywords> SplitArticle(List<ArticleInfo> articlesInfo)
        {
            
/*处理过后得到的关键词列表*/
            List
<SingleKeywords> singleKeywordsArray = new List<SingleKeywords>();

            
/*循环文章*/
            
/*这里可以显示分进度*/

            
//这里已经是处在了线程里面的话,就需要代理了,这里的设置也就需要代理了
            
//progressBar2.Maximum = articlesInfo.Count;//设置最大长度值-
            
//progressBar2.Value = 0;//设置当前值
            
//progressBar2.Step = 1;//设置没次增长多少

            OnRrogressBar2Set(articlesInfo.Count);

            
foreach (ArticleInfo articleInfo in articlesInfo)
            {

                
/*每篇文章循环使用切割*/
                
string curArticleBody = articleInfo.ArticleBody.ToString().Trim();
                
int curArticleId = articleInfo.ArticleId;
                
//切割后得到关键词列表
                string[] keywordsArray = curArticleBody.Split(' ');

                
int keywordsArrayLength = keywordsArray.Length;

                
/*首先初始入库一个关键词,每篇第一个关键词肯定是要入库的*/
                SingleKeywords curKeywords 
= new SingleKeywords(curArticleId, keywordsArray[0], 10);
                singleKeywordsArray.Add(curKeywords);

                
for (int i = 1; i < keywordsArrayLength - 1; i++)
                {
                    
int singlekeywordsLength = singleKeywordsArray.Count;
                    
bool flag = true;
                    
for (int j = 0; j < singlekeywordsLength; j++)
                    {
                        
string tempSingleKeywords = keywordsArray[i].Trim();
                        
int tempArticleId = curArticleId;

                        
string temp2SingleKeywords = singleKeywordsArray[j].KeywordsStr.Trim();
                        
int temp2ArticleId = singleKeywordsArray[j].ArticleId;

                        
if (tempSingleKeywords.Equals(temp2SingleKeywords) && tempArticleId == temp2ArticleId)
                        {
                            singleKeywordsArray[j].SingleCount 
+= 1;
                            flag 
= false;
                            
break;
                        }
                    }

                    
//true代表没有一个是相同的,allKeywordsArray要加关键词
                    if (flag)
                    {
                        SingleKeywords addSingleKeywords 
= new SingleKeywords(curArticleId, keywordsArray[i].Trim(), 10);
                        singleKeywordsArray.Add(addSingleKeywords);
                    }
                }
                
/*这里进行百分比的计算*/
                
//todo

                
/*分进度*/
                OnRrogressBarAdd2(progressBar2.Step);
            }
            
/*嵌套到分词的时候,就顺便操作数据库了*/
            
this.UpdateArticleAndInsertKeywords(singleKeywordsArray);


            
//string startId = articlesInfo[0].ArticleId.ToString();
            
//string endId = articlesInfo[articlesInfo.Count - 1].ArticleId.ToString();
            
//label1.Text = "文章范围:" + startId + "-" + endId + "已经被更新完成!";


            
return singleKeywordsArray;

        }

        
/*接着数据库里面的操作*/
        
public string UpdateArticleAndInsertKeywords(List<SingleKeywords> singleKeywordsArray)
        {
            
//数据库连接加了字符集后,问题解决,插入正常
            string strSetCharset = "utf8";//System.Text.Encoding.UTF8.HeaderName;//System.Text.Encoding.Default.HeaderName;
            
//string strSetCharset = "UTF8";

            
string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost""hwyd""root""8152", strSetCharset);
            
//string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");


            
//这样就已经循环更新了,本地使用的程序,不担心安全,快速开发出来-这里需要用事务,待处理
            foreach (SingleKeywords singleKeywords in singleKeywordsArray)
            {
                
string mysqlStr = " Update ccl_addonarticle set isDo =1 where aid = @aid";
                
int articleId = singleKeywords.ArticleId;
                mysqlStr 
= mysqlStr.Replace("@aid", articleId.ToString());
                WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr);


                
string mysqlStr2 = "Insert articlekeywords(articleId,keywords,singleCount,singlePercent) values(?articleId,?KeywordsStr,?singleCount,?singlePercent)";
                
int articleId2 = singleKeywords.ArticleId;
                
string keywordsStr = singleKeywords.KeywordsStr;
                
int singleCount = singleKeywords.SingleCount;
                
double singlePercent = singleKeywords.SinglePercent;

 

                MySqlParameter[] keywordsParms 
= new MySqlParameter[] {
     
new MySqlParameter("?articleId", MySqlDbType.Int32, 4),
     
new MySqlParameter("?KeywordsStr", MySqlDbType.VarChar),
     
new MySqlParameter("?singleCount", MySqlDbType.Int32, 4),
     
new MySqlParameter("?singlePercent", MySqlDbType.Double,4)};


                keywordsStr 
= Traditional2Simplified(keywordsStr);

                keywordsParms[
0].Value = articleId2;
                keywordsParms[
1].Value = keywordsStr;
                keywordsParms[
2].Value = singleCount;
                keywordsParms[
3].Value = singlePercent;

                WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr2, keywordsParms);

            }
            
return "ok";

        }

        
/*辅助程序,改变编码*/
        
private string DBStringToNormal(string dbStr)
        {
            
byte[] str = new byte[dbStr.Length];
            
for (int i = 0; i < dbStr.Length; ++i)
                str[i] 
= (byte)(dbStr[i]);
            
return System.Text.Encoding.Default.GetString(str, 0, dbStr.Length);
        }

        
public string Traditional2Simplified(string str)
        { 
//繁体转简体   
            return (Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0));

        }
        
/// <summary>
        
/// 提取HTML代码中文字的C#函数
        
/// </summary>
        public string StripHTML2(string strHtml)
        {
            
string[] aryReg ={
           
@"<script[^>]*?>.*?</script>",
           
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/oxite/admin/file://[%22%22'tbnr]%7c[%5e/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
           
@"([\r\n])[\s]+",
           
@"&(quot|#34);",
           
@"&(amp|#38);",
           
@"&(lt|#60);",
           
@"&(gt|#62);",
           
@"&(nbsp|#160);",
           
@"&(iexcl|#161);",
           
@"&(cent|#162);",
           
@"&(pound|#163);",
           
@"&(copy|#169);",
           
@"&#(\d+);",
           
@"-->",
           
@"<!--.*\n"
          };
            
string[] aryRep =   {
             
"",
             
"",
             
"",
             
"\"",
             "&",
             
"<",
             
">",
             
"   ",
             
"\xa1",//chr(161), 
             "\xa2",//chr(162), 
             "\xa3",//chr(163), 
             "\xa9",//chr(169), 
             "",
             
"\r\n",
             
""
            };
            
string newReg = aryReg[0];
            
string strOutput = strHtml;
            
for (int i = 0; i < aryReg.Length; i++)
            {
                Regex regex 
= new Regex(aryReg[i], RegexOptions.IgnoreCase);
                strOutput 
= regex.Replace(strOutput, aryRep[i]);
            }
            strOutput.Replace(
"<""");
            strOutput.Replace(
">""");
            strOutput.Replace(
"\r\n""");
            
return strOutput;
        }

        
/// <summary>
        
/// 提取HTML代码中文字的C#函数
        
/// </summary>
        public string StripHTML3(string strHtml)
        {
            
return strHtml.Replace(@"[^A-Za-z0-9\u4E00-\u9FBB]""");
        }
        
/*利用得到的最原始的关键词列表进行全局计算*/
        
public List<AllKeywords> ComputeKeywords(List<SingleKeywords> singleKeywords)
        {
            
/*处理过后得到的关键词列表*/
            List
<AllKeywords> allKeywordsArray = new List<AllKeywords>();

            
/*把关键词相同的全部加起来,统计全局关键词*/
            
//初始化全局统计列表

            AllKeywords allKeywords 
= new AllKeywords(singleKeywords[0].KeywordsStr, singleKeywords[0].SingleCount, 0);
            allKeywordsArray.Add(allKeywords);
            
int singleKeywordsCount = singleKeywords.Count;
            
for (int i = 1; i < singleKeywordsCount - 1; i++)
            {
                
int allkeywordsLength = allKeywordsArray.Count;
                
bool flag = true;
                
for (int j = 0; j < allkeywordsLength; j++)
                {
                    
string tempSingleKeywords = singleKeywords[i].KeywordsStr.Trim();
                    
string tempAllKeywords = allKeywordsArray[j].KeywordsStr.Trim();

                    
if (tempSingleKeywords.Equals(tempAllKeywords))
                    {
                        allKeywordsArray[j].AllCount 
+= 1;
                        flag 
= false;
                        
break;
                    }
                }

                
//true代表没有一个是相同的,allKeywordsArray要加关键词
                if (flag)
                {
                    AllKeywords addAllKeywords 
= new AllKeywords(singleKeywords[i].KeywordsStr, singleKeywords[i].SingleCount, 0);
                    allKeywordsArray.Add(addAllKeywords);
                }
            }

            
/*这里进行百分比的计算*/
            
//todo
            return allKeywordsArray;
        }

 

        
/**/
        
/// <summary>
        
/// 将Html标签转化为空格
        
/// </summary>
        
/// <param name="strHtml">待转化的字符串</param>
        
/// <returns>经过转化的字符串</returns>
        private string stripHtml(string strHtml)
        {
            Regex objRegExp 
= new Regex("<(.|\n)+?>");
            
string strOutput = objRegExp.Replace(strHtml, "");
            strOutput 
= strOutput.Replace("<""&lt;");
            strOutput 
= strOutput.Replace(">""&gt;");

            
//把所有空格变为一个空格
            Regex r = new Regex(@"\s+");
            strOutput 
= r.Replace(strOutput, " ");
            strOutput.Trim();

            
return strOutput;
        }


        
private void btnUseStopword_Click(object sender, EventArgs e)
        {
            
/*这里分开写,好校验,现在是为了方便*/
            List
<ArticleInfo> targetArticles = this.UseStopwords(this.GetMyArticles("10"), this.GetMyStopwords());
            dataGridView3.DataSource 
= targetArticles;

        }

        
private void btnGetArticle_Click(object sender, EventArgs e)
        {
            DataSet articleDS 
= this.GetArticles();
            dataGridView2.DataSource 
= articleDS.Tables[0];
        }

        
private void btnStopwords_Click(object sender, EventArgs e)
        {
            
//string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");
            
//string mysqlStr = "select * from stopwords  where stopwordsId = @stopwordsId ";
            
//MySqlParameter myParameter = new MySqlParameter("@stopwordsId", MySqlDbType.Int32, 4);
            
//myParameter.Value = 26;

            DataSet stopwordsDataSet 
= this.GetStopwords();
            dataGridView1.DataSource 
= stopwordsDataSet.Tables[0];
        }

        
/*这里循环点击,或者程序自动点击也行*/
        
private void btnGetKeywords_Click(object sender, EventArgs e)
        {
            
/*这里分开写,好校验,现在是为了方便---分批处理,并且能够自动,判断,如果返回了值,就可以继续循环*/
            
/*按分页的方法,先统计出一共多少篇文章,规定每次执行的篇数,计算出需要执行的次数,利用返回结果来判断是否当前
             处理是否已经完成,完成者继续执行,否者报出错原因
*/
            
int articlesCount = this.GetArticlesCount();
            
int pageLength = 10;
            
int doCount = articlesCount/pageLength;
            
int lastLength = articlesCount % pageLength;
            progressBar1.Maximum 
= doCount;//设置最大长度值
            progressBar1.Value = 0;//设置当前值
            progressBar1.Step = 1;//设置没次增长多少


            System.Threading.Thread thread 
= new System.Threading.Thread(delegate(object arg) {
                
//如果总数小于单批长度,直接一次处理,长度为余数
                if (articlesCount < pageLength)
                {
                    List
<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(lastLength.ToString()), this.GetMyStopwords()));
                    OnGridViewDataBind(singleKeywordsArray);
                }
                
else
                {
                    
for (int i = 0; i < doCount; i++)
                    {
                        
//这里循环操作
                        List<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(pageLength.ToString()), this.GetMyStopwords()));
                        OnGridViewDataBind(singleKeywordsArray);
                        OnRrogressBarAdd(progressBar1.Step);

                    }

                    
//如果有余数,单独处理最后一次
                    if (lastLength != 0)
                    {
                        List
<SingleKeywords> singleKeywordsArray = this.SplitArticle(this.UseStopwords(this.GetMyArticles(lastLength.ToString()), this.GetMyStopwords()));
                        OnGridViewDataBind(singleKeywordsArray);
                    }
                }
            });
            thread.Start();
        }

        
private void btnAllCompute_Click(object sender, EventArgs e)
        {
            List
<AllKeywords> allKeywordsArray = this.ComputeKeywords(this.SplitArticle(this.UseStopwords(this.GetMyArticles("10"), this.GetMyStopwords())));
            dataGridView5.DataSource 
= allKeywordsArray;

        }


        
/*跨线程的操作*/
        
//绑定datagridview
        protected delegate void GridViewDataBind(object source);

        
protected void OnGridViewDataBind(object source)
        {
            
if (dataGridView4 == null)
                
return;
            
if (dataGridView4.InvokeRequired)
                dataGridView4.Invoke(
new GridViewDataBind(
                    
delegate(object dataSource)
                    {
                        dataGridView4.DataSource 
= dataSource;
                    }
                    ), source);
            
else
                dataGridView4.DataSource 
= source;

        }        
        
        
//设置整体进度条
        protected delegate void RrogressBarAdd(int step);

        
protected void OnRrogressBarAdd(int step)
        {
            
if (progressBar1 == null)
                
return;
            
if (progressBar1.InvokeRequired)
                progressBar1.Invoke(
new RrogressBarAdd(
                    
delegate(int mystep)
                    {
                        progressBar1.Value 
+= mystep;//让进度条增加一次
                    }
                    ), step);
            
else
                progressBar1.Value 
+= step;//让进度条增加一次

        }


        
//设置分进度条
        protected delegate void RrogressBarAdd2(int step);

        
protected void OnRrogressBarAdd2(int step)
        {
            
if (progressBar2 == null)
                
return;
            
if (progressBar2.InvokeRequired)
                progressBar2.Invoke(
new RrogressBarAdd2(
                    
delegate(int mystep)
                    {
                        progressBar2.Value 
+= mystep;//让进度条增加一次
                    }
                    ), step);
            
else
                progressBar2.Value 
+= step;//让进度条增加一次

        }
        
protected delegate void RrogressBar2Set(int maximum);

        
protected void OnRrogressBar2Set(int maximum)
        {
            
if (progressBar2 == null)
                
return;
            
if (progressBar2.InvokeRequired)
                progressBar2.Invoke(
new RrogressBar2Set(
                    
delegate(int myMaximum)
                    {
                        progressBar2.Maximum 
= myMaximum;//设置最大长度值-
                        progressBar2.Value = 0;//设置当前值
                        progressBar2.Step = 1;//设置没次增长多少

                    }
                    ), maximum);
            
else
                progressBar2.Value 
+= maximum;//让进度条增加一次

        }

 


        
public string segment(string articleStr)
        {
            WordSegV1 seg 
= new WordSegV1();
            
string s = seg.Segment(articleStr,' ');
            
return s;
        }

 


    }
}

 

 

 

程序开发完毕后,我突然发现分词不是那么容易的,找了下,又发现好东西了,c#版本开源的中文分词-ictclas,和一个简单的c#版本的分词组件。中文分词组件   好慢,等申请首页发布我在给出另外下载的代码吧,呵呵,看博客园园长的了。

posted on 2010-03-22 11:56  Mix  阅读(683)  评论(2编辑  收藏  举报