昨天有幸拜读了洞庭散人的,我正在学习这个,我从内心感谢洞庭散人的分享!随即我把它移植到了c#平台上。
该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.
昨天有幸拜读了洞庭散人的<基于朴素贝叶斯分类器的文本分类算法>,我正在学习这个,我从内心感谢洞庭散人的分享!随即我把它移植到了c#平台上。
该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.
ICTCLAS中文分词for Lucene.Net接口代码(实现Analyzer):

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.IO;
5
6
using Lucene.Net.Analysis;
7
using Lucene.Net.Analysis.Standard;
8
9
namespace AspxOn.Search.FenLei
10

{
11
12
/**//// <summary>
13
/// ICTCLAS分词组件for Lucene.net接口
14
/// </summary>
15
public class ICTCLASAnalyzer : Analyzer
16
{
17
//定义要过滤的词
18
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[428];
19
public string NoisePath = Environment.CurrentDirectory + "\\data\\stopwords.txt";
20
21
public ICTCLASAnalyzer()
22
{
23
StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.Default);
24
string noise = reader.ReadLine();
25
int i = 0;
26
27
while (!string.IsNullOrEmpty(noise))
28
{
29
CHINESE_ENGLISH_STOP_WORDS[i] = noise;
30
noise = reader.ReadLine();
31
i++;
32
}
33
34
}
35
36
/**//**//**//// Constructs a {@link StandardTokenizer} filtered by a {@link
37
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
38
///
39
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
40
{
41
TokenStream result = new ICTCLASTokenizer(reader);
42
result = new StandardFilter(result);
43
result = new LowerCaseFilter(result);
44
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
45
return result;
46
}
47
48
49
}
50
}
ICTCLAS中文分词for Lucene.Net接口代码(实现Tokenizer):

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
5
using Lucene.Net.Analysis;
6
using SharpICTCLAS;
7
using System.IO;
8
9
namespace AspxOn.Search.FenLei
10

{
11
public class ICTCLASTokenizer : Tokenizer
12
{
13
int nKind = 1;
14
List<WordResult[]> result;
15
int startIndex = 0;
16
int endIndex = 0;
17
int i = 1;
18
/**//**/
19
/**////
20
/// 待分词的句子
21
///
22
private string sentence;
23
/**//**/
24
/**//// Constructs a tokenizer for this Reader.
25
public ICTCLASTokenizer(System.IO.TextReader reader)
26
{
27
this.input = reader;
28
sentence = input.ReadToEnd();
29
sentence = sentence.Replace("\r\n", "");
30
string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
31
//Console.WriteLine("正在初始化字典库,请稍候");
32
WordSegment wordSegment = new WordSegment();
33
wordSegment.InitWordSegment(DictPath);
34
result = wordSegment.Segment(sentence, nKind);
35
}
36
37
/**//**/
38
/**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null
39
///
40
public override Token Next()
41
{
42
Token token = null;
43
while (i < result[0].Length - 1)
44
{
45
string word = result[0][i].sWord;
46
endIndex = startIndex + word.Length - 1;
47
token = new Token(word, startIndex, endIndex);
48
startIndex = endIndex + 1;
49
50
i++;
51
return token;
52
53
}
54
return null;
55
}
56
57
}
58
}
中文分词器代码:

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.IO;
5
6
using Lucene.Net.Analysis;
7
using Lucene.Net.Analysis.Standard;
8
using Lucene.Net.Documents;
9
10
using Lucene.Net.Analysis.Cn;
11
using Lucene.Net.Analysis.KTDictSeg;
12
13
namespace AspxOn.Search.FenLei
14

{
15
/**//// <summary>
16
/// 中文分词器
17
/// </summary>
18
public class ChineseSpliter
19
{
20
public static string Split(string text, string splitToken)
21
{
22
StringBuilder sb = new StringBuilder();
23
24
Analyzer an = new ICTCLASAnalyzer();
25
26
//TokenStream ts = an.ReusableTokenStream("", new StringReader(text));
27
28
TokenStream ts = an.TokenStream("", new StringReader(text));
29
30
Lucene.Net.Analysis.Token token;
31
while ((token = ts.Next()) != null)
32
{
33
sb.Append(splitToken + token.TermText());
34
}
35
36
return sb.ToString().Substring(1);
37
}
38
}
39
}
训练管理器代码:

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.IO;
5
6
using System.Text.RegularExpressions;
7
8
namespace AspxOn.Search.FenLei
9

{
10
11
/**//// <summary>
12
/// 训练管理器
13
/// </summary>
14
public class TrainingDataManager
15
{
16
private string[] trainingFileClassicfications; //训练预料分类数组
17
private DirectoryInfo trainingTextDir; //训练预料存放目录
18
private string defaultDir = "D:\\SogouC.mini.20061127\\SogouC.mini\\Sample";
19
//private string defaultDir = @"J:\SogouC.reduced.20061127\SogouC.reduced\Reduced";
20
21
public TrainingDataManager()
22
{
23
if (!Directory.Exists(defaultDir))
24
{
25
throw new Exception("当前语料目录不存在!");
26
}
27
trainingTextDir = new DirectoryInfo(defaultDir);
28
29
trainingFileClassicfications = Directory.GetDirectories(defaultDir,"*",SearchOption.TopDirectoryOnly);
30
31
for (int i = 0; i < trainingFileClassicfications.Length; i++)
32
{
33
trainingFileClassicfications[i] = (Regex.Split(trainingFileClassicfications[i], "\\\\"))[(Regex.Split(trainingFileClassicfications[i], "\\\\")).Length - 1];
34
//Console.WriteLine(trainingFileClassicfications[i]);
35
}
36
}
37
38
/**//// <summary>
39
/// 获取分类列表
40
/// </summary>
41
/// <returns></returns>
42
public string[] GetTrainingClassifications()
43
{
44
return trainingFileClassicfications;
45
}
46
47
/**//// <summary>
48
/// 获取指定分类下的文件路径
49
/// </summary>
50
/// <param name="classification"></param>
51
/// <returns></returns>
52
public string[] GetFilesPath(string classification)
53
{
54
string[] ret = Directory.GetFiles(defaultDir+"\\"+classification);
55
56
return ret;
57
}
58
59
/**//// <summary>
60
/// 获取指定位置的文件内容
61
/// </summary>
62
/// <param name="filepath"></param>
63
/// <returns></returns>
64
public string GetFileText(string filepath)
65
{
66
FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read, FileShare.Read);
67
byte[] bt = new byte[fs.Length];
68
fs.Read(bt, 0, bt.Length);
69
fs.Close();
70
string s = Encoding.Default.GetString(bt);
71
return s;
72
}
73
74
/**//// <summary>
75
/// 获取训练文本集中的文本数目
76
/// </summary>
77
/// <returns></returns>
78
public int GetTrainFileCount()
79
{
80
int ret = 0;
81
for (int i = 0; i < trainingFileClassicfications.Length; i++)
82
{
83
ret += GetTrainFileCountOfCertainClassification(trainingFileClassicfications[i]);
84
}
85
return ret;
86
}
87
88
/**//// <summary>
89
/// 获取指定分类下的文本数目
90
/// </summary>
91
/// <param name="classification"></param>
92
/// <returns></returns>
93
public int GetTrainFileCountOfCertainClassification(string classification)
94
{
95
int ret = 0;
96
97
ret = Directory.GetFiles(defaultDir + "\\" + classification).Length;
98
99
return ret;
100
}
101
102
/**//// <summary>
103
/// 获取指定分类包含关键字或关键词的样本数目
104
/// </summary>
105
/// <param name="classification">指定分类</param>
106
/// <param name="key">关键词或关键字</param>
107
/// <returns>样本数目</returns>
108
public int GetCountContainKeyOfClassification(string classification, string key)
109
{
110
int ret = 0;
111
string[] filepaths = GetFilesPath(classification);
112
try
113
{
114
115
for (int i = 0; i < filepaths.Length; i++)
116
{
117
string text = GetFileText(filepaths[i]);
118
if (text.Contains(key))
119
{
120
ret++;
121
}
122
}
123
}
124
catch
125
{
126
throw new Exception("error!");
127
}
128
return ret;
129
}
130
}
131
}
132
先验概率计算代码:

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
5
namespace AspxOn.Search.FenLei
6

{
7
/**//// <summary>
8
/// 先验概率(事先概率)计算
9
/// </summary>
10
public class PriorProbability
11
{
12
private static TrainingDataManager tdm = new TrainingDataManager();
13
14
/**//// <summary>
15
/// 计算先验概率
16
/// </summary>
17
/// <param name="c">给定的分类</param>
18
/// <returns>给定条件下的先验概率</returns>
19
public static float CaculatePc(string c)
20
{
21
float ret = 0F;
22
float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
23
float N = tdm.GetTrainFileCount();
24
ret = Nc / N;
25
return ret;
26
}
27
}
28
}
条件概率计算代码:

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
5
namespace AspxOn.Search.FenLei
6

{
7
/**//// <summary>
8
/// 条件概率计算
9
/// </summary>
10
public class ClassConditionalProbability
11
{
12
13
private static TrainingDataManager tdm = new TrainingDataManager();
14
private static float M = 0F;
15
16
/**//// <summary>
17
/// 类条件概率
18
/// </summary>
19
/// <param name="x">给定关键字</param>
20
/// <param name="c">给定分类</param>
21
/// <returns></returns>
22
public static float CaculatePxc(string x, string c)
23
{
24
float ret = 0F;
25
float Nxc = tdm.GetCountContainKeyOfClassification(c, x);
26
float Nc = tdm.GetTrainFileCountOfCertainClassification(c);
27
float V = tdm.GetTrainingClassifications().Length;
28
29
ret = (Nxc + 1) / (Nc + V + M);//为避免出现0这样的极端情况,进行加权处理
30
31
return ret;
32
}
33
}
34
}
用于保存分类结果的类:

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
5
namespace AspxOn.Search.FenLei
6

{
7
/**//// <summary>
8
/// 保存分类结果
9
/// </summary>
10
public class ClassifyResult
11
{
12
13
public double probability; //分类概率
14
public string classification; //分类
15
public ClassifyResult()
16
{
17
probability = 0;
18
classification = string.Empty;
19
}
20
}
21
}
贝叶斯分类器代码:

Code
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
5
namespace AspxOn.Search.FenLei
6

{
7
/**//// <summary>
8
/// 朴素贝叶斯分类器
9
/// </summary>
10
public class BayesClassifier
11
{
12
13
private TrainingDataManager tdm; //训练集合管理器
14
//private string trainingDataPath; //训练集合路径
15
private static float zoomFactor = 10.0F;
16
17
/**//// <summary>
18
/// 默认构造器,初始化训练集合
19
/// </summary>
20
public BayesClassifier()
21
{
22
tdm = new TrainingDataManager();
23
}
24
25
/**//// <summary>
26
/// 计算给定的文本属性向量X在给定的分类Cj中的类条件概率
27
/// </summary>
28
/// <param name="X">文本属性向量X</param>
29
/// <param name="Cj">给定的分类</param>
30
/// <returns>分类条件概率连乘值</returns>
31
protected float CaluProd(string[] X, string Cj)
32
{
33
float ret = 1.0F;
34
for (int i = 0; i < X.Length; i++)
35
{
36
string Xi = X[i];
37
ret *= ClassConditionalProbability.CaculatePxc(Xi, Cj) * zoomFactor;//因为数值过小,因此将连乘值放大10倍(通过乘以zoomFactor)
38
}
39
ret *= PriorProbability.CaculatePc(Cj); //再乘以先验概率
40
return ret;
41
}
42
43
/**//// <summary>
44
/// 对指定文本进行分类
45
/// </summary>
46
/// <param name="text">指定文本</param>
47
/// <returns>分类结果</returns>
48
public List<ClassifyResult> Classify(string text)
49
{
50
string[] terms = ChineseSpliter.Split(text, "|").Split('|'); //中文分词处理(分词结果可能包含停用词)
51
string[] classes = tdm.GetTrainingClassifications(); //分类列表数组
52
float probility = 0.0F;
53
List<ClassifyResult> crs = new List<ClassifyResult>(); //分类结果
54
for (int i = 0; i < classes.Length; i++)
55
{
56
string Ci = classes[i];
57
probility = CaluProd(terms, Ci); //计算给定的文本属性向量terms在给定的分类Ci中的分类条件概率
58
ClassifyResult cr = new ClassifyResult();
59
cr.classification = Ci;
60
cr.probability = probility;
61
crs.Add(cr);
62
}
63
return crs;
64
}
65
66
public string GetMaxNum(List<ClassifyResult> crs)
67
{
68
double ret = 0;
69
string classification = string.Empty;
70
ret = crs[0].probability;
71
for (int i = 0; i < crs.Count; i++)
72
{
73
if (crs[i].probability > ret)
74
{
75
ret = crs[i].probability;
76
classification = crs[i].classification;
77
}
78
}
79
return classification;
80
}
81
}
82
}
代码太多,编辑的时候卡的很,于是再整个(二)