基于朴素贝叶斯分类器的文本分类算法C#版(一)
昨天有幸拜读了洞庭散人的,我正在学习这个,我从内心感谢洞庭散人的分享!随即我把它移植到了c#平台上。
该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.
昨天有幸拜读了洞庭散人的<基于朴素贝叶斯分类器的文本分类算法>,我正在学习这个,我从内心感谢洞庭散人的分享!随即我把它移植到了c#平台上。
该程序用到了Lucene.Net,用到了基于词典的ICTCLAS中文分词1.0.
ICTCLAS中文分词for Lucene.Net接口代码(实现Analyzer):
1
using System;2
using System.Collections.Generic;3
using System.Text;4
using System.IO;5

6
using Lucene.Net.Analysis;7
using Lucene.Net.Analysis.Standard;8

9
namespace AspxOn.Search.FenLei10


{11

12

/**//// <summary>13
/// ICTCLAS分词组件for Lucene.net接口14
/// </summary>15
public class ICTCLASAnalyzer : Analyzer16

{17
//定义要过滤的词18
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[428];19
public string NoisePath = Environment.CurrentDirectory + "\\data\\stopwords.txt";20

21
public ICTCLASAnalyzer()22

{23
StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.Default);24
string noise = reader.ReadLine();25
int i = 0;26
27
while (!string.IsNullOrEmpty(noise))28

{29
CHINESE_ENGLISH_STOP_WORDS[i] = noise;30
noise = reader.ReadLine();31
i++;32
}33
34
}35

36

/**//**//**//// Constructs a {@link StandardTokenizer} filtered by a {@link37
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 38
/// 39
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)40

{41
TokenStream result = new ICTCLASTokenizer(reader);42
result = new StandardFilter(result);43
result = new LowerCaseFilter(result);44
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);45
return result;46
}47

48

49
}50
}
ICTCLAS中文分词for Lucene.Net接口代码(实现Tokenizer):
1
using System;2
using System.Collections.Generic;3
using System.Text;4

5
using Lucene.Net.Analysis;6
using SharpICTCLAS;7
using System.IO;8

9
namespace AspxOn.Search.FenLei10


{11
public class ICTCLASTokenizer : Tokenizer12

{13
int nKind = 1;14
List<WordResult[]> result;15
int startIndex = 0;16
int endIndex = 0;17
int i = 1;18

/**//**/19

/**//// 20
/// 待分词的句子21
/// 22
private string sentence;23

/**//**/24

/**//// Constructs a tokenizer for this Reader. 25
public ICTCLASTokenizer(System.IO.TextReader reader)26

{27
this.input = reader;28
sentence = input.ReadToEnd();29
sentence = sentence.Replace("\r\n", "");30
string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;31
//Console.WriteLine("正在初始化字典库,请稍候");32
WordSegment wordSegment = new WordSegment();33
wordSegment.InitWordSegment(DictPath);34
result = wordSegment.Segment(sentence, nKind);35
}36

37

/**//**/38

/**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null39
/// 40
public override Token Next()41

{42
Token token = null;43
while (i < result[0].Length - 1)44

{45
string word = result[0][i].sWord;46
endIndex = startIndex + word.Length - 1;47
token = new Token(word, startIndex, endIndex);48
startIndex = endIndex + 1;49

50
i++;51
return token;52

53
}54
return null;55
}56

57
}58
}
中文分词器代码:
1
using System;2
using System.Collections.Generic;3
using System.Text;4
using System.IO;5

6
using Lucene.Net.Analysis;7
using Lucene.Net.Analysis.Standard;8
using Lucene.Net.Documents;9

10
using Lucene.Net.Analysis.Cn;11
using Lucene.Net.Analysis.KTDictSeg;12

13
namespace AspxOn.Search.FenLei14


{15

/**//// <summary>16
/// 中文分词器17
/// </summary>18
public class ChineseSpliter19

{20
public static string Split(string text, string splitToken)21

{22
StringBuilder sb = new StringBuilder();23

24
Analyzer an = new ICTCLASAnalyzer();25

26
//TokenStream ts = an.ReusableTokenStream("", new StringReader(text));27

28
TokenStream ts = an.TokenStream("", new StringReader(text));29

30
Lucene.Net.Analysis.Token token;31
while ((token = ts.Next()) != null)32

{33
sb.Append(splitToken + token.TermText());34
}35

36
return sb.ToString().Substring(1);37
}38
}39
}
训练管理器代码:
1
using System;2
using System.Collections.Generic;3
using System.Text;4
using System.IO;5

6
using System.Text.RegularExpressions;7

8
namespace AspxOn.Search.FenLei9


{10

11

/**//// <summary>12
/// 训练管理器13
/// </summary>14
public class TrainingDataManager15

{16
private string[] trainingFileClassicfications; //训练预料分类数组17
private DirectoryInfo trainingTextDir; //训练预料存放目录18
private string defaultDir = "D:\\SogouC.mini.20061127\\SogouC.mini\\Sample";19
//private string defaultDir = @"J:\SogouC.reduced.20061127\SogouC.reduced\Reduced";20

21
public TrainingDataManager()22

{23
if (!Directory.Exists(defaultDir))24

{25
throw new Exception("当前语料目录不存在!");26
}27
trainingTextDir = new DirectoryInfo(defaultDir);28

29
trainingFileClassicfications = Directory.GetDirectories(defaultDir,"*",SearchOption.TopDirectoryOnly);30

31
for (int i = 0; i < trainingFileClassicfications.Length; i++)32

{33
trainingFileClassicfications[i] = (Regex.Split(trainingFileClassicfications[i], "\\\\"))[(Regex.Split(trainingFileClassicfications[i], "\\\\")).Length - 1];34
//Console.WriteLine(trainingFileClassicfications[i]);35
}36
}37

38

/**//// <summary>39
/// 获取分类列表40
/// </summary>41
/// <returns></returns>42
public string[] GetTrainingClassifications()43

{44
return trainingFileClassicfications;45
}46

47

/**//// <summary>48
/// 获取指定分类下的文件路径49
/// </summary>50
/// <param name="classification"></param>51
/// <returns></returns>52
public string[] GetFilesPath(string classification)53

{54
string[] ret = Directory.GetFiles(defaultDir+"\\"+classification);55
56
return ret;57
}58

59

/**//// <summary>60
/// 获取指定位置的文件内容61
/// </summary>62
/// <param name="filepath"></param>63
/// <returns></returns>64
public string GetFileText(string filepath)65

{66
FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read, FileShare.Read);67
byte[] bt = new byte[fs.Length];68
fs.Read(bt, 0, bt.Length);69
fs.Close();70
string s = Encoding.Default.GetString(bt);71
return s;72
}73

74

/**//// <summary>75
/// 获取训练文本集中的文本数目76
/// </summary>77
/// <returns></returns>78
public int GetTrainFileCount()79

{80
int ret = 0;81
for (int i = 0; i < trainingFileClassicfications.Length; i++)82

{83
ret += GetTrainFileCountOfCertainClassification(trainingFileClassicfications[i]);84
}85
return ret;86
}87

88

/**//// <summary>89
/// 获取指定分类下的文本数目90
/// </summary>91
/// <param name="classification"></param>92
/// <returns></returns>93
public int GetTrainFileCountOfCertainClassification(string classification)94

{95
int ret = 0;96

97
ret = Directory.GetFiles(defaultDir + "\\" + classification).Length;98

99
return ret;100
}101

102

/**//// <summary>103
/// 获取指定分类包含关键字或关键词的样本数目104
/// </summary>105
/// <param name="classification">指定分类</param>106
/// <param name="key">关键词或关键字</param>107
/// <returns>样本数目</returns>108
public int GetCountContainKeyOfClassification(string classification, string key)109

{110
int ret = 0;111
string[] filepaths = GetFilesPath(classification);112
try113

{114
115
for (int i = 0; i < filepaths.Length; i++)116

{117
string text = GetFileText(filepaths[i]);118
if (text.Contains(key))119

{120
ret++;121
}122
}123
}124
catch125

{126
throw new Exception("error!");127
}128
return ret;129
}130
}131
}132

先验概率计算代码:
1
using System;2
using System.Collections.Generic;3
using System.Text;4

5
namespace AspxOn.Search.FenLei6


{7

/**//// <summary>8
/// 先验概率(事先概率)计算9
/// </summary>10
public class PriorProbability11

{12
private static TrainingDataManager tdm = new TrainingDataManager();13

14

/**//// <summary>15
/// 计算先验概率16
/// </summary>17
/// <param name="c">给定的分类</param>18
/// <returns>给定条件下的先验概率</returns>19
public static float CaculatePc(string c)20

{21
float ret = 0F;22
float Nc = tdm.GetTrainFileCountOfCertainClassification(c);23
float N = tdm.GetTrainFileCount();24
ret = Nc / N;25
return ret;26
}27
}28
}
条件概率计算代码:
1
using System;2
using System.Collections.Generic;3
using System.Text;4

5
namespace AspxOn.Search.FenLei6


{7

/**//// <summary>8
/// 条件概率计算9
/// </summary>10
public class ClassConditionalProbability11

{12

13
private static TrainingDataManager tdm = new TrainingDataManager();14
private static float M = 0F;15

16

/**//// <summary>17
/// 类条件概率18
/// </summary>19
/// <param name="x">给定关键字</param>20
/// <param name="c">给定分类</param>21
/// <returns></returns>22
public static float CaculatePxc(string x, string c)23

{24
float ret = 0F;25
float Nxc = tdm.GetCountContainKeyOfClassification(c, x);26
float Nc = tdm.GetTrainFileCountOfCertainClassification(c);27
float V = tdm.GetTrainingClassifications().Length;28

29
ret = (Nxc + 1) / (Nc + V + M);//为避免出现0这样的极端情况,进行加权处理30

31
return ret;32
}33
}34
}
用于保存分类结果的类:
1
using System;2
using System.Collections.Generic;3
using System.Text;4

5
namespace AspxOn.Search.FenLei6


{7

/**//// <summary>8
/// 保存分类结果9
/// </summary>10
public class ClassifyResult11

{12

13
public double probability; //分类概率14
public string classification; //分类15
public ClassifyResult()16

{17
probability = 0;18
classification = string.Empty;19
}20
}21
}
贝叶斯分类器代码:
1
using System;2
using System.Collections.Generic;3
using System.Text;4

5
namespace AspxOn.Search.FenLei6


{7

/**//// <summary>8
/// 朴素贝叶斯分类器9
/// </summary>10
public class BayesClassifier11

{12

13
private TrainingDataManager tdm; //训练集合管理器14
//private string trainingDataPath; //训练集合路径15
private static float zoomFactor = 10.0F;16

17

/**//// <summary>18
/// 默认构造器,初始化训练集合19
/// </summary>20
public BayesClassifier()21

{22
tdm = new TrainingDataManager();23
}24

25

/**//// <summary>26
/// 计算给定的文本属性向量X在给定的分类Cj中的类条件概率27
/// </summary>28
/// <param name="X">文本属性向量X</param>29
/// <param name="Cj">给定的分类</param>30
/// <returns>分类条件概率连乘值</returns>31
protected float CaluProd(string[] X, string Cj)32

{33
float ret = 1.0F;34
for (int i = 0; i < X.Length; i++)35

{36
string Xi = X[i];37
ret *= ClassConditionalProbability.CaculatePxc(Xi, Cj) * zoomFactor;//因为数值过小,因此将连乘值放大10倍(通过乘以zoomFactor)38
}39
ret *= PriorProbability.CaculatePc(Cj); //再乘以先验概率40
return ret;41
}42

43

/**//// <summary>44
/// 对指定文本进行分类45
/// </summary>46
/// <param name="text">指定文本</param>47
/// <returns>分类结果</returns>48
public List<ClassifyResult> Classify(string text)49

{50
string[] terms = ChineseSpliter.Split(text, "|").Split('|'); //中文分词处理(分词结果可能包含停用词)51
string[] classes = tdm.GetTrainingClassifications(); //分类列表数组52
float probility = 0.0F;53
List<ClassifyResult> crs = new List<ClassifyResult>(); //分类结果54
for (int i = 0; i < classes.Length; i++)55

{56
string Ci = classes[i];57
probility = CaluProd(terms, Ci); //计算给定的文本属性向量terms在给定的分类Ci中的分类条件概率58
ClassifyResult cr = new ClassifyResult();59
cr.classification = Ci;60
cr.probability = probility;61
crs.Add(cr);62
}63
return crs;64
}65

66
public string GetMaxNum(List<ClassifyResult> crs)67

{68
double ret = 0;69
string classification = string.Empty;70
ret = crs[0].probability;71
for (int i = 0; i < crs.Count; i++)72

{73
if (crs[i].probability > ret)74

{75
ret = crs[i].probability;76
classification = crs[i].classification;77
}78
}79
return classification;80
}81
}82
}
代码太多,编辑的时候卡的很,于是再整个(二)
浙公网安备 33010602011771号