扩展Lucene的分词模块

我利用了吕震宇根据Free版ICTCLAS改编而成.net平台下的ICTCLAS，把ICTCLAS的分词为lucene所用。以下是我写的程序，比较简单。大家看看评评，有什么要改进的地方，望大家指出
Analyzer类：

using System;
2

using System.Collections.Generic;
3

using System.Text;
4

using Lucene.Net.Analysis;
6

using Lucene.Net.Analysis.Standard;
7

using System.IO;
8

namespace ICTCLASForLucene
10

{
11

public class ICTCLASAnalyzer : Analyzer
12

{
13

//定义要过滤的词
14

public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[368];
15

public string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";
16

public ICTCLASAnalyzer()
18

{
19

StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
20

string noise = reader.ReadLine();
21

int i = 0;
22

while (!string.IsNullOrEmpty(noise))
23

{
24

CHINESE_ENGLISH_STOP_WORDS[i] = noise;
25

noise = reader.ReadLine();
26

i++;
27

}
28

}
29

/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
31

/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
32

/// </summary>
33

public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
34

{
35

TokenStream result = new ICTCLASTokenizer(reader);
36

result = new StandardFilter(result);
37

result = new LowerCaseFilter(result);
38

result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
39

return result;
40

}
41

}
44

}
45

Tokenizer类：

using System;
2

using System.Collections.Generic;
3

using System.Text;
4

using Lucene.Net.Analysis;
6

using SharpICTCLAS;
7

using System.IO;
8

namespace ICTCLASForLucene
10

{
11

class ICTCLASTokenizer : Tokenizer
12

{
13

int nKind = 2;
14

List<WordResult[]> result;
15

int startIndex = 0;
16

int endIndex = 0;
17

int i = 1;
18

/// <summary>
19

/// 待分词的句子
20

/// </summary>
21

private string sentence;
22

/// <summary>Constructs a tokenizer for this Reader. </summary>
23

public ICTCLASTokenizer(System.IO.TextReader reader)
24

{
25

this.input = reader;
26

sentence = input.ReadToEnd();
27

sentence = sentence.Replace("\r\n","");
28

string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
29

//Console.WriteLine("正在初始化字典库，请稍候

");
30

WordSegment wordSegment = new WordSegment();
31

wordSegment.InitWordSegment(DictPath);
32

result = wordSegment.Segment(sentence, nKind);
33

}
34

/// <summary>进行切词，返回数据流中下一个token或者数据流为空时返回null
36

/// </summary>
37

public override Token Next()
38

{
39

Token token = null;
40

while (i < result[0].Length-1)
41

{
42

string word = result[0][i].sWord;
43

endIndex = startIndex + word.Length - 1;
44

token = new Token(word, startIndex, endIndex);
45

startIndex = endIndex + 1;
46

i++;
48

return token;
49

}
51

return null;
52

}
53

}
54

}
55

分词郊果：

需分词句子：***，***，中华人民共和国在1949年建立，从此开始了新中国的伟大篇章.长春市长春节发表致词汉字abc iphone 1265325.98921 fee1212@tom.com http://news.qq.com 100%

分词结果:

(***,0,2)(***,4,6)(中华人民共和国,8,14)(1949年,16,20)(建立,21,22)(从此,24,25)(新,29,29)(中国,30,31)(伟大,33,34)(篇章,35,36)(长春市,38,40)(春节,42,43)(发表,44,45)(致词,46,47)(汉字,48,49)(abc,50,52)(iphone,53,58)(1265325.98921,59,71)(fee1212@tom,72,82)(com,84,86)(http://news,87,97)(qq,99,100)(com,102,104)(100%,105,108)

耗时00:00:00.0937500

posted on 2009-08-10 15:46 Robin99 阅读(137) 评论(0) 收藏举报

刷新页面返回顶部

Robin's Blog

公告