我利用了吕震宇根据Free版ICTCLAS改编而成.net平台下的ICTCLAS,把ICTCLAS的分词为lucene所用。以下是我写的程序,比较简单。大家看看评评,有什么要改进的地方,望大家指出
Analyzer类:
Tokenizer类:
需分词句子:***,***,中华人民共和国在1949年建立,从此开始了新中国的伟大篇章.长春市长春节发表致词汉字abc iphone 1265325.98921 fee1212@tom.com http://news.qq.com 100%
分词结果:
(***,0,2)(***,4,6)(中华人民共和国,8,14)(1949年,16,20)(建立,21,22)(从此,24,25)(新,29,29)(中国,30,31)(伟大,33,34)(篇章,35,36)(长春市,38,40)(春节,42,43)(发表,44,45)(致词,46,47)(汉字,48,49)(abc,50,52)(iphone,53,58)(1265325.98921,59,71)(fee1212@tom,72,82)(com,84,86)(http://news,87,97)(qq,99,100)(com,102,104)(100%,105,108)
耗时00:00:00.0937500
Analyzer类:
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
5
using Lucene.Net.Analysis;
6
using Lucene.Net.Analysis.Standard;
7
using System.IO;
8
9
namespace ICTCLASForLucene
10
{
11
public class ICTCLASAnalyzer : Analyzer
12
{
13
//定义要过滤的词
14
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[368];
15
public string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";
16
17
public ICTCLASAnalyzer()
18
{
19
StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
20
string noise = reader.ReadLine();
21
int i = 0;
22
while (!string.IsNullOrEmpty(noise))
23
{
24
CHINESE_ENGLISH_STOP_WORDS[i] = noise;
25
noise = reader.ReadLine();
26
i++;
27
}
28
}
29
30
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
31
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
32
/// </summary>
33
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
34
{
35
TokenStream result = new ICTCLASTokenizer(reader);
36
result = new StandardFilter(result);
37
result = new LowerCaseFilter(result);
38
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
39
return result;
40
}
41
42
43
}
44
}
45
using System;2
using System.Collections.Generic;3
using System.Text;4

5
using Lucene.Net.Analysis;6
using Lucene.Net.Analysis.Standard;7
using System.IO;8

9
namespace ICTCLASForLucene10
{11
public class ICTCLASAnalyzer : Analyzer12
{13
//定义要过滤的词14
public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[368];15
public string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";16

17
public ICTCLASAnalyzer()18
{19
StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);20
string noise = reader.ReadLine();21
int i = 0;22
while (!string.IsNullOrEmpty(noise))23
{24
CHINESE_ENGLISH_STOP_WORDS[i] = noise;25
noise = reader.ReadLine();26
i++;27
}28
}29

30
/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link31
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 32
/// </summary>33
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)34
{35
TokenStream result = new ICTCLASTokenizer(reader);36
result = new StandardFilter(result);37
result = new LowerCaseFilter(result);38
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);39
return result;40
}41

42

43
}44
}45

Tokenizer类:
1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
5
using Lucene.Net.Analysis;
6
using SharpICTCLAS;
7
using System.IO;
8
9
namespace ICTCLASForLucene
10
{
11
class ICTCLASTokenizer : Tokenizer
12
{
13
int nKind = 2;
14
List<WordResult[]> result;
15
int startIndex = 0;
16
int endIndex = 0;
17
int i = 1;
18
/// <summary>
19
/// 待分词的句子
20
/// </summary>
21
private string sentence;
22
/// <summary>Constructs a tokenizer for this Reader. </summary>
23
public ICTCLASTokenizer(System.IO.TextReader reader)
24
{
25
this.input = reader;
26
sentence = input.ReadToEnd();
27
sentence = sentence.Replace("\r\n","");
28
string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
29
//Console.WriteLine("正在初始化字典库,请稍候
");
30
WordSegment wordSegment = new WordSegment();
31
wordSegment.InitWordSegment(DictPath);
32
result = wordSegment.Segment(sentence, nKind);
33
}
34
35
/// <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null
36
/// </summary>
37
public override Token Next()
38
{
39
Token token = null;
40
while (i < result[0].Length-1)
41
{
42
string word = result[0][i].sWord;
43
endIndex = startIndex + word.Length - 1;
44
token = new Token(word, startIndex, endIndex);
45
startIndex = endIndex + 1;
46
47
i++;
48
return token;
49
50
}
51
return null;
52
}
53
}
54
}
55
分词郊果:
using System;2
using System.Collections.Generic;3
using System.Text;4

5
using Lucene.Net.Analysis;6
using SharpICTCLAS;7
using System.IO;8

9
namespace ICTCLASForLucene10
{11
class ICTCLASTokenizer : Tokenizer12
{13
int nKind = 2;14
List<WordResult[]> result;15
int startIndex = 0;16
int endIndex = 0;17
int i = 1;18
/// <summary>19
/// 待分词的句子20
/// </summary>21
private string sentence;22
/// <summary>Constructs a tokenizer for this Reader. </summary>23
public ICTCLASTokenizer(System.IO.TextReader reader)24
{25
this.input = reader;26
sentence = input.ReadToEnd();27
sentence = sentence.Replace("\r\n","");28
string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;29
//Console.WriteLine("正在初始化字典库,请稍候
");30
WordSegment wordSegment = new WordSegment();31
wordSegment.InitWordSegment(DictPath);32
result = wordSegment.Segment(sentence, nKind);33
}34

35
/// <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null36
/// </summary>37
public override Token Next()38
{39
Token token = null;40
while (i < result[0].Length-1)41
{42
string word = result[0][i].sWord;43
endIndex = startIndex + word.Length - 1;44
token = new Token(word, startIndex, endIndex);45
startIndex = endIndex + 1;46

47
i++;48
return token;49

50
}51
return null;52
}53
}54
}55

需分词句子:***,***,中华人民共和国在1949年建立,从此开始了新中国的伟大篇章.长春市长春节发表致词汉字abc iphone 1265325.98921 fee1212@tom.com http://news.qq.com 100%
分词结果:
(***,0,2)(***,4,6)(中华人民共和国,8,14)(1949年,16,20)(建立,21,22)(从此,24,25)(新,29,29)(中国,30,31)(伟大,33,34)(篇章,35,36)(长春市,38,40)(春节,42,43)(发表,44,45)(致词,46,47)(汉字,48,49)(abc,50,52)(iphone,53,58)(1265325.98921,59,71)(fee1212@tom,72,82)(com,84,86)(http://news,87,97)(qq,99,100)(com,102,104)(100%,105,108)
耗时00:00:00.0937500


浙公网安备 33010602011771号