RavenDB自定义analyzer

备注: 1 因为测试的时候我用的是本地一个书库内容, 数据写入的代码就省略了

        2 中科院分词部分可以到中科院网站下载 http://ictclas.org/

        3 上面利用中科院分词的ICTCLASAnalyzer, 已经在项目中应用,效率还可以

另: 本来打算用盘古分词来做测试的, 但是盘古分词对lucene 2.9的时候做了些修改, 所以ravendb必须要重新编译, 有点麻烦, 就跳过了, 操作方法应该基本一致

 

环境:RanvenDB Build-121(Lucene.Net 2.9.2.1)

自定义分词: 中科院分词

开始:

Ravendb.net上关于自定义分词有如下描述:

You can also create your own custom analyzer, compile it to a dll and drop it in in directory called "Analyzers" under the RavenDB base directory. Afterward, you can then use the fully qualified type name of your custom analyzer as the analyzer for a particular field.

但在实验过程中, 发现自定义分词应该与 Server/Lucene.Net.dll 在同一目录下(即根目录)

 

Analyzer:

代码
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;

namespace LuceneNetChinese
{
public class ICTCLASAnalyzer : Analyzer
{
//定义要过滤的词
public string[] CHINESE_ENGLISH_STOP_WORDS;
public string StopPath = @"Stopwords.txt";


public ICTCLASAnalyzer()
{
ArrayList StopWords
= new ArrayList();
StreamReader reader
= new StreamReader(StopPath, System.Text.Encoding.UTF8);
string noise = reader.ReadLine();
int i = 0;
while (!string.IsNullOrEmpty(noise))
{
StopWords.Add(noise);
noise
= reader.ReadLine();
i
++;
}
CHINESE_ENGLISH_STOP_WORDS
= new String[i];

while (i > 0)
{
i
--;
CHINESE_ENGLISH_STOP_WORDS[i]
= (string)StopWords[i];
}
StopWords.Clear();
}

/**/
/// Constructs a {@link StandardTokenizer} filtered by a {@link
/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
///
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{

TokenStream result
= new ICTCLASTokenizer(reader);
result
= new StandardFilter(result);
result
= new LowerCaseFilter(result);
result
= new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
return result;
}
}
}

 

Tokenizer:

 

代码
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using System.Runtime.InteropServices;

namespace LuceneNetChinese
{
class ICTCLASTokenizer : Tokenizer
{
result_t[] result;
byte[] bytes;

int startIndex = 0;
int endIndex = 0;
int i = 0;
//string[] pResult;
/**/
///
/// 待分词的句子
///
private string sentence;
/**/
/// Constructs a tokenizer for this Reader.
///
//public static bool flag = ICTCLAS_Init(null);
//public static int userdic = ICTCLAS.ICTCLAS_ImportUserDict("userdict.txt");

public ICTCLASTokenizer(System.IO.TextReader reader)
{
this.input = reader;

sentence
= input.ReadToEnd();


if (!ICTCLAS_Init(null))
{
//System.Console.WriteLine("Init ICTCLAS failed!");
//return;
throw new Exception("Init ICTCLAS failed!");
}


int count = ICTCLAS_GetParagraphProcessAWordCount(sentence);
bytes
= Encoding.Default.GetBytes(sentence);
result
= new result_t[count];
ICTCLAS_ParagraphProcessAW(count, result);
//string sResult = ICTCLAS.ICTCLAS_ParagraphProcess(sentence, 1);
//pResult = Regex.Split(sResult, @"(?<=\w) ");//直接获取分词结果字符串,在结果上分出词。
//Console.WriteLine(sResult);
}

/**/
/// 进行切词,返回数据流中下一个token或者数据流为空时返回null
///
public override Token Next()
{
Token token
= null;
while (i < result.Length)
{

result_t word
= result[i];

byte[] tmpbyte = new byte[word.length];
for (int idx = 0; idx < word.length; idx++)
{
tmpbyte[idx]
= bytes[word.start + idx];
}

token
= new Token(Encoding.Default.GetString(tmpbyte), word.start, word.start + word.length);
//MatchCollection rw = Regex.Matches(word, @"\s");
//int space = rw.Count;

//startIndex = endIndex;//sentence.IndexOf(word, endIndex);
//endIndex = startIndex + word.Length;

//token = new Token(sentence.Substring(startIndex + space, word.Length - space), startIndex + space, endIndex);

i
++;
// Console.WriteLine("word: {0},({1},{2})", sentence.Substring(startIndex + space, word.Length - space), startIndex + space, endIndex);
return token;
}
return null;
}


const string path = @"ICTCLAS30.dll";

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_Init")]
public static extern bool ICTCLAS_Init(String sInitDirPath);

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcess")]
public static extern String ICTCLAS_ParagraphProcess(String sParagraph, int bPOStagged);

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_Exit")]
public static extern bool ICTCLAS_Exit();

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDict")]
public static extern int ICTCLAS_ImportUserDict(String sFilename);

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcess")]
public static extern bool ICTCLAS_FileProcess(String sSrcFilename, String sDestFilename, int bPOStagged);

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcessEx")]
public static extern bool ICTCLAS_FileProcessEx(String sSrcFilename, String sDestFilename);

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_GetParagraphProcessAWordCount")]
static extern int ICTCLAS_GetParagraphProcessAWordCount(String sParagraph);
//ICTCLAS_GetParagraphProcessAWordCount
[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcessAW")]
static extern void ICTCLAS_ParagraphProcessAW(int nCount, [Out, MarshalAs(UnmanagedType.LPArray)] result_t[] result);

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_AddUserWord")]
static extern int ICTCLAS_AddUserWord(String sWord);

[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_SaveTheUsrDic")]
static extern int ICTCLAS_SaveTheUsrDic();


[DllImport(path, CharSet
= CharSet.Ansi, EntryPoint = "ICTCLAS_DelUsrWord")]
static extern int ICTCLAS_DelUsrWord(String sWord);
}

[StructLayout(LayoutKind.Explicit)]
public struct result_t
{
[FieldOffset(
0)]
public int start;
[FieldOffset(
4)]
public int length;
[FieldOffset(
8)]
public int sPos;
[FieldOffset(
12)]
public int sPosLow;
[FieldOffset(
16)]
public int POS_id;
[FieldOffset(
20)]
public int word_ID;
[FieldOffset(
24)]
public int word_type;
[FieldOffset(
28)]
public int weight;
}
}

 

 

dll放置如下:

测试:

 

实体
public class Book
{
public long BookID { get; set; }
public string BookName { get; set; }
public string Description { get; set; }

public List<Chapter> Chapters { get; set; }
}

public class Chapter
{
public long chapterID { get; set; }
public string ChapterName { get; set; }
}

 

 

数据写入(略)

生成索引:

 

代码
var documentStore1 = new DocumentStore { Url = "http://localhost:8080" }.Initialize();
using (var session = documentStore1.OpenSession())
{
session.DatabaseCommands.PutIndex(
"idx_Description", new IndexDefinition
{
Map
= @"from b in docs.Books
where b.BookID > 0
select new { b.Description }
",
Analyzers
=
{
{
"Description", typeof(ICTCLASAnalyzer).AssemblyQualifiedName}
}
});
}

 

查询测试:

 

 

            var documentStore1 = new DocumentStore { Url = "http://localhost:8080" }.Initialize();
            using (var session = documentStore1.OpenSession())
            {
                #region Search
                var books = session.LuceneQuery<Book>("idx_Description").Where("Description:病毒")
                    .Take(5).ToArray();

                Console.WriteLine(books.Count());
                #endregion
            }
posted @ 2010-09-26 13:41  wiseshrek  阅读(1096)  评论(0编辑  收藏  举报