Lucene.Net+KTDictSeg中文分词搭建全文检索引擎
What is Lucene——Apache Lucene
Apache Lucene(TM) is a high-performance, full-featured text search engine library written entirely in Java. It is a technology suitable for nearly any application that requires full-text search, especially cross-platform.
Apache Lucene is an open source project available for free download. Please use the links on the left to access Lucene.
使用Lucene的一些心得
最先用Lucene的时候不是使用Java下的开发包,而是用了.Net下的Lucene.Net.dll,用于搭建一个全文检索的引擎,用以全文检索用户上传的文档。效果图如下:
英文的全文检索相对于中文来说,简单一些,因为英文单词可以以空格为分割的依据,而中文的分割要根据语义,单词的解析模糊,很难界定。例如,最经典的:
长春市长春药店。→长春市|长春药店 →长春市长|春药店...
中文分词算法,分为几大类:基于字符串匹配的分词算法;基于理解的分词算法;基于统计的分词算法。
对于这些算法,网上有很多资料,感兴趣的话可以看看。
而我这个系统中,使用的是eaglet前辈的KTDictSeg分词组件,(eaglet前辈是著名的盘古分词的作者)。
OK,现在讲一讲我如何使用Lucene.net.dll和KTDictSeg分词组件来实现全文检索:
1.为项目添加引用,如下:
Lucene.Net.dll
2.现在系统便有了Lucene和中文分词功能的支持。添加一个类,字段可以与你要索引的文件的字段一样。
这里我们假设我们添加了A类,字段有:DomID,Title,SynTime,HtmPath,KeyWords
3.添加完类,添加一个检索类,用于新建索引和搜索,代码如下:
1 using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 using System.Web; 5 using System.Text; 6 using System.Text.RegularExpressions; 7 using System.IO; 8 using Lucene.Net; 9 using Lucene.Net.Analysis; 10 using Lucene.Net.Analysis.Standard; 11 using Lucene.Net.Documents; 12 using Lucene.Net.Index; 13 using Lucene.Net.QueryParsers; 14 using Lucene.Net.Search; 15 using Lucene.Net.Store; 16 using Lucene.Net.Util; 17 using Lucene.Net.Analysis.KTDictSeg; 18 using KTDictSeg.HighLight; 19 20 namespace LearnAndPracticeBLL 21 { 22 public class DictLucenceSearch 23 { 24 //用于存储检索出数据总数 25 private int _count; 26 public int Count 27 { 28 get { return _count; } 29 set { _count = value; } 30 } 31 32 //搜索关键字 33 private string _keyword; 34 public string KeyWord 35 { 36 get { return _keyword; } 37 set { _keyword = value; } 38 } 39 40 //搜索时间 41 private double _time; 42 public double Time 43 { 44 get { return _time; } 45 set { _time = value; } 46 } 47 48 //索引的路径 49 private string _myindex; 50 51 public string Myindex 52 { 53 get { return _myindex; } 54 set 55 { 56 _myindex = System.Web.HttpContext.Current.Server.MapPath("~/"+value); 57 } 58 } 59 60 ///<summary> 61 ///对多个文档添加索引 62 ///</summary> 63 ///<param name="writer"></param> 64 ///<param name="mydoc"></param> 65 public static void AddDocument(IndexWriter writer,List<LearnAndPracticeModel.Document> mydocs) 66 { 67 for (int i = 0; i < mydocs.Count; i++) 68 { 69 Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); 70 Field DomID = new Field("DomID", mydocs[i].DomID.ToString(), Field.Store.YES, Field.Index.TOKENIZED); 71 Field Title = new Field("Title", mydocs[i].Title, Field.Store.YES, Field.Index.TOKENIZED); 72 Field HtmPath = new Field("HtmlPath", mydocs[i].HtmPath, Field.Store.YES, Field.Index.NO); 73 Field KeyWords = new Field("KeyWords", mydocs[i].KeyWords, Field.Store.YES, Field.Index.TOKENIZED); 74 Field Content = new Field("Content", GetHTML(mydocs[i].HtmPath), Field.Store.YES, Field.Index.TOKENIZED); 75 76 //为字段设置权重,默认为1 77 Title.SetBoost(1.5f); 78 79 KeyWords.SetBoost(1.2f); 80 doc.Add(DomID); 81 doc.Add(Title); 82 doc.Add(HtmPath); 83 doc.Add(KeyWords); 84 doc.Add(SynTime); 85 doc.Add(Content); 86 87 writer.AddDocument(doc); 88 } 89 } 90 91 ///<summary> 92 ///对多个文档建立索引 93 ///</summary> 94 public void Index(List<A>doms) 95 { 96 string INDEX_STORE_PATH = Myindex; 97 98 //使用eaglet的KTDictSeg的分词器 99 Analyzer analyzer = new KTDictSegAnalyzer(); 100 101 //lucene.net 默认分词器 102 //Analyzer analyzer = new StandardAnalyzer(); 103 104 FSDirectory fsDir; 105 if (System.IO.Directory.Exists(INDEX_STORE_PATH)) 106 { 107 fsDir= FSDirectory.GetDirectory(INDEX_STORE_PATH, false); 108 } 109 else 110 { 111 fsDir = FSDirectory.GetDirectory(INDEX_STORE_PATH, true); 112 } 113 114 IndexWriter writer = new IndexWriter(fsDir, analyzer, true); 115 116 AddDocument(writer, doms); 117 writer.Optimize(); 118 writer.Close(); 119 120 } 121 122 ///<summary> 123 ///词进行分词 124 ///</summary> 125 ///<param name="keyWords">要搜索的词</param> 126 ///<param name="ktTokenizer">分词对象</param> 127 ///<returns>分词后的结果</returns> 128 public string GetKeyWordSplitBySpace(string keyWords, KTDictSegTokenizer ktTokenizer) 129 { 130 StringBuilder builder = new StringBuilder(); 131 List<FTAlgorithm.T_WordInfo> words = ktTokenizer.SegmentToWordInfos(keyWords); 132 133 foreach (FTAlgorithm.T_WordInfo word in words) 134 { 135 if (word == null) 136 { 137 continue; 138 } 139 140 KeyWord = KeyWord + word + ","; 141 builder.AppendFormat("{0}^{1}", word.Word, (int)Math.Pow(3, word.Rank)); 142 } 143 KeyWord = KeyWord.Substring(0, KeyWord.Length - 1); 144 return builder.ToString().Trim(); 145 } 146 147 ///<summary> 148 ///检索信息 149 ///</summary> 150 ///<param name="keyWord">检索关键字</param> 151 ///<param name="pageNumber">当前第几条</param> 152 ///<param name="pageSize">每页显示的条数</param> 153 ///<returns></returns> 154 public List<TheIndex> Search(string keyWord, int pageNumber, int pageSize) 155 { 156 string word = GetKeyWordSplitBySpace(keyWord, new KTDictSegTokenizer()); 157 158 IndexSearcher search = new IndexSearcher(Myindex); 159 StringBuilder builder = new StringBuilder(); 160 KTDictSegAnalyzer analyzer = new KTDictSegAnalyzer(true); 161 162 //多字段搜索字段 163 MultiFieldQueryParser parser = new MultiFieldQueryParser(new string[] { "Title", "Content","KeyWords" }, analyzer); 164 165 //分词 166 Query query = parser.Parse(word); 167 Hits hits = search.Search(query); 168 Count = hits.Length(); 169 170 int num = 0;//记录每页最后一条 171 if (Count < pageNumber + pageSize) 172 { 173 num = Count; 174 } 175 176 else 177 { 178 num = pageSize + pageNumber; 179 } 180 181 DateTime begin = DateTime.Now; 182 183 List<TheIndex> theindexs = new List<TheIndex>(); 184 for (int i = 0; i < num; i++) 185 { 186 //使用KTDictSeg.HighLight高亮组件为结果添加高亮效果 187 KTDictSeg.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = 188 new KTDictSeg.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>"); 189 190 KTDictSeg.HighLight.Highlighter highlighter = 191 new KTDictSeg.HighLight.Highlighter(simpleHTMLFormatter, 192 new Lucene.Net.Analysis.KTDictSeg.KTDictSegTokenizer()); 193 194 highlighter.FragmentSize = 300; 195 196 TheIndex theindex = new TheIndex(); 197 theindex.DomID =Convert.ToInt32(hits.Doc(i).Get("DomID")); 198 theindex.Title= hits.Doc(i).Get("Title"); 199 200 string title = highlighter.GetBestFragment(keyWord,theindex.Title); 201 if (!string.IsNullOrEmpty(title)) 202 { 203 theindex.Title = title; 204 } 205 206 theindex.HtmPath = hits.Doc(i).Get("HtmlPath"); 207 theindex.Content =hits.Doc(i).Get("Content"); 208 209 string content=highlighter.GetBestFragment(keyWord,theindex.Content); 210 if (!string.IsNullOrEmpty(content)) 211 { 212 theindex.Content = content; 213 } 214 215 theindex.Content = theindex.Content.Substring(0, theindex.Content.Length>300?300:theindex.Content.Length); 216 theindex.SynTime =Convert.ToDateTime(hits.Doc(i).Get("SynTime")).ToString("yyyy年MM月dd日"); 217 theindex.KeyWords = hits.Doc(i).Get("KeyWords"); 218 219 string keywords = highlighter.GetBestFragment(keyWord,theindex.KeyWords); 220 if (!string.IsNullOrEmpty(keywords)) 221 { 222 theindex.KeyWords = keywords; 223 } 224 225 theindexs.Add(theindex); 226 } 227 228 DateTime end = DateTime.Now; 229 double ts = (end - begin).TotalMilliseconds; 230 Time = ts / 1000.000; 231 search.Close();//关闭检索器 232 return theindexs; 233 } 234 235 236 //定义方法,用来解析HTML,用以为HTML文档添加索引 237 public static string GetHTML(string Path) 238 { 239 System.Text.Encoding encoding = System.Text.Encoding.GetEncoding("gb2312"); 240 StreamReader sr = new StreamReader(HttpContext.Current.Server.MapPath("~/") + Path, encoding); 241 242 string str = ""; 243 244 while (sr.Peek() != -1) 245 { 246 char[] buffer = new char[4096]; 247 int bufferFillSize = sr.ReadBlock(buffer, 0, 4096); 248 str = str + new string(buffer); 249 } 250 sr.Close(); 251 252 string strResult = findUsedFromHtml(str); 253 return strResult; 254 } 255 256 ///<summary> 257 ///获取Html的body部分并进行格式化 258 ///</summary> 259 ///<param name="strHtml"></param> 260 ///<returns></returns> 261 private static string findUsedFromHtml(string strHtml) 262 { 263 string strBody; 264 265 int bodyStart = strHtml.IndexOf("<body"); 266 int bodyEnd = strHtml.IndexOf("</body>"); 267 268 //Body部分 269 strBody =StripHTML(strHtml.Substring(bodyStart, bodyEnd - bodyStart + 7)); 270 271 return strBody; 272 } 273 274 ///<Header>去除HTML的tag 275 ///</Header> 276 ///<param name="HTML">源</param> 277 ///<returns>结果</returns> 278 public static string StripHTML(string HTML) 279 { 280 string[] Regexs = 281 { 282 @"<body[^>]*?>", 283 @"<script[^>]*?>.*?</script>", 284 @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", 285 @"([\r\n])[\s]+", 286 @"&(quot|#34);", 287 @"&(amp|#38);", 288 @"&(lt|#60);", 289 @"&(gt|#62);", 290 @"&(nbsp|#160);", 291 @"&(iexcl|#161);", 292 @"&(cent|#162);", 293 @"&(pound|#163);", 294 @"&(copy|#169);", 295 @"&#(\d+);", 296 @"-->", 297 @"<!--.*\n" 298 }; 299 300 string[] Replaces = 301 { 302 "", 303 "", 304 "", 305 "", 306 "\"", 307 "&", 308 "<", 309 ">", 310 " ", 311 "\xa1", //chr(161), 312 "\xa2", //chr(162), 313 "\xa3", //chr(163), 314 "\xa9", //chr(169), 315 "", 316 "\r\n", 317 "" 318 }; 319 320 string s = HTML; 321 for (int i = 0; i < Regexs.Length; i++) 322 { 323 s = new Regex(Regexs[i], RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(s, Replaces[i]); 324 } 325 s.Replace("<", ""); 326 s.Replace(">", ""); 327 return s; 328 } 329 } 330 }
新建完这个类,我们就可以为文档添加索引了,并搜索了,呵呵。下次再写写再Java下的实现。
呵呵,博客新手+初级程序员,所以写出的文章难免干巴巴并且涉及比较少,请不要见笑。最近实习也比较忙,初稿就这样,呵呵,有什么问题,请发到我的邮箱:three_zone@163.com
多聚旅游 聚游宝 学友网