|
公告
-
残荷听雨,梨花飞雪,落英缤纷时节。晓来谁染枫林醉?点点都是离人泪
活着,就是快乐!自信,就是美丽!
有人爱,就是幸福。
春天来了
但愿野百合也有春天
第三季度的计划
木了
晚上一个人看会儿《读者乡土人文版》,听会儿广播挺不错的,想起了三年前在石家庄没电脑的日子,时光飞逝呀,现在笔记本都用上了,以前从没想过,确实得知足常乐。
日历
| | 日 | 一 | 二 | 三 | 四 | 五 | 六 |
|---|
| 27 | 28 | 29 | 30 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 | 8 | 9 | 10 | | 11 | 12 | 13 | 14 | 15 | 16 | 17 | | 18 | 19 | 20 | 21 | 22 | 23 | 24 | | 25 | 26 | 27 | 28 | 29 | 30 | 31 | | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
统计
- 随笔 - 248
- 文章 - 2
- 评论 - 2319
- 引用 - 75
导航
常用链接
留言簿
我参与的团队
我的标签
随笔分类
随笔档案
相册
朋友
积分与排名
最新评论

阅读排行榜
评论排行榜
60天内阅读排行
|
蛙蛙推荐:蛙蛙教你索引邮件
困了,不多说了,就是教你怎样把邮件建立索引,再搜索出来。用MAPI把邮件读取到数据库里,用SharpICTCLAS做一个lucene的中文的语汇单元分析器,用lucene建立索引及查询索引。
把某目录邮件读取到数据库里的代码很简单
private static void getmails()
  {
_Application appOutlook = new Application();
NameSpace outlookNS = appOutlook.GetNamespace("MAPI");
outlookNS.Logon("", null, null, null);

Console.WriteLine(outlookNS.Folders.Count);
MAPIFolder inboxFolder = outlookNS.Folders[1].Folders["chat"];
HandlerFolder(inboxFolder);
outlookNS.Logoff();
}

private static void HandlerFolder(MAPIFolder inboxFolder)
  {
foreach (object item in inboxFolder.Items)
 {
MailItem mi = item as MailItem;
if (mi != null)
 {
Console.WriteLine("ReceivedTime:{0}\r\nSubject:{1}", mi.ReceivedTime, mi.Subject);
try
 {
MailDAO.Add_Mail_SSH(mi.Subject, mi.Body, mi.ReceivedTime, mi.To,
mi.SenderName, mi.SenderEmailAddress, mi.CC ?? "", mi.BCC ?? "");
}
catch (Exception ex)
 {
Console.WriteLine(ex);
}
}
}
}


为了方便测试,先弄一些假数据
class MailDAO
  {
public static IList<Email> GetMails()
 {
List<Email> ret = new List<Email>();
Email mail = new Email();
mail.Subject = "倡议:大家做一个.net开源的灾难管理系统";
mail.Body = "倡议:大家做一个.net开源的灾难管理系统";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@126.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-18 12:19");
ret.Add(mail);

mail = new Email();
mail.Subject = "[置顶]蛙蛙推荐:蛙蛙教你文本聚类";
mail.Body = "[置顶]蛙蛙推荐:蛙蛙教你文本聚类";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@126.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-10 20:43");
ret.Add(mail);

mail = new Email();
mail.Subject = "蛙蛙推荐:蛙蛙牌关键词提取算法";
mail.Body = "蛙蛙推荐:蛙蛙牌关键词提取算法";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@126.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-11 23:34");
ret.Add(mail);

mail = new Email();
mail.Subject = "蛙蛙推荐:蛙蛙牌软件注册码算法";
mail.Body = "蛙蛙推荐:蛙蛙牌软件注册码算法";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@fetionmm.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-03 21:55");
ret.Add(mail);

return ret;
}
}


像收件人,抄送地址等用一个简单的语汇单元分析器就可以,代码如下,因为token的位置信息只在分析的时候有用,只有位置增量才会写到索引里,所以位置信息都写了-1
 /**//// <summary>
/// 用于语汇单元化用简单分割字符隔开的字符串
/// </summary>
public class WawaSimpleTokenizer : Tokenizer
  {
private readonly string _txt;
private List<string> _filter;
private int _current;
private bool _isStart = true;
private int _max;
public WawaSimpleTokenizer(TextReader reader)
 {
input = reader;
_txt = input.ReadToEnd();
_txt = _txt.ToLower().Replace("'", "").Replace('"', ' ');
}

public override Token Next()
 {
if (_isStart)
 {
Regex r = new Regex("([ \\t{}():;. \n])");
String[] tokens = r.Split(_txt);
_filter = new List<string>();
for (int i = 0; i < tokens.Length; i++)
 {
MatchCollection mc = r.Matches(tokens[i]);
if (mc.Count <= 0 && tokens[i].Trim().Length > 0)
_filter.Add(tokens[i]);
}
_max = _filter.Count-1;
_current = 0;
_isStart = false;
}

if (_current > _max)
return null;

string word = _filter[_current];
Token token = new Token(word, -1, -1);
_current++;
return token;
}
}


把lucene适配成lucene的分析器,有人做了,直接拿过来,链接如下
http://www.cnblogs.com/birdshover/archive/2008/03/26/1122305.html
建立索引的代码如下,写了些注释,不多说了就
public class MailIndexer
  {
protected string _indexDirectory;
protected IndexWriter _writer = null;

protected virtual void setUp()
 {
string dictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar; //词库路径
SharpICTCLASAnalyzer sharpICTCLASAnalyzer = new SharpICTCLASAnalyzer(dictPath); //中文分析器
//按域分析包装器,邮件标题、正文等用中文分析器分析
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(sharpICTCLASAnalyzer);
WawaSimpleAnalyzer simpleAnalyzer = new WawaSimpleAnalyzer(); //按分隔符语汇单元化的分析器
//收件人,抄送和密送地址用简单解析器
wrapper.AddAnalyzer("MailTo", simpleAnalyzer);
wrapper.AddAnalyzer("Cc", simpleAnalyzer);
wrapper.AddAnalyzer("Bcc", simpleAnalyzer);

_indexDirectory = Path.Combine(Environment.CurrentDirectory, "mailindex") + Path.DirectorySeparatorChar;
; //索引路径

_writer = new IndexWriter(_indexDirectory, wrapper, true); //创建IndexWriter
_writer.SetUseCompoundFile(true); //显式设置索引为复合索引
_writer.SetMaxFieldLength(int.MaxValue); //设置域最大长度为最大值
_writer.SetMergeFactor(100); //设置每100个段合并成一个大段
_writer.SetMaxMergeDocs(10000); //设置一个段的最大文档数
_writer.SetMaxBufferedDocs(1000); //设置在把索引写入磁盘前内存里文档的缓存个数
}

public void ExeCute()
 {
try
 {
setUp();
IList<Email> mails = MailDAO.GetMails();
foreach (Email mail in mails)
 {
try
 {
Console.WriteLine("正在索引:{0}", mail.Subject);
Document doc = new Document();
doc.Add(new Field("Subject", mail.Subject ?? "",
Field.Store.YES, Field.Index.TOKENIZED)); //邮件主题,分词、索引、存储——
doc.Add(new Field("Body", mail.Body ?? "",
Field.Store.NO, Field.Index.TOKENIZED)); //邮件正文,分词、索引,不存储
doc.Add(new Field("ReceiveTime", mail.ReceiveTime.ToString("yyyyMMdd"),
Field.Store.YES, Field.Index.UN_TOKENIZED)); //接受时间,不分词,不索引,保存
doc.Add(new Field("MailTo", mail.MailTo ?? "",
Field.Store.YES, Field.Index.TOKENIZED)); //收件人,分词,索引,保存
doc.Add(new Field("SenderName", mail.SenderName ?? "",
Field.Store.YES, Field.Index.UN_TOKENIZED)); //发送人姓名,不分词,索引,保存
doc.Add(new Field("SenderEmailAddress", mail.SenderEmailAddress ?? "",
Field.Store.YES, Field.Index.UN_TOKENIZED)); //发送人邮件,不分词,索引,保存
doc.Add(new Field("Cc", mail.Cc ?? "",
Field.Store.YES, Field.Index.NO)); //抄送,分词,索引,保存
doc.Add(new Field("Bcc", mail.Bcc ?? "",
Field.Store.YES, Field.Index.NO)); //密送,分词、索引,保存
_writer.AddDocument(doc);
}
catch (Exception ex)
 {
Console.WriteLine("索引出错:{0},{1}", mail.Subject, ex);
}
}
}
catch (Exception ex)
 {
Console.WriteLine("Run:{0}", ex);
}
finally
 {
close();
}
}

private void close()
 {
try
 {
_writer.Optimize();
_writer.Close();
}
catch (Exception ex)
 {
Console.WriteLine("Close:{0}", ex);
}
}
}


搜索的代码如下
public class MailSearcher
  {
protected string _indexDirectory;
protected IndexSearcher _searcher = null;
private MultiFieldQueryParser _mfqp;
public MailSearcher()
 {
_indexDirectory = Path.Combine(Environment.CurrentDirectory, "mailindex") + Path.DirectorySeparatorChar; ; //索引路径
_searcher = new IndexSearcher(_indexDirectory);
string dictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar; //词库路径
SharpICTCLASAnalyzer sharpICTCLASAnalyzer = new SharpICTCLASAnalyzer(dictPath); //中文分析器
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(sharpICTCLASAnalyzer);
WawaSimpleAnalyzer simpleAnalyzer = new WawaSimpleAnalyzer(); //按分隔符语汇单元化的分析器
//收件人,抄送和密送地址用简单解析器
wrapper.AddAnalyzer("MailTo", simpleAnalyzer);
wrapper.AddAnalyzer("Cc", simpleAnalyzer);
wrapper.AddAnalyzer("Bcc", simpleAnalyzer);
 string[] fields = new string[] { "Subject", "Body", "MailTo", "Cc", "Bcc" };
_mfqp = new MultiFieldQueryParser(fields, wrapper);
}
public Hits Search(string queryStr)
 {
Query q = _mfqp.Parse(queryStr);
Hits result = _searcher.Search(q);
return result;
}

public static void ShowHits(Hits hits)
 {
Console.WriteLine("共有{0}个结果",hits.Length());
for(int i = 0;i<hits.Length();i++)
 {
string subject = hits.Doc(i).Get("Subject");
string receiveTime = hits.Doc(i).Get("ReceiveTime");
Console.WriteLine("{0}-{1}",receiveTime,subject);
}
}
}


最后整个的测试代码如下
static void Main(string[] args)
  {
Console.WriteLine("索引");
MailIndexer indexer = new MailIndexer();
indexer.ExeCute();

Console.WriteLine("搜索");
MailSearcher searcher = new MailSearcher();
Console.WriteLine("搜索包含'蛙蛙'的邮件");
MailSearcher.ShowHits(searcher.Search("蛙蛙"));
Console.WriteLine("搜索包含'蛙蛙'且包含'聚类'的邮件");
MailSearcher.ShowHits(searcher.Search("蛙蛙 +聚类"));
Console.WriteLine("搜索包含接受时间从20080510到20080513的邮件");
MailSearcher.ShowHits(searcher.Search("ReceiveTime:[20080510 TO 20080513]"));

Console.WriteLine("ok");
Console.Read();
}


最后的结果代码应该如下
索引
正在索引:倡议:大家做一个.net开源的灾难管理系统
正在索引:[置顶]蛙蛙推荐:蛙蛙教你文本聚类
正在索引:蛙蛙推荐:蛙蛙牌关键词提取算法
正在索引:蛙蛙推荐:蛙蛙牌软件注册码算法
搜索
搜索包含'蛙蛙'的邮件
共有3个结果
20080511-蛙蛙推荐:蛙蛙牌关键词提取算法
20080510-[置顶]蛙蛙推荐:蛙蛙教你文本聚类
20080503-蛙蛙推荐:蛙蛙牌软件注册码算法
搜索包含'蛙蛙'且包含'聚类'的邮件
共有1个结果
20080510-[置顶]蛙蛙推荐:蛙蛙教你文本聚类
搜索包含接受时间从20080510到20080513的邮件
共有2个结果
20080510-[置顶]蛙蛙推荐:蛙蛙教你文本聚类
20080511-蛙蛙推荐:蛙蛙牌关键词提取算法
ok
完整源码下载如下
http://files.cnblogs.com/onlytiancai/MailIndexer.zip
其中词库,吕震宇的中文分词的程序集及lucene.net2.0的程序集请到网上搜索下载。
评论:
-
#1楼
Posted @ 2008-05-21 04:29
感谢
回复 引用
-
#2楼
Posted @ 2008-05-21 08:06
学习
回复 引用 查看
-
#3楼
Posted @ 2008-05-21 09:18
Mark
回复 引用 查看
-
#4楼
Posted @ 2008-05-21 09:32
蛙蛙牌大师
回复 引用 查看
-
#5楼
Posted @ 2008-05-21 09:43
好东西,我正要做这方面,谢谢了
回复 引用 查看
-
#6楼
Posted @ 2008-05-21 09:56
哇哇...
回复 引用 查看
-
#7楼
Posted @ 2008-05-21 11:01
你这个能不让outlook提示你读取它??
回复 引用 查看
-
#8楼[ 楼主]
Posted @ 2008-05-21 11:03
@Franz 弹出提示点允许就O了
回复 引用 查看
-
#9楼
Posted @ 2008-05-21 11:06
@蛙蛙池塘
不好,总感觉这样不人性化!
你如果想后台读取,突然出来个这个,wow,用户会骂人的!
回复 引用 查看
-
#10楼[ 楼主]
Posted @ 2008-05-21 11:17
@Franz 你自己google一下吧,呵呵。
回复 引用 查看
-
#11楼
Posted @ 2008-05-21 14:39
--引用-------------------------------------------------- 簡簡單單..: Mark --------------------------------------------------------
回复 引用 查看
-
#12楼
Posted @ 2008-05-22 02:34
感谢蛙蛙
回复 引用
|