# -*- coding: UTF-8 -*-
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis import RegexAnalyzer
from whoosh.qparser import QueryParser
import os

#打开文件
with open("./Test/test.py","r") as filenm_src:
str_all=filenm_src.read()
#print str_all.decode('UTF-8').encode('GBK')

analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)")
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("indexdir"):
os.mkdir("indexdir")
ix = create_in("indexdir", schema)
writer = ix.writer()
#必须使用decode('UTF-8')?
writer.add_document(title=u"First document", path=u"/a",
content=str_all.decode('UTF-8'))
writer.add_document(title=u"Second document", path=u"/b",
content=u"The second one 你 中文测试中文 is even more interesting!")
writer.commit()

#查找一个要素的例子
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"first")
#使用with,尽早回句柄。
with ix.searcher() as searcher:
results = searcher.search(q)
if results.is_empty():
print "not found"
else:
#访问各个要素
print results[0]
print results[0].rank
print results[0].docnum
print results[0].keys()
print results[0].score
print results[0]["title"]
print results[0]["path"]
print results[0]["content"]

Posted on 2014-12-02 15:11  旅途  阅读(112)  评论(0)    收藏  举报