test.py 四

# -*- coding: UTF-8 -*-
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis import RegexAnalyzer
from whoosh.qparser import QueryParser
from jieba.analyse import ChineseAnalyzer

import os
import chardet

#打开文件
with open("./Test/test.txt","r") as filenm_src:
str_all=filenm_src.read()
print str_all
fencoding=chardet.detect(str_all)
print fencoding
#print str_all.decode('GBK')

#analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)")
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("indexdir"):
os.mkdir("indexdir")
ix = create_in("indexdir", schema)
writer = ix.writer()
#必须使用UTF-8格式
writer.add_document(title=u"First document", path=u"/a",
content=str_all.decode('GBK'))
writer.commit()

#查找一个要素的例子
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"我们是中国人")
#for t in q:
# print t.text
#使用with，尽早回句柄。
with ix.searcher() as searcher:
results = searcher.search(q)
if results.is_empty():
print "not found"
else:
#访问各个要素
#print results[0]
print results[0].rank
print results[0].docnum
print results[0].keys()
print results[0].score
print results[0]["title"]
print results[0]["path"]
print results[0]["content"]

#for t in analyzer(u"我的好朋友是李明;我爱北京天安门;IBM和Microsoft; I have a dream. this is intetesting and interested me a lot"):
# print t.text

Posted on 2014-12-02 17:11 旅途阅读(100) 评论(0) 收藏举报

刷新页面返回顶部