搜索引擎的爬虫使用的elasticsearch版本由5升到7代码修改的地方

第一步:升级elasticsearch-dsl包 pip install elasticsearch-dsl==7.2.1

第二步:model代码,创建elasticsearch索引

# from elasticsearch_dsl import DocType, Date, Completion, Keyword, Text
from elasticsearch_dsl import Document, Date, Integer, Keyword, Text, connections

# from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
#
# from elasticsearch_dsl.connections import connections

connections.connections.create_connection(hosts=["localhost"])


# class CustomAnalyzer(_CustomAnalyzer):
#     def get_analysis_definition(self):
#         return {}


# ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])


class YunWei(Document):
    # suggest = Completion(analyzer=ik_analyzer)
    url = Keyword()
    url_object_id = Keyword()
    title = Text(analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word")
    create_time = Date()
    tags = Text(analyzer="ik_max_word")
    crawl_time = Date()

    class Index:
        name = "yunwei"
        settings = {"number_of_shards": 5}


if __name__ == "__main__":
    YunWei.init()

注释的代码就是去掉的不需要的代码,下面是item中调用的代码

from ArticleSpider.models.es_types import YunWei

from elasticsearch_dsl.connections import connections

es = connections.create_connection(YunWei)

def gen_suggests(index, info_tuple):
# 根据字符串生成搜索建议数组
userd_words = set()
suggests = []
for text, weight in info_tuple:
if text:
# 调用es的analyze接口分析字符串
words = es.indices.analyze(index="yunwei", body={"analyzer": "ik_max_word", "text": "{0}".format(text)})
analyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"]) > 1])
new_words = analyzed_words - userd_words
else:
new_words = set()
if new_words:
suggests.append({"input": list(new_words), "weight": weight})

return suggests

    
    ---后面的调用---
    article.suggest = gen_suggests(YunWei,
((article.title, 10), (article.tags, 7), (article.content, 5)))

这样即可以在elasticsearch升级为7.5.1(我的是这个)后使用爬虫不会出问题了

 

下面补充一下ik分词器的安装方法

 

posted @ 2020-09-10 00:01  竹为君生  阅读(314)  评论(0)    收藏  举报