搜索引擎的爬虫使用的elasticsearch版本由5升到7代码修改的地方
第一步:升级elasticsearch-dsl包 pip install elasticsearch-dsl==7.2.1
第二步:model代码,创建elasticsearch索引
# from elasticsearch_dsl import DocType, Date, Completion, Keyword, Text from elasticsearch_dsl import Document, Date, Integer, Keyword, Text, connections # from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer # # from elasticsearch_dsl.connections import connections connections.connections.create_connection(hosts=["localhost"]) # class CustomAnalyzer(_CustomAnalyzer): # def get_analysis_definition(self): # return {} # ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) class YunWei(Document): # suggest = Completion(analyzer=ik_analyzer) url = Keyword() url_object_id = Keyword() title = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") create_time = Date() tags = Text(analyzer="ik_max_word") crawl_time = Date() class Index: name = "yunwei" settings = {"number_of_shards": 5} if __name__ == "__main__": YunWei.init()
注释的代码就是去掉的不需要的代码,下面是item中调用的代码
from ArticleSpider.models.es_types import YunWei from elasticsearch_dsl.connections import connections es = connections.create_connection(YunWei)
def gen_suggests(index, info_tuple):
# 根据字符串生成搜索建议数组
userd_words = set()
suggests = []
for text, weight in info_tuple:
if text:
# 调用es的analyze接口分析字符串
words = es.indices.analyze(index="yunwei", body={"analyzer": "ik_max_word", "text": "{0}".format(text)})
analyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"]) > 1])
new_words = analyzed_words - userd_words
else:
new_words = set()
if new_words:
suggests.append({"input": list(new_words), "weight": weight})
return suggests
---后面的调用---
article.suggest = gen_suggests(YunWei,
((article.title, 10), (article.tags, 7), (article.content, 5)))
这样即可以在elasticsearch升级为7.5.1(我的是这个)后使用爬虫不会出问题了
下面补充一下ik分词器的安装方法
-
一:先下载包,地址为: https://github.com/medcl/elasticsearch-analysis-ik/releases
创建ik插件目录:
cd your-es-root/plugins/ && mkdir ik解压ik分词器压缩包即安装完成:
your-es-root/plugins/ik -
二:使用elasticsearch插件安装
-
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.3.0/elasticsearch-analysis-ik-6.3.0.zip
附ik分词器的GitHub官方网址:https://github.com/medcl/elasticsearch-analysis-ik

浙公网安备 33010602011771号