python版elasticsearch入门笔记
Elasticsearch 是一个分布式、高扩展、高实时的搜索与数据分析引擎。Elasticsearch 的实现原理主要分为以下几个步骤,首先用户将数据提交到Elasticsearch 数据库中,再通过分词控制器去将对应的语句分词,将其权重和分词结果一并存入数据,当用户搜索数据时候,再根据权重将结果排名,打分,再将返回结果呈现给用户,以下案例版本为7.16.2,注意python环境安装的elasticsearch版本最好与es数据库版本一致,避免出现兼容性问题
目录:
1、测试样例说明
2、获取相似问embedding
3、索引表创建
4、数据更新导入案例
5、索引表信息查看
6、数据搜索查询
a、通过主键查新,get方法通过主键返回对应数据
b、完全匹配,term方法类似sql 的“=” terms类似sql的in
c、相似匹配,BM25
d、相似匹配加embedding点击排序
e、正则查询,该方法可用于搜索推荐
1、测试样例说明
测试样例是claud2生成的18个标准问,每个标准问有5个相似问,一共90条测试数据,标准问与相似问有均对应id,共四列数据
import pandas as pd
dt=pd.read_csv('./kn_test.csv',encoding='gb18030')
2、获取相似问embedding,这里以macbert为例子,需提前部署bert-seriving,获取similar_question的向量表征后,数据格式如下所示
from bert_serving.client import BertClient IP = IP PORT = PORT PORT_OUT = PORT_OUT BC = BertClient(ip=IP, port=PORT, port_out=PORT_OUT) dt['vec']=list(BC.encode(list(dt['similar_question']))) dt['similar_content'] = dt['similar_question'] #复制一列用于完全匹配 dt

3、索引表创建,需注意字段是否要设为索引或关键字,以及需要倒排索引的列名,向量维度等
from elasticsearch import Elasticsearch
import elasticsearch.helpers as es_helpers
IP=IP
PORT=9200
es = Elasticsearch([{'host': IP,'port': PORT}])
if es.indices.exists(index="faq_test"):
es.indices.delete('faq_test')
#倒排索引创建
index_body={
"settings" : {
"number_of_shards" : 1,
"number_of_replicas" : 1
},
"mappings": {
"properties": {
"similar_content": {
"type":"text",
"analyzer": "ik_max_word", #需安装IK分词器
"search_analyzer": "ik_max_word"
},
"similar_question":{
"type":"keyword", #用于完全匹配
"index":True
},
"vec":{
"type":"dense_vector",
"dims":768, #向量维度
"index":False
},
"faq_id":{
"type":"text",
"index": True
},
"fsimilar_id":{
"type":"text",
"index": False
}
,
"faq_standard":{
"type":"text",
"index": True
}
}
}
}
es.indices.create(index="faq_test", body=index_body)
#
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'faq_test'}
4、数据更新导入案例,这里以dataframe,其他数据格式同理,该方法采用update方法,存在则更新,不存在则新建,需注意主键字段
data=dt.to_dict(orient='records')
action = [{'_op_type':'index','_index':'faq_test','_id':d['fsimilar_id'],'_source':d} for d in data]
es_helpers.bulk(es, action)
5、索引表信息查看
#方法一
!curl '{IP}:9200/_cat/indices?v'
#方法二
from prettyprinter import cpprint
all_indices = es.indices.get_alias("*")
cpprint(all_indices)
#方法三
index_num=es.count(index='faq_test')['count']
print(num)
index_info = es.indices.get(index="faq_test")
cpprint(index_info)
6、数据搜索查询
a、通过主键查新,get方法通过主键返回对应数据
es.get(index="faq_test",id='1002',_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"])
#
{'_index': 'faq_test',
'_type': '_doc',
'_id': '1002',
'_version': 1,
'_seq_no': 1,
'_primary_term': 1,
'found': True,
'_source': {'fsimilar_id': 1002,
'faq_standard': ' 如何学习编程',
'similar_content': '学习编程的步骤是什么',
'faq_id': 8001}}
b、完全匹配,term方法类似sql 的“=” terms类似sql的in
query = {"term": {"similar_question": "如何入门编程"}}
res = es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"])
res
#
{'took': 7,
'timed_out': False,
'_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
'hits': {'total': {'value': 1, 'relation': 'eq'},
'max_score': 4.1053944,
'hits': [{'_index': 'faq_test',
'_type': '_doc',
'_id': '1003',
'_score': 4.1053944,
'_source': {'fsimilar_id': 1003,
'faq_standard': ' 如何学习编程',
'similar_content': '如何入门编程',
'faq_id': 8001}}]}}
c、相似匹配,这里直接返回BM25得分最高的数据
query = {"match": {"similar_content": "商品怎么还没有发货"}}
res = es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"])
res['hits']['hits'][:3]
#
[{'_index': 'faq_test',
'_type': '_doc',
'_id': '1065',
'_score': 5.111511,
'_source': {'fsimilar_id': 1065,
'faq_standard': '商品几天后发货',
'similar_content': '商品何时开始处理发货',
'faq_id': 8013}},
{'_index': 'faq_test',
'_type': '_doc',
'_id': '1053',
'_score': 5.0239367,
'_source': {'fsimilar_id': 1053,
'faq_standard': '购买的商品质量有问题怎么办',
'similar_content': '买到的商品存在缺陷怎么申请退换',
'faq_id': 8011}},
{'_index': 'faq_test',
'_type': '_doc',
'_id': '1063',
'_score': 4.578239,
'_source': {'fsimilar_id': 1063,
'faq_standard': '商品几天后发货',
'similar_content': '购买商品多长时间安排发货',
'faq_id': 8013}}]
query = {
"bool": {
"must": [
{"match": {"similar_content": "怎么学习"}},
{"terms": {"faq_id": ['8012','8001']}}
]
}
}
res=es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"])
res['hits']['hits'][:3]
#
[{'_index': 'faq_test',
'_type': '_doc',
'_id': '1001',
'_score': 4.7467327,
'_source': {'fsimilar_id': 1001,
'faq_standard': ' 如何学习编程',
'similar_content': '怎样开始学习编程',
'faq_id': 8001}},
{'_index': 'faq_test',
'_type': '_doc',
'_id': '1002',
'_score': 4.310314,
'_source': {'fsimilar_id': 1002,
'faq_standard': ' 如何学习编程',
'similar_content': '学习编程的步骤是什么',
'faq_id': 8001}},
{'_index': 'faq_test',
'_type': '_doc',
'_id': '1004',
'_score': 4.128132,
'_source': {'fsimilar_id': 1004,
'faq_standard': ' 如何学习编程',
'similar_content': '编程学习的最佳途径是什么',
'faq_id': 8001}}]
d、相似匹配加向量点积排序,注意这里的embedding需和入库时调用的embedding 服务为同一个模型,返回结果根据点积相似度降序排列
text ="商品怎么还没有发货"
vec= BC.encode([text])[0]
query = {
"script_score": {
"query": {
"bool": {
"must": [
{"match": {"similar_content": text}}
, {"terms": {"faq_id": ['8018','8013']}}
]
}
},
"script": {
"source": "cosineSimilarity(params.vec, \u0027vec\u0027)",
"params": {
"vec": vec
}
}
}
}
res =es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"])
#[i['_source']['faq_standard'] for i in res['hits']['hits']]
#[i['_score'] for i in res['hits']['hits']]
res['hits']['hits'][:3]
#
[{'_index': 'faq_test',
'_type': '_doc',
'_id': '1065',
'_score': 0.8778344,
'_source': {'fsimilar_id': 1065,
'faq_standard': '商品几天后发货',
'similar_content': '商品何时开始处理发货',
'faq_id': 8013}},
{'_index': 'faq_test',
'_type': '_doc',
'_id': '1063',
'_score': 0.8663627,
'_source': {'fsimilar_id': 1063,
'faq_standard': '商品几天后发货',
'similar_content': '购买商品多长时间安排发货',
'faq_id': 8013}},
{'_index': 'faq_test',
'_type': '_doc',
'_id': '1062',
'_score': 0.8555895,
'_source': {'fsimilar_id': 1062,
'faq_standard': '商品几天后发货',
'similar_content': '付款成功后多少天发货',
'faq_id': 8013}}]
e、正则查询,该方法可用于搜索推荐
text='如何'
text_left=text+".*"
query = {
"bool": {
"must": [
{"regexp": {"similar_content": {
"value": text_left,
"flags": "ALL",
"max_determinized_states": 10000,
"rewrite": "constant_score"
}
}
},
{ "terms":{"faq_id": ["8001","8002"]}},
]
,"must_not":[{"terms":{"faq_id":["8018","8019"]}}]
}
}
res =es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"])
print ([i['_source']['faq_standard'] for i in res['hits']['hits']])
['如何学习编程', '如何提高英语口语能力', '如何提高英语口语能力']

浙公网安备 33010602011771号