10 基于dis_max实现best fields策略进行多字段搜索
TF (Term Frequency): 基于词项(term vector), 用来表示一个词项在某个文档中出现多少次。词频越高,文档得分越高
IDF (Inveres Dcoument Frequency): 基于词项(term vector)逆文档频率越高,词项就越罕见。 评分公式利用该因子为包含罕见词项的文档加权。
构造数据
DELETE /forum PUT /forum {"settings":{"number_of_shards": 1}} POST /forum/_bulk { "index": { "_id": 1 }} { "articleID" : "XHDK-A-1293-#fJ3", "userID" : 1, "hidden": false, "postDate": "2017-01-01" } { "index": { "_id": 2 }} { "articleID" : "KDKE-B-9947-#kL5", "userID" : 1, "hidden": false, "postDate": "2017-01-02" } { "index": { "_id": 3 }} { "articleID" : "JODL-X-1937-#pV7", "userID" : 2, "hidden": false, "postDate": "2017-01-01" } { "index": { "_id": 4 }} { "articleID" : "QQPX-R-3956-#aD8", "userID" : 2, "hidden": true, "postDate": "2017-01-02" } POST /forum/_bulk {"update":{"_id":"1"}} {"doc":{"tag":["java","hadoop"]}} {"update":{"_id":"2"}} {"doc":{"tag":["java"]}} {"update":{"_id":"3"}} {"doc":{"tag":["hadoop"]}} {"update":{"_id":"4"}} {"doc":{"tag":["java","elasticsearch"]}} POST /forum/_bulk {"update":{"_id":"1"}} {"doc":{"view_cnt":30}} {"update":{"_id":"2"}} {"doc":{"view_cnt":50}} {"update":{"_id":"3"}} {"doc":{"view_cnt":100}} {"update":{"_id":"4"}} {"doc":{"view_cnt":80}} POST /forum/_bulk {"index":{"_id":5}} {"articleID":"DHJK-B-1395-#Ky5","userID":3,"hidden":false,"postDate":"2019-06-01","tag":["elasticsearch"],"tag_cnt":1,"view_cnt":10} POST /forum/_bulk {"update":{"_id":"5"}} {"doc":{"postDate":"2019-05-01"}} POST /forum/_bulk {"update":{"_id":"1"}} {"doc":{"title":"this is java and elasticsearch blog"}} {"update":{"_id":"2"}} {"doc":{"title":"this is java blog"}} {"update":{"_id":"3"}} {"doc":{"title":"this is elasticsearch blog"}} {"update":{"_id":"4"}} {"doc":{"title":"this is java, elasticsearch, hadoop blog"}} {"update":{"_id":"5"}} {"doc":{"title":"this is spark blog"}} POST /forum/_bulk {"update":{"_id":"1"}} {"doc":{"content":"i like to write best elasticsearch article"}} {"update":{"_id":"2"}} {"doc":{"content":"i think java is the best programming language"}} {"update":{"_id":"3"}} {"doc":{"content":"i am only an elasticsearch beginner"}} {"update":{"_id":"4"}} {"doc":{"content":"elasticsearch and hadoop are all very good solution, i am a beginner"}} {"update":{"_id":"5"}} {"doc":{"content":"spark is best big data solution based on scala ,an programming language similar to java"}}
普通查询
GET /forum/_search { "query":{ "bool":{ "should":[ { "match":{ "title":"java solution" } }, { "match":{ "content":"java solution" } } ], "minimum_should_match": 1 } } }
结果:
{ "took" : 11, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 4, "relation" : "eq" }, "max_score" : 1.5041151, "hits" : [ { "_index" : "forum", "_type" : "_doc", "_id" : "2", "_score" : 1.5041151, "_source" : { "articleID" : "KDKE-B-9947-#kL5", "userID" : 1, "hidden" : false, "postDate" : "2017-01-02", "tag" : [ "java" ], "view_cnt" : 50, "title" : "this is java blog", "content" : "i think java is the best programming language" } }, { "_index" : "forum", "_type" : "_doc", "_id" : "5", "_score" : 1.4233949, "_source" : { "articleID" : "DHJK-B-1395-#Ky5", "userID" : 3, "hidden" : false, "postDate" : "2019-05-01", "tag" : [ "elasticsearch" ], "tag_cnt" : 1, "view_cnt" : 10, "title" : "this is spark blog", "content" : "spark is best big data solution based on scala ,an programming language similar to java" } }, { "_index" : "forum", "_type" : "_doc", "_id" : "4", "_score" : 1.2715201, "_source" : { "articleID" : "QQPX-R-3956-#aD8", "userID" : 2, "hidden" : true, "postDate" : "2017-01-02", "tag" : [ "java", "elasticsearch" ], "view_cnt" : 80, "title" : "this is java, elasticsearch, hadoop blog", "content" : "elasticsearch and hadoop are all very good solution, i am a beginner" } }, { "_index" : "forum", "_type" : "_doc", "_id" : "1", "_score" : 0.47728038, "_source" : { "articleID" : "XHDK-A-1293-#fJ3", "userID" : 1, "hidden" : false, "postDate" : "2017-01-01", "tag" : [ "java", "hadoop" ], "view_cnt" : 30, "title" : "this is java and elasticsearch blog", "content" : "i like to write best elasticsearch article" } } ] } }
id=2的数据排在了前面,其实我们希望id=5的排在前面,毕竟id=5的数据 content字段既有java又有solution. 那看下dis_max吧
GET /forum/_search { "query":{ "dis_max":{ "queries":[ { "match":{ "title":"java solution" } }, { "match":{ "content":"java solution" } } ] } } }
结果:
{ "took" : 22, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 4, "relation" : "eq" }, "max_score" : 1.4233949, "hits" : [ { "_index" : "forum", "_type" : "_doc", "_id" : "5", "_score" : 1.4233949, "_source" : { "articleID" : "DHJK-B-1395-#Ky5", "userID" : 3, "hidden" : false, "postDate" : "2019-05-01", "tag" : [ "elasticsearch" ], "tag_cnt" : 1, "view_cnt" : 10, "title" : "this is spark blog", "content" : "spark is best big data solution based on scala ,an programming language similar to java" } }, { "_index" : "forum", "_type" : "_doc", "_id" : "2", "_score" : 0.9395274, "_source" : { "articleID" : "KDKE-B-9947-#kL5", "userID" : 1, "hidden" : false, "postDate" : "2017-01-02", "tag" : [ "java" ], "view_cnt" : 50, "title" : "this is java blog", "content" : "i think java is the best programming language" } }, { "_index" : "forum", "_type" : "_doc", "_id" : "4", "_score" : 0.7942397, "_source" : { "articleID" : "QQPX-R-3956-#aD8", "userID" : 2, "hidden" : true, "postDate" : "2017-01-02", "tag" : [ "java", "elasticsearch" ], "view_cnt" : 80, "title" : "this is java, elasticsearch, hadoop blog", "content" : "elasticsearch and hadoop are all very good solution, i am a beginner" } }, { "_index" : "forum", "_type" : "_doc", "_id" : "1", "_score" : 0.48898652, "_source" : { "articleID" : "XHDK-A-1293-#fJ3", "userID" : 1, "hidden" : false, "postDate" : "2017-01-01", "tag" : [ "java", "hadoop" ], "view_cnt" : 30, "title" : "this is java and elasticsearch blog", "content" : "i like to write best elasticsearch article" } } ] } }
best fields策略 : 搜索到的结果,应该是某一个field中匹配到了尽可能多的关键词,被排在前面;而不是尽可能多的field匹配到了少数的关键词,排在了前面.
dis_max语法,直接取多个query中,分数最高的那一个query的分数即可
立志如山 静心求实
浙公网安备 33010602011771号