使用Elastic搭建文档检索引擎(for windows)

1.安装Elastic，默认端口9200 (当前版本为7.2)

2.安装Kibana，默认端口5601 (当前版本为7.2)

3.安装Elastic中文分词插件:　　https://github.com/medcl/elasticsearch-analysis-ik

4.解压放在: Elastic安装目录/plugins/ik/

5.重启Elastic服务，使用Kibana添加测试数据

创建索引，指定使用ik_max_word分词器

PUT /km
{
  "settings": {
    "analysis": {
      "analyzer": {
        "ik": {
          "tokenizer": "ik_max_word"
        }
      }
    }
  }
}

指定数据分词器和搜索分词器

POST 1st_index/_mapping
{

　　"properties": {
   　　"content": {
    　　　　"type": "text",
                "analyzer": "ik_max_word",
                "search_analyzer": "ik_smart"
            }
      }

}

PUT /1st_index/_doc/4?pretty
{
  "content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
}

高亮搜索

GET /km/_search?pretty
{
  "query":{"match":{"content":"中国"}},
  "highlight": {
    "pre_tags": ["<tag1>", "<tag2>"],
    "post_tags": ["</tag1>", "</tag2>"],
    "fields": {"content": {}}
  }
}

搜索结果

{
  "took" : 49,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.8007569,
    "hits" : [
      {
        "_index" : "km",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 0.8007569,
        "_source" : {
          "content" : "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
        },
        "highlight" : {
          "content" : [
            "<tag1>中国</tag1>驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
          ]
        }
      },
      {
        "_index" : "km",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 0.8007569,
        "_source" : {
          "content" : "中韩渔警冲突调查：韩警平均每天扣1艘中国渔船"
        },
        "highlight" : {
          "content" : [
            "中韩渔警冲突调查：韩警平均每天扣1艘<tag1>中国</tag1>渔船"
          ]
        }
      }
    ]
  }
}

高亮搜索2

GET /_search?pretty
{
  "query": {"match": {
    "content": "中国"
  }},
  "highlight": {
        "pre_tags": ["<span style = 'color:red'>"],
        "post_tags": ["</span>"],
        "fields": {"content": {}}
      }

}

搜索结果

{
  "took" : 34,
  "timed_out" : false,
  "_shards" : {
    "total" : 6,
    "successful" : 6,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 6,
      "relation" : "eq"
    },
    "max_score" : 1.225708,
    "hits" : [
      {
        "_index" : "1st_index",
        "_type" : "docs",
        "_id" : "3",
        "_score" : 1.225708,
        "_source" : {
          "content" : "中韩渔警冲突调查：韩警平均每天扣1艘中国渔船"
        },
        "highlight" : {
          "content" : [
            "<span style = 'color:red'>中</span>韩渔警冲突调查：韩警平均每天扣1艘<span style = 'color:red'>中</span><span style = 'color:red'>国</span>渔船"
          ]
        }
      },
      {
        "_index" : "1st_index",
        "_type" : "docs",
        "_id" : "4",
        "_score" : 0.96408343,
        "_source" : {
          "content" : "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
        },
        "highlight" : {
          "content" : [
            "<span style = 'color:red'>中</span><span style = 'color:red'>国</span>驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
          ]
        }
      },
      {
        "_index" : "km",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 0.50831574,
        "_source" : {
          "content" : "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
        },
        "highlight" : {
          "content" : [
            "<span style = 'color:red'>中国</span>驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
          ]
        }
      },
      {
        "_index" : "km",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 0.50831574,
        "_source" : {
          "content" : "中韩渔警冲突调查：韩警平均每天扣1艘中国渔船"
        },
        "highlight" : {
          "content" : [
            "中韩渔警冲突调查：韩警平均每天扣1艘<span style = 'color:red'>中国</span>渔船"
          ]
        }
      },
      {
        "_index" : "1st_index",
        "_type" : "docs",
        "_id" : "1",
        "_score" : 0.3864615,
        "_source" : {
          "content" : "美国留给伊拉克的是个烂摊子吗"
        },
        "highlight" : {
          "content" : [
            "美<span style = 'color:red'>国</span>留给伊拉克的是个烂摊子吗"
          ]
        }
      },
      {
        "_index" : "{index}",
        "_type" : "docs",
        "_id" : "1",
        "_score" : 0.2876821,
        "_source" : {
          "content" : "美国留给伊拉克的是个烂摊子吗"
        },
        "highlight" : {
          "content" : [
            "美<span style = 'color:red'>国</span>留给伊拉克的是个烂摊子吗"
          ]
        }
      }
    ]
  }
}

分词

Smart分词

GET /km/_analyze
{
  "text": "你今天开了多少张单子", "tokenizer": "ik_smart"
}

结果

{
  "tokens" : [
    {
      "token" : "你",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "今天",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "开了",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "多少",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "张",
      "start_offset" : 7,
      "end_offset" : 8,
      "type" : "CN_CHAR",
      "position" : 4
    },
    {
      "token" : "单子",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "CN_WORD",
      "position" : 5
    }
  ]
}

WordMax分词

GET /km/_analyze
{
  "text": "你今天开了多少张单子", "tokenizer": "ik_max_word"
}

结果

{
  "tokens" : [
    {
      "token" : "你",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "今天",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "天开",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "开了",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "多少",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "张",
      "start_offset" : 7,
      "end_offset" : 8,
      "type" : "CN_CHAR",
      "position" : 5
    },
    {
      "token" : "单子",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "CN_WORD",
      "position" : 6
    }
  ]
}

ik_max_word: 会将文本做最细粒度的拆分，比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”，会穷尽各种可能的组合，适合 Term Query；

ik_smart: 会做最粗粒度的拆分，比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”，适合 Phrase 查询。

Dictionary Configuration

IKAnalyzer.cfg.xml can be located at {conf}/analysis-ik/config/IKAnalyzer.cfg.xml or {plugins}/elasticsearch-analysis-ik-*/config/IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
    <comment>IK Analyzer 扩展配置</comment>
    <!--用户可以在这里配置自己的扩展字典 -->
    <entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
     <!--用户可以在这里配置自己的扩展停止词字典-->
    <entry key="ext_stopwords">custom/ext_stopword.dic</entry>
     <!--用户可以在这里配置远程扩展字典 -->
    <entry key="remote_ext_dict">location</entry>
     <!--用户可以在这里配置远程扩展停止词字典-->
    <entry key="remote_ext_stopwords">http://xxx.com/xxx.dic</entry>
</properties>

其中 location 是指一个 url，比如 http://yoursite.com/getCustomDict，该请求只需满足以下两点即可完成分词热更新。

该 http 请求需要返回两个头部(header)，一个是 Last-Modified，一个是 ETag，这两者都是字符串类型，只要有一个发生变化，该插件就会去抓取新的分词进而更新词库。
该 http 请求返回的内容格式是一行一个分词，换行符用 \n 即可。

满足上面两点要求就可以实现热更新分词了，不需要重启 ES 实例。

可以将需自动更新的热词放在一个 UTF-8 编码的 .txt 文件里，放在 nginx 或其他简易 http server 下，当 .txt 文件修改时，http server 会在客户端请求该文件时自动返回相应的 Last-Modified 和 ETag。可以另外做一个工具来从业务系统提取相关词汇，并更新这个 .txt 文件。

have fun.

近义词

添加近义词索引

PUT /km-synonym
{
    "settings": {
        "index" : {
            "analysis" : {
                "analyzer" : {
                    "by_smart" : {
                        "tokenizer" : "ik_smart",
                        "filter" : ["synonym"]
                    },
                    "by_max_word":{
                        "tokenizer" :"ik_max_word",
                        "filter" : ["synonym"]
                    }
                },
                "filter" : {
                    "synonym" : {
                        "type" : "synonym",
                        "synonyms_path" : "synonyms.dic"
                    }
                }
            }
        }
    }
}

文件synonyms.dic放在Elastic的配置目录config下，格式如下:

# Blank lines and lines starting with pound are comments.

# Explicit mappings match any token sequence on the LHS of "=>"
# and replace with all alternatives on the RHS.  These types of mappings
# ignore the expand parameter in the schema.
# Examples:
i-pod, i pod => ipod,
sea biscuit, sea biscit => seabiscuit

# Equivalent synonyms may be separated with commas and give
# no explicit mapping.  In this case the mapping behavior will
# be taken from the expand parameter in the schema.  This allows
# the same synonym file to be used in different synonym handling strategies.
# Examples:
ipod, i-pod, i pod
foozball , foosball
universe , cosmos
lol, laughing out loud

# If expand==true, "ipod, i-pod, i pod" is equivalent
# to the explicit mapping:
ipod, i-pod, i pod => ipod, i-pod, i pod
# If expand==false, "ipod, i-pod, i pod" is equivalent
# to the explicit mapping:
ipod, i-pod, i pod => ipod

# Multiple synonym mapping entries are merged.
foo => foo bar
foo => baz
# is equivalent to
foo => foo bar, baz

测试

POST km-synonym/_analyze
{
  "analyzer": "by_max_word",
  "text":"绝地求生是个垃圾游戏"
}

结果

{
  "tokens" : [
    {
      "token" : "绝地",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "吃",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "SYNONYM",
      "position" : 0
    },
    {
      "token" : "求生",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "鸡",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "SYNONYM",
      "position" : 1
    },
    {
      "token" : "是",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "CN_CHAR",
      "position" : 2
    },
    {
      "token" : "个",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "垃圾",
      "start_offset" : 6,
      "end_offset" : 8,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "游戏",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "CN_WORD",
      "position" : 5
    }
  ]
}

文件搜索实现

下载安装插件ingest-attachment

https://artifacts.elastic.co/downloads/elasticsearch-plugins/ingest-attachment/ingest-attachment-7.2.0.zip

或

elasticsearch/bin/>elasticsearch-plugin.bat install ingest-attachment

发表于 2019-06-29 17:52 Helsing 阅读(355) 评论(0) 收藏举报

分词

Dictionary Configuration

近义词

文件搜索实现

公告