Elasticsearch打造全文搜索引擎(二)

一、Es的文档、索引的CURD操作

1. elasticsearch概念

  • 集群:一个或多个节点组织在一起
  • 节点:一个节点是集群中的一个服务器,有一个名字来标识,默认是一个随机的漫画角色的名字
  • 分片:将索引划分为多份的能力,允许水平分割和扩展容量,多个分片相应请求,提高性能和吞吐量。
  • 副本:创建分片的一份或多份的能力,在一个节点失败其余节点可以顶上。
elasticsearch mysql
index(索引) 数据库
type(类型)
document(文档)
fields

2.常用属性和类型

 

3.内置类型

4. CURD操作

  • 索引的初始化操作
  • 指定分片和副本的数量
  • shards一旦设置不能修改
# 索引初始化
PUT lagou { "settings": { "index": { "number_of_shards": 5, # 分片 "number_of_replicas": 1 # 备份 } } } GET lagou/_settings GET _all/_settings GET .kibana,lagou/_settings GET _settings # 修改settings PUT lagou/_settings { "number_of_replicas": 2 } # 获取索引信息 GET _all GET lagou # 新建/保存文档 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":15000, "city":"北京", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 新建文档 # 方式二 POST lagou/job/ { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } GET lagou/job/1 GET lagou/job/1?_source=title GET lagou/job/1?_source=title,city GET lagou/job/1?_source # 修改文章 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":18000, "city":"广州", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 方式二:修改修改某一字段 POST lagou/job/1/_update { "doc": { "comments":20 } } # 删除 DELETE lagou/job/1 DELETE lagou/job DELETE lagou

二、mget和bulk操作

# 批量操作

数据准备
POST lagou/job1/1
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job1/2
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job2/1
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job2/2
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

mget批量获取
GET _mget
{
  "docs":[
      {"_index":"lagou",
       "_type":"job1",
       "_id":1
      },
      {"_index":"lagou",
       "_type":"job2",
       "_id":2
      }
    ]
}

GET lagou/_mget
{
  "docs":[
      {
       "_type":"job1",
       "_id":1
      },
      {
       "_type":"job2",
       "_id":2
      }
    ]
}

GET lagou/job1/_mget
{
  "docs":[
      {
       "_id":1
      },
      {
       "_id":2
      }
    ]
}

GET lagou/job1/_mget
{
  "ids":[1,2]
}

bulk增删改查

POST _bulk
{"index":{"_index":"lagou","_type":"job1","_id":"3"}}
{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}
{"index":{"_index":"lagou","_type":"job2","_id":"3"}}
{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}

POST _bulk
{"create":{"_index":"lagou","_type":"job1","_id":"3"}}
{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}

POST _bulk
{"delete":{"_index":"lagou","_type":"job1","_id":"3"}}

POST _bulk
{"update":{"_index":"lagou","_type":"job1","_id":"3"}}
{"doc":{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}}

三、mapping映射和查询

1. mapping映射

2.倒排索引

3. 倒排索引待解决的问题

4. 查询

5. 操作

# mapping操作

PUT lagou1
{
  "mappings":{
    "job":{
      "properties":{
        "title":{
          "type":"text"
        },
        "salary_min":{
          "type":"integer"
        },
        "city":{
          "type":"keyword"
        },
        "company":{
          "properties":{
            "name":{
              "type":"text"
            },
            "company_addr":{
              "type":"text"
            },
            "employee_count":{
              "type":"integer"
            }
        }
      },
      "publish_date":{
        "type":"date",
        "format":"yyyy-MM-dd"
      },
      "comments":{
        "type":"integer"
      }
    }
  }
}
}

PUT lagou1/job/1
{
  "title": "python爬虫分布式开发",
  "salary_min":15000,
  "city":"北京",
  "company":{
    "name":"百度",
    "company_addr":"北京市软件园",
    "employee_count":50
  },
  "publish_date":"2019-06-15",
  "comments":15
}

# get index mapping

GET lagou1/_mapping
GET lagou1/_mapping/job
GET _all/_mapping/job

# 查询

PUT lagou2
{
  "mappings": {
    "job":{
      "properties":{
        "title":{
          "type": "text",
          "store":true,
          "analyzer": "ik_max_word"
        },
        "company_name": {
          "type": "keyword",
          "store":true
        },
        "desc":{
          "type":"text"
        }, 
        "add_time":{
          "type":"date",
          "format":"yyyy-MM-dd"
        },
        "comments":{
          "type": "integer"
        }
      }
    }
  }
}


POST lagou2/job
{
  "title":"python django 开发工程师" ,
  "company_name":"美国科技有限公司",
  "desc":"对django的概念熟悉,熟悉python基础知识", 
  "comments":20,
  "add_time":"2017-04-01"  
}

POST lagou2/job
{
  "title":"python scrapy redis 分布式爬虫基本" ,
  "company_name":"百度科技有限公司",
  "desc":"对scrapy的概念熟悉,熟悉redis的基本操作",
  "comments":5,
  "add_time":"2017-04-15"  
} 

POST lagou2/job
{
  "title":"Elasticsearch打造搜索引擎" ,
  "company_name":"阿里巴巴科技有限公司",
  "desc":"熟悉数据结构算法,熟悉python的基本开发",
  "comments":15,
  "add_time":"2017-06-20"  
} 

POST lagou2/job
{
  "title":"python打造推荐引擎系统" ,
  "company_name":"阿里巴巴科技有限公司",
  "desc":"熟悉推荐引擎的原理以及算法、掌握C语言",
  "comments":60,
  "add_time":"2016-10-20"  
} 

# 简单查询
#查看分析器解析的结果
GET _analyze
{
  "analyzer": "ik_smart",
  "text":"Python网络开发师"
}
GET _analyze
{
  "analyzer": "ik_max_word",
  "text":"Python网络开发师"
}

#match查询 (分词查询) python 和分布式
#查询第0-2条的title和company_name字段(desc字段的stored属性不是true),并按comments排序
GET lagou2/_search
{
 "stored_fields":["title","company_name","desc"], 
  "query":{
    "match":{
      "title":"python分布式"  
    }
  },
  "from": 0,
  "size": 2,
  "sort": [
    {
      "comments": {
        "order": "desc"
      }
    }
  ]
}

#查询comments在大于等于10、小于等于20、权重2.0的数据
GET lagou2/_search
{
  "query":{  
    "range": {
      "comments": {
        "gte": 10,
        "lte": 20,
        "boost":2.0
      }
    }
  }
}
GET lagou2/_search
{
  "query":{  
    "range": {
      "add_time": {
        "gte": "2017-04-01",
        "lte": "now"
      }
    }
  }
}

#term查询(不会做处理、直接查,类似于keyword属性)
GET lagou2/_search
{
  "query":{
    "term":{
      "title":"python"  
    }
  }
}
#terms 和用match查django分布工程  效果一样
GET lagou2/_search
{
  "query":{
    "terms":{
      "title":["django"  ,"分布"  ,"工程"  ]
    }
  }
}

#match_all
GET lagou2/_search
{
  "query":{
    "match_all":{}
  }
}
 
#match_phrase 
#短语查询
#满足所有词 既有python也有系统,俩个词最小间距6位
GET lagou2/_search
{
  "query":{
    "match_phrase": {
      "title": {
        "query": "python系统",
        "slop":6
      }
    }
  }
}

#multi_match 多字段匹配,title的权重高于desc的3倍
GET lagou2/_search
{
  "query":{
    "multi_match": { 
      "query": "python系统",
      "fields":["title^3","desc"]
    }
  }
}

# sort查询
GET lagou2/_search
{
  "query": {
    "match_all": {}
  },
  "sort": [
    {
      "comments": {
        "order": "asc"
      }
    }
  ]
}

# range范围查询
GET lagou2/_search
{
  "query": { 
      "range": {
      "comments": {
        "gte": 20,
        "lte": 60,
        "boost":2.0
      }
    }
  }
}

GET lagou2/_search
{
  "query": { 
      "range": {
      "add_time": {
        "gte": "2017-06-07",
        "lte": "now"
      }
  }
}
}

#wildcard 通配符查询
GET lagou2/_search
{
  "query":{  
    "wildcard": {
      "title": {
        "value": "pyth*n",
        "boost": 2
      }
    }
  }
}

# 组合查询
#bool 查询
#用 bool 包括 must should must_not filter来完成
#格式如下
#bool:{
#  "filter":[], #不参与打分
#  "must":[],  #相当于        (salary=20 and title=Python)
#  "should":[], #相当于       (salary=20 or title=Python)
#  "must_not":[], #相当于not
#}

#建立测试数据
POST lagou/testjob/_bulk
{"index":{"_id":1}}
{"salary":10,"title":"Python"}
{"index":{"_id":2}}
{"salary":20,"title":"Scrapy"}
{"index":{"_id":3}}
{"salary":30,"title":"Django"}
{"index":{"_id":4}}
{"salary":30,"title":"Elasticsearch"}

DELETE lagou/testjob

#简单的过滤查询
#最简单的fileter查询
#select * from testjob where salary=20
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "must": {
        "match":{
          "salary":20
        }
      }, 
      "filter":{ 
        "match":{
          "title":"Scrapy"
        }
      }
    }
  }
}
#select * from testjob
#where (salary=20 or title=Python) and salary!=30 and salary!=10
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "should":[
          {"term":{"salary":20}},
          {"term":{"title":"python"}}
        ],
      "must_not": [
        {"term": {"salary": "30"}},
        {"term": {"salary": "10"}}
      ] 
    }
  }
}

#where (salary=30 and title="django") or title="python"
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "should":[
          {"term":{"title":"python"}},
          {"bool": { 
            "must":[
                {"term":{"salary":30}},
                {"term":{"title":"django"}}
              ] 
          }}
        ] 
    }
  }
}

#测试数据
POST lagou/testjob2/_bulk
{"index":{"_id":1}}
{"tags":["search"]}
{"index":{"_id":2}}
{"tags":["search","python"]}
{"index":{"_id":3}}
{"other_filed":["some data"]}
{"index":{"_id":4}}
{"tags":null}
{"index":{"_id":5}}
{"tags":["search",null]}

#处理null空值的方法
#select tags from testjob2 where tags is not null
GET lagou/testjob2/_search
{
  "query": {
    "bool": {
      "filter": {
        "exists": {
          "field": "tags"
        }
      }
    }
  }
}
#select tags from testjob2 where tags is null
GET lagou/testjob2/_search
{
  "query": {
    "bool": {
      "must_not": {
        "exists": {
          "field": "tags"
        }
      }
    }
  }
} 

 

gitee地址https://gitee.com/zhangyafeii/ArticleSpider_LcvSearch

 

posted @ 2019-06-17 19:48  DreamBoy_张亚飞  阅读(1421)  评论(0编辑  收藏  举报