聚合分析

  ES作为搜索引擎兼数据库,同样提供强大的聚合分析能力。

  bucket:一个数据分组 ,类比数据库的话,相当于group by

      metric:对一个数据分组执行的统计 ,常见的数据分析的metric操作有count,avg,max,min,sum等

  ES聚合分析查询的写法

"aggregations" : {
    "<aggregation_name>" : {                            <!--聚合的名字 -->
        "<aggregation_type>" : {                        <!--聚合的类型 -->
            <aggregation_body>                          <!--聚合体:对哪些字段进行聚合 -->
        }
        [,"meta" : {  [<meta_data_body>] } ]?           <!--元 -->
        [,"aggregations" : { [<sub_aggregation>]+ } ]?  <!--在聚合里面在定义子聚合 -->
    }
    [,"<aggregation_name_2>" : { ... } ]*                <!--聚合的名字 -->
}

  aggregations 也可简写为 aggs,聚合计算的值可以取字段的值,也可是脚本计算的结果

  示例1:查询所有客户中年龄的最大值

POST /user/_search
{
  "size":1,
  "aggs": {
    "maxAge": {
      "max": {
        "field":"age"
      }
    }
  }
}

  示例2:查询年龄为24岁的用户中的收入最大值

POST /user/_search
{
  "size":2,
  "query":{
    "match":{
      "age":26
    }
  },
  "sort":[
    {
      "salary":{
        "order":"desc"
      }
    }
  ],
  "aggs": {
    "max_salary": {
      "max": {
        "field":"salary"
      }
    }
  }
}

  补充:对用户的年龄进行分组,对年龄分组再根据userId进行分组,求平均收入

POST /user/_search?size=0
{
  "aggs": {
    "group_by_age": {
      "terms": {
        "field":"age"
      },
      "aggs":{
        "avg_salary":{
          "avg":{
            "field":"salary"
          }
        },
        "group_by_userId":{
          "terms": {
            "field":"userId"
           },
           "aggs":{
             "userId_avg_salary":{
                "avg":{
                  "field":"salary"
                }
             }
           }
        }
      }
    }
  }
}

      示例3:值来源于脚本,查询所有客户的平均年龄是多少,并对平均年龄加10

POST /user/_search
{
  "aggs": {
    "avg_age":{
      "avg":{
        "script":{
          "source":"doc.age.value"
        }
      }
    },
    "avg_age10":{
    "avg":{
      "script":{
        "source":"doc.age.value+10"
      }
    }
   }
  }
}

  示例4:指定field,在脚本中用_value 取字段的值

POST /user/_search?size=0
{
  "aggs":{
    "sum_age":{
      "sum":{
        "field":"age",
        "script":{
          "source":"_value"
        }
      }
    }
  }
}

  示例5:为没有值字段指定值。如未指定,缺失该字段值的文档将被忽略

POST /bank/_search?size=0
{
  "aggs": {
    "avg_age": {
      "avg": {
        "field": "age",
        "missing": 18
      }
    }
  }
}

  文档计数 count

Value count 统计某字段有值的文档数

POST /user/_search?size=0
{
  "aggs":{
    "age_count":{
      "value_count":{
        "field":"age"
      }
    }
  }
}

  cardinality  值去重计数

POST /forum/_search?size=0
{
  "aggs":{
    "postDate_count":{
      "cardinality":{
        "field":"postDate"
      }
    }
  }
}

  stats 统计 count max min avg sum 5个值

POST /user/_search?size=0
{
  "aggs":{
    "age_stats":{
      "stats":{
        "field":"age"
      }
    }
  }
}

  Extended stats

       高级统计,比stats多4个统计结果: 平方和、方差、标准差、平均值加/减两个标准差的区间

POST /user/_search?size=0
{
  "aggs":{
    "age_stats":{
      "extended_stats":{
        "field":"age"
      }
    }
  }
}

Terms Aggregation  根据字段值项分组聚合

POST /user/_search?size=0
{
  "aggs":{
    "age_terms":{
      "terms":{
        "field":"age"
      }
    }
  }
}
{
  "took" : 13,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "age_terms" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : 20,
          "doc_count" : 1
        },
        {
          "key" : 25,
          "doc_count" : 1
        },
        {
          "key" : 28,
          "doc_count" : 1
        }
      ]
    }
  }
}

       doc_count_error_upper_bound": 0:文档计数的最大偏差值

       "sum_other_doc_count": 0:未返回的其他项的文档数

  默认情况下返回按文档计数从高到低的前10个分组

  order  指定分组的排序

  根据文档计数排序

POST /user/_search?size=0
{
  "aggs":{
    "age_terms":{
      "terms":{
        "field":"age",
        "order":{"_count":"asc"}
      }
    }
  }
}
{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "age_terms" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : 20,
          "doc_count" : 1
        },
        {
          "key" : 25,
          "doc_count" : 1
        },
        {
          "key" : 28,
          "doc_count" : 1
        }
      ]
    }
  }
}

   示例6:根据分组值排序

POST /user/_search?size=0
{
  "aggs":{
    "age_terms":{
      "terms":{
        "field":"age",
        "order":{"_key":"desc"}
      }
    }
  }
}

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "age_terms" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : 28,
          "doc_count" : 1
        },
        {
          "key" : 25,
          "doc_count" : 1
        },
        {
          "key" : 20,
          "doc_count" : 1
        }
      ]
    }
  }
}

  示例7:取分组指标值排序

POST /bank/_search?size=0
{
  "aggs": {
    "age_terms": {
      "terms": {
        "field": "age",
        "order": {
          "max_balance": "asc"
        }
      },
      "aggs": {
        "max_balance": {
          "max": {
            "field": "balance"
          }
        },
        "min_balance": {
          "min": {
            "field": "balance"
          }
        }
      }
    }
  }
}

   示例8:筛选分组-正则表达式匹配值

GET /_search
{
    "aggs" : {
        "tags" : {
            "terms" : {
                "field" : "tags",
                "include" : ".*sport.*",
                "exclude" : "water_.*"
            }
        }
    }
}

  缺失值处理

GET /_search
{
    "aggs" : {
        "tags" : {
             "terms" : {
                 "field" : "tags",
                 "missing": "N/A" 
             }
         }
    }
}

  filter Aggregation  对满足过滤查询的文档进行聚合计算

  在查询命中的文档中选取符合过滤条件的文档进行聚合,先过滤再聚合

POST /user/_search?size=0
{
  "aggs":{
    "age_terms":{
      "filter":{
        "match":{
          "job":"工程师"
        }
      },
      "aggs":{
        "avg_age":{
          "avg":{
            "field":"age"
          }
        }
      }
    }
  }
}

  Range Aggregation 范围分组聚合

POST /user/_search?size=0
{
  "aggs":{
    "age_range":{
      "range":{
        "field":"age",
        "ranges":[
          {
            "to":25
          },
          {
            "from":25,
            "to":35
          },
          {
            "from":35
          }
        ]
      }
    },
    "aggs":{
        "max":{
          "field":"age"
        }
    }
  }
}

  Date Range Aggregation  时间范围分组聚合

POST /bank/_search?size=0
{
  "aggs": {
    "range": {
      "date_range": {
        "field": "date",
        "format": "MM-yyy",
        "ranges": [
          {
            "to": "now-10M/M"
          },
          {
            "from": "now-10M/M"
          }
        ]
      }
    }
  }
}

  globa关键字

POST employees/_search?size=0
{
  "query":{
    "match": {
      "job": "Java Programmer"
    }
  },
  "aggs":{
    "java_avg_salary":{
      "avg":{
        "field":"salary"
      }
    },
    "all":{
      "global":{},
      "aggs":{
        "avg_salary":{
          "avg":{
            "field":"salary"
          }
        }
      }
    }
  }
}

   结果

{
  "took" : 17,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 11,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "all" : {
      "doc_count" : 20,
      "avg_salary" : {
        "value" : 24700.0
      }
    },
    "java_avg_salary" : {
      "value" : 23272.727272727272
    }
  }
}

   平均年龄降序排列的工资分布

  histogram:进行bucket分组操作,接收一个field,按照这个field的值的各个范围区间,进行bucket分组操作 .

POST employees/_search?size=0
{
  "aggs":{
    "salary_list":{
      "histogram":{
        "field":"salary",
        "interval":5000,
        "order":{
          "age>avg_age":"desc"
        }
      },
      "aggs":{
        "age":{
          "filter":{
            "range":{
                "age":{
                  "gte":10
                }
              }
            },
          "aggs":{
              "avg_age":{
                "avg":{
                  "field":"age"
                }
              }
            }
          }
        }
    }
  }
}

结果:
{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 20,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "salary_list" : {
      "buckets" : [
        {
          "key" : 50000.0,
          "doc_count" : 1,
          "age" : {
            "doc_count" : 1,
            "avg_age" : {
              "value" : 41.0
            }
          }
        },
        {
          "key" : 35000.0,
          "doc_count" : 2,
          "age" : {
            "doc_count" : 2,
            "avg_age" : {
              "value" : 34.0
            }
          }
        },
        {
          "key" : 25000.0,
          "doc_count" : 3,
          "age" : {
            "doc_count" : 3,
            "avg_age" : {
              "value" : 32.0
            }
          }
        },
        {
          "key" : 30000.0,
          "doc_count" : 3,
          "age" : {
            "doc_count" : 3,
            "avg_age" : {
              "value" : 30.333333333333332
            }
          }
        },
        {
          "key" : 20000.0,
          "doc_count" : 6,
          "age" : {
            "doc_count" : 6,
            "avg_age" : {
              "value" : 28.333333333333332
            }
          }
        },
        {
          "key" : 15000.0,
          "doc_count" : 4,
          "age" : {
            "doc_count" : 4,
            "avg_age" : {
              "value" : 24.0
            }
          }
        },
        {
          "key" : 5000.0,
          "doc_count" : 1,
          "age" : {
            "doc_count" : 1,
            "avg_age" : {
              "value" : 20.0
            }
          }
        },
        {
          "key" : 10000.0,
          "doc_count" : 0,
          "age" : {
            "doc_count" : 0,
            "avg_age" : {
              "value" : null
            }
          }
        },
        {
          "key" : 40000.0,
          "doc_count" : 0,
          "age" : {
            "doc_count" : 0,
            "avg_age" : {
              "value" : null
            }
          }
        },
        {
          "key" : 45000.0,
          "doc_count" : 0,
          "age" : {
            "doc_count" : 0,
            "avg_age" : {
              "value" : null
            }
          }
        }
      ]
    }
  }
}

 

 

  

 

 

 

 

 

  

posted on 2020-06-14 12:25  溪水静幽  阅读(250)  评论(0)    收藏  举报