范围字段分桶的微妙之处
编辑范围字段分桶的微妙之处
编辑文档在其落入的每个桶中都会被计数
编辑由于范围表示多个值,因此在范围字段上运行桶聚合可能会导致同一文档落入多个桶中。这可能会导致令人惊讶的行为,例如桶计数的总和高于匹配的文档数。例如,考虑以下索引
resp = client.indices.create( index="range_index", settings={ "number_of_shards": 2 }, mappings={ "properties": { "expected_attendees": { "type": "integer_range" }, "time_frame": { "type": "date_range", "format": "yyyy-MM-dd||epoch_millis" } } }, ) print(resp) resp1 = client.index( index="range_index", id="1", refresh=True, document={ "expected_attendees": { "gte": 10, "lte": 20 }, "time_frame": { "gte": "2019-10-28", "lte": "2019-11-04" } }, ) print(resp1)
response = client.indices.create( index: 'range_index', body: { settings: { number_of_shards: 2 }, mappings: { properties: { expected_attendees: { type: 'integer_range' }, time_frame: { type: 'date_range', format: 'yyyy-MM-dd||epoch_millis' } } } } ) puts response response = client.index( index: 'range_index', id: 1, refresh: true, body: { expected_attendees: { gte: 10, lte: 20 }, time_frame: { gte: '2019-10-28', lte: '2019-11-04' } } ) puts response
const response = await client.indices.create({ index: "range_index", settings: { number_of_shards: 2, }, mappings: { properties: { expected_attendees: { type: "integer_range", }, time_frame: { type: "date_range", format: "yyyy-MM-dd||epoch_millis", }, }, }, }); console.log(response); const response1 = await client.index({ index: "range_index", id: 1, refresh: "true", document: { expected_attendees: { gte: 10, lte: 20, }, time_frame: { gte: "2019-10-28", lte: "2019-11-04", }, }, }); console.log(response1);
PUT range_index { "settings": { "number_of_shards": 2 }, "mappings": { "properties": { "expected_attendees": { "type": "integer_range" }, "time_frame": { "type": "date_range", "format": "yyyy-MM-dd||epoch_millis" } } } } PUT range_index/_doc/1?refresh { "expected_attendees" : { "gte" : 10, "lte" : 20 }, "time_frame" : { "gte" : "2019-10-28", "lte" : "2019-11-04" } }
以下聚合中的范围比间隔宽,因此文档将落入多个桶中。
resp = client.search( index="range_index", size="0", aggs={ "range_histo": { "histogram": { "field": "expected_attendees", "interval": 5 } } }, ) print(resp)
response = client.search( index: 'range_index', size: 0, body: { aggregations: { range_histo: { histogram: { field: 'expected_attendees', interval: 5 } } } } ) puts response
const response = await client.search({ index: "range_index", size: 0, aggs: { range_histo: { histogram: { field: "expected_attendees", interval: 5, }, }, }, }); console.log(response);
POST /range_index/_search?size=0 { "aggs": { "range_histo": { "histogram": { "field": "expected_attendees", "interval": 5 } } } }
由于间隔为 5
(并且默认情况下偏移量为 0
),我们预期桶为 10
、15
和 20
。我们的范围文档将落入所有这三个桶中。
{ ... "aggregations" : { "range_histo" : { "buckets" : [ { "key" : 10.0, "doc_count" : 1 }, { "key" : 15.0, "doc_count" : 1 }, { "key" : 20.0, "doc_count" : 1 } ] } } }
文档不能部分存在于桶中;例如,上述文档不能在上述三个桶中的每一个中计为三分之一。在本例中,由于文档的范围落入多个桶中,因此该文档的完整值也会在每个桶的任何子聚合中被计数。
查询边界不是聚合过滤器
编辑当使用查询来过滤正在聚合的字段时,可能会出现另一种意外行为。在这种情况下,文档可能与查询匹配,但范围的一个或两个端点仍在查询之外。考虑对上述文档的以下聚合
resp = client.search( index="range_index", size="0", query={ "range": { "time_frame": { "gte": "2019-11-01", "format": "yyyy-MM-dd" } } }, aggs={ "november_data": { "date_histogram": { "field": "time_frame", "calendar_interval": "day", "format": "yyyy-MM-dd" } } }, ) print(resp)
response = client.search( index: 'range_index', size: 0, body: { query: { range: { time_frame: { gte: '2019-11-01', format: 'yyyy-MM-dd' } } }, aggregations: { november_data: { date_histogram: { field: 'time_frame', calendar_interval: 'day', format: 'yyyy-MM-dd' } } } } ) puts response
const response = await client.search({ index: "range_index", size: 0, query: { range: { time_frame: { gte: "2019-11-01", format: "yyyy-MM-dd", }, }, }, aggs: { november_data: { date_histogram: { field: "time_frame", calendar_interval: "day", format: "yyyy-MM-dd", }, }, }, }); console.log(response);
POST /range_index/_search?size=0 { "query": { "range": { "time_frame": { "gte": "2019-11-01", "format": "yyyy-MM-dd" } } }, "aggs": { "november_data": { "date_histogram": { "field": "time_frame", "calendar_interval": "day", "format": "yyyy-MM-dd" } } } }
即使查询只考虑 11 月份的日期,聚合也会生成 8 个桶(10 月份 4 个,11 月份 4 个),因为聚合是根据所有匹配文档的范围计算的。
{ ... "aggregations" : { "november_data" : { "buckets" : [ { "key_as_string" : "2019-10-28", "key" : 1572220800000, "doc_count" : 1 }, { "key_as_string" : "2019-10-29", "key" : 1572307200000, "doc_count" : 1 }, { "key_as_string" : "2019-10-30", "key" : 1572393600000, "doc_count" : 1 }, { "key_as_string" : "2019-10-31", "key" : 1572480000000, "doc_count" : 1 }, { "key_as_string" : "2019-11-01", "key" : 1572566400000, "doc_count" : 1 }, { "key_as_string" : "2019-11-02", "key" : 1572652800000, "doc_count" : 1 }, { "key_as_string" : "2019-11-03", "key" : 1572739200000, "doc_count" : 1 }, { "key_as_string" : "2019-11-04", "key" : 1572825600000, "doc_count" : 1 } ] } } }
根据用例,CONTAINS
查询可以将文档限制为仅那些完全在查询范围内的文档。在本例中,不会包含该文档,并且聚合将为空。在文档应该被计数但可以安全地忽略超出范围的数据的情况下,在聚合之后过滤桶也是一个选项。