范围字段分桶的细微之处
编辑范围字段分桶的细微之处
编辑文档计入其所属的每个桶
编辑由于范围表示多个值,因此对范围字段运行桶聚合可能会导致同一文档落入多个桶中。这可能导致意外行为,例如桶计数之和高于匹配文档的数量。例如,考虑以下索引
resp = client.indices.create( index="range_index", settings={ "number_of_shards": 2 }, mappings={ "properties": { "expected_attendees": { "type": "integer_range" }, "time_frame": { "type": "date_range", "format": "yyyy-MM-dd||epoch_millis" } } }, ) print(resp) resp1 = client.index( index="range_index", id="1", refresh=True, document={ "expected_attendees": { "gte": 10, "lte": 20 }, "time_frame": { "gte": "2019-10-28", "lte": "2019-11-04" } }, ) print(resp1)
response = client.indices.create( index: 'range_index', body: { settings: { number_of_shards: 2 }, mappings: { properties: { expected_attendees: { type: 'integer_range' }, time_frame: { type: 'date_range', format: 'yyyy-MM-dd||epoch_millis' } } } } ) puts response response = client.index( index: 'range_index', id: 1, refresh: true, body: { expected_attendees: { gte: 10, lte: 20 }, time_frame: { gte: '2019-10-28', lte: '2019-11-04' } } ) puts response
const response = await client.indices.create({ index: "range_index", settings: { number_of_shards: 2, }, mappings: { properties: { expected_attendees: { type: "integer_range", }, time_frame: { type: "date_range", format: "yyyy-MM-dd||epoch_millis", }, }, }, }); console.log(response); const response1 = await client.index({ index: "range_index", id: 1, refresh: "true", document: { expected_attendees: { gte: 10, lte: 20, }, time_frame: { gte: "2019-10-28", lte: "2019-11-04", }, }, }); console.log(response1);
PUT range_index { "settings": { "number_of_shards": 2 }, "mappings": { "properties": { "expected_attendees": { "type": "integer_range" }, "time_frame": { "type": "date_range", "format": "yyyy-MM-dd||epoch_millis" } } } } PUT range_index/_doc/1?refresh { "expected_attendees" : { "gte" : 10, "lte" : 20 }, "time_frame" : { "gte" : "2019-10-28", "lte" : "2019-11-04" } }
在以下聚合中,范围大于区间,因此文档将落入多个桶中。
resp = client.search( index="range_index", size="0", aggs={ "range_histo": { "histogram": { "field": "expected_attendees", "interval": 5 } } }, ) print(resp)
response = client.search( index: 'range_index', size: 0, body: { aggregations: { range_histo: { histogram: { field: 'expected_attendees', interval: 5 } } } } ) puts response
const response = await client.search({ index: "range_index", size: 0, aggs: { range_histo: { histogram: { field: "expected_attendees", interval: 5, }, }, }, }); console.log(response);
POST /range_index/_search?size=0 { "aggs": { "range_histo": { "histogram": { "field": "expected_attendees", "interval": 5 } } } }
由于区间为 5
(默认情况下偏移量为 0
),我们预期桶为 10
、15
和 20
。我们的范围文档将落入所有这三个桶中。
{ ... "aggregations" : { "range_histo" : { "buckets" : [ { "key" : 10.0, "doc_count" : 1 }, { "key" : 15.0, "doc_count" : 1 }, { "key" : 20.0, "doc_count" : 1 } ] } } }
文档不能部分存在于桶中;例如,上述文档不能在上述三个桶中的每一个中各计数三分之一。在本例中,由于文档的范围落入多个桶中,因此该文档的完整值也将计入每个桶的任何子聚合中。
查询边界不是聚合过滤器
编辑当使用查询过滤要聚合的字段时,可能会出现另一种意外行为。在这种情况下,文档可能匹配查询,但其范围的端点之一或两者都可能在查询之外。考虑对上述文档进行以下聚合
resp = client.search( index="range_index", size="0", query={ "range": { "time_frame": { "gte": "2019-11-01", "format": "yyyy-MM-dd" } } }, aggs={ "november_data": { "date_histogram": { "field": "time_frame", "calendar_interval": "day", "format": "yyyy-MM-dd" } } }, ) print(resp)
response = client.search( index: 'range_index', size: 0, body: { query: { range: { time_frame: { gte: '2019-11-01', format: 'yyyy-MM-dd' } } }, aggregations: { november_data: { date_histogram: { field: 'time_frame', calendar_interval: 'day', format: 'yyyy-MM-dd' } } } } ) puts response
const response = await client.search({ index: "range_index", size: 0, query: { range: { time_frame: { gte: "2019-11-01", format: "yyyy-MM-dd", }, }, }, aggs: { november_data: { date_histogram: { field: "time_frame", calendar_interval: "day", format: "yyyy-MM-dd", }, }, }, }); console.log(response);
POST /range_index/_search?size=0 { "query": { "range": { "time_frame": { "gte": "2019-11-01", "format": "yyyy-MM-dd" } } }, "aggs": { "november_data": { "date_histogram": { "field": "time_frame", "calendar_interval": "day", "format": "yyyy-MM-dd" } } } }
即使查询只考虑11月份的天数,聚合也会生成8个桶(10月份4个,11月份4个),因为聚合是根据所有匹配文档的范围计算的。
{ ... "aggregations" : { "november_data" : { "buckets" : [ { "key_as_string" : "2019-10-28", "key" : 1572220800000, "doc_count" : 1 }, { "key_as_string" : "2019-10-29", "key" : 1572307200000, "doc_count" : 1 }, { "key_as_string" : "2019-10-30", "key" : 1572393600000, "doc_count" : 1 }, { "key_as_string" : "2019-10-31", "key" : 1572480000000, "doc_count" : 1 }, { "key_as_string" : "2019-11-01", "key" : 1572566400000, "doc_count" : 1 }, { "key_as_string" : "2019-11-02", "key" : 1572652800000, "doc_count" : 1 }, { "key_as_string" : "2019-11-03", "key" : 1572739200000, "doc_count" : 1 }, { "key_as_string" : "2019-11-04", "key" : 1572825600000, "doc_count" : 1 } ] } } }
根据用例,CONTAINS
查询可以将文档限制为仅完全落入查询范围内的那些文档。在本例中,该文档不会被包含,聚合将为空。聚合后过滤桶也是一种选择,用于文档应被计数但可以安全忽略超出范围的数据的用例。