范围字段分桶的微妙之处

编辑

范围字段分桶的微妙之处

编辑

文档在其落入的每个桶中都会被计数

编辑

由于范围表示多个值,因此在范围字段上运行桶聚合可能会导致同一文档落入多个桶中。这可能会导致令人惊讶的行为,例如桶计数的总和高于匹配的文档数。例如,考虑以下索引

resp = client.indices.create(
    index="range_index",
    settings={
        "number_of_shards": 2
    },
    mappings={
        "properties": {
            "expected_attendees": {
                "type": "integer_range"
            },
            "time_frame": {
                "type": "date_range",
                "format": "yyyy-MM-dd||epoch_millis"
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="range_index",
    id="1",
    refresh=True,
    document={
        "expected_attendees": {
            "gte": 10,
            "lte": 20
        },
        "time_frame": {
            "gte": "2019-10-28",
            "lte": "2019-11-04"
        }
    },
)
print(resp1)
response = client.indices.create(
  index: 'range_index',
  body: {
    settings: {
      number_of_shards: 2
    },
    mappings: {
      properties: {
        expected_attendees: {
          type: 'integer_range'
        },
        time_frame: {
          type: 'date_range',
          format: 'yyyy-MM-dd||epoch_millis'
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'range_index',
  id: 1,
  refresh: true,
  body: {
    expected_attendees: {
      gte: 10,
      lte: 20
    },
    time_frame: {
      gte: '2019-10-28',
      lte: '2019-11-04'
    }
  }
)
puts response
const response = await client.indices.create({
  index: "range_index",
  settings: {
    number_of_shards: 2,
  },
  mappings: {
    properties: {
      expected_attendees: {
        type: "integer_range",
      },
      time_frame: {
        type: "date_range",
        format: "yyyy-MM-dd||epoch_millis",
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "range_index",
  id: 1,
  refresh: "true",
  document: {
    expected_attendees: {
      gte: 10,
      lte: 20,
    },
    time_frame: {
      gte: "2019-10-28",
      lte: "2019-11-04",
    },
  },
});
console.log(response1);
PUT range_index
{
  "settings": {
    "number_of_shards": 2
  },
  "mappings": {
    "properties": {
      "expected_attendees": {
        "type": "integer_range"
      },
      "time_frame": {
        "type": "date_range",
        "format": "yyyy-MM-dd||epoch_millis"
      }
    }
  }
}

PUT range_index/_doc/1?refresh
{
  "expected_attendees" : {
    "gte" : 10,
    "lte" : 20
  },
  "time_frame" : {
    "gte" : "2019-10-28",
    "lte" : "2019-11-04"
  }
}

以下聚合中的范围比间隔宽,因此文档将落入多个桶中。

resp = client.search(
    index="range_index",
    size="0",
    aggs={
        "range_histo": {
            "histogram": {
                "field": "expected_attendees",
                "interval": 5
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'range_index',
  size: 0,
  body: {
    aggregations: {
      range_histo: {
        histogram: {
          field: 'expected_attendees',
          interval: 5
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "range_index",
  size: 0,
  aggs: {
    range_histo: {
      histogram: {
        field: "expected_attendees",
        interval: 5,
      },
    },
  },
});
console.log(response);
POST /range_index/_search?size=0
{
  "aggs": {
    "range_histo": {
      "histogram": {
        "field": "expected_attendees",
        "interval": 5
      }
    }
  }
}

由于间隔为 5 (并且默认情况下偏移量为 0),我们预期桶为 101520。我们的范围文档将落入所有这三个桶中。

{
  ...
  "aggregations" : {
    "range_histo" : {
      "buckets" : [
        {
          "key" : 10.0,
          "doc_count" : 1
        },
        {
          "key" : 15.0,
          "doc_count" : 1
        },
        {
          "key" : 20.0,
          "doc_count" : 1
        }
      ]
    }
  }
}

文档不能部分存在于桶中;例如,上述文档不能在上述三个桶中的每一个中计为三分之一。在本例中,由于文档的范围落入多个桶中,因此该文档的完整值也会在每个桶的任何子聚合中被计数。

查询边界不是聚合过滤器

编辑

当使用查询来过滤正在聚合的字段时,可能会出现另一种意外行为。在这种情况下,文档可能与查询匹配,但范围的一个或两个端点仍在查询之外。考虑对上述文档的以下聚合

resp = client.search(
    index="range_index",
    size="0",
    query={
        "range": {
            "time_frame": {
                "gte": "2019-11-01",
                "format": "yyyy-MM-dd"
            }
        }
    },
    aggs={
        "november_data": {
            "date_histogram": {
                "field": "time_frame",
                "calendar_interval": "day",
                "format": "yyyy-MM-dd"
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'range_index',
  size: 0,
  body: {
    query: {
      range: {
        time_frame: {
          gte: '2019-11-01',
          format: 'yyyy-MM-dd'
        }
      }
    },
    aggregations: {
      november_data: {
        date_histogram: {
          field: 'time_frame',
          calendar_interval: 'day',
          format: 'yyyy-MM-dd'
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "range_index",
  size: 0,
  query: {
    range: {
      time_frame: {
        gte: "2019-11-01",
        format: "yyyy-MM-dd",
      },
    },
  },
  aggs: {
    november_data: {
      date_histogram: {
        field: "time_frame",
        calendar_interval: "day",
        format: "yyyy-MM-dd",
      },
    },
  },
});
console.log(response);
POST /range_index/_search?size=0
{
  "query": {
    "range": {
      "time_frame": {
        "gte": "2019-11-01",
        "format": "yyyy-MM-dd"
      }
    }
  },
  "aggs": {
    "november_data": {
      "date_histogram": {
        "field": "time_frame",
        "calendar_interval": "day",
        "format": "yyyy-MM-dd"
      }
    }
  }
}

即使查询只考虑 11 月份的日期,聚合也会生成 8 个桶(10 月份 4 个,11 月份 4 个),因为聚合是根据所有匹配文档的范围计算的。

{
  ...
  "aggregations" : {
    "november_data" : {
      "buckets" : [
              {
          "key_as_string" : "2019-10-28",
          "key" : 1572220800000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019-10-29",
          "key" : 1572307200000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019-10-30",
          "key" : 1572393600000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019-10-31",
          "key" : 1572480000000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019-11-01",
          "key" : 1572566400000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019-11-02",
          "key" : 1572652800000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019-11-03",
          "key" : 1572739200000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2019-11-04",
          "key" : 1572825600000,
          "doc_count" : 1
        }
      ]
    }
  }
}

根据用例,CONTAINS 查询可以将文档限制为仅那些完全在查询范围内的文档。在本例中,不会包含该文档,并且聚合将为空。在文档应该被计数但可以安全地忽略超出范围的数据的情况下,在聚合之后过滤桶也是一个选项。