采样聚合
编辑采样聚合
编辑一种过滤聚合,用于将任何子聚合的处理限制为得分最高的文档样本。
用例示例
- 将分析重点缩小到高度相关的匹配项,而不是潜在的非常长的低质量匹配项。
- 降低聚合的运行成本,这些聚合可以使用样本产生有用的结果,例如
significant_terms
示例
在 StackOverflow 数据上对流行术语 javascript
或较罕见的术语 kibana
进行查询将匹配许多文档 - 其中大部分文档缺少 Kibana 这个词。为了将 significant_terms
聚合的重点放在更可能匹配查询最有趣部分的得分最高的文档上,我们使用样本。
resp = client.search( index="stackoverflow", size="0", query={ "query_string": { "query": "tags:kibana OR tags:javascript" } }, aggs={ "sample": { "sampler": { "shard_size": 200 }, "aggs": { "keywords": { "significant_terms": { "field": "tags", "exclude": [ "kibana", "javascript" ] } } } } }, ) print(resp)
response = client.search( index: 'stackoverflow', size: 0, body: { query: { query_string: { query: 'tags:kibana OR tags:javascript' } }, aggregations: { sample: { sampler: { shard_size: 200 }, aggregations: { keywords: { significant_terms: { field: 'tags', exclude: [ 'kibana', 'javascript' ] } } } } } } ) puts response
const response = await client.search({ index: "stackoverflow", size: 0, query: { query_string: { query: "tags:kibana OR tags:javascript", }, }, aggs: { sample: { sampler: { shard_size: 200, }, aggs: { keywords: { significant_terms: { field: "tags", exclude: ["kibana", "javascript"], }, }, }, }, }, }); console.log(response);
POST /stackoverflow/_search?size=0 { "query": { "query_string": { "query": "tags:kibana OR tags:javascript" } }, "aggs": { "sample": { "sampler": { "shard_size": 200 }, "aggs": { "keywords": { "significant_terms": { "field": "tags", "exclude": [ "kibana", "javascript" ] } } } } } }
响应
{ ... "aggregations": { "sample": { "doc_count": 200, "keywords": { "doc_count": 200, "bg_count": 650, "buckets": [ { "key": "elasticsearch", "doc_count": 150, "score": 1.078125, "bg_count": 200 }, { "key": "logstash", "doc_count": 50, "score": 0.5625, "bg_count": 50 } ] } } } }
如果没有 sampler
聚合,请求查询将考虑所有低质量匹配项的完整“长尾”,因此会识别出不太重要的术语,例如 jquery
和 angular
,而不是关注更具洞察力的 Kibana 相关术语。
resp = client.search( index="stackoverflow", size="0", query={ "query_string": { "query": "tags:kibana OR tags:javascript" } }, aggs={ "low_quality_keywords": { "significant_terms": { "field": "tags", "size": 3, "exclude": [ "kibana", "javascript" ] } } }, ) print(resp)
response = client.search( index: 'stackoverflow', size: 0, body: { query: { query_string: { query: 'tags:kibana OR tags:javascript' } }, aggregations: { low_quality_keywords: { significant_terms: { field: 'tags', size: 3, exclude: [ 'kibana', 'javascript' ] } } } } ) puts response
const response = await client.search({ index: "stackoverflow", size: 0, query: { query_string: { query: "tags:kibana OR tags:javascript", }, }, aggs: { low_quality_keywords: { significant_terms: { field: "tags", size: 3, exclude: ["kibana", "javascript"], }, }, }, }); console.log(response);
POST /stackoverflow/_search?size=0 { "query": { "query_string": { "query": "tags:kibana OR tags:javascript" } }, "aggs": { "low_quality_keywords": { "significant_terms": { "field": "tags", "size": 3, "exclude": [ "kibana", "javascript" ] } } } }
响应
{ ... "aggregations": { "low_quality_keywords": { "doc_count": 600, "bg_count": 650, "buckets": [ { "key": "angular", "doc_count": 200, "score": 0.02777, "bg_count": 200 }, { "key": "jquery", "doc_count": 200, "score": 0.02777, "bg_count": 200 }, { "key": "logstash", "doc_count": 50, "score": 0.0069, "bg_count": 50 } ] } } }
shard_size
编辑shard_size
参数限制了在每个分片上处理的样本中收集多少个得分最高的文档。默认值为 100。