采样器聚合
编辑采样器聚合
编辑一种过滤聚合,用于将任何子聚合的处理限制在得分最高的文档样本中。
示例用例
- 将分析的重点集中在高相关性匹配上,而不是潜在的非常长的低质量匹配尾部。
- 降低仅使用样本即可产生有用结果的聚合的运行成本,例如
significant_terms
。
示例
在 StackOverflow 数据上查询常用术语 javascript
或较少见的术语 kibana
将匹配许多文档 - 其中大多数缺少 Kibana 一词。为了将 significant_terms
聚合集中在更有可能匹配我们查询中最有趣部分的高分文档上,我们使用一个样本。
resp = client.search( index="stackoverflow", size="0", query={ "query_string": { "query": "tags:kibana OR tags:javascript" } }, aggs={ "sample": { "sampler": { "shard_size": 200 }, "aggs": { "keywords": { "significant_terms": { "field": "tags", "exclude": [ "kibana", "javascript" ] } } } } }, ) print(resp)
response = client.search( index: 'stackoverflow', size: 0, body: { query: { query_string: { query: 'tags:kibana OR tags:javascript' } }, aggregations: { sample: { sampler: { shard_size: 200 }, aggregations: { keywords: { significant_terms: { field: 'tags', exclude: [ 'kibana', 'javascript' ] } } } } } } ) puts response
const response = await client.search({ index: "stackoverflow", size: 0, query: { query_string: { query: "tags:kibana OR tags:javascript", }, }, aggs: { sample: { sampler: { shard_size: 200, }, aggs: { keywords: { significant_terms: { field: "tags", exclude: ["kibana", "javascript"], }, }, }, }, }, }); console.log(response);
POST /stackoverflow/_search?size=0 { "query": { "query_string": { "query": "tags:kibana OR tags:javascript" } }, "aggs": { "sample": { "sampler": { "shard_size": 200 }, "aggs": { "keywords": { "significant_terms": { "field": "tags", "exclude": [ "kibana", "javascript" ] } } } } } }
响应
{ ... "aggregations": { "sample": { "doc_count": 200, "keywords": { "doc_count": 200, "bg_count": 650, "buckets": [ { "key": "elasticsearch", "doc_count": 150, "score": 1.078125, "bg_count": 200 }, { "key": "logstash", "doc_count": 50, "score": 0.5625, "bg_count": 50 } ] } } } }
如果没有 sampler
聚合,请求查询会考虑低质量匹配的完整“长尾”,因此会识别出不太重要的术语,例如 jquery
和 angular
,而不是关注更具洞察力的 Kibana 相关术语。
resp = client.search( index="stackoverflow", size="0", query={ "query_string": { "query": "tags:kibana OR tags:javascript" } }, aggs={ "low_quality_keywords": { "significant_terms": { "field": "tags", "size": 3, "exclude": [ "kibana", "javascript" ] } } }, ) print(resp)
response = client.search( index: 'stackoverflow', size: 0, body: { query: { query_string: { query: 'tags:kibana OR tags:javascript' } }, aggregations: { low_quality_keywords: { significant_terms: { field: 'tags', size: 3, exclude: [ 'kibana', 'javascript' ] } } } } ) puts response
const response = await client.search({ index: "stackoverflow", size: 0, query: { query_string: { query: "tags:kibana OR tags:javascript", }, }, aggs: { low_quality_keywords: { significant_terms: { field: "tags", size: 3, exclude: ["kibana", "javascript"], }, }, }, }); console.log(response);
POST /stackoverflow/_search?size=0 { "query": { "query_string": { "query": "tags:kibana OR tags:javascript" } }, "aggs": { "low_quality_keywords": { "significant_terms": { "field": "tags", "size": 3, "exclude": [ "kibana", "javascript" ] } } } }
响应
{ ... "aggregations": { "low_quality_keywords": { "doc_count": 600, "bg_count": 650, "buckets": [ { "key": "angular", "doc_count": 200, "score": 0.02777, "bg_count": 200 }, { "key": "jquery", "doc_count": 200, "score": 0.02777, "bg_count": 200 }, { "key": "logstash", "doc_count": 50, "score": 0.0069, "bg_count": 50 } ] } } }
shard_size
编辑shard_size
参数限制了在每个分片上处理的样本中收集的得分最高的文档数量。默认值为 100。