过滤搜索结果

编辑

你可以使用两种方法来过滤搜索结果

  • 使用带有 filter 子句的布尔查询。搜索请求将布尔过滤器应用于搜索命中和聚合
  • 使用搜索 API 的 post_filter 参数。搜索请求仅将后置过滤器应用于搜索命中,而不应用于聚合。你可以使用后置过滤器来基于更广泛的结果集计算聚合,然后进一步缩小结果范围。

    你还可以在后置过滤器之后重计分命中结果,以提高相关性并重新排序结果。

后置过滤器

编辑

当你使用 post_filter 参数来过滤搜索结果时,搜索命中结果会在聚合计算之后被过滤。后置过滤器对聚合结果没有影响。

例如,你正在销售具有以下属性的衬衫

resp = client.indices.create(
    index="shirts",
    mappings={
        "properties": {
            "brand": {
                "type": "keyword"
            },
            "color": {
                "type": "keyword"
            },
            "model": {
                "type": "keyword"
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="shirts",
    id="1",
    refresh=True,
    document={
        "brand": "gucci",
        "color": "red",
        "model": "slim"
    },
)
print(resp1)
response = client.indices.create(
  index: 'shirts',
  body: {
    mappings: {
      properties: {
        brand: {
          type: 'keyword'
        },
        color: {
          type: 'keyword'
        },
        model: {
          type: 'keyword'
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'shirts',
  id: 1,
  refresh: true,
  body: {
    brand: 'gucci',
    color: 'red',
    model: 'slim'
  }
)
puts response
const response = await client.indices.create({
  index: "shirts",
  mappings: {
    properties: {
      brand: {
        type: "keyword",
      },
      color: {
        type: "keyword",
      },
      model: {
        type: "keyword",
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "shirts",
  id: 1,
  refresh: "true",
  document: {
    brand: "gucci",
    color: "red",
    model: "slim",
  },
});
console.log(response1);
PUT /shirts
{
  "mappings": {
    "properties": {
      "brand": { "type": "keyword"},
      "color": { "type": "keyword"},
      "model": { "type": "keyword"}
    }
  }
}

PUT /shirts/_doc/1?refresh
{
  "brand": "gucci",
  "color": "red",
  "model": "slim"
}

假设用户指定了两个过滤器

color:redbrand:gucci。你只想在搜索结果中向他们展示 Gucci 制造的红色衬衫。通常,你可以使用bool 查询来完成此操作

resp = client.search(
    index="shirts",
    query={
        "bool": {
            "filter": [
                {
                    "term": {
                        "color": "red"
                    }
                },
                {
                    "term": {
                        "brand": "gucci"
                    }
                }
            ]
        }
    },
)
print(resp)
response = client.search(
  index: 'shirts',
  body: {
    query: {
      bool: {
        filter: [
          {
            term: {
              color: 'red'
            }
          },
          {
            term: {
              brand: 'gucci'
            }
          }
        ]
      }
    }
  }
)
puts response
const response = await client.search({
  index: "shirts",
  query: {
    bool: {
      filter: [
        {
          term: {
            color: "red",
          },
        },
        {
          term: {
            brand: "gucci",
          },
        },
      ],
    },
  },
});
console.log(response);
GET /shirts/_search
{
  "query": {
    "bool": {
      "filter": [
        { "term": { "color": "red"   }},
        { "term": { "brand": "gucci" }}
      ]
    }
  }
}

但是,你也希望使用分面导航来显示用户可以点击的其他选项的列表。也许你有一个 model 字段,可以让用户将他们的搜索结果限制为红色的 Gucci T 恤正装衬衫

这可以通过terms 聚合来完成

resp = client.search(
    index="shirts",
    query={
        "bool": {
            "filter": [
                {
                    "term": {
                        "color": "red"
                    }
                },
                {
                    "term": {
                        "brand": "gucci"
                    }
                }
            ]
        }
    },
    aggs={
        "models": {
            "terms": {
                "field": "model"
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'shirts',
  body: {
    query: {
      bool: {
        filter: [
          {
            term: {
              color: 'red'
            }
          },
          {
            term: {
              brand: 'gucci'
            }
          }
        ]
      }
    },
    aggregations: {
      models: {
        terms: {
          field: 'model'
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "shirts",
  query: {
    bool: {
      filter: [
        {
          term: {
            color: "red",
          },
        },
        {
          term: {
            brand: "gucci",
          },
        },
      ],
    },
  },
  aggs: {
    models: {
      terms: {
        field: "model",
      },
    },
  },
});
console.log(response);
GET /shirts/_search
{
  "query": {
    "bool": {
      "filter": [
        { "term": { "color": "red"   }},
        { "term": { "brand": "gucci" }}
      ]
    }
  },
  "aggs": {
    "models": {
      "terms": { "field": "model" } 
    }
  }
}

返回 Gucci 红色衬衫最受欢迎的款式。

但是,你可能还想告诉用户,有多少 Gucci 衬衫在其他颜色中可用。如果你只是在 color 字段上添加一个 terms 聚合,你只会得到 red 颜色,因为你的查询只返回 Gucci 的红色衬衫。

相反,你希望在聚合期间包含所有颜色的衬衫,然后仅将 colors 过滤器应用于搜索结果。这就是 post_filter 的目的

resp = client.search(
    index="shirts",
    query={
        "bool": {
            "filter": {
                "term": {
                    "brand": "gucci"
                }
            }
        }
    },
    aggs={
        "colors": {
            "terms": {
                "field": "color"
            }
        },
        "color_red": {
            "filter": {
                "term": {
                    "color": "red"
                }
            },
            "aggs": {
                "models": {
                    "terms": {
                        "field": "model"
                    }
                }
            }
        }
    },
    post_filter={
        "term": {
            "color": "red"
        }
    },
)
print(resp)
response = client.search(
  index: 'shirts',
  body: {
    query: {
      bool: {
        filter: {
          term: {
            brand: 'gucci'
          }
        }
      }
    },
    aggregations: {
      colors: {
        terms: {
          field: 'color'
        }
      },
      color_red: {
        filter: {
          term: {
            color: 'red'
          }
        },
        aggregations: {
          models: {
            terms: {
              field: 'model'
            }
          }
        }
      }
    },
    post_filter: {
      term: {
        color: 'red'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "shirts",
  query: {
    bool: {
      filter: {
        term: {
          brand: "gucci",
        },
      },
    },
  },
  aggs: {
    colors: {
      terms: {
        field: "color",
      },
    },
    color_red: {
      filter: {
        term: {
          color: "red",
        },
      },
      aggs: {
        models: {
          terms: {
            field: "model",
          },
        },
      },
    },
  },
  post_filter: {
    term: {
      color: "red",
    },
  },
});
console.log(response);
GET /shirts/_search
{
  "query": {
    "bool": {
      "filter": {
        "term": { "brand": "gucci" } 
      }
    }
  },
  "aggs": {
    "colors": {
      "terms": { "field": "color" } 
    },
    "color_red": {
      "filter": {
        "term": { "color": "red" } 
      },
      "aggs": {
        "models": {
          "terms": { "field": "model" } 
        }
      }
    }
  },
  "post_filter": { 
    "term": { "color": "red" }
  }
}

现在,主查询会查找所有 Gucci 衬衫,而不管颜色如何。

colors 聚合返回 Gucci 衬衫的流行颜色。

color_red 聚合将 models 子聚合限制为红色 Gucci 衬衫。

最后,post_filter 从搜索 hits 中删除红色以外的颜色。

重计分过滤后的搜索结果

编辑

通过使用次要的(通常更昂贵的)算法,而不是将昂贵的算法应用于索引中的所有文档,重计分可以帮助提高精度,方法是重新排序由querypost_filter阶段返回的顶部文档(例如 100 - 500 个)。

在每个分片将其结果返回以由处理整个搜索请求的节点进行排序之前,会在每个分片上执行 rescore 请求。

当前,重计分 API 只有一个实现:查询重计分器,它使用查询来调整评分。将来,可能会提供其他重计分器,例如,成对重计分器。

如果显式sort(除了按降序排列的 _score)与 rescore 查询一起提供,则会引发错误。

在向用户公开分页时,你不应该在浏览每个页面时更改 window_size (通过传递不同的 from 值),因为这会改变顶部命中,导致结果在用户浏览页面时令人困惑地发生变化。

查询重计分器
编辑

查询重计分器仅在由querypost_filter阶段返回的 Top-K 结果上执行第二次查询。每个分片上将检查的文档数量可以通过 window_size 参数控制,该参数默认为 10。

默认情况下,原始查询和重计分查询的分数会线性组合,从而为每个文档生成最终的 _score。原始查询和重计分查询的相对重要性可以分别通过 query_weightrescore_query_weight 来控制。两者都默认为 1

例如

resp = client.search(
    query={
        "match": {
            "message": {
                "operator": "or",
                "query": "the quick brown"
            }
        }
    },
    rescore={
        "window_size": 50,
        "query": {
            "rescore_query": {
                "match_phrase": {
                    "message": {
                        "query": "the quick brown",
                        "slop": 2
                    }
                }
            },
            "query_weight": 0.7,
            "rescore_query_weight": 1.2
        }
    },
)
print(resp)
response = client.search(
  body: {
    query: {
      match: {
        message: {
          operator: 'or',
          query: 'the quick brown'
        }
      }
    },
    rescore: {
      window_size: 50,
      query: {
        rescore_query: {
          match_phrase: {
            message: {
              query: 'the quick brown',
              slop: 2
            }
          }
        },
        query_weight: 0.7,
        rescore_query_weight: 1.2
      }
    }
  }
)
puts response
const response = await client.search({
  query: {
    match: {
      message: {
        operator: "or",
        query: "the quick brown",
      },
    },
  },
  rescore: {
    window_size: 50,
    query: {
      rescore_query: {
        match_phrase: {
          message: {
            query: "the quick brown",
            slop: 2,
          },
        },
      },
      query_weight: 0.7,
      rescore_query_weight: 1.2,
    },
  },
});
console.log(response);
POST /_search
{
   "query" : {
      "match" : {
         "message" : {
            "operator" : "or",
            "query" : "the quick brown"
         }
      }
   },
   "rescore" : {
      "window_size" : 50,
      "query" : {
         "rescore_query" : {
            "match_phrase" : {
               "message" : {
                  "query" : "the quick brown",
                  "slop" : 2
               }
            }
         },
         "query_weight" : 0.7,
         "rescore_query_weight" : 1.2
      }
   }
}

分数的组合方式可以使用 score_mode 来控制

评分模式 描述

total

添加原始分数和重计分查询分数。默认值。

multiply

将原始分数乘以重计分查询分数。对于函数查询重计分很有用。

avg

取原始分数和重计分查询分数的平均值。

max

取原始分数和重计分查询分数的最大值。

min

取原始分数和重计分查询分数的最小值。

多次重计分
编辑

也可以按顺序执行多次重计分

resp = client.search(
    query={
        "match": {
            "message": {
                "operator": "or",
                "query": "the quick brown"
            }
        }
    },
    rescore=[
        {
            "window_size": 100,
            "query": {
                "rescore_query": {
                    "match_phrase": {
                        "message": {
                            "query": "the quick brown",
                            "slop": 2
                        }
                    }
                },
                "query_weight": 0.7,
                "rescore_query_weight": 1.2
            }
        },
        {
            "window_size": 10,
            "query": {
                "score_mode": "multiply",
                "rescore_query": {
                    "function_score": {
                        "script_score": {
                            "script": {
                                "source": "Math.log10(doc.count.value + 2)"
                            }
                        }
                    }
                }
            }
        }
    ],
)
print(resp)
response = client.search(
  body: {
    query: {
      match: {
        message: {
          operator: 'or',
          query: 'the quick brown'
        }
      }
    },
    rescore: [
      {
        window_size: 100,
        query: {
          rescore_query: {
            match_phrase: {
              message: {
                query: 'the quick brown',
                slop: 2
              }
            }
          },
          query_weight: 0.7,
          rescore_query_weight: 1.2
        }
      },
      {
        window_size: 10,
        query: {
          score_mode: 'multiply',
          rescore_query: {
            function_score: {
              script_score: {
                script: {
                  source: 'Math.log10(doc.count.value + 2)'
                }
              }
            }
          }
        }
      }
    ]
  }
)
puts response
const response = await client.search({
  query: {
    match: {
      message: {
        operator: "or",
        query: "the quick brown",
      },
    },
  },
  rescore: [
    {
      window_size: 100,
      query: {
        rescore_query: {
          match_phrase: {
            message: {
              query: "the quick brown",
              slop: 2,
            },
          },
        },
        query_weight: 0.7,
        rescore_query_weight: 1.2,
      },
    },
    {
      window_size: 10,
      query: {
        score_mode: "multiply",
        rescore_query: {
          function_score: {
            script_score: {
              script: {
                source: "Math.log10(doc.count.value + 2)",
              },
            },
          },
        },
      },
    },
  ],
});
console.log(response);
POST /_search
{
   "query" : {
      "match" : {
         "message" : {
            "operator" : "or",
            "query" : "the quick brown"
         }
      }
   },
   "rescore" : [ {
      "window_size" : 100,
      "query" : {
         "rescore_query" : {
            "match_phrase" : {
               "message" : {
                  "query" : "the quick brown",
                  "slop" : 2
               }
            }
         },
         "query_weight" : 0.7,
         "rescore_query_weight" : 1.2
      }
   }, {
      "window_size" : 10,
      "query" : {
         "score_mode": "multiply",
         "rescore_query" : {
            "function_score" : {
               "script_score": {
                  "script": {
                    "source": "Math.log10(doc.count.value + 2)"
                  }
               }
            }
         }
      }
   } ]
}

第一个重计分器获取查询结果,然后第二个重计分器获取第一个重计分器的结果,依此类推。第二个重计分器会“看到”第一个重计分器完成的排序,因此可以在第一个重计分器上使用较大的窗口将文档拉入第二个重计分器的较小窗口中。