› › ›

混合精确搜索与词干提取

在构建搜索应用程序时，词干提取通常是必不可少的，因为希望对 skiing 的查询能够匹配包含 ski 或 skis 的文档。但是，如果用户想要专门搜索 skiing 呢？通常的做法是使用多字段，以便以两种不同的方式索引相同的内容

resp = client.indices.create(
    index="index",
    settings={
        "analysis": {
            "analyzer": {
                "english_exact": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase"
                    ]
                }
            }
        }
    },
    mappings={
        "properties": {
            "body": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "exact": {
                        "type": "text",
                        "analyzer": "english_exact"
                    }
                }
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="index",
    id="1",
    document={
        "body": "Ski resort"
    },
)
print(resp1)

resp2 = client.index(
    index="index",
    id="2",
    document={
        "body": "A pair of skis"
    },
)
print(resp2)

resp3 = client.indices.refresh(
    index="index",
)
print(resp3)

response = client.indices.create(
  index: 'index',
  body: {
    settings: {
      analysis: {
        analyzer: {
          english_exact: {
            tokenizer: 'standard',
            filter: [
              'lowercase'
            ]
          }
        }
      }
    },
    mappings: {
      properties: {
        body: {
          type: 'text',
          analyzer: 'english',
          fields: {
            exact: {
              type: 'text',
              analyzer: 'english_exact'
            }
          }
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'index',
  id: 1,
  body: {
    body: 'Ski resort'
  }
)
puts response

response = client.index(
  index: 'index',
  id: 2,
  body: {
    body: 'A pair of skis'
  }
)
puts response

response = client.indices.refresh(
  index: 'index'
)
puts response

const response = await client.indices.create({
  index: "index",
  settings: {
    analysis: {
      analyzer: {
        english_exact: {
          tokenizer: "standard",
          filter: ["lowercase"],
        },
      },
    },
  },
  mappings: {
    properties: {
      body: {
        type: "text",
        analyzer: "english",
        fields: {
          exact: {
            type: "text",
            analyzer: "english_exact",
          },
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "index",
  id: 1,
  document: {
    body: "Ski resort",
  },
});
console.log(response1);

const response2 = await client.index({
  index: "index",
  id: 2,
  document: {
    body: "A pair of skis",
  },
});
console.log(response2);

const response3 = await client.indices.refresh({
  index: "index",
});
console.log(response3);

PUT index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "english_exact": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "body": {
        "type": "text",
        "analyzer": "english",
        "fields": {
          "exact": {
            "type": "text",
            "analyzer": "english_exact"
          }
        }
      }
    }
  }
}

PUT index/_doc/1
{
  "body": "Ski resort"
}

PUT index/_doc/2
{
  "body": "A pair of skis"
}

POST index/_refresh

Copy as curl Try in Elastic

通过这样的设置，在 body 上搜索 ski 会返回两个文档

resp = client.search(
    index="index",
    query={
        "simple_query_string": {
            "fields": [
                "body"
            ],
            "query": "ski"
        }
    },
)
print(resp)

response = client.search(
  index: 'index',
  body: {
    query: {
      simple_query_string: {
        fields: [
          'body'
        ],
        query: 'ski'
      }
    }
  }
)
puts response

const response = await client.search({
  index: "index",
  query: {
    simple_query_string: {
      fields: ["body"],
      query: "ski",
    },
  },
});
console.log(response);

GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body" ],
      "query": "ski"
    }
  }
}

Copy as curl Try in Elastic

{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 2,
        "relation": "eq"
    },
    "max_score": 0.18232156,
    "hits": [
      {
        "_index": "index",
        "_id": "1",
        "_score": 0.18232156,
        "_source": {
          "body": "Ski resort"
        }
      },
      {
        "_index": "index",
        "_id": "2",
        "_score": 0.18232156,
        "_source": {
          "body": "A pair of skis"
        }
      }
    ]
  }
}

另一方面，在 body.exact 上搜索 ski 只会返回文档 1，因为 body.exact 的分析链不执行词干提取。

resp = client.search(
    index="index",
    query={
        "simple_query_string": {
            "fields": [
                "body.exact"
            ],
            "query": "ski"
        }
    },
)
print(resp)

response = client.search(
  index: 'index',
  body: {
    query: {
      simple_query_string: {
        fields: [
          'body.exact'
        ],
        query: 'ski'
      }
    }
  }
)
puts response

const response = await client.search({
  index: "index",
  query: {
    simple_query_string: {
      fields: ["body.exact"],
      query: "ski",
    },
  },
});
console.log(response);

GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body.exact" ],
      "query": "ski"
    }
  }
}

Copy as curl Try in Elastic

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 1,
        "relation": "eq"
    },
    "max_score": 0.8025915,
    "hits": [
      {
        "_index": "index",
        "_id": "1",
        "_score": 0.8025915,
        "_source": {
          "body": "Ski resort"
        }
      }
    ]
  }
}

这不容易暴露给最终用户，因为我们需要找到一种方法来确定他们是否正在寻找精确匹配，并相应地重定向到适当的字段。而且，如果只有部分查询需要精确匹配，而其他部分仍然应该考虑词干提取，该怎么办？

幸运的是，query_string 和 simple_query_string 查询具有一个可以解决此确切问题的特性：quote_field_suffix。这告诉 Elasticsearch，出现在引号之间的单词将被重定向到不同的字段，如下所示

resp = client.search(
    index="index",
    query={
        "simple_query_string": {
            "fields": [
                "body"
            ],
            "quote_field_suffix": ".exact",
            "query": "\"ski\""
        }
    },
)
print(resp)

response = client.search(
  index: 'index',
  body: {
    query: {
      simple_query_string: {
        fields: [
          'body'
        ],
        quote_field_suffix: '.exact',
        query: '"ski"'
      }
    }
  }
)
puts response

const response = await client.search({
  index: "index",
  query: {
    simple_query_string: {
      fields: ["body"],
      quote_field_suffix: ".exact",
      query: '"ski"',
    },
  },
});
console.log(response);

GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body" ],
      "quote_field_suffix": ".exact",
      "query": "\"ski\""
    }
  }
}

Copy as curl Try in Elastic

{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 1,
        "relation": "eq"
    },
    "max_score": 0.8025915,
    "hits": [
      {
        "_index": "index",
        "_id": "1",
        "_score": 0.8025915,
        "_source": {
          "body": "Ski resort"
        }
      }
    ]
  }
}

在上面的例子中，由于 ski 在引号之间，因此由于 quote_field_suffix 参数，它在 body.exact 字段上进行搜索，所以只有文档 1 匹配。这允许用户根据需要混合精确搜索和词干搜索。

如果在 quote_field_suffix 中传递的字段选择不存在，则搜索将回退到使用查询字符串的默认字段。

« 搜索相关性优化获取一致的评分 »

Was this helpful?

Feedback

The Search AI Company

ELK Stack

Elastic Cloud

Generative AI

Search

Security

Observability

By solution

Industries

Customer spotlight

Research

Build

Learn

Connect

混合精确搜索与词干提取

混合精确搜索与词干提取

Follow us

About us

Join us

Partners

Trust & Security

Investor relations

Excellence Awards

About us

Join us

Partners

Trust & Security

Investor relations

Excellence Awards