混合精确搜索与词干提取
编辑混合精确搜索与词干提取
编辑在构建搜索应用程序时,词干提取通常是必不可少的,因为希望对 skiing
的查询能够匹配包含 ski
或 skis
的文档。但是,如果用户想要专门搜索 skiing
呢?通常的做法是使用多字段,以便以两种不同的方式索引相同的内容
resp = client.indices.create( index="index", settings={ "analysis": { "analyzer": { "english_exact": { "tokenizer": "standard", "filter": [ "lowercase" ] } } } }, mappings={ "properties": { "body": { "type": "text", "analyzer": "english", "fields": { "exact": { "type": "text", "analyzer": "english_exact" } } } } }, ) print(resp) resp1 = client.index( index="index", id="1", document={ "body": "Ski resort" }, ) print(resp1) resp2 = client.index( index="index", id="2", document={ "body": "A pair of skis" }, ) print(resp2) resp3 = client.indices.refresh( index="index", ) print(resp3)
response = client.indices.create( index: 'index', body: { settings: { analysis: { analyzer: { english_exact: { tokenizer: 'standard', filter: [ 'lowercase' ] } } } }, mappings: { properties: { body: { type: 'text', analyzer: 'english', fields: { exact: { type: 'text', analyzer: 'english_exact' } } } } } } ) puts response response = client.index( index: 'index', id: 1, body: { body: 'Ski resort' } ) puts response response = client.index( index: 'index', id: 2, body: { body: 'A pair of skis' } ) puts response response = client.indices.refresh( index: 'index' ) puts response
const response = await client.indices.create({ index: "index", settings: { analysis: { analyzer: { english_exact: { tokenizer: "standard", filter: ["lowercase"], }, }, }, }, mappings: { properties: { body: { type: "text", analyzer: "english", fields: { exact: { type: "text", analyzer: "english_exact", }, }, }, }, }, }); console.log(response); const response1 = await client.index({ index: "index", id: 1, document: { body: "Ski resort", }, }); console.log(response1); const response2 = await client.index({ index: "index", id: 2, document: { body: "A pair of skis", }, }); console.log(response2); const response3 = await client.indices.refresh({ index: "index", }); console.log(response3);
PUT index { "settings": { "analysis": { "analyzer": { "english_exact": { "tokenizer": "standard", "filter": [ "lowercase" ] } } } }, "mappings": { "properties": { "body": { "type": "text", "analyzer": "english", "fields": { "exact": { "type": "text", "analyzer": "english_exact" } } } } } } PUT index/_doc/1 { "body": "Ski resort" } PUT index/_doc/2 { "body": "A pair of skis" } POST index/_refresh
通过这样的设置,在 body
上搜索 ski
会返回两个文档
resp = client.search( index="index", query={ "simple_query_string": { "fields": [ "body" ], "query": "ski" } }, ) print(resp)
response = client.search( index: 'index', body: { query: { simple_query_string: { fields: [ 'body' ], query: 'ski' } } } ) puts response
const response = await client.search({ index: "index", query: { simple_query_string: { fields: ["body"], query: "ski", }, }, }); console.log(response);
GET index/_search { "query": { "simple_query_string": { "fields": [ "body" ], "query": "ski" } } }
{ "took": 2, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped" : 0, "failed": 0 }, "hits": { "total" : { "value": 2, "relation": "eq" }, "max_score": 0.18232156, "hits": [ { "_index": "index", "_id": "1", "_score": 0.18232156, "_source": { "body": "Ski resort" } }, { "_index": "index", "_id": "2", "_score": 0.18232156, "_source": { "body": "A pair of skis" } } ] } }
另一方面,在 body.exact
上搜索 ski
只会返回文档 1
,因为 body.exact
的分析链不执行词干提取。
resp = client.search( index="index", query={ "simple_query_string": { "fields": [ "body.exact" ], "query": "ski" } }, ) print(resp)
response = client.search( index: 'index', body: { query: { simple_query_string: { fields: [ 'body.exact' ], query: 'ski' } } } ) puts response
const response = await client.search({ index: "index", query: { simple_query_string: { fields: ["body.exact"], query: "ski", }, }, }); console.log(response);
GET index/_search { "query": { "simple_query_string": { "fields": [ "body.exact" ], "query": "ski" } } }
{ "took": 1, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped" : 0, "failed": 0 }, "hits": { "total" : { "value": 1, "relation": "eq" }, "max_score": 0.8025915, "hits": [ { "_index": "index", "_id": "1", "_score": 0.8025915, "_source": { "body": "Ski resort" } } ] } }
这不容易暴露给最终用户,因为我们需要找到一种方法来确定他们是否正在寻找精确匹配,并相应地重定向到适当的字段。而且,如果只有部分查询需要精确匹配,而其他部分仍然应该考虑词干提取,该怎么办?
幸运的是,query_string
和 simple_query_string
查询具有一个可以解决此确切问题的特性:quote_field_suffix
。这告诉 Elasticsearch,出现在引号之间的单词将被重定向到不同的字段,如下所示
resp = client.search( index="index", query={ "simple_query_string": { "fields": [ "body" ], "quote_field_suffix": ".exact", "query": "\"ski\"" } }, ) print(resp)
response = client.search( index: 'index', body: { query: { simple_query_string: { fields: [ 'body' ], quote_field_suffix: '.exact', query: '"ski"' } } } ) puts response
const response = await client.search({ index: "index", query: { simple_query_string: { fields: ["body"], quote_field_suffix: ".exact", query: '"ski"', }, }, }); console.log(response);
GET index/_search { "query": { "simple_query_string": { "fields": [ "body" ], "quote_field_suffix": ".exact", "query": "\"ski\"" } } }
{ "took": 2, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped" : 0, "failed": 0 }, "hits": { "total" : { "value": 1, "relation": "eq" }, "max_score": 0.8025915, "hits": [ { "_index": "index", "_id": "1", "_score": 0.8025915, "_source": { "body": "Ski resort" } } ] } }
在上面的例子中,由于 ski
在引号之间,因此由于 quote_field_suffix
参数,它在 body.exact
字段上进行搜索,所以只有文档 1
匹配。这允许用户根据需要混合精确搜索和词干搜索。
如果在 quote_field_suffix
中传递的字段选择不存在,则搜索将回退到使用查询字符串的默认字段。