混合精确搜索与词干提取

编辑

在构建搜索应用程序时,词干提取通常是必不可少的,因为希望对 skiing 的查询能够匹配包含 skiskis 的文档。但是,如果用户想要专门搜索 skiing 呢?通常的做法是使用多字段,以便以两种不同的方式索引相同的内容

resp = client.indices.create(
    index="index",
    settings={
        "analysis": {
            "analyzer": {
                "english_exact": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase"
                    ]
                }
            }
        }
    },
    mappings={
        "properties": {
            "body": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "exact": {
                        "type": "text",
                        "analyzer": "english_exact"
                    }
                }
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="index",
    id="1",
    document={
        "body": "Ski resort"
    },
)
print(resp1)

resp2 = client.index(
    index="index",
    id="2",
    document={
        "body": "A pair of skis"
    },
)
print(resp2)

resp3 = client.indices.refresh(
    index="index",
)
print(resp3)
response = client.indices.create(
  index: 'index',
  body: {
    settings: {
      analysis: {
        analyzer: {
          english_exact: {
            tokenizer: 'standard',
            filter: [
              'lowercase'
            ]
          }
        }
      }
    },
    mappings: {
      properties: {
        body: {
          type: 'text',
          analyzer: 'english',
          fields: {
            exact: {
              type: 'text',
              analyzer: 'english_exact'
            }
          }
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'index',
  id: 1,
  body: {
    body: 'Ski resort'
  }
)
puts response

response = client.index(
  index: 'index',
  id: 2,
  body: {
    body: 'A pair of skis'
  }
)
puts response

response = client.indices.refresh(
  index: 'index'
)
puts response
const response = await client.indices.create({
  index: "index",
  settings: {
    analysis: {
      analyzer: {
        english_exact: {
          tokenizer: "standard",
          filter: ["lowercase"],
        },
      },
    },
  },
  mappings: {
    properties: {
      body: {
        type: "text",
        analyzer: "english",
        fields: {
          exact: {
            type: "text",
            analyzer: "english_exact",
          },
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "index",
  id: 1,
  document: {
    body: "Ski resort",
  },
});
console.log(response1);

const response2 = await client.index({
  index: "index",
  id: 2,
  document: {
    body: "A pair of skis",
  },
});
console.log(response2);

const response3 = await client.indices.refresh({
  index: "index",
});
console.log(response3);
PUT index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "english_exact": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "body": {
        "type": "text",
        "analyzer": "english",
        "fields": {
          "exact": {
            "type": "text",
            "analyzer": "english_exact"
          }
        }
      }
    }
  }
}

PUT index/_doc/1
{
  "body": "Ski resort"
}

PUT index/_doc/2
{
  "body": "A pair of skis"
}

POST index/_refresh

通过这样的设置,在 body 上搜索 ski 会返回两个文档

resp = client.search(
    index="index",
    query={
        "simple_query_string": {
            "fields": [
                "body"
            ],
            "query": "ski"
        }
    },
)
print(resp)
response = client.search(
  index: 'index',
  body: {
    query: {
      simple_query_string: {
        fields: [
          'body'
        ],
        query: 'ski'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "index",
  query: {
    simple_query_string: {
      fields: ["body"],
      query: "ski",
    },
  },
});
console.log(response);
GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body" ],
      "query": "ski"
    }
  }
}
{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 2,
        "relation": "eq"
    },
    "max_score": 0.18232156,
    "hits": [
      {
        "_index": "index",
        "_id": "1",
        "_score": 0.18232156,
        "_source": {
          "body": "Ski resort"
        }
      },
      {
        "_index": "index",
        "_id": "2",
        "_score": 0.18232156,
        "_source": {
          "body": "A pair of skis"
        }
      }
    ]
  }
}

另一方面,在 body.exact 上搜索 ski 只会返回文档 1,因为 body.exact 的分析链不执行词干提取。

resp = client.search(
    index="index",
    query={
        "simple_query_string": {
            "fields": [
                "body.exact"
            ],
            "query": "ski"
        }
    },
)
print(resp)
response = client.search(
  index: 'index',
  body: {
    query: {
      simple_query_string: {
        fields: [
          'body.exact'
        ],
        query: 'ski'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "index",
  query: {
    simple_query_string: {
      fields: ["body.exact"],
      query: "ski",
    },
  },
});
console.log(response);
GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body.exact" ],
      "query": "ski"
    }
  }
}
{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 1,
        "relation": "eq"
    },
    "max_score": 0.8025915,
    "hits": [
      {
        "_index": "index",
        "_id": "1",
        "_score": 0.8025915,
        "_source": {
          "body": "Ski resort"
        }
      }
    ]
  }
}

这不容易暴露给最终用户,因为我们需要找到一种方法来确定他们是否正在寻找精确匹配,并相应地重定向到适当的字段。而且,如果只有部分查询需要精确匹配,而其他部分仍然应该考虑词干提取,该怎么办?

幸运的是,query_stringsimple_query_string 查询具有一个可以解决此确切问题的特性:quote_field_suffix。这告诉 Elasticsearch,出现在引号之间的单词将被重定向到不同的字段,如下所示

resp = client.search(
    index="index",
    query={
        "simple_query_string": {
            "fields": [
                "body"
            ],
            "quote_field_suffix": ".exact",
            "query": "\"ski\""
        }
    },
)
print(resp)
response = client.search(
  index: 'index',
  body: {
    query: {
      simple_query_string: {
        fields: [
          'body'
        ],
        quote_field_suffix: '.exact',
        query: '"ski"'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "index",
  query: {
    simple_query_string: {
      fields: ["body"],
      quote_field_suffix: ".exact",
      query: '"ski"',
    },
  },
});
console.log(response);
GET index/_search
{
  "query": {
    "simple_query_string": {
      "fields": [ "body" ],
      "quote_field_suffix": ".exact",
      "query": "\"ski\""
    }
  }
}
{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 1,
        "relation": "eq"
    },
    "max_score": 0.8025915,
    "hits": [
      {
        "_index": "index",
        "_id": "1",
        "_score": 0.8025915,
        "_source": {
          "body": "Ski resort"
        }
      }
    ]
  }
}

在上面的例子中,由于 ski 在引号之间,因此由于 quote_field_suffix 参数,它在 body.exact 字段上进行搜索,所以只有文档 1 匹配。这允许用户根据需要混合精确搜索和词干搜索。

如果在 quote_field_suffix 中传递的字段选择不存在,则搜索将回退到使用查询字符串的默认字段。