将您自己的稠密向量嵌入引入 Elasticsearch
编辑将您自己的稠密向量嵌入引入 Elasticsearch
编辑本教程演示如何将已具有稠密向量嵌入的文档索引到 Elasticsearch 中。您还将学习使用 knn
在本教程末尾,您会找到有关在 Elasticsearch 中部署文本嵌入模型的更多信息链接,以便您可以动态生成查询的嵌入。
这是一个高级用例。有关使用 Elasticsearch 进行语义搜索的选项概述,请参阅 语义搜索。
步骤 1:使用 dense_vector
- 评论:存储在
字段中 -
类型会自动使用 int8_hnsw
量化,以减少搜索浮点向量时所需的内存占用。在 稠密向量量化 中了解有关平衡性能和准确性的更多信息。
resp = client.indices.create( index="amazon-reviews", mappings={ "properties": { "review_vector": { "type": "dense_vector", "dims": 8, "index": True, "similarity": "cosine" }, "review_text": { "type": "text" } } }, ) print(resp)
const response = await client.indices.create({ index: "amazon-reviews", mappings: { properties: { review_vector: { type: "dense_vector", dims: 8, index: true, similarity: "cosine", }, review_text: { type: "text", }, }, }, }); console.log(response);
PUT /amazon-reviews { "mappings": { "properties": { "review_vector": { "type": "dense_vector", "dims": 8, "index": true, "similarity": "cosine" }, "review_text": { "type": "text" } } } }
步骤 2:使用嵌入索引文档
resp = client.index( index="amazon-reviews", id="1", document={ "review_text": "This product is lifechanging! I'm telling all my friends about it.", "review_vector": [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 ] }, ) print(resp)
const response = await client.index({ index: "amazon-reviews", id: 1, document: { review_text: "This product is lifechanging! I'm telling all my friends about it.", review_vector: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], }, }); console.log(response);
PUT /amazon-reviews/_doc/1 { "review_text": "This product is lifechanging! I'm telling all my friends about it.", "review_vector": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] }
编辑在生产环境中,您需要使用 _bulk
端点 一次索引多个文档。
以下是在单个 _bulk
resp = client.bulk( operations=[ { "index": { "_index": "amazon-reviews", "_id": "2" } }, { "review_text": "This product is amazing! I love it.", "review_vector": [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 ] }, { "index": { "_index": "amazon-reviews", "_id": "3" } }, { "review_text": "This product is terrible. I hate it.", "review_vector": [ 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ] }, { "index": { "_index": "amazon-reviews", "_id": "4" } }, { "review_text": "This product is great. I can do anything with it.", "review_vector": [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 ] }, { "index": { "_index": "amazon-reviews", "_id": "5" } }, { "review_text": "This product has ruined my life and the lives of my family and friends.", "review_vector": [ 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ] } ], ) print(resp)
const response = await client.bulk({ operations: [ { index: { _index: "amazon-reviews", _id: "2", }, }, { review_text: "This product is amazing! I love it.", review_vector: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], }, { index: { _index: "amazon-reviews", _id: "3", }, }, { review_text: "This product is terrible. I hate it.", review_vector: [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1], }, { index: { _index: "amazon-reviews", _id: "4", }, }, { review_text: "This product is great. I can do anything with it.", review_vector: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], }, { index: { _index: "amazon-reviews", _id: "5", }, }, { review_text: "This product has ruined my life and the lives of my family and friends.", review_vector: [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1], }, ], }); console.log(response);
POST /_bulk { "index": { "_index": "amazon-reviews", "_id": "2" } } { "review_text": "This product is amazing! I love it.", "review_vector": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] } { "index": { "_index": "amazon-reviews", "_id": "3" } } { "review_text": "This product is terrible. I hate it.", "review_vector": [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1] } { "index": { "_index": "amazon-reviews", "_id": "4" } } { "review_text": "This product is great. I can do anything with it.", "review_vector": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] } { "index": { "_index": "amazon-reviews", "_id": "5" } } { "review_text": "This product has ruined my life and the lives of my family and friends.", "review_vector": [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1] }
步骤 3:使用嵌入搜索文档
编辑现在,您可以使用 knn
检索器 查询这些文档向量。 knn
是一种向量搜索类型,它查找与查询向量最相似的 k
resp = client.search( index="amazon-reviews", retriever={ "knn": { "field": "review_vector", "query_vector": [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 ], "k": 2, "num_candidates": 5 } }, ) print(resp)
const response = await client.search({ index: "amazon-reviews", retriever: { knn: { field: "review_vector", query_vector: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], k: 2, num_candidates: 5, }, }, }); console.log(response);
POST /amazon-reviews/_search { "retriever": { "knn": { "field": "review_vector", "query_vector": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], "k": 2, "num_candidates": 5 } } }
在这个简单的示例中,我们发送一个原始向量作为查询文本。在实际场景中,您需要使用嵌入模型为查询生成向量。 |
为此,您需要在 Elasticsearch 中部署一个文本嵌入模型,并使用 query_vector_builder
了解如何 使用已部署的文本嵌入模型 进行语义搜索。
如果您刚开始使用 Elasticsearch 中的向量搜索,请参阅 语义搜索。