稀疏向量字段类型
编辑稀疏向量字段类型
编辑sparse_vector
字段可以索引特征和权重,以便后续在带有 sparse_vector
的查询中用于查询文档。此字段还可以与传统的 text_expansion
查询一起使用。
sparse_vector
是应与 ELSER 映射一起使用的字段类型。
resp = client.indices.create( index="my-index", mappings={ "properties": { "text.tokens": { "type": "sparse_vector" } } }, ) print(resp)
response = client.indices.create( index: 'my-index', body: { mappings: { properties: { 'text.tokens' => { type: 'sparse_vector' } } } } ) puts response
const response = await client.indices.create({ index: "my-index", mappings: { properties: { "text.tokens": { type: "sparse_vector", }, }, }, }); console.log(response);
PUT my-index { "mappings": { "properties": { "text.tokens": { "type": "sparse_vector" } } } }
有关使用 ELSER 将文档添加到 sparse_vector
映射字段的完整示例,请参阅 使用 ELSER 进行语义搜索。
多值稀疏向量
编辑当传递稀疏向量的值数组时,将选择具有相似名称的特征的最大值。
论文《Adapting Learned Sparse Retrieval for Long Documents》(https://arxiv.org/pdf/2305.18494.pdf)对此进行了更详细的讨论。总而言之,研究结果支持表示聚合通常优于分数聚合。
对于需要重叠特征名称的情况,应将它们单独存储或使用嵌套字段。
以下是一个传递具有重叠特征名称的文档的示例。请考虑在此示例中,存在两个表示正面情绪和负面情绪的类别。但是,出于检索的目的,我们还希望获得整体影响,而不是特定的情绪。在示例中,impact
存储为多值稀疏向量,并且仅存储重叠名称的最大值。更具体地说,此处的最终 GET
查询返回的 _score
约为 1.2(这是 max(impact.delicious[0], impact.delicious[1])
,并且由于我们具有如下所述的 0.4% 的相对误差,因此是近似值)
resp = client.indices.create( index="my-index-000001", mappings={ "properties": { "text": { "type": "text", "analyzer": "standard" }, "impact": { "type": "sparse_vector" }, "positive": { "type": "sparse_vector" }, "negative": { "type": "sparse_vector" } } }, ) print(resp) resp1 = client.index( index="my-index-000001", document={ "text": "I had some terribly delicious carrots.", "impact": [ { "I": 0.55, "had": 0.4, "some": 0.28, "terribly": 0.01, "delicious": 1.2, "carrots": 0.8 }, { "I": 0.54, "had": 0.4, "some": 0.28, "terribly": 2.01, "delicious": 0.02, "carrots": 0.4 } ], "positive": { "I": 0.55, "had": 0.4, "some": 0.28, "terribly": 0.01, "delicious": 1.2, "carrots": 0.8 }, "negative": { "I": 0.54, "had": 0.4, "some": 0.28, "terribly": 2.01, "delicious": 0.02, "carrots": 0.4 } }, ) print(resp1) resp2 = client.search( index="my-index-000001", query={ "term": { "impact": { "value": "delicious" } } }, ) print(resp2)
const response = await client.indices.create({ index: "my-index-000001", mappings: { properties: { text: { type: "text", analyzer: "standard", }, impact: { type: "sparse_vector", }, positive: { type: "sparse_vector", }, negative: { type: "sparse_vector", }, }, }, }); console.log(response); const response1 = await client.index({ index: "my-index-000001", document: { text: "I had some terribly delicious carrots.", impact: [ { I: 0.55, had: 0.4, some: 0.28, terribly: 0.01, delicious: 1.2, carrots: 0.8, }, { I: 0.54, had: 0.4, some: 0.28, terribly: 2.01, delicious: 0.02, carrots: 0.4, }, ], positive: { I: 0.55, had: 0.4, some: 0.28, terribly: 0.01, delicious: 1.2, carrots: 0.8, }, negative: { I: 0.54, had: 0.4, some: 0.28, terribly: 2.01, delicious: 0.02, carrots: 0.4, }, }, }); console.log(response1); const response2 = await client.search({ index: "my-index-000001", query: { term: { impact: { value: "delicious", }, }, }, }); console.log(response2);
PUT my-index-000001 { "mappings": { "properties": { "text": { "type": "text", "analyzer": "standard" }, "impact": { "type": "sparse_vector" }, "positive": { "type": "sparse_vector" }, "negative": { "type": "sparse_vector" } } } } POST my-index-000001/_doc { "text": "I had some terribly delicious carrots.", "impact": [{"I": 0.55, "had": 0.4, "some": 0.28, "terribly": 0.01, "delicious": 1.2, "carrots": 0.8}, {"I": 0.54, "had": 0.4, "some": 0.28, "terribly": 2.01, "delicious": 0.02, "carrots": 0.4}], "positive": {"I": 0.55, "had": 0.4, "some": 0.28, "terribly": 0.01, "delicious": 1.2, "carrots": 0.8}, "negative": {"I": 0.54, "had": 0.4, "some": 0.28, "terribly": 2.01, "delicious": 0.02, "carrots": 0.4} } GET my-index-000001/_search { "query": { "term": { "impact": { "value": "delicious" } } } }
sparse_vector
字段不能包含在 8.0 到 8.10 之间的 Elasticsearch 版本上 创建 的索引中
sparse_vector
字段仅支持严格的正值。负值将被拒绝。
sparse_vector
字段不支持 分析器、查询、排序或聚合。它们只能在专门的查询中使用。建议在此类字段上使用的查询是 sparse_vector
查询。它们也可以在传统的 text_expansion
查询中使用。
sparse_vector
字段仅保留 9 个有效位的精度,这转化为大约 0.4% 的相对误差。