稀疏向量字段类型

编辑

sparse_vector 字段可以索引特征和权重,以便后续在带有 sparse_vector 的查询中用于查询文档。此字段还可以与传统的 text_expansion 查询一起使用。

sparse_vector 是应与 ELSER 映射一起使用的字段类型。

resp = client.indices.create(
    index="my-index",
    mappings={
        "properties": {
            "text.tokens": {
                "type": "sparse_vector"
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: 'my-index',
  body: {
    mappings: {
      properties: {
        'text.tokens' => {
          type: 'sparse_vector'
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "my-index",
  mappings: {
    properties: {
      "text.tokens": {
        type: "sparse_vector",
      },
    },
  },
});
console.log(response);
PUT my-index
{
  "mappings": {
    "properties": {
      "text.tokens": {
        "type": "sparse_vector"
      }
    }
  }
}

有关使用 ELSER 将文档添加到 sparse_vector 映射字段的完整示例,请参阅 使用 ELSER 进行语义搜索

多值稀疏向量

编辑

当传递稀疏向量的值数组时,将选择具有相似名称的特征的最大值。

论文《Adapting Learned Sparse Retrieval for Long Documents》(https://arxiv.org/pdf/2305.18494.pdf)对此进行了更详细的讨论。总而言之,研究结果支持表示聚合通常优于分数聚合。

对于需要重叠特征名称的情况,应将它们单独存储或使用嵌套字段。

以下是一个传递具有重叠特征名称的文档的示例。请考虑在此示例中,存在两个表示正面情绪和负面情绪的类别。但是,出于检索的目的,我们还希望获得整体影响,而不是特定的情绪。在示例中,impact 存储为多值稀疏向量,并且仅存储重叠名称的最大值。更具体地说,此处的最终 GET 查询返回的 _score 约为 1.2(这是 max(impact.delicious[0], impact.delicious[1]),并且由于我们具有如下所述的 0.4% 的相对误差,因此是近似值)

resp = client.indices.create(
    index="my-index-000001",
    mappings={
        "properties": {
            "text": {
                "type": "text",
                "analyzer": "standard"
            },
            "impact": {
                "type": "sparse_vector"
            },
            "positive": {
                "type": "sparse_vector"
            },
            "negative": {
                "type": "sparse_vector"
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="my-index-000001",
    document={
        "text": "I had some terribly delicious carrots.",
        "impact": [
            {
                "I": 0.55,
                "had": 0.4,
                "some": 0.28,
                "terribly": 0.01,
                "delicious": 1.2,
                "carrots": 0.8
            },
            {
                "I": 0.54,
                "had": 0.4,
                "some": 0.28,
                "terribly": 2.01,
                "delicious": 0.02,
                "carrots": 0.4
            }
        ],
        "positive": {
            "I": 0.55,
            "had": 0.4,
            "some": 0.28,
            "terribly": 0.01,
            "delicious": 1.2,
            "carrots": 0.8
        },
        "negative": {
            "I": 0.54,
            "had": 0.4,
            "some": 0.28,
            "terribly": 2.01,
            "delicious": 0.02,
            "carrots": 0.4
        }
    },
)
print(resp1)

resp2 = client.search(
    index="my-index-000001",
    query={
        "term": {
            "impact": {
                "value": "delicious"
            }
        }
    },
)
print(resp2)
const response = await client.indices.create({
  index: "my-index-000001",
  mappings: {
    properties: {
      text: {
        type: "text",
        analyzer: "standard",
      },
      impact: {
        type: "sparse_vector",
      },
      positive: {
        type: "sparse_vector",
      },
      negative: {
        type: "sparse_vector",
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "my-index-000001",
  document: {
    text: "I had some terribly delicious carrots.",
    impact: [
      {
        I: 0.55,
        had: 0.4,
        some: 0.28,
        terribly: 0.01,
        delicious: 1.2,
        carrots: 0.8,
      },
      {
        I: 0.54,
        had: 0.4,
        some: 0.28,
        terribly: 2.01,
        delicious: 0.02,
        carrots: 0.4,
      },
    ],
    positive: {
      I: 0.55,
      had: 0.4,
      some: 0.28,
      terribly: 0.01,
      delicious: 1.2,
      carrots: 0.8,
    },
    negative: {
      I: 0.54,
      had: 0.4,
      some: 0.28,
      terribly: 2.01,
      delicious: 0.02,
      carrots: 0.4,
    },
  },
});
console.log(response1);

const response2 = await client.search({
  index: "my-index-000001",
  query: {
    term: {
      impact: {
        value: "delicious",
      },
    },
  },
});
console.log(response2);
PUT my-index-000001
{
  "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "analyzer": "standard"
      },
      "impact": {
        "type": "sparse_vector"
      },
      "positive": {
        "type": "sparse_vector"
      },
      "negative": {
        "type": "sparse_vector"
      }
    }
  }
}

POST my-index-000001/_doc
{
    "text": "I had some terribly delicious carrots.",
    "impact": [{"I": 0.55, "had": 0.4, "some": 0.28, "terribly": 0.01, "delicious": 1.2, "carrots": 0.8},
               {"I": 0.54, "had": 0.4, "some": 0.28, "terribly": 2.01, "delicious": 0.02, "carrots": 0.4}],
    "positive": {"I": 0.55, "had": 0.4, "some": 0.28, "terribly": 0.01, "delicious": 1.2, "carrots": 0.8},
    "negative": {"I": 0.54, "had": 0.4, "some": 0.28, "terribly": 2.01, "delicious": 0.02, "carrots": 0.4}
}

GET my-index-000001/_search
{
  "query": {
    "term": {
      "impact": {
         "value": "delicious"
      }
    }
  }
}

sparse_vector 字段不能包含在 8.0 到 8.10 之间的 Elasticsearch 版本上 创建 的索引中

sparse_vector 字段仅支持严格的正值。负值将被拒绝。

sparse_vector 字段不支持 分析器、查询、排序或聚合。它们只能在专门的查询中使用。建议在此类字段上使用的查询是 sparse_vector 查询。它们也可以在传统的 text_expansion 查询中使用。

sparse_vector 字段仅保留 9 个有效位的精度,这转化为大约 0.4% 的相对误差。