相似度模块

编辑

相似度(评分/排名模型)定义了如何对匹配的文档进行评分。相似度是每个字段的属性,这意味着可以通过映射为每个字段定义不同的相似度。

相似度仅适用于文本类型和关键字类型字段。

配置自定义相似度被认为是专家功能,内置相似度很可能已经足够,如 similarity 中所述。

配置相似度

编辑

大多数现有或自定义的相似度都有配置选项,可以通过索引设置进行配置,如下所示。索引选项可以在创建索引或更新索引设置时提供。

resp = client.indices.create(
    index="index",
    settings={
        "index": {
            "similarity": {
                "my_similarity": {
                    "type": "DFR",
                    "basic_model": "g",
                    "after_effect": "l",
                    "normalization": "h2",
                    "normalization.h2.c": "3.0"
                }
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: 'index',
  body: {
    settings: {
      index: {
        similarity: {
          my_similarity: {
            type: 'DFR',
            basic_model: 'g',
            after_effect: 'l',
            normalization: 'h2',
            "normalization.h2.c": '3.0'
          }
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "index",
  settings: {
    index: {
      similarity: {
        my_similarity: {
          type: "DFR",
          basic_model: "g",
          after_effect: "l",
          normalization: "h2",
          "normalization.h2.c": "3.0",
        },
      },
    },
  },
});
console.log(response);
PUT /index
{
  "settings": {
    "index": {
      "similarity": {
        "my_similarity": {
          "type": "DFR",
          "basic_model": "g",
          "after_effect": "l",
          "normalization": "h2",
          "normalization.h2.c": "3.0"
        }
      }
    }
  }
}

这里我们配置 DFR 相似度,以便可以在映射中将其引用为 my_similarity,如下面的示例所示

resp = client.indices.put_mapping(
    index="index",
    properties={
        "title": {
            "type": "text",
            "similarity": "my_similarity"
        }
    },
)
print(resp)
response = client.indices.put_mapping(
  index: 'index',
  body: {
    properties: {
      title: {
        type: 'text',
        similarity: 'my_similarity'
      }
    }
  }
)
puts response
const response = await client.indices.putMapping({
  index: "index",
  properties: {
    title: {
      type: "text",
      similarity: "my_similarity",
    },
  },
});
console.log(response);
PUT /index/_mapping
{
  "properties" : {
    "title" : { "type" : "text", "similarity" : "my_similarity" }
  }
}

可用的相似度

编辑

BM25 相似度(默认

编辑

基于 TF/IDF 的相似度,具有内置的 tf 归一化,并且应该更适合短字段(如名称)。有关更多详细信息,请参阅 Okapi_BM25。此相似度具有以下选项

k1

控制非线性词频归一化(饱和)。默认值为 1.2

b

控制文档长度归一化 tf 值的程度。默认值为 0.75

discount_overlaps

确定在计算范数时是否忽略重叠标记(位置增量为 0 的标记)。默认情况下,此值为 true,这意味着在计算范数时不计算重叠标记。

类型名称:BM25

DFR 相似度

编辑

实现 与随机性的偏差 框架的相似度。此相似度具有以下选项

basic_model

可能的值:gifinine

after_effect

可能的值:bl

normalization

可能的值:noh1h2h3z

除第一个选项外的所有选项都需要一个归一化值。

类型名称:DFR

DFI 相似度

编辑

实现 与独立性的偏差 模型的相似度。此相似度具有以下选项

independence_measure

可能的值 standardizedsaturatedchisquared

使用此相似度时,强烈建议不要删除停用词以获得良好的相关性。还要注意,频率小于预期频率的术语将获得等于 0 的分数。

类型名称:DFI

IB 相似度。

编辑

基于信息的模型 。该算法基于这样一个概念,即任何符号分布序列中的信息内容主要由其基本元素的重复使用决定。对于书面文本,此挑战将对应于比较不同作者的写作风格。此相似度具有以下选项

distribution

可能的值:llspl

lambda

可能的值:dfttf

normalization

DFR 相似度相同。

类型名称:IB

LM Dirichlet 相似度。

编辑

LM Dirichlet 相似度 。此相似度具有以下选项

mu

默认为 2000

论文中的评分公式为出现次数少于语言模型预测的术语分配负分,这对于 Lucene 来说是非法的,因此此类术语的分数为 0。

类型名称:LMDirichlet

LM Jelinek Mercer 相似度。

编辑

LM Jelinek Mercer 相似度 。该算法尝试捕获文本中的重要模式,同时排除噪声。此相似度具有以下选项

lambda

最佳值取决于集合和查询。对于标题查询,最佳值约为 0.1,对于长查询,最佳值约为 0.7。默认为 0.1。当值接近 0 时,匹配更多查询词的文档的排名将高于匹配较少词的文档。

类型名称:LMJelinekMercer

脚本相似度

编辑

一种相似度,允许您使用脚本来指定如何计算分数。例如,下面的示例显示了如何重新实现 TF-IDF

resp = client.indices.create(
    index="index",
    settings={
        "number_of_shards": 1,
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
                }
            }
        }
    },
    mappings={
        "properties": {
            "field": {
                "type": "text",
                "similarity": "scripted_tfidf"
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="index",
    id="1",
    document={
        "field": "foo bar foo"
    },
)
print(resp1)

resp2 = client.index(
    index="index",
    id="2",
    document={
        "field": "bar baz"
    },
)
print(resp2)

resp3 = client.indices.refresh(
    index="index",
)
print(resp3)

resp4 = client.search(
    index="index",
    explain=True,
    query={
        "query_string": {
            "query": "foo^1.7",
            "default_field": "field"
        }
    },
)
print(resp4)
response = client.indices.create(
  index: 'index',
  body: {
    settings: {
      number_of_shards: 1,
      similarity: {
        scripted_tfidf: {
          type: 'scripted',
          script: {
            source: 'double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;'
          }
        }
      }
    },
    mappings: {
      properties: {
        field: {
          type: 'text',
          similarity: 'scripted_tfidf'
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'index',
  id: 1,
  body: {
    field: 'foo bar foo'
  }
)
puts response

response = client.index(
  index: 'index',
  id: 2,
  body: {
    field: 'bar baz'
  }
)
puts response

response = client.indices.refresh(
  index: 'index'
)
puts response

response = client.search(
  index: 'index',
  explain: true,
  body: {
    query: {
      query_string: {
        query: 'foo^1.7',
        default_field: 'field'
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "index",
  settings: {
    number_of_shards: 1,
    similarity: {
      scripted_tfidf: {
        type: "scripted",
        script: {
          source:
            "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;",
        },
      },
    },
  },
  mappings: {
    properties: {
      field: {
        type: "text",
        similarity: "scripted_tfidf",
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "index",
  id: 1,
  document: {
    field: "foo bar foo",
  },
});
console.log(response1);

const response2 = await client.index({
  index: "index",
  id: 2,
  document: {
    field: "bar baz",
  },
});
console.log(response2);

const response3 = await client.indices.refresh({
  index: "index",
});
console.log(response3);

const response4 = await client.search({
  index: "index",
  explain: "true",
  query: {
    query_string: {
      query: "foo^1.7",
      default_field: "field",
    },
  },
});
console.log(response4);
PUT /index
{
  "settings": {
    "number_of_shards": 1,
    "similarity": {
      "scripted_tfidf": {
        "type": "scripted",
        "script": {
          "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "field": {
        "type": "text",
        "similarity": "scripted_tfidf"
      }
    }
  }
}

PUT /index/_doc/1
{
  "field": "foo bar foo"
}

PUT /index/_doc/2
{
  "field": "bar baz"
}

POST /index/_refresh

GET /index/_search?explain=true
{
  "query": {
    "query_string": {
      "query": "foo^1.7",
      "default_field": "field"
    }
  }
}

产生

{
  "took": 12,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
        "value": 1,
        "relation": "eq"
    },
    "max_score": 1.9508477,
    "hits": [
      {
        "_shard": "[index][0]",
        "_node": "OzrdjxNtQGaqs4DmioFw9A",
        "_index": "index",
        "_id": "1",
        "_score": 1.9508477,
        "_source": {
          "field": "foo bar foo"
        },
        "_explanation": {
          "value": 1.9508477,
          "description": "weight(field:foo in 0) [PerFieldSimilarity], result of:",
          "details": [
            {
              "value": 1.9508477,
              "description": "score from ScriptedSimilarity(weightScript=[null], script=[Script{type=inline, lang='painless', idOrCode='double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;', options={}, params={}}]) computed from:",
              "details": [
                {
                  "value": 1.0,
                  "description": "weight",
                  "details": []
                },
                {
                  "value": 1.7,
                  "description": "query.boost",
                  "details": []
                },
                {
                  "value": 2,
                  "description": "field.docCount",
                  "details": []
                },
                {
                  "value": 4,
                  "description": "field.sumDocFreq",
                  "details": []
                },
                {
                  "value": 5,
                  "description": "field.sumTotalTermFreq",
                  "details": []
                },
                {
                  "value": 1,
                  "description": "term.docFreq",
                  "details": []
                },
                {
                  "value": 2,
                  "description": "term.totalTermFreq",
                  "details": []
                },
                {
                  "value": 2.0,
                  "description": "doc.freq",
                  "details": []
                },
                {
                  "value": 3,
                  "description": "doc.length",
                  "details": []
                }
              ]
            }
          ]
        }
      }
    ]
  }
}

虽然脚本相似度提供了很大的灵活性,但它们需要满足一组规则。如果未能这样做,可能会导致 Elasticsearch 在搜索时静默返回错误的匹配结果或因内部错误而失败

  • 返回的分数必须为正数。
  • 在所有其他变量保持不变的情况下,当 doc.freq 增加时,分数不得减少。
  • 在所有其他变量保持不变的情况下,当 doc.length 增加时,分数不得增加。

您可能已经注意到,上述脚本的很大一部分依赖于对每个文档都相同的统计信息。通过提供一个 weight_script 可以使上述内容稍微更有效率,该脚本将计算独立于文档的分数部分,并且可在 weight 变量下使用。当未提供 weight_script 时,weight 等于 1weight_script 可以访问与 script 相同的变量,除了 doc,因为它应该计算对分数的独立于文档的贡献。

以下配置将给出相同的 tf-idf 分数,但效率略高

resp = client.indices.create(
    index="index",
    settings={
        "number_of_shards": 1,
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "weight_script": {
                    "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;"
                },
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;"
                }
            }
        }
    },
    mappings={
        "properties": {
            "field": {
                "type": "text",
                "similarity": "scripted_tfidf"
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: 'index',
  body: {
    settings: {
      number_of_shards: 1,
      similarity: {
        scripted_tfidf: {
          type: 'scripted',
          weight_script: {
            source: 'double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;'
          },
          script: {
            source: 'double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;'
          }
        }
      }
    },
    mappings: {
      properties: {
        field: {
          type: 'text',
          similarity: 'scripted_tfidf'
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "index",
  settings: {
    number_of_shards: 1,
    similarity: {
      scripted_tfidf: {
        type: "scripted",
        weight_script: {
          source:
            "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;",
        },
        script: {
          source:
            "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;",
        },
      },
    },
  },
  mappings: {
    properties: {
      field: {
        type: "text",
        similarity: "scripted_tfidf",
      },
    },
  },
});
console.log(response);
PUT /index
{
  "settings": {
    "number_of_shards": 1,
    "similarity": {
      "scripted_tfidf": {
        "type": "scripted",
        "weight_script": {
          "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;"
        },
        "script": {
          "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "field": {
        "type": "text",
        "similarity": "scripted_tfidf"
      }
    }
  }
}

类型名称:scripted

默认相似度

编辑

默认情况下,Elasticsearch 将使用配置为 default 的任何相似度。

您可以在创建索引时更改索引中所有字段的默认相似度

resp = client.indices.create(
    index="index",
    settings={
        "index": {
            "similarity": {
                "default": {
                    "type": "boolean"
                }
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: 'index',
  body: {
    settings: {
      index: {
        similarity: {
          default: {
            type: 'boolean'
          }
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "index",
  settings: {
    index: {
      similarity: {
        default: {
          type: "boolean",
        },
      },
    },
  },
});
console.log(response);
PUT /index
{
  "settings": {
    "index": {
      "similarity": {
        "default": {
          "type": "boolean"
        }
      }
    }
  }
}

如果要在创建索引后更改默认相似度,则必须关闭索引,发送以下请求,然后再次打开索引

resp = client.indices.close(
    index="index",
)
print(resp)

resp1 = client.indices.put_settings(
    index="index",
    settings={
        "index": {
            "similarity": {
                "default": {
                    "type": "boolean"
                }
            }
        }
    },
)
print(resp1)

resp2 = client.indices.open(
    index="index",
)
print(resp2)
response = client.indices.close(
  index: 'index'
)
puts response

response = client.indices.put_settings(
  index: 'index',
  body: {
    index: {
      similarity: {
        default: {
          type: 'boolean'
        }
      }
    }
  }
)
puts response

response = client.indices.open(
  index: 'index'
)
puts response
const response = await client.indices.close({
  index: "index",
});
console.log(response);

const response1 = await client.indices.putSettings({
  index: "index",
  settings: {
    index: {
      similarity: {
        default: {
          type: "boolean",
        },
      },
    },
  },
});
console.log(response1);

const response2 = await client.indices.open({
  index: "index",
});
console.log(response2);
POST /index/_close

PUT /index/_settings
{
  "index": {
    "similarity": {
      "default": {
        "type": "boolean"
      }
    }
  }
}

POST /index/_open