相似度模块
编辑相似度模块编辑
相似度(评分/排名模型)定义了如何对匹配文档进行评分。相似度是针对每个字段的,这意味着可以通过映射为每个字段定义不同的相似度。
配置自定义相似度被认为是一项专家功能,内置相似度很可能已经足够,如相似度
中所述。
配置相似度编辑
大多数现有或自定义相似度都有配置选项,可以通过索引设置进行配置,如下所示。索引选项可以在创建索引或更新索引设置时提供。
resp = client.indices.create( index="index", body={ "settings": { "index": { "similarity": { "my_similarity": { "type": "DFR", "basic_model": "g", "after_effect": "l", "normalization": "h2", "normalization.h2.c": "3.0", } } } } }, ) print(resp)
response = client.indices.create( index: 'index', body: { settings: { index: { similarity: { my_similarity: { type: 'DFR', basic_model: 'g', after_effect: 'l', normalization: 'h2', "normalization.h2.c": '3.0' } } } } } ) puts response
PUT /index { "settings": { "index": { "similarity": { "my_similarity": { "type": "DFR", "basic_model": "g", "after_effect": "l", "normalization": "h2", "normalization.h2.c": "3.0" } } } } }
这里我们配置 DFR 相似度,以便可以在映射中将其引用为 my_similarity
,如下面的示例所示
resp = client.indices.put_mapping( index="index", body={ "properties": { "title": {"type": "text", "similarity": "my_similarity"} } }, ) print(resp)
response = client.indices.put_mapping( index: 'index', body: { properties: { title: { type: 'text', similarity: 'my_similarity' } } } ) puts response
PUT /index/_mapping { "properties" : { "title" : { "type" : "text", "similarity" : "my_similarity" } } }
可用相似度编辑
BM25 相似度(默认)编辑
基于 TF/IDF 的相似度,具有内置的 tf 归一化,应该更适合短字段(如名称)。有关更多详细信息,请参阅Okapi_BM25。此相似度具有以下选项
|
控制非线性词频归一化(饱和度)。默认值为 |
|
控制文档长度对 tf 值进行归一化的程度。默认值为 |
|
确定在计算范数时是否忽略重叠标记(位置增量为 0 的标记)。默认情况下,这是 true,这意味着在计算范数时不计算重叠标记。 |
类型名称:BM25
DFR 相似度编辑
实现随机性偏差框架的相似度。此相似度具有以下选项
除第一个选项外,所有选项都需要一个归一化值。
类型名称:DFR
DFI 相似度编辑
实现独立性偏差模型的相似度。此相似度具有以下选项
|
可能的值 |
使用此相似度时,强烈建议不要删除停用词以获得良好的相关性。另请注意,频率低于预期频率的词语得分将为 0。
类型名称:DFI
IB 相似度。编辑
基于信息的模型。该算法基于以下概念:任何符号*分布*序列中的信息内容主要由其基本元素的重复使用决定。对于书面文本,这一挑战将对应于比较不同作者的写作风格。此相似度具有以下选项
类型名称:IB
LM Dirichlet 相似度。编辑
LM Dirichlet 相似度。此相似度具有以下选项
|
默认为 |
论文中的评分公式对出现次数少于语言模型预测的词语赋予负分,这对 Lucene 来说是非法的,因此此类词语的得分将为 0。
类型名称:LMDirichlet
LM Jelinek Mercer 相似度。编辑
LM Jelinek Mercer 相似度。该算法试图捕捉文本中的重要模式,同时去除噪声。此相似度具有以下选项
|
最佳值取决于集合和查询。对于标题查询,最佳值约为 |
类型名称:LMJelinekMercer
脚本化相似度编辑
一种相似度,允许您使用脚本来指定如何计算分数。例如,下面的示例展示了如何重新实现 TF-IDF
resp = client.indices.create( index="index", body={ "settings": { "number_of_shards": 1, "similarity": { "scripted_tfidf": { "type": "scripted", "script": { "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;" }, } }, }, "mappings": { "properties": { "field": {"type": "text", "similarity": "scripted_tfidf"} } }, }, ) print(resp) resp = client.index( index="index", id="1", body={"field": "foo bar foo"}, ) print(resp) resp = client.index( index="index", id="2", body={"field": "bar baz"}, ) print(resp) resp = client.indices.refresh( index="index", ) print(resp) resp = client.search( index="index", explain="true", body={ "query": { "query_string": {"query": "foo^1.7", "default_field": "field"} } }, ) print(resp)
response = client.indices.create( index: 'index', body: { settings: { number_of_shards: 1, similarity: { scripted_tfidf: { type: 'scripted', script: { source: 'double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;' } } } }, mappings: { properties: { field: { type: 'text', similarity: 'scripted_tfidf' } } } } ) puts response response = client.index( index: 'index', id: 1, body: { field: 'foo bar foo' } ) puts response response = client.index( index: 'index', id: 2, body: { field: 'bar baz' } ) puts response response = client.indices.refresh( index: 'index' ) puts response response = client.search( index: 'index', explain: true, body: { query: { query_string: { query: 'foo^1.7', default_field: 'field' } } } ) puts response
PUT /index { "settings": { "number_of_shards": 1, "similarity": { "scripted_tfidf": { "type": "scripted", "script": { "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;" } } } }, "mappings": { "properties": { "field": { "type": "text", "similarity": "scripted_tfidf" } } } } PUT /index/_doc/1 { "field": "foo bar foo" } PUT /index/_doc/2 { "field": "bar baz" } POST /index/_refresh GET /index/_search?explain=true { "query": { "query_string": { "query": "foo^1.7", "default_field": "field" } } }
产生
{ "took": 12, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 1, "relation": "eq" }, "max_score": 1.9508477, "hits": [ { "_shard": "[index][0]", "_node": "OzrdjxNtQGaqs4DmioFw9A", "_index": "index", "_id": "1", "_score": 1.9508477, "_source": { "field": "foo bar foo" }, "_explanation": { "value": 1.9508477, "description": "weight(field:foo in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 1.9508477, "description": "score from ScriptedSimilarity(weightScript=[null], script=[Script{type=inline, lang='painless', idOrCode='double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;', options={}, params={}}]) computed from:", "details": [ { "value": 1.0, "description": "weight", "details": [] }, { "value": 1.7, "description": "query.boost", "details": [] }, { "value": 2, "description": "field.docCount", "details": [] }, { "value": 4, "description": "field.sumDocFreq", "details": [] }, { "value": 5, "description": "field.sumTotalTermFreq", "details": [] }, { "value": 1, "description": "term.docFreq", "details": [] }, { "value": 2, "description": "term.totalTermFreq", "details": [] }, { "value": 2.0, "description": "doc.freq", "details": [] }, { "value": 3, "description": "doc.length", "details": [] } ] } ] } } ] } }
虽然脚本化相似度提供了很大的灵活性,但它们需要满足一组规则。如果不这样做,可能会导致 Elasticsearch 在搜索时静默返回错误的热门结果或因内部错误而失败
- 返回的分数必须为正数。
- 在其他所有变量保持不变的情况下,当
doc.freq
增加时,分数不得减少。 - 在其他所有变量保持不变的情况下,当
doc.length
增加时,分数不得增加。
您可能已经注意到,上述脚本的很大一部分依赖于对每个文档都相同的统计数据。可以通过提供一个 weight_script
来稍微提高效率,该脚本将计算分数中与文档无关的部分,并在 weight
变量下可用。如果没有提供 weight_script
,则 weight
等于 1
。weight_script
可以访问与 script
相同的变量,但 doc
除外,因为它应该计算对分数的与文档无关的贡献。
下面的配置将给出相同的 tf-idf 分数,但效率略高
resp = client.indices.create( index="index", body={ "settings": { "number_of_shards": 1, "similarity": { "scripted_tfidf": { "type": "scripted", "weight_script": { "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;" }, "script": { "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;" }, } }, }, "mappings": { "properties": { "field": {"type": "text", "similarity": "scripted_tfidf"} } }, }, ) print(resp)
response = client.indices.create( index: 'index', body: { settings: { number_of_shards: 1, similarity: { scripted_tfidf: { type: 'scripted', weight_script: { source: 'double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;' }, script: { source: 'double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;' } } } }, mappings: { properties: { field: { type: 'text', similarity: 'scripted_tfidf' } } } } ) puts response
PUT /index { "settings": { "number_of_shards": 1, "similarity": { "scripted_tfidf": { "type": "scripted", "weight_script": { "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;" }, "script": { "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;" } } } }, "mappings": { "properties": { "field": { "type": "text", "similarity": "scripted_tfidf" } } } }
类型名称:scripted
默认相似度编辑
默认情况下,Elasticsearch 将使用配置为 default
的任何相似度。
您可以在创建索引时更改索引中所有字段的默认相似度
resp = client.indices.create( index="index", body={ "settings": { "index": {"similarity": {"default": {"type": "boolean"}}} } }, ) print(resp)
response = client.indices.create( index: 'index', body: { settings: { index: { similarity: { default: { type: 'boolean' } } } } } ) puts response
PUT /index { "settings": { "index": { "similarity": { "default": { "type": "boolean" } } } } }
如果要在创建索引后更改默认相似度,则必须关闭索引,发送以下请求,然后再次打开它
resp = client.indices.close( index="index", ) print(resp) resp = client.indices.put_settings( index="index", body={"index": {"similarity": {"default": {"type": "boolean"}}}}, ) print(resp) resp = client.indices.open( index="index", ) print(resp)
response = client.indices.close( index: 'index' ) puts response response = client.indices.put_settings( index: 'index', body: { index: { similarity: { default: { type: 'boolean' } } } } ) puts response response = client.indices.open( index: 'index' ) puts response
POST /index/_close PUT /index/_settings { "index": { "similarity": { "default": { "type": "boolean" } } } } POST /index/_open