关键词重复标记过滤器
编辑关键词重复标记过滤器
编辑输出流中每个标记的关键词版本。这些关键词标记不会进行词干提取。
keyword_repeat
过滤器为关键词标记分配 keyword
属性为 true
。词干提取标记过滤器,例如 stemmer
或 porter_stem
,会跳过 keyword
属性为 true
的标记。
您可以将 keyword_repeat
过滤器与词干提取标记过滤器一起使用,以输出流中每个标记的词干提取版本和未词干提取版本。
为了正常工作,keyword_repeat
过滤器必须列在 分析器配置 中任何词干提取过滤器之前。
词干提取不会影响所有标记。这意味着即使在词干提取之后,流中也可能在相同位置包含重复的标记。
要删除这些重复的标记,请在分析器配置中将 remove_duplicates
过滤器添加到词干提取过滤器之后。
keyword_repeat
过滤器使用 Lucene 的 KeywordRepeatFilter。
示例
编辑下面的 分析 API 请求使用 keyword_repeat
过滤器输出 fox running and jumping
中每个标记的关键词版本和非关键词版本。
要返回这些标记的 keyword
属性,分析 API 请求还包括以下参数
-
explain
:true
-
attributes
:keyword
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat" ], text="fox running and jumping", explain=True, attributes="keyword", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat' ], text: 'fox running and jumping', explain: true, attributes: 'keyword' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat"], text: "fox running and jumping", explain: true, attributes: "keyword", }); console.log(response);
GET /_analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat" ], "text": "fox running and jumping", "explain": true, "attributes": "keyword" }
API 返回以下响应。请注意,每个标记的一个版本具有 keyword
属性 true
。
响应
{ "detail": { "custom_analyzer": true, "charfilters": [], "tokenizer": ..., "tokenfilters": [ { "name": "keyword_repeat", "tokens": [ { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": true }, { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": false }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": true }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": false }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": true }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": false }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": true }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": false } ] } ] } }
要对非关键词标记进行词干提取,请在之前的分析 API 请求中将 stemmer
过滤器添加到 keyword_repeat
过滤器之后。
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat", "stemmer" ], text="fox running and jumping", explain=True, attributes="keyword", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat', 'stemmer' ], text: 'fox running and jumping', explain: true, attributes: 'keyword' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat", "stemmer"], text: "fox running and jumping", explain: true, attributes: "keyword", }); console.log(response);
GET /_analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat", "stemmer" ], "text": "fox running and jumping", "explain": true, "attributes": "keyword" }
API 返回以下响应。请注意以下更改
running
的非关键词版本已词干提取为run
。jumping
的非关键词版本已词干提取为jump
。
响应
{ "detail": { "custom_analyzer": true, "charfilters": [], "tokenizer": ..., "tokenfilters": [ { "name": "keyword_repeat", "tokens": ... }, { "name": "stemmer", "tokens": [ { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": true }, { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": false }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": true }, { "token": "run", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": false }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": true }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": false }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": true }, { "token": "jump", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": false } ] } ] } }
但是,fox
和 and
的关键词版本和非关键词版本相同,并且在相同的位置。
要删除这些重复的标记,请在分析 API 请求中将 remove_duplicates
过滤器添加到 stemmer
之后。
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat", "stemmer", "remove_duplicates" ], text="fox running and jumping", explain=True, attributes="keyword", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat', 'stemmer', 'remove_duplicates' ], text: 'fox running and jumping', explain: true, attributes: 'keyword' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat", "stemmer", "remove_duplicates"], text: "fox running and jumping", explain: true, attributes: "keyword", }); console.log(response);
GET /_analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat", "stemmer", "remove_duplicates" ], "text": "fox running and jumping", "explain": true, "attributes": "keyword" }
API 返回以下响应。请注意,fox
和 and
的重复标记已被删除。
响应
{ "detail": { "custom_analyzer": true, "charfilters": [], "tokenizer": ..., "tokenfilters": [ { "name": "keyword_repeat", "tokens": ... }, { "name": "stemmer", "tokens": ... }, { "name": "remove_duplicates", "tokens": [ { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": true }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": true }, { "token": "run", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": false }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": true }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": true }, { "token": "jump", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": false } ] } ] } }
添加到分析器
编辑以下 创建索引 API 请求使用 keyword_repeat
过滤器来配置新的 自定义分析器。
此自定义分析器使用 keyword_repeat
和 porter_stem
过滤器来创建流中每个标记的词干提取版本和未词干提取版本。remove_duplicates
过滤器随后会删除流中的任何重复标记。
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_custom_analyzer": { "tokenizer": "standard", "filter": [ "keyword_repeat", "porter_stem", "remove_duplicates" ] } } } }, ) print(resp)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_custom_analyzer: { tokenizer: 'standard', filter: [ 'keyword_repeat', 'porter_stem', 'remove_duplicates' ] } } } } } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_custom_analyzer: { tokenizer: "standard", filter: ["keyword_repeat", "porter_stem", "remove_duplicates"], }, }, }, }, }); console.log(response);
PUT /my-index-000001 { "settings": { "analysis": { "analyzer": { "my_custom_analyzer": { "tokenizer": "standard", "filter": [ "keyword_repeat", "porter_stem", "remove_duplicates" ] } } } } }