移除重复词元过滤器
编辑移除重复词元过滤器
编辑移除同一位置的重复词元。
remove_duplicates
过滤器使用 Lucene 的 RemoveDuplicatesTokenFilter。
示例
编辑要了解 remove_duplicates
过滤器的使用方法,您首先需要生成一个在同一位置包含重复词元的词元流。
以下 分析 API 请求使用 keyword_repeat
和 stemmer
过滤器为 jumping dog
创建词干提取和未词干提取的词元。
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat", "stemmer" ], text="jumping dog", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat', 'stemmer' ], text: 'jumping dog' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat", "stemmer"], text: "jumping dog", }); console.log(response);
GET _analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat", "stemmer" ], "text": "jumping dog" }
API 返回以下响应。请注意,位置 1
处的 dog
词元是重复的。
{ "tokens": [ { "token": "jumping", "start_offset": 0, "end_offset": 7, "type": "word", "position": 0 }, { "token": "jump", "start_offset": 0, "end_offset": 7, "type": "word", "position": 0 }, { "token": "dog", "start_offset": 8, "end_offset": 11, "type": "word", "position": 1 }, { "token": "dog", "start_offset": 8, "end_offset": 11, "type": "word", "position": 1 } ] }
要移除其中一个重复的 dog
词元,请将 remove_duplicates
过滤器添加到之前的分析 API 请求中。
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat", "stemmer", "remove_duplicates" ], text="jumping dog", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat', 'stemmer', 'remove_duplicates' ], text: 'jumping dog' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat", "stemmer", "remove_duplicates"], text: "jumping dog", }); console.log(response);
GET _analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat", "stemmer", "remove_duplicates" ], "text": "jumping dog" }
API 返回以下响应。现在位置 1
处只有一个 dog
词元。
{ "tokens": [ { "token": "jumping", "start_offset": 0, "end_offset": 7, "type": "word", "position": 0 }, { "token": "jump", "start_offset": 0, "end_offset": 7, "type": "word", "position": 0 }, { "token": "dog", "start_offset": 8, "end_offset": 11, "type": "word", "position": 1 } ] }
添加到分析器
编辑以下 创建索引 API 请求使用 remove_duplicates
过滤器配置一个新的 自定义分析器。
此自定义分析器使用 keyword_repeat
和 stemmer
过滤器为流中的每个词元创建词干提取和未词干提取的版本。然后,remove_duplicates
过滤器会移除同一位置的任何重复词元。
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_custom_analyzer": { "tokenizer": "standard", "filter": [ "keyword_repeat", "stemmer", "remove_duplicates" ] } } } }, ) print(resp)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_custom_analyzer: { tokenizer: 'standard', filter: [ 'keyword_repeat', 'stemmer', 'remove_duplicates' ] } } } } } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_custom_analyzer: { tokenizer: "standard", filter: ["keyword_repeat", "stemmer", "remove_duplicates"], }, }, }, }, }); console.log(response);
PUT my-index-000001 { "settings": { "analysis": { "analyzer": { "my_custom_analyzer": { "tokenizer": "standard", "filter": [ "keyword_repeat", "stemmer", "remove_duplicates" ] } } } } }