关键词重复词元过滤器
编辑关键词重复词元过滤器
编辑输出流中每个词元的关键词版本。这些关键词词元不会进行词干提取。
keyword_repeat
过滤器为关键词词元分配一个 keyword
属性,其值为 true
。词干提取词元过滤器,例如 stemmer
或 porter_stem
,会跳过 keyword
属性为 true
的词元。
你可以将 keyword_repeat
过滤器与词干提取词元过滤器一起使用,以输出流中每个词元的词干提取版本和未提取词干版本。
为了正常工作,keyword_repeat
过滤器必须在 分析器配置中任何词干提取词元过滤器之前列出。
词干提取并非影响所有词元。这意味着流中可能包含位于相同位置的重复词元,即使在词干提取之后也是如此。
要删除这些重复词元,请在分析器配置中的词干提取过滤器之后添加 remove_duplicates
过滤器。
keyword_repeat
过滤器使用 Lucene 的 KeywordRepeatFilter。
示例
编辑以下 analyze API 请求使用 keyword_repeat
过滤器输出 fox running and jumping
中每个词元的关键词和非关键词版本。
要返回这些词元的 keyword
属性,分析 API 请求还包括以下参数:
-
explain
:true
-
attributes
:keyword
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat" ], text="fox running and jumping", explain=True, attributes="keyword", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat' ], text: 'fox running and jumping', explain: true, attributes: 'keyword' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat"], text: "fox running and jumping", explain: true, attributes: "keyword", }); console.log(response);
GET /_analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat" ], "text": "fox running and jumping", "explain": true, "attributes": "keyword" }
API 返回以下响应。请注意,每个词元的一个版本都具有 keyword
属性,其值为 true
。
响应
{ "detail": { "custom_analyzer": true, "charfilters": [], "tokenizer": ..., "tokenfilters": [ { "name": "keyword_repeat", "tokens": [ { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": true }, { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": false }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": true }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": false }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": true }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": false }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": true }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": false } ] } ] } }
要提取非关键词词元的词干,请在之前的分析 API 请求中,在 keyword_repeat
过滤器之后添加 stemmer
过滤器。
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat", "stemmer" ], text="fox running and jumping", explain=True, attributes="keyword", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat', 'stemmer' ], text: 'fox running and jumping', explain: true, attributes: 'keyword' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat", "stemmer"], text: "fox running and jumping", explain: true, attributes: "keyword", }); console.log(response);
GET /_analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat", "stemmer" ], "text": "fox running and jumping", "explain": true, "attributes": "keyword" }
API 返回以下响应。请注意以下更改:
running
的非关键词版本被词干提取为run
。jumping
的非关键词版本被词干提取为jump
。
响应
{ "detail": { "custom_analyzer": true, "charfilters": [], "tokenizer": ..., "tokenfilters": [ { "name": "keyword_repeat", "tokens": ... }, { "name": "stemmer", "tokens": [ { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": true }, { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": false }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": true }, { "token": "run", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": false }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": true }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": false }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": true }, { "token": "jump", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": false } ] } ] } }
但是,fox
和 and
的关键词和非关键词版本是相同的,并且位于各自相同的位置。
要删除这些重复的词元,请在分析 API 请求中,在 stemmer
之后添加 remove_duplicates
过滤器。
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "keyword_repeat", "stemmer", "remove_duplicates" ], text="fox running and jumping", explain=True, attributes="keyword", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'keyword_repeat', 'stemmer', 'remove_duplicates' ], text: 'fox running and jumping', explain: true, attributes: 'keyword' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["keyword_repeat", "stemmer", "remove_duplicates"], text: "fox running and jumping", explain: true, attributes: "keyword", }); console.log(response);
GET /_analyze { "tokenizer": "whitespace", "filter": [ "keyword_repeat", "stemmer", "remove_duplicates" ], "text": "fox running and jumping", "explain": true, "attributes": "keyword" }
API 返回以下响应。请注意,已删除 fox
和 and
的重复词元。
响应
{ "detail": { "custom_analyzer": true, "charfilters": [], "tokenizer": ..., "tokenfilters": [ { "name": "keyword_repeat", "tokens": ... }, { "name": "stemmer", "tokens": ... }, { "name": "remove_duplicates", "tokens": [ { "token": "fox", "start_offset": 0, "end_offset": 3, "type": "word", "position": 0, "keyword": true }, { "token": "running", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": true }, { "token": "run", "start_offset": 4, "end_offset": 11, "type": "word", "position": 1, "keyword": false }, { "token": "and", "start_offset": 12, "end_offset": 15, "type": "word", "position": 2, "keyword": true }, { "token": "jumping", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": true }, { "token": "jump", "start_offset": 16, "end_offset": 23, "type": "word", "position": 3, "keyword": false } ] } ] } }
添加到分析器
编辑以下 创建索引 API 请求使用 keyword_repeat
过滤器配置一个新的 自定义分析器。
此自定义分析器使用 keyword_repeat
和 porter_stem
过滤器来创建流中每个词元的词干提取版本和未提取词干版本。remove_duplicates
过滤器随后会从流中删除任何重复的词元。
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_custom_analyzer": { "tokenizer": "standard", "filter": [ "keyword_repeat", "porter_stem", "remove_duplicates" ] } } } }, ) print(resp)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_custom_analyzer: { tokenizer: 'standard', filter: [ 'keyword_repeat', 'porter_stem', 'remove_duplicates' ] } } } } } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_custom_analyzer: { tokenizer: "standard", filter: ["keyword_repeat", "porter_stem", "remove_duplicates"], }, }, }, }, }); console.log(response);
PUT /my-index-000001 { "settings": { "analysis": { "analyzer": { "my_custom_analyzer": { "tokenizer": "standard", "filter": [ "keyword_repeat", "porter_stem", "remove_duplicates" ] } } } } }