创建自定义分析器
编辑创建自定义分析器
编辑当内置分析器不能满足您的需求时,您可以创建一个 custom
分析器,它使用以下合适的组合:
配置
编辑custom
分析器接受以下参数:
|
分析器类型。接受内置分析器类型。对于自定义分析器,请使用 |
|
一个内置的或自定义的分词器。(必需) |
|
一个可选的内置或自定义的字符过滤器数组。 |
|
一个可选的内置或自定义的词元过滤器数组。 |
|
当索引文本值数组时,Elasticsearch 在一个值的最后一个词元和下一个值的第一个词元之间插入一个假的“间隔”,以确保短语查询不会匹配来自不同数组元素的两个词元。默认为 |
配置示例
编辑这是一个结合了以下内容的示例:
- 字符过滤器
- 分词器
- 词元过滤器
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_custom_analyzer": { "type": "custom", "tokenizer": "standard", "char_filter": [ "html_strip" ], "filter": [ "lowercase", "asciifolding" ] } } } }, ) print(resp) resp1 = client.indices.analyze( index="my-index-000001", analyzer="my_custom_analyzer", text="Is this déjà vu</b>?", ) print(resp1)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_custom_analyzer: { type: 'custom', tokenizer: 'standard', char_filter: [ 'html_strip' ], filter: [ 'lowercase', 'asciifolding' ] } } } } } ) puts response response = client.indices.analyze( index: 'my-index-000001', body: { analyzer: 'my_custom_analyzer', text: 'Is this déjà vu</b>?' } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_custom_analyzer: { type: "custom", tokenizer: "standard", char_filter: ["html_strip"], filter: ["lowercase", "asciifolding"], }, }, }, }, }); console.log(response); const response1 = await client.indices.analyze({ index: "my-index-000001", analyzer: "my_custom_analyzer", text: "Is this déjà vu</b>?", }); console.log(response1);
PUT my-index-000001 { "settings": { "analysis": { "analyzer": { "my_custom_analyzer": { "type": "custom", "tokenizer": "standard", "char_filter": [ "html_strip" ], "filter": [ "lowercase", "asciifolding" ] } } } } } POST my-index-000001/_analyze { "analyzer": "my_custom_analyzer", "text": "Is this <b>déjà vu</b>?" }
上面的示例生成以下词元:
[ is, this, deja, vu ]
前面的示例使用了默认配置的分词器、词元过滤器和字符过滤器,但是可以创建每个配置的版本,并在自定义分析器中使用它们。
这是一个更复杂的示例,它结合了以下内容:
这是一个示例:
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_custom_analyzer": { "char_filter": [ "emoticons" ], "tokenizer": "punctuation", "filter": [ "lowercase", "english_stop" ] } }, "tokenizer": { "punctuation": { "type": "pattern", "pattern": "[ .,!?]" } }, "char_filter": { "emoticons": { "type": "mapping", "mappings": [ ":) => _happy_", ":( => _sad_" ] } }, "filter": { "english_stop": { "type": "stop", "stopwords": "_english_" } } } }, ) print(resp) resp1 = client.indices.analyze( index="my-index-000001", analyzer="my_custom_analyzer", text="I'm a :) person, and you?", ) print(resp1)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_custom_analyzer: { char_filter: [ 'emoticons' ], tokenizer: 'punctuation', filter: [ 'lowercase', 'english_stop' ] } }, tokenizer: { punctuation: { type: 'pattern', pattern: '[ .,!?]' } }, char_filter: { emoticons: { type: 'mapping', mappings: [ ':) => _happy_', ':( => _sad_' ] } }, filter: { english_stop: { type: 'stop', stopwords: '_english_' } } } } } ) puts response response = client.indices.analyze( index: 'my-index-000001', body: { analyzer: 'my_custom_analyzer', text: "I'm a :) person, and you?" } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_custom_analyzer: { char_filter: ["emoticons"], tokenizer: "punctuation", filter: ["lowercase", "english_stop"], }, }, tokenizer: { punctuation: { type: "pattern", pattern: "[ .,!?]", }, }, char_filter: { emoticons: { type: "mapping", mappings: [":) => _happy_", ":( => _sad_"], }, }, filter: { english_stop: { type: "stop", stopwords: "_english_", }, }, }, }, }); console.log(response); const response1 = await client.indices.analyze({ index: "my-index-000001", analyzer: "my_custom_analyzer", text: "I'm a :) person, and you?", }); console.log(response1);
PUT my-index-000001 { "settings": { "analysis": { "analyzer": { "my_custom_analyzer": { "char_filter": [ "emoticons" ], "tokenizer": "punctuation", "filter": [ "lowercase", "english_stop" ] } }, "tokenizer": { "punctuation": { "type": "pattern", "pattern": "[ .,!?]" } }, "char_filter": { "emoticons": { "type": "mapping", "mappings": [ ":) => _happy_", ":( => _sad_" ] } }, "filter": { "english_stop": { "type": "stop", "stopwords": "_english_" } } } } } POST my-index-000001/_analyze { "analyzer": "my_custom_analyzer", "text": "I'm a :) person, and you?" }
为索引分配一个默认的自定义分析器, |
|
定义自定义的 |
|
定义自定义的 |
|
定义自定义的 |
上面的示例生成以下词元:
[ i'm, _happy_, person, you ]