模式替换字符过滤器
编辑模式替换字符过滤器
编辑pattern_replace
字符过滤器使用正则表达式匹配应替换为指定替换字符串的字符。替换字符串可以引用正则表达式中的捕获组。
注意病态正则表达式
模式替换字符过滤器使用 Java 正则表达式。
编写不当的正则表达式可能会运行非常缓慢,甚至抛出 StackOverflowError 并导致其运行所在的节点突然退出。
阅读更多关于 病态正则表达式以及如何避免它们 的信息。
配置
编辑pattern_replace
字符过滤器接受以下参数
|
一个 Java 正则表达式。必填。 |
|
替换字符串,可以使用 |
|
Java 正则表达式 标志。标志应以管道分隔,例如 |
示例配置
编辑在此示例中,我们配置 pattern_replace
字符过滤器以将数字中的任何嵌入式短划线替换为下划线,即 123-456-789
→ 123_456_789
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "standard", "char_filter": [ "my_char_filter" ] } }, "char_filter": { "my_char_filter": { "type": "pattern_replace", "pattern": "(\\d+)-(?=\\d)", "replacement": "$1_" } } } }, ) print(resp) resp1 = client.indices.analyze( index="my-index-000001", analyzer="my_analyzer", text="My credit card is 123-456-789", ) print(resp1)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_analyzer: { tokenizer: 'standard', char_filter: [ 'my_char_filter' ] } }, char_filter: { my_char_filter: { type: 'pattern_replace', pattern: '(\\d+)-(?=\\d)', replacement: '$1_' } } } } } ) puts response response = client.indices.analyze( index: 'my-index-000001', body: { analyzer: 'my_analyzer', text: 'My credit card is 123-456-789' } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_analyzer: { tokenizer: "standard", char_filter: ["my_char_filter"], }, }, char_filter: { my_char_filter: { type: "pattern_replace", pattern: "(\\d+)-(?=\\d)", replacement: "$1_", }, }, }, }, }); console.log(response); const response1 = await client.indices.analyze({ index: "my-index-000001", analyzer: "my_analyzer", text: "My credit card is 123-456-789", }); console.log(response1);
PUT my-index-000001 { "settings": { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "standard", "char_filter": [ "my_char_filter" ] } }, "char_filter": { "my_char_filter": { "type": "pattern_replace", "pattern": "(\\d+)-(?=\\d)", "replacement": "$1_" } } } } } POST my-index-000001/_analyze { "analyzer": "my_analyzer", "text": "My credit card is 123-456-789" }
上述示例生成以下术语
[ My, credit, card, is, 123_456_789 ]
使用更改原始文本长度的替换字符串将适用于搜索目的,但会导致高亮显示不正确,如下例所示。
此示例在遇到小写字母后跟大写字母时插入空格(即 fooBarBaz
→ foo Bar Baz
),允许单独查询 camelCase 单词
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "standard", "char_filter": [ "my_char_filter" ], "filter": [ "lowercase" ] } }, "char_filter": { "my_char_filter": { "type": "pattern_replace", "pattern": "(?<=\\p{Lower})(?=\\p{Upper})", "replacement": " " } } } }, mappings={ "properties": { "text": { "type": "text", "analyzer": "my_analyzer" } } }, ) print(resp) resp1 = client.indices.analyze( index="my-index-000001", analyzer="my_analyzer", text="The fooBarBaz method", ) print(resp1)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_analyzer: { tokenizer: 'standard', char_filter: [ 'my_char_filter' ], filter: [ 'lowercase' ] } }, char_filter: { my_char_filter: { type: 'pattern_replace', pattern: '(?<=\\p{Lower})(?=\\p{Upper})', replacement: ' ' } } } }, mappings: { properties: { text: { type: 'text', analyzer: 'my_analyzer' } } } } ) puts response response = client.indices.analyze( index: 'my-index-000001', body: { analyzer: 'my_analyzer', text: 'The fooBarBaz method' } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_analyzer: { tokenizer: "standard", char_filter: ["my_char_filter"], filter: ["lowercase"], }, }, char_filter: { my_char_filter: { type: "pattern_replace", pattern: "(?<=\\p{Lower})(?=\\p{Upper})", replacement: " ", }, }, }, }, mappings: { properties: { text: { type: "text", analyzer: "my_analyzer", }, }, }, }); console.log(response); const response1 = await client.indices.analyze({ index: "my-index-000001", analyzer: "my_analyzer", text: "The fooBarBaz method", }); console.log(response1);
PUT my-index-000001 { "settings": { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "standard", "char_filter": [ "my_char_filter" ], "filter": [ "lowercase" ] } }, "char_filter": { "my_char_filter": { "type": "pattern_replace", "pattern": "(?<=\\p{Lower})(?=\\p{Upper})", "replacement": " " } } } }, "mappings": { "properties": { "text": { "type": "text", "analyzer": "my_analyzer" } } } } POST my-index-000001/_analyze { "analyzer": "my_analyzer", "text": "The fooBarBaz method" }
上面返回以下术语
[ the, foo, bar, baz, method ]
查询 bar
将正确找到文档,但结果上的高亮显示将产生不正确的高亮显示,因为我们的字符过滤器更改了原始文本的长度
resp = client.index( index="my-index-000001", id="1", refresh=True, document={ "text": "The fooBarBaz method" }, ) print(resp) resp1 = client.search( index="my-index-000001", query={ "match": { "text": "bar" } }, highlight={ "fields": { "text": {} } }, ) print(resp1)
response = client.index( index: 'my-index-000001', id: 1, refresh: true, body: { text: 'The fooBarBaz method' } ) puts response response = client.search( index: 'my-index-000001', body: { query: { match: { text: 'bar' } }, highlight: { fields: { text: {} } } } ) puts response
const response = await client.index({ index: "my-index-000001", id: 1, refresh: "true", document: { text: "The fooBarBaz method", }, }); console.log(response); const response1 = await client.search({ index: "my-index-000001", query: { match: { text: "bar", }, }, highlight: { fields: { text: {}, }, }, }); console.log(response1);
PUT my-index-000001/_doc/1?refresh { "text": "The fooBarBaz method" } GET my-index-000001/_search { "query": { "match": { "text": "bar" } }, "highlight": { "fields": { "text": {} } } }
上面的输出为
{ "timed_out": false, "took": $body.took, "_shards": { "total": 1, "successful": 1, "skipped" : 0, "failed": 0 }, "hits": { "total" : { "value": 1, "relation": "eq" }, "max_score": 0.2876821, "hits": [ { "_index": "my-index-000001", "_id": "1", "_score": 0.2876821, "_source": { "text": "The fooBarBaz method" }, "highlight": { "text": [ "The foo<em>Ba</em>rBaz method" ] } } ] } }