模式替换字符过滤器

编辑

pattern_replace 字符过滤器使用正则表达式匹配应替换为指定替换字符串的字符。替换字符串可以引用正则表达式中的捕获组。

注意病态正则表达式

模式替换字符过滤器使用 Java 正则表达式

编写不当的正则表达式可能会运行非常缓慢,甚至抛出 StackOverflowError 并导致其运行所在的节点突然退出。

阅读更多关于 病态正则表达式以及如何避免它们 的信息。

配置

编辑

pattern_replace 字符过滤器接受以下参数

pattern

一个 Java 正则表达式。必填。

replacement

替换字符串,可以使用 $1..$9 语法引用捕获组,如 此处 所述。

flags

Java 正则表达式 标志。标志应以管道分隔,例如 "CASE_INSENSITIVE|COMMENTS"

示例配置

编辑

在此示例中,我们配置 pattern_replace 字符过滤器以将数字中的任何嵌入式短划线替换为下划线,即 123-456-789123_456_789

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "tokenizer": "standard",
                    "char_filter": [
                        "my_char_filter"
                    ]
                }
            },
            "char_filter": {
                "my_char_filter": {
                    "type": "pattern_replace",
                    "pattern": "(\\d+)-(?=\\d)",
                    "replacement": "$1_"
                }
            }
        }
    },
)
print(resp)

resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_analyzer",
    text="My credit card is 123-456-789",
)
print(resp1)
response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_analyzer: {
            tokenizer: 'standard',
            char_filter: [
              'my_char_filter'
            ]
          }
        },
        char_filter: {
          my_char_filter: {
            type: 'pattern_replace',
            pattern: '(\\d+)-(?=\\d)',
            replacement: '$1_'
          }
        }
      }
    }
  }
)
puts response

response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_analyzer',
    text: 'My credit card is 123-456-789'
  }
)
puts response
const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_analyzer: {
          tokenizer: "standard",
          char_filter: ["my_char_filter"],
        },
      },
      char_filter: {
        my_char_filter: {
          type: "pattern_replace",
          pattern: "(\\d+)-(?=\\d)",
          replacement: "$1_",
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_analyzer",
  text: "My credit card is 123-456-789",
});
console.log(response1);
PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "standard",
          "char_filter": [
            "my_char_filter"
          ]
        }
      },
      "char_filter": {
        "my_char_filter": {
          "type": "pattern_replace",
          "pattern": "(\\d+)-(?=\\d)",
          "replacement": "$1_"
        }
      }
    }
  }
}

POST my-index-000001/_analyze
{
  "analyzer": "my_analyzer",
  "text": "My credit card is 123-456-789"
}

上述示例生成以下术语

[ My, credit, card, is, 123_456_789 ]

使用更改原始文本长度的替换字符串将适用于搜索目的,但会导致高亮显示不正确,如下例所示。

此示例在遇到小写字母后跟大写字母时插入空格(即 fooBarBazfoo Bar Baz),允许单独查询 camelCase 单词

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "tokenizer": "standard",
                    "char_filter": [
                        "my_char_filter"
                    ],
                    "filter": [
                        "lowercase"
                    ]
                }
            },
            "char_filter": {
                "my_char_filter": {
                    "type": "pattern_replace",
                    "pattern": "(?<=\\p{Lower})(?=\\p{Upper})",
                    "replacement": " "
                }
            }
        }
    },
    mappings={
        "properties": {
            "text": {
                "type": "text",
                "analyzer": "my_analyzer"
            }
        }
    },
)
print(resp)

resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_analyzer",
    text="The fooBarBaz method",
)
print(resp1)
response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_analyzer: {
            tokenizer: 'standard',
            char_filter: [
              'my_char_filter'
            ],
            filter: [
              'lowercase'
            ]
          }
        },
        char_filter: {
          my_char_filter: {
            type: 'pattern_replace',
            pattern: '(?<=\\p{Lower})(?=\\p{Upper})',
            replacement: ' '
          }
        }
      }
    },
    mappings: {
      properties: {
        text: {
          type: 'text',
          analyzer: 'my_analyzer'
        }
      }
    }
  }
)
puts response

response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_analyzer',
    text: 'The fooBarBaz method'
  }
)
puts response
const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_analyzer: {
          tokenizer: "standard",
          char_filter: ["my_char_filter"],
          filter: ["lowercase"],
        },
      },
      char_filter: {
        my_char_filter: {
          type: "pattern_replace",
          pattern: "(?<=\\p{Lower})(?=\\p{Upper})",
          replacement: " ",
        },
      },
    },
  },
  mappings: {
    properties: {
      text: {
        type: "text",
        analyzer: "my_analyzer",
      },
    },
  },
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_analyzer",
  text: "The fooBarBaz method",
});
console.log(response1);
PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "standard",
          "char_filter": [
            "my_char_filter"
          ],
          "filter": [
            "lowercase"
          ]
        }
      },
      "char_filter": {
        "my_char_filter": {
          "type": "pattern_replace",
          "pattern": "(?<=\\p{Lower})(?=\\p{Upper})",
          "replacement": " "
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
  }
}

POST my-index-000001/_analyze
{
  "analyzer": "my_analyzer",
  "text": "The fooBarBaz method"
}

上面返回以下术语

[ the, foo, bar, baz, method ]

查询 bar 将正确找到文档,但结果上的高亮显示将产生不正确的高亮显示,因为我们的字符过滤器更改了原始文本的长度

resp = client.index(
    index="my-index-000001",
    id="1",
    refresh=True,
    document={
        "text": "The fooBarBaz method"
    },
)
print(resp)

resp1 = client.search(
    index="my-index-000001",
    query={
        "match": {
            "text": "bar"
        }
    },
    highlight={
        "fields": {
            "text": {}
        }
    },
)
print(resp1)
response = client.index(
  index: 'my-index-000001',
  id: 1,
  refresh: true,
  body: {
    text: 'The fooBarBaz method'
  }
)
puts response

response = client.search(
  index: 'my-index-000001',
  body: {
    query: {
      match: {
        text: 'bar'
      }
    },
    highlight: {
      fields: {
        text: {}
      }
    }
  }
)
puts response
const response = await client.index({
  index: "my-index-000001",
  id: 1,
  refresh: "true",
  document: {
    text: "The fooBarBaz method",
  },
});
console.log(response);

const response1 = await client.search({
  index: "my-index-000001",
  query: {
    match: {
      text: "bar",
    },
  },
  highlight: {
    fields: {
      text: {},
    },
  },
});
console.log(response1);
PUT my-index-000001/_doc/1?refresh
{
  "text": "The fooBarBaz method"
}

GET my-index-000001/_search
{
  "query": {
    "match": {
      "text": "bar"
    }
  },
  "highlight": {
    "fields": {
      "text": {}
    }
  }
}

上面的输出为

{
  "timed_out": false,
  "took": $body.took,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 1,
        "relation": "eq"
    },
    "max_score": 0.2876821,
    "hits": [
      {
        "_index": "my-index-000001",
        "_id": "1",
        "_score": 0.2876821,
        "_source": {
          "text": "The fooBarBaz method"
        },
        "highlight": {
          "text": [
            "The foo<em>Ba</em>rBaz method" 
          ]
        }
      }
    ]
  }
}

注意不正确的高亮显示。