模式替换字符过滤器

编辑

pattern_replace 字符过滤器使用正则表达式来匹配应该被指定的替换字符串替换的字符。替换字符串可以引用正则表达式中的捕获组。

注意病态正则表达式

模式替换字符过滤器使用 Java 正则表达式

一个写得不好的正则表达式可能会运行非常缓慢,甚至抛出 StackOverflowError 并导致其运行所在的节点突然退出。

阅读更多关于病态正则表达式以及如何避免它们的信息。

配置

编辑

pattern_replace 字符过滤器接受以下参数

pattern

一个 Java 正则表达式。必需。

replacement

替换字符串,可以使用 $1..$9 语法引用捕获组,如此处所述。

flags

Java 正则表达式 标志。标志应该用管道符分隔,例如 "CASE_INSENSITIVE|COMMENTS"

配置示例

编辑

在此示例中,我们将 pattern_replace 字符过滤器配置为将数字中任何嵌入的破折号替换为下划线,即 123-456-789123_456_789

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "tokenizer": "standard",
                    "char_filter": [
                        "my_char_filter"
                    ]
                }
            },
            "char_filter": {
                "my_char_filter": {
                    "type": "pattern_replace",
                    "pattern": "(\\d+)-(?=\\d)",
                    "replacement": "$1_"
                }
            }
        }
    },
)
print(resp)

resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_analyzer",
    text="My credit card is 123-456-789",
)
print(resp1)
response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_analyzer: {
            tokenizer: 'standard',
            char_filter: [
              'my_char_filter'
            ]
          }
        },
        char_filter: {
          my_char_filter: {
            type: 'pattern_replace',
            pattern: '(\\d+)-(?=\\d)',
            replacement: '$1_'
          }
        }
      }
    }
  }
)
puts response

response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_analyzer',
    text: 'My credit card is 123-456-789'
  }
)
puts response
const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_analyzer: {
          tokenizer: "standard",
          char_filter: ["my_char_filter"],
        },
      },
      char_filter: {
        my_char_filter: {
          type: "pattern_replace",
          pattern: "(\\d+)-(?=\\d)",
          replacement: "$1_",
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_analyzer",
  text: "My credit card is 123-456-789",
});
console.log(response1);
PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "standard",
          "char_filter": [
            "my_char_filter"
          ]
        }
      },
      "char_filter": {
        "my_char_filter": {
          "type": "pattern_replace",
          "pattern": "(\\d+)-(?=\\d)",
          "replacement": "$1_"
        }
      }
    }
  }
}

POST my-index-000001/_analyze
{
  "analyzer": "my_analyzer",
  "text": "My credit card is 123-456-789"
}

以上示例生成以下词条

[ My, credit, card, is, 123_456_789 ]

使用更改原始文本长度的替换字符串可以用于搜索目的,但会导致不正确的突出显示,如下面的示例所示。

此示例在遇到小写字母后跟大写字母时插入一个空格(即 fooBarBazfoo Bar Baz),允许单独查询 camelCase 单词

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "tokenizer": "standard",
                    "char_filter": [
                        "my_char_filter"
                    ],
                    "filter": [
                        "lowercase"
                    ]
                }
            },
            "char_filter": {
                "my_char_filter": {
                    "type": "pattern_replace",
                    "pattern": "(?<=\\p{Lower})(?=\\p{Upper})",
                    "replacement": " "
                }
            }
        }
    },
    mappings={
        "properties": {
            "text": {
                "type": "text",
                "analyzer": "my_analyzer"
            }
        }
    },
)
print(resp)

resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_analyzer",
    text="The fooBarBaz method",
)
print(resp1)
response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_analyzer: {
            tokenizer: 'standard',
            char_filter: [
              'my_char_filter'
            ],
            filter: [
              'lowercase'
            ]
          }
        },
        char_filter: {
          my_char_filter: {
            type: 'pattern_replace',
            pattern: '(?<=\\p{Lower})(?=\\p{Upper})',
            replacement: ' '
          }
        }
      }
    },
    mappings: {
      properties: {
        text: {
          type: 'text',
          analyzer: 'my_analyzer'
        }
      }
    }
  }
)
puts response

response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_analyzer',
    text: 'The fooBarBaz method'
  }
)
puts response
const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_analyzer: {
          tokenizer: "standard",
          char_filter: ["my_char_filter"],
          filter: ["lowercase"],
        },
      },
      char_filter: {
        my_char_filter: {
          type: "pattern_replace",
          pattern: "(?<=\\p{Lower})(?=\\p{Upper})",
          replacement: " ",
        },
      },
    },
  },
  mappings: {
    properties: {
      text: {
        type: "text",
        analyzer: "my_analyzer",
      },
    },
  },
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_analyzer",
  text: "The fooBarBaz method",
});
console.log(response1);
PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "standard",
          "char_filter": [
            "my_char_filter"
          ],
          "filter": [
            "lowercase"
          ]
        }
      },
      "char_filter": {
        "my_char_filter": {
          "type": "pattern_replace",
          "pattern": "(?<=\\p{Lower})(?=\\p{Upper})",
          "replacement": " "
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
  }
}

POST my-index-000001/_analyze
{
  "analyzer": "my_analyzer",
  "text": "The fooBarBaz method"
}

以上返回以下词条

[ the, foo, bar, baz, method ]

查询 bar 将正确找到文档,但结果的突出显示将产生不正确的突出显示,因为我们的字符过滤器更改了原始文本的长度

resp = client.index(
    index="my-index-000001",
    id="1",
    refresh=True,
    document={
        "text": "The fooBarBaz method"
    },
)
print(resp)

resp1 = client.search(
    index="my-index-000001",
    query={
        "match": {
            "text": "bar"
        }
    },
    highlight={
        "fields": {
            "text": {}
        }
    },
)
print(resp1)
response = client.index(
  index: 'my-index-000001',
  id: 1,
  refresh: true,
  body: {
    text: 'The fooBarBaz method'
  }
)
puts response

response = client.search(
  index: 'my-index-000001',
  body: {
    query: {
      match: {
        text: 'bar'
      }
    },
    highlight: {
      fields: {
        text: {}
      }
    }
  }
)
puts response
const response = await client.index({
  index: "my-index-000001",
  id: 1,
  refresh: "true",
  document: {
    text: "The fooBarBaz method",
  },
});
console.log(response);

const response1 = await client.search({
  index: "my-index-000001",
  query: {
    match: {
      text: "bar",
    },
  },
  highlight: {
    fields: {
      text: {},
    },
  },
});
console.log(response1);
PUT my-index-000001/_doc/1?refresh
{
  "text": "The fooBarBaz method"
}

GET my-index-000001/_search
{
  "query": {
    "match": {
      "text": "bar"
    }
  },
  "highlight": {
    "fields": {
      "text": {}
    }
  }
}

以上输出是

{
  "timed_out": false,
  "took": $body.took,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 1,
        "relation": "eq"
    },
    "max_score": 0.2876821,
    "hits": [
      {
        "_index": "my-index-000001",
        "_id": "1",
        "_score": 0.2876821,
        "_source": {
          "text": "The fooBarBaz method"
        },
        "highlight": {
          "text": [
            "The foo<em>Ba</em>rBaz method" 
          ]
        }
      }
    ]
  }
}

注意不正确的突出显示。