› › ›

长度 Token 过滤器

编辑

长度 Token 过滤器

编辑

移除长度小于或大于指定字符长度的 Token。例如，您可以使用 length 过滤器排除长度小于 2 个字符和长度大于 5 个字符的 Token。

此过滤器使用 Lucene 的 LengthFilter。

length 过滤器会移除整个 Token。如果您希望将 Token 缩短到特定长度，请使用 truncate 过滤器。

示例

编辑

下面的分析 API 请求使用 length 过滤器来移除长度大于 4 个字符的 Token。

resp = client.indices.analyze(
    tokenizer="whitespace",
    filter=[
        {
            "type": "length",
            "min": 0,
            "max": 4
        }
    ],
    text="the quick brown fox jumps over the lazy dog",
)
print(resp)

response = client.indices.analyze(
  body: {
    tokenizer: 'whitespace',
    filter: [
      {
        type: 'length',
        min: 0,
        max: 4
      }
    ],
    text: 'the quick brown fox jumps over the lazy dog'
  }
)
puts response

const response = await client.indices.analyze({
  tokenizer: "whitespace",
  filter: [
    {
      type: "length",
      min: 0,
      max: 4,
    },
  ],
  text: "the quick brown fox jumps over the lazy dog",
});
console.log(response);

GET _analyze
{
  "tokenizer": "whitespace",
  "filter": [
    {
      "type": "length",
      "min": 0,
      "max": 4
    }
  ],
  "text": "the quick brown fox jumps over the lazy dog"
}

过滤器生成的 Token 如下所示：

[ the, fox, over, the, lazy, dog ]

添加到分析器

编辑

下面的创建索引 API 请求使用 length 过滤器来配置一个新的自定义分析器。

resp = client.indices.create(
    index="length_example",
    settings={
        "analysis": {
            "analyzer": {
                "standard_length": {
                    "tokenizer": "standard",
                    "filter": [
                        "length"
                    ]
                }
            }
        }
    },
)
print(resp)

response = client.indices.create(
  index: 'length_example',
  body: {
    settings: {
      analysis: {
        analyzer: {
          standard_length: {
            tokenizer: 'standard',
            filter: [
              'length'
            ]
          }
        }
      }
    }
  }
)
puts response

const response = await client.indices.create({
  index: "length_example",
  settings: {
    analysis: {
      analyzer: {
        standard_length: {
          tokenizer: "standard",
          filter: ["length"],
        },
      },
    },
  },
});
console.log(response);

PUT length_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "standard_length": {
          "tokenizer": "standard",
          "filter": [ "length" ]
        }
      }
    }
  }
}

可配置参数

编辑

min: (可选，整数) Token 的最小字符长度。长度较短的 Token 将从输出中排除。默认为 0。
max: (可选，整数) Token 的最大字符长度。长度较长的 Token 将从输出中排除。默认为 Integer.MAX_VALUE，即 2^31-1 或 2147483647。

自定义

编辑

要自定义 length 过滤器，请复制它以创建新的自定义 Token 过滤器的基础。您可以使用其可配置参数修改过滤器。

例如，以下请求创建了一个自定义 length 过滤器，该过滤器将移除长度小于 2 个字符和长度大于 10 个字符的 Token。

resp = client.indices.create(
    index="length_custom_example",
    settings={
        "analysis": {
            "analyzer": {
                "whitespace_length_2_to_10_char": {
                    "tokenizer": "whitespace",
                    "filter": [
                        "length_2_to_10_char"
                    ]
                }
            },
            "filter": {
                "length_2_to_10_char": {
                    "type": "length",
                    "min": 2,
                    "max": 10
                }
            }
        }
    },
)
print(resp)

response = client.indices.create(
  index: 'length_custom_example',
  body: {
    settings: {
      analysis: {
        analyzer: {
          "whitespace_length_2_to_10_char": {
            tokenizer: 'whitespace',
            filter: [
              'length_2_to_10_char'
            ]
          }
        },
        filter: {
          "length_2_to_10_char": {
            type: 'length',
            min: 2,
            max: 10
          }
        }
      }
    }
  }
)
puts response

const response = await client.indices.create({
  index: "length_custom_example",
  settings: {
    analysis: {
      analyzer: {
        whitespace_length_2_to_10_char: {
          tokenizer: "whitespace",
          filter: ["length_2_to_10_char"],
        },
      },
      filter: {
        length_2_to_10_char: {
          type: "length",
          min: 2,
          max: 10,
        },
      },
    },
  },
});
console.log(response);

PUT length_custom_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "whitespace_length_2_to_10_char": {
          "tokenizer": "whitespace",
          "filter": [ "length_2_to_10_char" ]
        }
      },
      "filter": {
        "length_2_to_10_char": {
          "type": "length",
          "min": 2,
          "max": 10
        }
      }
    }
  }
}

« KStem Token 过滤器限制 Token 数量过滤器 »