smartcn_stop 令牌过滤器

编辑

smartcn_stop 令牌过滤器会过滤掉由 smartcn 分析器 (_smartcn_) 定义的停用词,以及用户指定的任何其他自定义停用词。此过滤器仅支持预定义的 _smartcn_ 停用词列表。如果要使用其他预定义列表,请改用 stop 令牌过滤器

PUT smartcn_example
{
  "settings": {
    "index": {
      "analysis": {
        "analyzer": {
          "smartcn_with_stop": {
            "tokenizer": "smartcn_tokenizer",
            "filter": [
              "porter_stem",
              "my_smartcn_stop"
            ]
          }
        },
        "filter": {
          "my_smartcn_stop": {
            "type": "smartcn_stop",
            "stopwords": [
              "_smartcn_",
              "stack",
              "的"
            ]
          }
        }
      }
    }
  }
}

GET smartcn_example/_analyze
{
  "analyzer": "smartcn_with_stop",
  "text": "哈喽,我们是 Elastic   我们是 Elastic Stack(Elasticsearch、Kibana、Beats 和 Logstash)的开发公司。从股票行情到 Twitter 消息流,从 Apache 日志到 WordPress 博文,我们可以帮助人们体验搜索的强大力量,帮助他们以截然不同的方式探索和分析数据"
}

以上请求返回

{
  "tokens": [
    {
      "token": "哈",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "喽",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "我们",
      "start_offset": 3,
      "end_offset": 5,
      "type": "word",
      "position": 3
    },
    {
      "token": "是",
      "start_offset": 5,
      "end_offset": 6,
      "type": "word",
      "position": 4
    },
    {
      "token": "elast",
      "start_offset": 7,
      "end_offset": 14,
      "type": "word",
      "position": 5
    },
    {
      "token": "我们",
      "start_offset": 17,
      "end_offset": 19,
      "type": "word",
      "position": 6
    },
    {
      "token": "是",
      "start_offset": 19,
      "end_offset": 20,
      "type": "word",
      "position": 7
    },
    {
      "token": "elast",
      "start_offset": 21,
      "end_offset": 28,
      "type": "word",
      "position": 8
    },
    {
      "token": "elasticsearch",
      "start_offset": 35,
      "end_offset": 48,
      "type": "word",
      "position": 11
    },
    {
      "token": "kibana",
      "start_offset": 49,
      "end_offset": 55,
      "type": "word",
      "position": 13
    },
    {
      "token": "beat",
      "start_offset": 56,
      "end_offset": 61,
      "type": "word",
      "position": 15
    },
    {
      "token": "和",
      "start_offset": 62,
      "end_offset": 63,
      "type": "word",
      "position": 16
    },
    {
      "token": "logstash",
      "start_offset": 64,
      "end_offset": 72,
      "type": "word",
      "position": 17
    },
    {
      "token": "开发",
      "start_offset": 74,
      "end_offset": 76,
      "type": "word",
      "position": 20
    },
    {
      "token": "公司",
      "start_offset": 76,
      "end_offset": 78,
      "type": "word",
      "position": 21
    },
    {
      "token": "从",
      "start_offset": 79,
      "end_offset": 80,
      "type": "word",
      "position": 23
    },
    {
      "token": "股票",
      "start_offset": 80,
      "end_offset": 82,
      "type": "word",
      "position": 24
    },
    {
      "token": "行情",
      "start_offset": 82,
      "end_offset": 84,
      "type": "word",
      "position": 25
    },
    {
      "token": "到",
      "start_offset": 84,
      "end_offset": 85,
      "type": "word",
      "position": 26
    },
    {
      "token": "twitter",
      "start_offset": 86,
      "end_offset": 93,
      "type": "word",
      "position": 27
    },
    {
      "token": "消息",
      "start_offset": 94,
      "end_offset": 96,
      "type": "word",
      "position": 28
    },
    {
      "token": "流",
      "start_offset": 96,
      "end_offset": 97,
      "type": "word",
      "position": 29
    },
    {
      "token": "从",
      "start_offset": 98,
      "end_offset": 99,
      "type": "word",
      "position": 31
    },
    {
      "token": "apach",
      "start_offset": 100,
      "end_offset": 106,
      "type": "word",
      "position": 32
    },
    {
      "token": "日志",
      "start_offset": 107,
      "end_offset": 109,
      "type": "word",
      "position": 33
    },
    {
      "token": "到",
      "start_offset": 109,
      "end_offset": 110,
      "type": "word",
      "position": 34
    },
    {
      "token": "wordpress",
      "start_offset": 111,
      "end_offset": 120,
      "type": "word",
      "position": 35
    },
    {
      "token": "博",
      "start_offset": 121,
      "end_offset": 122,
      "type": "word",
      "position": 36
    },
    {
      "token": "文",
      "start_offset": 122,
      "end_offset": 123,
      "type": "word",
      "position": 37
    },
    {
      "token": "我们",
      "start_offset": 124,
      "end_offset": 126,
      "type": "word",
      "position": 39
    },
    {
      "token": "可以",
      "start_offset": 126,
      "end_offset": 128,
      "type": "word",
      "position": 40
    },
    {
      "token": "帮助",
      "start_offset": 128,
      "end_offset": 130,
      "type": "word",
      "position": 41
    },
    {
      "token": "人们",
      "start_offset": 130,
      "end_offset": 132,
      "type": "word",
      "position": 42
    },
    {
      "token": "体验",
      "start_offset": 132,
      "end_offset": 134,
      "type": "word",
      "position": 43
    },
    {
      "token": "搜索",
      "start_offset": 134,
      "end_offset": 136,
      "type": "word",
      "position": 44
    },
    {
      "token": "强大",
      "start_offset": 137,
      "end_offset": 139,
      "type": "word",
      "position": 46
    },
    {
      "token": "力量",
      "start_offset": 139,
      "end_offset": 141,
      "type": "word",
      "position": 47
    },
    {
      "token": "帮助",
      "start_offset": 142,
      "end_offset": 144,
      "type": "word",
      "position": 49
    },
    {
      "token": "他们",
      "start_offset": 144,
      "end_offset": 146,
      "type": "word",
      "position": 50
    },
    {
      "token": "以",
      "start_offset": 146,
      "end_offset": 147,
      "type": "word",
      "position": 51
    },
    {
      "token": "截然不同",
      "start_offset": 147,
      "end_offset": 151,
      "type": "word",
      "position": 52
    },
    {
      "token": "方式",
      "start_offset": 152,
      "end_offset": 154,
      "type": "word",
      "position": 54
    },
    {
      "token": "探索",
      "start_offset": 154,
      "end_offset": 156,
      "type": "word",
      "position": 55
    },
    {
      "token": "和",
      "start_offset": 156,
      "end_offset": 157,
      "type": "word",
      "position": 56
    },
    {
      "token": "分析",
      "start_offset": 157,
      "end_offset": 159,
      "type": "word",
      "position": 57
    },
    {
      "token": "数据",
      "start_offset": 159,
      "end_offset": 161,
      "type": "word",
      "position": 58
    }
  ]
}