路径层次分词器

编辑

path_hierarchy 分词器接收类似文件系统路径的分层值,在路径分隔符处分割,并为树中的每个组件发出一个术语。 path_hierarcy 分词器在底层使用 Lucene 的 PathHierarchyTokenizer

示例输出

编辑
resp = client.indices.analyze(
    tokenizer="path_hierarchy",
    text="/one/two/three",
)
print(resp)
response = client.indices.analyze(
  body: {
    tokenizer: 'path_hierarchy',
    text: '/one/two/three'
  }
)
puts response
const response = await client.indices.analyze({
  tokenizer: "path_hierarchy",
  text: "/one/two/three",
});
console.log(response);
POST _analyze
{
  "tokenizer": "path_hierarchy",
  "text": "/one/two/three"
}

以上文本将生成以下术语

[ /one, /one/two, /one/two/three ]

配置

编辑

path_hierarchy 分词器接受以下参数

分隔符

用作路径分隔符的字符。默认为 /

替换

用于分隔符的可选替换字符。默认为 delimiter

缓冲区大小

一次读取到术语缓冲区的字符数。默认为 1024。术语缓冲区将按此大小增长,直到所有文本都被使用。建议不要更改此设置。

反转

如果 true,则使用 Lucene 的 ReversePathHierarchyTokenizer,它适用于类似域的层次结构。默认为 false

跳过

要跳过的初始标记数。默认为 0

示例配置

编辑

在此示例中,我们配置 path_hierarchy 分词器以在 - 字符处分割,并将其替换为 /。跳过前两个标记

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "tokenizer": "my_tokenizer"
                }
            },
            "tokenizer": {
                "my_tokenizer": {
                    "type": "path_hierarchy",
                    "delimiter": "-",
                    "replacement": "/",
                    "skip": 2
                }
            }
        }
    },
)
print(resp)

resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_analyzer",
    text="one-two-three-four-five",
)
print(resp1)
response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_analyzer: {
            tokenizer: 'my_tokenizer'
          }
        },
        tokenizer: {
          my_tokenizer: {
            type: 'path_hierarchy',
            delimiter: '-',
            replacement: '/',
            skip: 2
          }
        }
      }
    }
  }
)
puts response

response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_analyzer',
    text: 'one-two-three-four-five'
  }
)
puts response
const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_analyzer: {
          tokenizer: "my_tokenizer",
        },
      },
      tokenizer: {
        my_tokenizer: {
          type: "path_hierarchy",
          delimiter: "-",
          replacement: "/",
          skip: 2,
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_analyzer",
  text: "one-two-three-four-five",
});
console.log(response1);
PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "path_hierarchy",
          "delimiter": "-",
          "replacement": "/",
          "skip": 2
        }
      }
    }
  }
}

POST my-index-000001/_analyze
{
  "analyzer": "my_analyzer",
  "text": "one-two-three-four-five"
}

以上示例生成以下术语

[ /three, /three/four, /three/four/five ]

如果我们将 reverse 设置为 true,它将生成以下内容

[ one/two/three/, two/three/, three/ ]

详细示例

编辑

path_hierarchy 分词器的常见用例是按文件路径过滤结果。如果索引文件路径以及数据,则使用 path_hierarchy 分词器分析路径允许按文件路径字符串的不同部分过滤结果。

此示例配置索引以具有两个自定义分析器,并将这些分析器应用于将存储文件名的 file_path 文本字段的多字段。两个分析器之一使用反向标记化。然后索引一些示例文档以表示两个不同用户的照片文件夹中照片的一些文件路径。

resp = client.indices.create(
    index="file-path-test",
    settings={
        "analysis": {
            "analyzer": {
                "custom_path_tree": {
                    "tokenizer": "custom_hierarchy"
                },
                "custom_path_tree_reversed": {
                    "tokenizer": "custom_hierarchy_reversed"
                }
            },
            "tokenizer": {
                "custom_hierarchy": {
                    "type": "path_hierarchy",
                    "delimiter": "/"
                },
                "custom_hierarchy_reversed": {
                    "type": "path_hierarchy",
                    "delimiter": "/",
                    "reverse": "true"
                }
            }
        }
    },
    mappings={
        "properties": {
            "file_path": {
                "type": "text",
                "fields": {
                    "tree": {
                        "type": "text",
                        "analyzer": "custom_path_tree"
                    },
                    "tree_reversed": {
                        "type": "text",
                        "analyzer": "custom_path_tree_reversed"
                    }
                }
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="file-path-test",
    id="1",
    document={
        "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg"
    },
)
print(resp1)

resp2 = client.index(
    index="file-path-test",
    id="2",
    document={
        "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg"
    },
)
print(resp2)

resp3 = client.index(
    index="file-path-test",
    id="3",
    document={
        "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg"
    },
)
print(resp3)

resp4 = client.index(
    index="file-path-test",
    id="4",
    document={
        "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg"
    },
)
print(resp4)

resp5 = client.index(
    index="file-path-test",
    id="5",
    document={
        "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg"
    },
)
print(resp5)
response = client.indices.create(
  index: 'file-path-test',
  body: {
    settings: {
      analysis: {
        analyzer: {
          custom_path_tree: {
            tokenizer: 'custom_hierarchy'
          },
          custom_path_tree_reversed: {
            tokenizer: 'custom_hierarchy_reversed'
          }
        },
        tokenizer: {
          custom_hierarchy: {
            type: 'path_hierarchy',
            delimiter: '/'
          },
          custom_hierarchy_reversed: {
            type: 'path_hierarchy',
            delimiter: '/',
            reverse: 'true'
          }
        }
      }
    },
    mappings: {
      properties: {
        file_path: {
          type: 'text',
          fields: {
            tree: {
              type: 'text',
              analyzer: 'custom_path_tree'
            },
            tree_reversed: {
              type: 'text',
              analyzer: 'custom_path_tree_reversed'
            }
          }
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 1,
  body: {
    file_path: '/User/alice/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 2,
  body: {
    file_path: '/User/alice/photos/2017/05/16/my_photo2.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 3,
  body: {
    file_path: '/User/alice/photos/2017/05/16/my_photo3.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 4,
  body: {
    file_path: '/User/alice/photos/2017/05/15/my_photo1.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 5,
  body: {
    file_path: '/User/bob/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response
const response = await client.indices.create({
  index: "file-path-test",
  settings: {
    analysis: {
      analyzer: {
        custom_path_tree: {
          tokenizer: "custom_hierarchy",
        },
        custom_path_tree_reversed: {
          tokenizer: "custom_hierarchy_reversed",
        },
      },
      tokenizer: {
        custom_hierarchy: {
          type: "path_hierarchy",
          delimiter: "/",
        },
        custom_hierarchy_reversed: {
          type: "path_hierarchy",
          delimiter: "/",
          reverse: "true",
        },
      },
    },
  },
  mappings: {
    properties: {
      file_path: {
        type: "text",
        fields: {
          tree: {
            type: "text",
            analyzer: "custom_path_tree",
          },
          tree_reversed: {
            type: "text",
            analyzer: "custom_path_tree_reversed",
          },
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "file-path-test",
  id: 1,
  document: {
    file_path: "/User/alice/photos/2017/05/16/my_photo1.jpg",
  },
});
console.log(response1);

const response2 = await client.index({
  index: "file-path-test",
  id: 2,
  document: {
    file_path: "/User/alice/photos/2017/05/16/my_photo2.jpg",
  },
});
console.log(response2);

const response3 = await client.index({
  index: "file-path-test",
  id: 3,
  document: {
    file_path: "/User/alice/photos/2017/05/16/my_photo3.jpg",
  },
});
console.log(response3);

const response4 = await client.index({
  index: "file-path-test",
  id: 4,
  document: {
    file_path: "/User/alice/photos/2017/05/15/my_photo1.jpg",
  },
});
console.log(response4);

const response5 = await client.index({
  index: "file-path-test",
  id: 5,
  document: {
    file_path: "/User/bob/photos/2017/05/16/my_photo1.jpg",
  },
});
console.log(response5);
PUT file-path-test
{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_path_tree": {
          "tokenizer": "custom_hierarchy"
        },
        "custom_path_tree_reversed": {
          "tokenizer": "custom_hierarchy_reversed"
        }
      },
      "tokenizer": {
        "custom_hierarchy": {
          "type": "path_hierarchy",
          "delimiter": "/"
        },
        "custom_hierarchy_reversed": {
          "type": "path_hierarchy",
          "delimiter": "/",
          "reverse": "true"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "file_path": {
        "type": "text",
        "fields": {
          "tree": {
            "type": "text",
            "analyzer": "custom_path_tree"
          },
          "tree_reversed": {
            "type": "text",
            "analyzer": "custom_path_tree_reversed"
          }
        }
      }
    }
  }
}

POST file-path-test/_doc/1
{
  "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

POST file-path-test/_doc/2
{
  "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg"
}

POST file-path-test/_doc/3
{
  "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg"
}

POST file-path-test/_doc/4
{
  "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg"
}

POST file-path-test/_doc/5
{
  "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg"
}

针对文本字段搜索特定文件路径字符串会匹配所有示例文档,由于 bob 也是标准分析器创建的术语之一,因此 Bob 的文档排名最高,从而提高了 Bob 的文档的相关性。

resp = client.search(
    index="file-path-test",
    query={
        "match": {
            "file_path": "/User/bob/photos/2017/05"
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      match: {
        file_path: '/User/bob/photos/2017/05'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    match: {
      file_path: "/User/bob/photos/2017/05",
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "match": {
      "file_path": "/User/bob/photos/2017/05"
    }
  }
}

使用 file_path.tree 字段,可以轻松匹配或过滤存在于特定目录中的文件路径的文档。

resp = client.search(
    index="file-path-test",
    query={
        "term": {
            "file_path.tree": "/User/alice/photos/2017/05/16"
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      term: {
        'file_path.tree' => '/User/alice/photos/2017/05/16'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    term: {
      "file_path.tree": "/User/alice/photos/2017/05/16",
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "term": {
      "file_path.tree": "/User/alice/photos/2017/05/16"
    }
  }
}

使用此分词器的反向参数,还可以从文件路径的另一端匹配,例如单个文件名或深层子目录。以下示例显示了通过 file_path.tree_reversed 字段搜索所有名为 my_photo1.jpg 的文件,该字段配置为在映射中使用反向参数。

resp = client.search(
    index="file-path-test",
    query={
        "term": {
            "file_path.tree_reversed": {
                "value": "my_photo1.jpg"
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      term: {
        'file_path.tree_reversed' => {
          value: 'my_photo1.jpg'
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    term: {
      "file_path.tree_reversed": {
        value: "my_photo1.jpg",
      },
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "term": {
      "file_path.tree_reversed": {
        "value": "my_photo1.jpg"
      }
    }
  }
}

查看使用正向和反向生成的标记有助于显示为相同文件路径值创建的标记。

resp = client.indices.analyze(
    index="file-path-test",
    analyzer="custom_path_tree",
    text="/User/alice/photos/2017/05/16/my_photo1.jpg",
)
print(resp)

resp1 = client.indices.analyze(
    index="file-path-test",
    analyzer="custom_path_tree_reversed",
    text="/User/alice/photos/2017/05/16/my_photo1.jpg",
)
print(resp1)
response = client.indices.analyze(
  index: 'file-path-test',
  body: {
    analyzer: 'custom_path_tree',
    text: '/User/alice/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response

response = client.indices.analyze(
  index: 'file-path-test',
  body: {
    analyzer: 'custom_path_tree_reversed',
    text: '/User/alice/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response
const response = await client.indices.analyze({
  index: "file-path-test",
  analyzer: "custom_path_tree",
  text: "/User/alice/photos/2017/05/16/my_photo1.jpg",
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "file-path-test",
  analyzer: "custom_path_tree_reversed",
  text: "/User/alice/photos/2017/05/16/my_photo1.jpg",
});
console.log(response1);
POST file-path-test/_analyze
{
  "analyzer": "custom_path_tree",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

POST file-path-test/_analyze
{
  "analyzer": "custom_path_tree_reversed",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

当与其他类型的搜索结合使用时,能够使用文件路径进行过滤也很有用,例如此示例查找任何包含 16 的文件路径,并且还必须位于 Alice 的照片目录中。

resp = client.search(
    index="file-path-test",
    query={
        "bool": {
            "must": {
                "match": {
                    "file_path": "16"
                }
            },
            "filter": {
                "term": {
                    "file_path.tree": "/User/alice"
                }
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      bool: {
        must: {
          match: {
            file_path: '16'
          }
        },
        filter: {
          term: {
            'file_path.tree' => '/User/alice'
          }
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    bool: {
      must: {
        match: {
          file_path: "16",
        },
      },
      filter: {
        term: {
          "file_path.tree": "/User/alice",
        },
      },
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "bool" : {
      "must" : {
        "match" : { "file_path" : "16" }
      },
      "filter": {
        "term" : { "file_path.tree" : "/User/alice" }
      }
    }
  }
}