路径层次分词器

编辑

path_hierarchy 分词器接受类似文件系统路径的层次结构值,根据路径分隔符进行分割,并为树中的每个组件生成一个词项。path_hierarcy 分词器在底层使用 Lucene 的 PathHierarchyTokenizer

示例输出

编辑
resp = client.indices.analyze(
    tokenizer="path_hierarchy",
    text="/one/two/three",
)
print(resp)
response = client.indices.analyze(
  body: {
    tokenizer: 'path_hierarchy',
    text: '/one/two/three'
  }
)
puts response
const response = await client.indices.analyze({
  tokenizer: "path_hierarchy",
  text: "/one/two/three",
});
console.log(response);
POST _analyze
{
  "tokenizer": "path_hierarchy",
  "text": "/one/two/three"
}

以上文本将生成以下词项

[ /one, /one/two, /one/two/three ]

配置

编辑

path_hierarchy 分词器接受以下参数

delimiter

用作路径分隔符的字符。默认为 /

replacement

用于分隔符的可选替换字符。默认为 delimiter

buffer_size

单次读取到词项缓冲区中的字符数。默认为 1024。词项缓冲区将以该大小增长,直到所有文本都被消耗。建议不要更改此设置。

reverse

如果为 true,则使用 Lucene 的 ReversePathHierarchyTokenizer,它适用于类似域的层次结构。默认为 false

skip

要跳过的初始词项数。默认为 0

示例配置

编辑

在此示例中,我们将 path_hierarchy 分词器配置为按 - 字符分割,并将其替换为 /。跳过前两个词项。

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "tokenizer": "my_tokenizer"
                }
            },
            "tokenizer": {
                "my_tokenizer": {
                    "type": "path_hierarchy",
                    "delimiter": "-",
                    "replacement": "/",
                    "skip": 2
                }
            }
        }
    },
)
print(resp)

resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_analyzer",
    text="one-two-three-four-five",
)
print(resp1)
response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_analyzer: {
            tokenizer: 'my_tokenizer'
          }
        },
        tokenizer: {
          my_tokenizer: {
            type: 'path_hierarchy',
            delimiter: '-',
            replacement: '/',
            skip: 2
          }
        }
      }
    }
  }
)
puts response

response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_analyzer',
    text: 'one-two-three-four-five'
  }
)
puts response
const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_analyzer: {
          tokenizer: "my_tokenizer",
        },
      },
      tokenizer: {
        my_tokenizer: {
          type: "path_hierarchy",
          delimiter: "-",
          replacement: "/",
          skip: 2,
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_analyzer",
  text: "one-two-three-four-five",
});
console.log(response1);
PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "path_hierarchy",
          "delimiter": "-",
          "replacement": "/",
          "skip": 2
        }
      }
    }
  }
}

POST my-index-000001/_analyze
{
  "analyzer": "my_analyzer",
  "text": "one-two-three-four-five"
}

以上示例生成以下词项

[ /three, /three/four, /three/four/five ]

如果我们将 reverse 设置为 true,它将生成以下内容

[ one/two/three/, two/three/, three/ ]

详细示例

编辑

path_hierarchy 分词器的常见用例是通过文件路径过滤结果。如果将文件路径与数据一起索引,使用 path_hierarchy 分词器分析路径允许按文件路径字符串的不同部分过滤结果。

此示例配置一个索引,使其具有两个自定义分析器,并将这些分析器应用于将存储文件名的 file_path 文本字段的多字段。其中一个分析器使用反向标记化。然后索引一些示例文档,以表示两个不同用户的照片文件夹中的一些照片的文件路径。

resp = client.indices.create(
    index="file-path-test",
    settings={
        "analysis": {
            "analyzer": {
                "custom_path_tree": {
                    "tokenizer": "custom_hierarchy"
                },
                "custom_path_tree_reversed": {
                    "tokenizer": "custom_hierarchy_reversed"
                }
            },
            "tokenizer": {
                "custom_hierarchy": {
                    "type": "path_hierarchy",
                    "delimiter": "/"
                },
                "custom_hierarchy_reversed": {
                    "type": "path_hierarchy",
                    "delimiter": "/",
                    "reverse": "true"
                }
            }
        }
    },
    mappings={
        "properties": {
            "file_path": {
                "type": "text",
                "fields": {
                    "tree": {
                        "type": "text",
                        "analyzer": "custom_path_tree"
                    },
                    "tree_reversed": {
                        "type": "text",
                        "analyzer": "custom_path_tree_reversed"
                    }
                }
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="file-path-test",
    id="1",
    document={
        "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg"
    },
)
print(resp1)

resp2 = client.index(
    index="file-path-test",
    id="2",
    document={
        "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg"
    },
)
print(resp2)

resp3 = client.index(
    index="file-path-test",
    id="3",
    document={
        "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg"
    },
)
print(resp3)

resp4 = client.index(
    index="file-path-test",
    id="4",
    document={
        "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg"
    },
)
print(resp4)

resp5 = client.index(
    index="file-path-test",
    id="5",
    document={
        "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg"
    },
)
print(resp5)
response = client.indices.create(
  index: 'file-path-test',
  body: {
    settings: {
      analysis: {
        analyzer: {
          custom_path_tree: {
            tokenizer: 'custom_hierarchy'
          },
          custom_path_tree_reversed: {
            tokenizer: 'custom_hierarchy_reversed'
          }
        },
        tokenizer: {
          custom_hierarchy: {
            type: 'path_hierarchy',
            delimiter: '/'
          },
          custom_hierarchy_reversed: {
            type: 'path_hierarchy',
            delimiter: '/',
            reverse: 'true'
          }
        }
      }
    },
    mappings: {
      properties: {
        file_path: {
          type: 'text',
          fields: {
            tree: {
              type: 'text',
              analyzer: 'custom_path_tree'
            },
            tree_reversed: {
              type: 'text',
              analyzer: 'custom_path_tree_reversed'
            }
          }
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 1,
  body: {
    file_path: '/User/alice/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 2,
  body: {
    file_path: '/User/alice/photos/2017/05/16/my_photo2.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 3,
  body: {
    file_path: '/User/alice/photos/2017/05/16/my_photo3.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 4,
  body: {
    file_path: '/User/alice/photos/2017/05/15/my_photo1.jpg'
  }
)
puts response

response = client.index(
  index: 'file-path-test',
  id: 5,
  body: {
    file_path: '/User/bob/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response
const response = await client.indices.create({
  index: "file-path-test",
  settings: {
    analysis: {
      analyzer: {
        custom_path_tree: {
          tokenizer: "custom_hierarchy",
        },
        custom_path_tree_reversed: {
          tokenizer: "custom_hierarchy_reversed",
        },
      },
      tokenizer: {
        custom_hierarchy: {
          type: "path_hierarchy",
          delimiter: "/",
        },
        custom_hierarchy_reversed: {
          type: "path_hierarchy",
          delimiter: "/",
          reverse: "true",
        },
      },
    },
  },
  mappings: {
    properties: {
      file_path: {
        type: "text",
        fields: {
          tree: {
            type: "text",
            analyzer: "custom_path_tree",
          },
          tree_reversed: {
            type: "text",
            analyzer: "custom_path_tree_reversed",
          },
        },
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "file-path-test",
  id: 1,
  document: {
    file_path: "/User/alice/photos/2017/05/16/my_photo1.jpg",
  },
});
console.log(response1);

const response2 = await client.index({
  index: "file-path-test",
  id: 2,
  document: {
    file_path: "/User/alice/photos/2017/05/16/my_photo2.jpg",
  },
});
console.log(response2);

const response3 = await client.index({
  index: "file-path-test",
  id: 3,
  document: {
    file_path: "/User/alice/photos/2017/05/16/my_photo3.jpg",
  },
});
console.log(response3);

const response4 = await client.index({
  index: "file-path-test",
  id: 4,
  document: {
    file_path: "/User/alice/photos/2017/05/15/my_photo1.jpg",
  },
});
console.log(response4);

const response5 = await client.index({
  index: "file-path-test",
  id: 5,
  document: {
    file_path: "/User/bob/photos/2017/05/16/my_photo1.jpg",
  },
});
console.log(response5);
PUT file-path-test
{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_path_tree": {
          "tokenizer": "custom_hierarchy"
        },
        "custom_path_tree_reversed": {
          "tokenizer": "custom_hierarchy_reversed"
        }
      },
      "tokenizer": {
        "custom_hierarchy": {
          "type": "path_hierarchy",
          "delimiter": "/"
        },
        "custom_hierarchy_reversed": {
          "type": "path_hierarchy",
          "delimiter": "/",
          "reverse": "true"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "file_path": {
        "type": "text",
        "fields": {
          "tree": {
            "type": "text",
            "analyzer": "custom_path_tree"
          },
          "tree_reversed": {
            "type": "text",
            "analyzer": "custom_path_tree_reversed"
          }
        }
      }
    }
  }
}

POST file-path-test/_doc/1
{
  "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

POST file-path-test/_doc/2
{
  "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg"
}

POST file-path-test/_doc/3
{
  "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg"
}

POST file-path-test/_doc/4
{
  "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg"
}

POST file-path-test/_doc/5
{
  "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg"
}

针对文本字段搜索特定文件路径字符串会匹配所有示例文档,由于 bob 也是标准分析器创建的词项之一,因此 Bob 的文档排名最高,从而提高了 Bob 文档的相关性。

resp = client.search(
    index="file-path-test",
    query={
        "match": {
            "file_path": "/User/bob/photos/2017/05"
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      match: {
        file_path: '/User/bob/photos/2017/05'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    match: {
      file_path: "/User/bob/photos/2017/05",
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "match": {
      "file_path": "/User/bob/photos/2017/05"
    }
  }
}

使用 file_path.tree 字段可以轻松匹配或过滤特定目录中存在文件路径的文档。

resp = client.search(
    index="file-path-test",
    query={
        "term": {
            "file_path.tree": "/User/alice/photos/2017/05/16"
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      term: {
        'file_path.tree' => '/User/alice/photos/2017/05/16'
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    term: {
      "file_path.tree": "/User/alice/photos/2017/05/16",
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "term": {
      "file_path.tree": "/User/alice/photos/2017/05/16"
    }
  }
}

使用此分词器的反向参数,还可以从文件路径的另一端匹配,例如单个文件名或深层子目录。以下示例显示了通过配置为在映射中使用反向参数的 file_path.tree_reversed 字段搜索任何目录中名为 my_photo1.jpg 的所有文件。

resp = client.search(
    index="file-path-test",
    query={
        "term": {
            "file_path.tree_reversed": {
                "value": "my_photo1.jpg"
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      term: {
        'file_path.tree_reversed' => {
          value: 'my_photo1.jpg'
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    term: {
      "file_path.tree_reversed": {
        value: "my_photo1.jpg",
      },
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "term": {
      "file_path.tree_reversed": {
        "value": "my_photo1.jpg"
      }
    }
  }
}

查看正向和反向生成的词项,有助于展示为相同文件路径值创建的词项。

resp = client.indices.analyze(
    index="file-path-test",
    analyzer="custom_path_tree",
    text="/User/alice/photos/2017/05/16/my_photo1.jpg",
)
print(resp)

resp1 = client.indices.analyze(
    index="file-path-test",
    analyzer="custom_path_tree_reversed",
    text="/User/alice/photos/2017/05/16/my_photo1.jpg",
)
print(resp1)
response = client.indices.analyze(
  index: 'file-path-test',
  body: {
    analyzer: 'custom_path_tree',
    text: '/User/alice/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response

response = client.indices.analyze(
  index: 'file-path-test',
  body: {
    analyzer: 'custom_path_tree_reversed',
    text: '/User/alice/photos/2017/05/16/my_photo1.jpg'
  }
)
puts response
const response = await client.indices.analyze({
  index: "file-path-test",
  analyzer: "custom_path_tree",
  text: "/User/alice/photos/2017/05/16/my_photo1.jpg",
});
console.log(response);

const response1 = await client.indices.analyze({
  index: "file-path-test",
  analyzer: "custom_path_tree_reversed",
  text: "/User/alice/photos/2017/05/16/my_photo1.jpg",
});
console.log(response1);
POST file-path-test/_analyze
{
  "analyzer": "custom_path_tree",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

POST file-path-test/_analyze
{
  "analyzer": "custom_path_tree_reversed",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

当与其他类型的搜索结合使用时,使用文件路径进行过滤也很有用,例如,此示例查找包含 16 并且也必须在 Alice 照片目录中的任何文件路径。

resp = client.search(
    index="file-path-test",
    query={
        "bool": {
            "must": {
                "match": {
                    "file_path": "16"
                }
            },
            "filter": {
                "term": {
                    "file_path.tree": "/User/alice"
                }
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'file-path-test',
  body: {
    query: {
      bool: {
        must: {
          match: {
            file_path: '16'
          }
        },
        filter: {
          term: {
            'file_path.tree' => '/User/alice'
          }
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "file-path-test",
  query: {
    bool: {
      must: {
        match: {
          file_path: "16",
        },
      },
      filter: {
        term: {
          "file_path.tree": "/User/alice",
        },
      },
    },
  },
});
console.log(response);
GET file-path-test/_search
{
  "query": {
    "bool" : {
      "must" : {
        "match" : { "file_path" : "16" }
      },
      "filter": {
        "term" : { "file_path.tree" : "/User/alice" }
      }
    }
  }
}