路径层次分词器
编辑路径层次分词器
编辑path_hierarchy
分词器接受类似文件系统路径的层次结构值,根据路径分隔符进行分割,并为树中的每个组件生成一个词项。path_hierarcy
分词器在底层使用 Lucene 的 PathHierarchyTokenizer。
示例输出
编辑resp = client.indices.analyze( tokenizer="path_hierarchy", text="/one/two/three", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'path_hierarchy', text: '/one/two/three' } ) puts response
const response = await client.indices.analyze({ tokenizer: "path_hierarchy", text: "/one/two/three", }); console.log(response);
POST _analyze { "tokenizer": "path_hierarchy", "text": "/one/two/three" }
以上文本将生成以下词项
[ /one, /one/two, /one/two/three ]
配置
编辑path_hierarchy
分词器接受以下参数
|
用作路径分隔符的字符。默认为 |
|
用于分隔符的可选替换字符。默认为 |
|
单次读取到词项缓冲区中的字符数。默认为 |
|
如果为 |
|
要跳过的初始词项数。默认为 |
示例配置
编辑在此示例中,我们将 path_hierarchy
分词器配置为按 -
字符分割,并将其替换为 /
。跳过前两个词项。
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "my_tokenizer" } }, "tokenizer": { "my_tokenizer": { "type": "path_hierarchy", "delimiter": "-", "replacement": "/", "skip": 2 } } } }, ) print(resp) resp1 = client.indices.analyze( index="my-index-000001", analyzer="my_analyzer", text="one-two-three-four-five", ) print(resp1)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_analyzer: { tokenizer: 'my_tokenizer' } }, tokenizer: { my_tokenizer: { type: 'path_hierarchy', delimiter: '-', replacement: '/', skip: 2 } } } } } ) puts response response = client.indices.analyze( index: 'my-index-000001', body: { analyzer: 'my_analyzer', text: 'one-two-three-four-five' } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_analyzer: { tokenizer: "my_tokenizer", }, }, tokenizer: { my_tokenizer: { type: "path_hierarchy", delimiter: "-", replacement: "/", skip: 2, }, }, }, }, }); console.log(response); const response1 = await client.indices.analyze({ index: "my-index-000001", analyzer: "my_analyzer", text: "one-two-three-four-five", }); console.log(response1);
PUT my-index-000001 { "settings": { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "my_tokenizer" } }, "tokenizer": { "my_tokenizer": { "type": "path_hierarchy", "delimiter": "-", "replacement": "/", "skip": 2 } } } } } POST my-index-000001/_analyze { "analyzer": "my_analyzer", "text": "one-two-three-four-five" }
以上示例生成以下词项
[ /three, /three/four, /three/four/five ]
如果我们将 reverse
设置为 true
,它将生成以下内容
[ one/two/three/, two/three/, three/ ]
详细示例
编辑path_hierarchy
分词器的常见用例是通过文件路径过滤结果。如果将文件路径与数据一起索引,使用 path_hierarchy
分词器分析路径允许按文件路径字符串的不同部分过滤结果。
此示例配置一个索引,使其具有两个自定义分析器,并将这些分析器应用于将存储文件名的 file_path
文本字段的多字段。其中一个分析器使用反向标记化。然后索引一些示例文档,以表示两个不同用户的照片文件夹中的一些照片的文件路径。
resp = client.indices.create( index="file-path-test", settings={ "analysis": { "analyzer": { "custom_path_tree": { "tokenizer": "custom_hierarchy" }, "custom_path_tree_reversed": { "tokenizer": "custom_hierarchy_reversed" } }, "tokenizer": { "custom_hierarchy": { "type": "path_hierarchy", "delimiter": "/" }, "custom_hierarchy_reversed": { "type": "path_hierarchy", "delimiter": "/", "reverse": "true" } } } }, mappings={ "properties": { "file_path": { "type": "text", "fields": { "tree": { "type": "text", "analyzer": "custom_path_tree" }, "tree_reversed": { "type": "text", "analyzer": "custom_path_tree_reversed" } } } } }, ) print(resp) resp1 = client.index( index="file-path-test", id="1", document={ "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg" }, ) print(resp1) resp2 = client.index( index="file-path-test", id="2", document={ "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg" }, ) print(resp2) resp3 = client.index( index="file-path-test", id="3", document={ "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg" }, ) print(resp3) resp4 = client.index( index="file-path-test", id="4", document={ "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg" }, ) print(resp4) resp5 = client.index( index="file-path-test", id="5", document={ "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg" }, ) print(resp5)
response = client.indices.create( index: 'file-path-test', body: { settings: { analysis: { analyzer: { custom_path_tree: { tokenizer: 'custom_hierarchy' }, custom_path_tree_reversed: { tokenizer: 'custom_hierarchy_reversed' } }, tokenizer: { custom_hierarchy: { type: 'path_hierarchy', delimiter: '/' }, custom_hierarchy_reversed: { type: 'path_hierarchy', delimiter: '/', reverse: 'true' } } } }, mappings: { properties: { file_path: { type: 'text', fields: { tree: { type: 'text', analyzer: 'custom_path_tree' }, tree_reversed: { type: 'text', analyzer: 'custom_path_tree_reversed' } } } } } } ) puts response response = client.index( index: 'file-path-test', id: 1, body: { file_path: '/User/alice/photos/2017/05/16/my_photo1.jpg' } ) puts response response = client.index( index: 'file-path-test', id: 2, body: { file_path: '/User/alice/photos/2017/05/16/my_photo2.jpg' } ) puts response response = client.index( index: 'file-path-test', id: 3, body: { file_path: '/User/alice/photos/2017/05/16/my_photo3.jpg' } ) puts response response = client.index( index: 'file-path-test', id: 4, body: { file_path: '/User/alice/photos/2017/05/15/my_photo1.jpg' } ) puts response response = client.index( index: 'file-path-test', id: 5, body: { file_path: '/User/bob/photos/2017/05/16/my_photo1.jpg' } ) puts response
const response = await client.indices.create({ index: "file-path-test", settings: { analysis: { analyzer: { custom_path_tree: { tokenizer: "custom_hierarchy", }, custom_path_tree_reversed: { tokenizer: "custom_hierarchy_reversed", }, }, tokenizer: { custom_hierarchy: { type: "path_hierarchy", delimiter: "/", }, custom_hierarchy_reversed: { type: "path_hierarchy", delimiter: "/", reverse: "true", }, }, }, }, mappings: { properties: { file_path: { type: "text", fields: { tree: { type: "text", analyzer: "custom_path_tree", }, tree_reversed: { type: "text", analyzer: "custom_path_tree_reversed", }, }, }, }, }, }); console.log(response); const response1 = await client.index({ index: "file-path-test", id: 1, document: { file_path: "/User/alice/photos/2017/05/16/my_photo1.jpg", }, }); console.log(response1); const response2 = await client.index({ index: "file-path-test", id: 2, document: { file_path: "/User/alice/photos/2017/05/16/my_photo2.jpg", }, }); console.log(response2); const response3 = await client.index({ index: "file-path-test", id: 3, document: { file_path: "/User/alice/photos/2017/05/16/my_photo3.jpg", }, }); console.log(response3); const response4 = await client.index({ index: "file-path-test", id: 4, document: { file_path: "/User/alice/photos/2017/05/15/my_photo1.jpg", }, }); console.log(response4); const response5 = await client.index({ index: "file-path-test", id: 5, document: { file_path: "/User/bob/photos/2017/05/16/my_photo1.jpg", }, }); console.log(response5);
PUT file-path-test { "settings": { "analysis": { "analyzer": { "custom_path_tree": { "tokenizer": "custom_hierarchy" }, "custom_path_tree_reversed": { "tokenizer": "custom_hierarchy_reversed" } }, "tokenizer": { "custom_hierarchy": { "type": "path_hierarchy", "delimiter": "/" }, "custom_hierarchy_reversed": { "type": "path_hierarchy", "delimiter": "/", "reverse": "true" } } } }, "mappings": { "properties": { "file_path": { "type": "text", "fields": { "tree": { "type": "text", "analyzer": "custom_path_tree" }, "tree_reversed": { "type": "text", "analyzer": "custom_path_tree_reversed" } } } } } } POST file-path-test/_doc/1 { "file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg" } POST file-path-test/_doc/2 { "file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg" } POST file-path-test/_doc/3 { "file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg" } POST file-path-test/_doc/4 { "file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg" } POST file-path-test/_doc/5 { "file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg" }
针对文本字段搜索特定文件路径字符串会匹配所有示例文档,由于 bob
也是标准分析器创建的词项之一,因此 Bob 的文档排名最高,从而提高了 Bob 文档的相关性。
resp = client.search( index="file-path-test", query={ "match": { "file_path": "/User/bob/photos/2017/05" } }, ) print(resp)
response = client.search( index: 'file-path-test', body: { query: { match: { file_path: '/User/bob/photos/2017/05' } } } ) puts response
const response = await client.search({ index: "file-path-test", query: { match: { file_path: "/User/bob/photos/2017/05", }, }, }); console.log(response);
GET file-path-test/_search { "query": { "match": { "file_path": "/User/bob/photos/2017/05" } } }
使用 file_path.tree
字段可以轻松匹配或过滤特定目录中存在文件路径的文档。
resp = client.search( index="file-path-test", query={ "term": { "file_path.tree": "/User/alice/photos/2017/05/16" } }, ) print(resp)
response = client.search( index: 'file-path-test', body: { query: { term: { 'file_path.tree' => '/User/alice/photos/2017/05/16' } } } ) puts response
const response = await client.search({ index: "file-path-test", query: { term: { "file_path.tree": "/User/alice/photos/2017/05/16", }, }, }); console.log(response);
GET file-path-test/_search { "query": { "term": { "file_path.tree": "/User/alice/photos/2017/05/16" } } }
使用此分词器的反向参数,还可以从文件路径的另一端匹配,例如单个文件名或深层子目录。以下示例显示了通过配置为在映射中使用反向参数的 file_path.tree_reversed
字段搜索任何目录中名为 my_photo1.jpg
的所有文件。
resp = client.search( index="file-path-test", query={ "term": { "file_path.tree_reversed": { "value": "my_photo1.jpg" } } }, ) print(resp)
response = client.search( index: 'file-path-test', body: { query: { term: { 'file_path.tree_reversed' => { value: 'my_photo1.jpg' } } } } ) puts response
const response = await client.search({ index: "file-path-test", query: { term: { "file_path.tree_reversed": { value: "my_photo1.jpg", }, }, }, }); console.log(response);
GET file-path-test/_search { "query": { "term": { "file_path.tree_reversed": { "value": "my_photo1.jpg" } } } }
查看正向和反向生成的词项,有助于展示为相同文件路径值创建的词项。
resp = client.indices.analyze( index="file-path-test", analyzer="custom_path_tree", text="/User/alice/photos/2017/05/16/my_photo1.jpg", ) print(resp) resp1 = client.indices.analyze( index="file-path-test", analyzer="custom_path_tree_reversed", text="/User/alice/photos/2017/05/16/my_photo1.jpg", ) print(resp1)
response = client.indices.analyze( index: 'file-path-test', body: { analyzer: 'custom_path_tree', text: '/User/alice/photos/2017/05/16/my_photo1.jpg' } ) puts response response = client.indices.analyze( index: 'file-path-test', body: { analyzer: 'custom_path_tree_reversed', text: '/User/alice/photos/2017/05/16/my_photo1.jpg' } ) puts response
const response = await client.indices.analyze({ index: "file-path-test", analyzer: "custom_path_tree", text: "/User/alice/photos/2017/05/16/my_photo1.jpg", }); console.log(response); const response1 = await client.indices.analyze({ index: "file-path-test", analyzer: "custom_path_tree_reversed", text: "/User/alice/photos/2017/05/16/my_photo1.jpg", }); console.log(response1);
POST file-path-test/_analyze { "analyzer": "custom_path_tree", "text": "/User/alice/photos/2017/05/16/my_photo1.jpg" } POST file-path-test/_analyze { "analyzer": "custom_path_tree_reversed", "text": "/User/alice/photos/2017/05/16/my_photo1.jpg" }
当与其他类型的搜索结合使用时,使用文件路径进行过滤也很有用,例如,此示例查找包含 16
并且也必须在 Alice 照片目录中的任何文件路径。
resp = client.search( index="file-path-test", query={ "bool": { "must": { "match": { "file_path": "16" } }, "filter": { "term": { "file_path.tree": "/User/alice" } } } }, ) print(resp)
response = client.search( index: 'file-path-test', body: { query: { bool: { must: { match: { file_path: '16' } }, filter: { term: { 'file_path.tree' => '/User/alice' } } } } } ) puts response
const response = await client.search({ index: "file-path-test", query: { bool: { must: { match: { file_path: "16", }, }, filter: { term: { "file_path.tree": "/User/alice", }, }, }, }, }); console.log(response);
GET file-path-test/_search { "query": { "bool" : { "must" : { "match" : { "file_path" : "16" } }, "filter": { "term" : { "file_path.tree" : "/User/alice" } } } } }