分隔符负载令牌过滤器
编辑分隔符负载令牌过滤器
编辑旧名称 delimited_payload_filter
已弃用,不应在新索引中使用。请改用 delimited_payload
。
根据指定的分隔符将令牌流分隔为令牌和负载。
例如,您可以将 delimited_payload
过滤器与 |
分隔符一起使用,将 the|1 quick|2 fox|3
分割为令牌 the
、quick
和 fox
,其各自的负载为 1
、2
和 3
。
此过滤器使用 Lucene 的 DelimitedPayloadTokenFilter。
负载
负载是与令牌位置关联的用户定义的二进制数据,并存储为 base64 编码的字节。
默认情况下,Elasticsearch 不存储令牌负载。要存储负载,您必须
- 对于任何存储负载的字段,将
term_vector
映射参数设置为with_positions_payloads
或with_positions_offsets_payloads
。 - 使用包含
delimited_payload
过滤器的索引分析器
您可以使用 词项向量 API 查看存储的负载。
示例
编辑以下 analyze API 请求使用带有默认 |
分隔符的 delimited_payload
过滤器,将 the|0 brown|10 fox|5 is|0 quick|10
分割为令牌和负载。
resp = client.indices.analyze( tokenizer="whitespace", filter=[ "delimited_payload" ], text="the|0 brown|10 fox|5 is|0 quick|10", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'whitespace', filter: [ 'delimited_payload' ], text: 'the|0 brown|10 fox|5 is|0 quick|10' } ) puts response
const response = await client.indices.analyze({ tokenizer: "whitespace", filter: ["delimited_payload"], text: "the|0 brown|10 fox|5 is|0 quick|10", }); console.log(response);
GET _analyze { "tokenizer": "whitespace", "filter": ["delimited_payload"], "text": "the|0 brown|10 fox|5 is|0 quick|10" }
过滤器生成以下令牌
[ the, brown, fox, is, quick ]
请注意,analyze API 不返回存储的负载。有关包含返回负载的示例,请参阅 返回存储的负载。
添加到分析器
编辑以下 创建索引 API 请求使用 delimited-payload
过滤器来配置新的 自定义分析器。
resp = client.indices.create( index="delimited_payload", settings={ "analysis": { "analyzer": { "whitespace_delimited_payload": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } }, ) print(resp)
response = client.indices.create( index: 'delimited_payload', body: { settings: { analysis: { analyzer: { whitespace_delimited_payload: { tokenizer: 'whitespace', filter: [ 'delimited_payload' ] } } } } } ) puts response
const response = await client.indices.create({ index: "delimited_payload", settings: { analysis: { analyzer: { whitespace_delimited_payload: { tokenizer: "whitespace", filter: ["delimited_payload"], }, }, }, }, }); console.log(response);
PUT delimited_payload { "settings": { "analysis": { "analyzer": { "whitespace_delimited_payload": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } } }
可配置参数
编辑-
delimiter
- (可选,字符串)用于分隔令牌和负载的字符。默认为
|
。 -
encoding
-
(可选,字符串)存储的负载的数据类型。有效值包括
-
float
- (默认)浮点数
-
identity
- 字符
-
int
- 整数
-
自定义并添加到分析器
编辑要自定义 delimited_payload
过滤器,请复制它以创建新的自定义令牌过滤器的基础。您可以使用其可配置参数修改过滤器。
例如,以下 创建索引 API 请求使用自定义 delimited_payload
过滤器来配置新的 自定义分析器。自定义 delimited_payload
过滤器使用 +
分隔符分隔令牌和负载。负载编码为整数。
resp = client.indices.create( index="delimited_payload_example", settings={ "analysis": { "analyzer": { "whitespace_plus_delimited": { "tokenizer": "whitespace", "filter": [ "plus_delimited" ] } }, "filter": { "plus_delimited": { "type": "delimited_payload", "delimiter": "+", "encoding": "int" } } } }, ) print(resp)
response = client.indices.create( index: 'delimited_payload_example', body: { settings: { analysis: { analyzer: { whitespace_plus_delimited: { tokenizer: 'whitespace', filter: [ 'plus_delimited' ] } }, filter: { plus_delimited: { type: 'delimited_payload', delimiter: '+', encoding: 'int' } } } } } ) puts response
const response = await client.indices.create({ index: "delimited_payload_example", settings: { analysis: { analyzer: { whitespace_plus_delimited: { tokenizer: "whitespace", filter: ["plus_delimited"], }, }, filter: { plus_delimited: { type: "delimited_payload", delimiter: "+", encoding: "int", }, }, }, }, }); console.log(response);
PUT delimited_payload_example { "settings": { "analysis": { "analyzer": { "whitespace_plus_delimited": { "tokenizer": "whitespace", "filter": [ "plus_delimited" ] } }, "filter": { "plus_delimited": { "type": "delimited_payload", "delimiter": "+", "encoding": "int" } } } } }
返回存储的负载
编辑使用 创建索引 API 创建一个索引,该索引
- 包含一个存储带有负载的词项向量的字段。
- 使用带有
delimited_payload
过滤器的 自定义索引分析器。
resp = client.indices.create( index="text_payloads", mappings={ "properties": { "text": { "type": "text", "term_vector": "with_positions_payloads", "analyzer": "payload_delimiter" } } }, settings={ "analysis": { "analyzer": { "payload_delimiter": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } }, ) print(resp)
response = client.indices.create( index: 'text_payloads', body: { mappings: { properties: { text: { type: 'text', term_vector: 'with_positions_payloads', analyzer: 'payload_delimiter' } } }, settings: { analysis: { analyzer: { payload_delimiter: { tokenizer: 'whitespace', filter: [ 'delimited_payload' ] } } } } } ) puts response
const response = await client.indices.create({ index: "text_payloads", mappings: { properties: { text: { type: "text", term_vector: "with_positions_payloads", analyzer: "payload_delimiter", }, }, }, settings: { analysis: { analyzer: { payload_delimiter: { tokenizer: "whitespace", filter: ["delimited_payload"], }, }, }, }, }); console.log(response);
PUT text_payloads { "mappings": { "properties": { "text": { "type": "text", "term_vector": "with_positions_payloads", "analyzer": "payload_delimiter" } } }, "settings": { "analysis": { "analyzer": { "payload_delimiter": { "tokenizer": "whitespace", "filter": [ "delimited_payload" ] } } } } }
向索引添加包含负载的文档。
resp = client.index( index="text_payloads", id="1", document={ "text": "the|0 brown|3 fox|4 is|0 quick|10" }, ) print(resp)
response = client.index( index: 'text_payloads', id: 1, body: { text: 'the|0 brown|3 fox|4 is|0 quick|10' } ) puts response
const response = await client.index({ index: "text_payloads", id: 1, document: { text: "the|0 brown|3 fox|4 is|0 quick|10", }, }); console.log(response);
POST text_payloads/_doc/1 { "text": "the|0 brown|3 fox|4 is|0 quick|10" }
使用 词项向量 API 返回文档的令牌和 base64 编码的负载。
resp = client.termvectors( index="text_payloads", id="1", fields=[ "text" ], payloads=True, ) print(resp)
response = client.termvectors( index: 'text_payloads', id: 1, body: { fields: [ 'text' ], payloads: true } ) puts response
const response = await client.termvectors({ index: "text_payloads", id: 1, fields: ["text"], payloads: true, }); console.log(response);
GET text_payloads/_termvectors/1 { "fields": [ "text" ], "payloads": true }
API 返回以下响应
{ "_index": "text_payloads", "_id": "1", "_version": 1, "found": true, "took": 8, "term_vectors": { "text": { "field_statistics": { "sum_doc_freq": 5, "doc_count": 1, "sum_ttf": 5 }, "terms": { "brown": { "term_freq": 1, "tokens": [ { "position": 1, "payload": "QEAAAA==" } ] }, "fox": { "term_freq": 1, "tokens": [ { "position": 2, "payload": "QIAAAA==" } ] }, "is": { "term_freq": 1, "tokens": [ { "position": 3, "payload": "AAAAAA==" } ] }, "quick": { "term_freq": 1, "tokens": [ { "position": 4, "payload": "QSAAAA==" } ] }, "the": { "term_freq": 1, "tokens": [ { "position": 0, "payload": "AAAAAA==" } ] } } } } }