前言
- 技术栈
Python 3.11
pyparsing 3.2.3
lark 1.2.2
loguru 0.7.2
案例
- 测试代码
# encoding: utf-8
# author: qbit
# date: 2024-04-23
# summary: 将与或非逻辑表达式转换为 ES 表达式
import json
import pyparsing as pp
from loguru import logger
line = 'owner=x_111 AND doc_type=%x%_222 OR author=x_333 OR organ=x_444 AND (NOT pub_year=x_555)'
operator = (
pp.Literal(r'=x_') | # 全等精确匹配
pp.Literal(r'=%x%_') # 前后模糊匹配
)
field = pp.Word(pp.alphanums + '_')
value = pp.Word(pp.alphanums)
exprGroup: pp.Group = pp.Group(field("field") + operator("operator") + value("value"))
logicAND = pp.Word('AND')('logic')
logicOR = pp.Word('OR')('logic')
logicNOT = pp.Word('NOT')('logic')
exprForward = pp.infixNotation(
exprGroup("Expr"),
[
(logicAND, 2, pp.opAssoc.LEFT, ),
(logicOR, 2, pp.opAssoc.LEFT, ),
(logicNOT, 1, pp.opAssoc.RIGHT, ),
]
).setResultsName("Result", True)
result: pp.results.ParseResults = exprForward.parseString(line, parseAll=True)
logger.debug(f"result list: \n{json.dumps(result.as_list(), indent=4)}")
def list2dsl(lst):
r''' 将 pyparsing 解析出来的列表递归转化为 Elasticsearch DSL '''
if (len(lst) == 1) and isinstance(lst[0], list): # 列表中只有一个列表元素
return list2dsl(lst[0])
if lst[0] == 'NOT':
return {
'bool': {
'must_not': list2dsl(lst[1])
}
}
match lst[1]:
case 'AND':
mustList = []
for item in lst:
if item != 'AND':
mustList.append(list2dsl(item))
return {
'bool': {
'must': mustList
}
}
case 'OR':
shouldList = []
for item in lst:
if item != 'OR':
shouldList.append(list2dsl(item))
return {
'bool': {
'should': shouldList
}
}
case r'=x_':
return {
'query_string': f"{lst[0]}:{lst[2]}"
}
case r'=%x%_':
return {
'query_string': f"{lst[0]}:*{lst[2]}*"
}
case _:
pass
esdsl = json.dumps(list2dsl(result.as_list()), indent=4)
logger.debug(f"es dsl: \n {esdsl}")
logger.debug(f"line: {line}")
- 输出列表
[
[
[
[
"owner",
"=x_",
"111"
],
"AND",
[
"doc_type",
"=%x%_",
"222"
]
],
"OR",
[
"author",
"=x_",
"333"
],
"OR",
[
[
"organ",
"=x_",
"444"
],
"AND",
[
"NOT",
[
"pub_year",
"=x_",
"555"
]
]
]
]
]
- 测试 DSL
{
"bool": {
"should": [
{
"bool": {
"must": [
{
"query_string": "owner:111"
},
{
"query_string": "doc_type:*222*"
}
]
}
},
{
"query_string": "author:333"
},
{
"bool": {
"must": [
{
"query_string": "organ:444"
},
{
"bool": {
"must_not": {
"query_string": "pub_year:555"
}
}
}
]
}
}
]
}
}
lark 实现版本
- 2025.4 发现 pyparsing 在解析多层嵌套的复杂表达式时,速度过慢,尝试 lark 速度快很多
多层嵌套表达式示例
(doc_type=x_pvls1o2r) AND ((((((((collection=x_pjywkacw) OR (collection=x_pogw2kli)) OR (collection=x_pgj4ygwz)) OR (collection=x_p0ly5jmy)) OR (collection=x_pedp4esz)) OR (collection=x_paghniy0)) OR (collection=x_pzckpz0n)))
lark
代码示例# encoding: utf-8 # author: qbit # date: 2025-04-14 # summary: 将与或非逻辑表达式转换为 ES 表达式(使用lark库) import json from lark import Lark, Transformer from loguru import logger line = "owner=x_111 AND doc_type=%x%_222 OR (author=x_333 OR author=x_331) OR organ=x_444 AND (NOT pub_year=x_555)" # 优先级从上到下降低,与原始pyparsing实现保持一致 grammar = r""" ?start: expr ?expr: or_expr ?or_expr: and_expr | or_expr "OR" and_expr -> or_op ?and_expr: not_expr | and_expr "AND" not_expr -> and_op ?not_expr: atom | "NOT" atom -> not_op ?atom: term | "(" expr ")" term: FIELD OPERATOR VALUE FIELD: /[a-zA-Z0-9_]+/ OPERATOR: "=x_" | "=%x%_" VALUE: /[a-zA-Z0-9]+/ %import common.WS %ignore WS """ class LogicExprTransformer(Transformer): # 定义转换器来构建解析结果 def term(self, children): field, op, val = children return [field.value, op.value, val.value] def and_op(self, children): left, right = children result = [] # 处理左侧 if isinstance(left, list) and len(left) > 2 and left[1] == "AND": # 左侧已经是AND表达式,展开它 result.extend(left) else: # 左侧是普通表达式 result.append(left) # 添加当前的AND操作符 result.append("AND") # 处理右侧 if isinstance(right, list) and len(right) > 2 and right[1] == "AND": # 右侧已经是OR表达式,添加第一个元素和剩余部分 result.append(right[0]) # 添加第一个元素 result.append("AND") # 添加AND操作符 result.extend(right[2:]) # 添加剩余元素 else: # 右侧是普通表达式 result.append(right) return result def or_op(self, children): left, right = children result = [] # 处理左侧 if isinstance(left, list) and len(left) > 2 and left[1] == "OR": # 左侧已经是OR表达式,展开它 result.extend(left) else: # 左侧是普通表达式 result.append(left) # 添加当前的OR操作符 result.append("OR") # 处理右侧 if isinstance(right, list) and len(right) > 2 and right[1] == "OR": # 右侧已经是OR表达式,添加第一个元素和剩余部分 result.append(right[0]) # 添加第一个元素 result.append("OR") # 添加OR操作符 result.extend(right[2:]) # 添加剩余元素 else: # 右侧是普通表达式 result.append(right) return result def not_op(self, children): return ["NOT", children[0]] # 创建解析器 parser = Lark(grammar, parser="lalr", transformer=LogicExprTransformer()) # 解析表达式 result = parser.parse(line) logger.debug(f"result list: \n{json.dumps(result, indent=4)}")
相关资料
库
- https://pypi.org/project/pyparsing/
- https://pypi.org/project/boolean-parser
- https://pypi.org/project/parsimonious/
- https://pypi.org/project/sympy/
文章
- PyParsing 官方文档:https://pyparsing-docs.readthedocs.io/en/latest/
- pyparsing 学习(博客园)
- 取代正则-使用pyparsing来定制自己的解析器(知乎)
- Pyparsing快速构建解释器 | 实战搜索查询语法(知乎)
- parsing logical expression with pyparsing
- Pyparsing实战(知乎)
- pyparsing 2.x 案例
本文出自 qbit snap
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。