summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorptmcg <ptmcg@austin.rr.com>2023-03-25 03:34:35 -0500
committerptmcg <ptmcg@austin.rr.com>2023-03-25 03:34:35 -0500
commit2e98055c8dab3e00fd20f39cd815b7e2773886e7 (patch)
treef74c703691f444a6bae8ac91831175364dbdc0d4
parent9576e2fc2f6ab014ee9484e66926d28555306bcf (diff)
downloadpyparsing-git-2e98055c8dab3e00fd20f39cd815b7e2773886e7.tar.gz
Update lucene_grammar.py example, fix * and ? wildcards, and corrected some tests. Addresses #455
-rw-r--r--CHANGES3
-rw-r--r--examples/lucene_grammar.py82
-rw-r--r--tests/test_examples.py4
3 files changed, 57 insertions, 32 deletions
diff --git a/CHANGES b/CHANGES
index cea0baa..fd210a5 100644
--- a/CHANGES
+++ b/CHANGES
@@ -7,6 +7,9 @@ Version 3.1.0a2 - (in development)
Updated ci.yml permissions to limit default access to source - submitted by Joyce
Brum of Google. Thanks so much!
+Updated the lucene_grammar.py example (better support for '*' and '?' wildcards)
+and corrected the test cases - brought to my attention by Elijah Nicol, good catch!
+
Version 3.1.0a1 - March, 2023
-----------------------------
diff --git a/examples/lucene_grammar.py b/examples/lucene_grammar.py
index dba27df..437c5e3 100644
--- a/examples/lucene_grammar.py
+++ b/examples/lucene_grammar.py
@@ -2,9 +2,10 @@
# lucene_grammar.py
#
# Copyright 2011, Paul McGuire
+# Updated 2023
#
# implementation of Lucene grammar, as described
-# at http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/docs/queryparsersyntax.html
+# at https://lucene.apache.org/core/2_9_4/queryparsersyntax.html
#
import pyparsing as pp
@@ -12,17 +13,18 @@ from pyparsing import pyparsing_common as ppc
pp.ParserElement.enablePackrat()
-COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = map(pp.Literal, ":[]{}~^")
-LPAR, RPAR = map(pp.Suppress, "()")
-and_, or_, not_, to_ = map(pp.CaselessKeyword, "AND OR NOT TO".split())
+COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = pp.Literal.using_each(":[]{}~^")
+LPAR, RPAR = pp.Suppress.using_each("()")
+and_, or_, not_, to_ = pp.CaselessKeyword.using_each("AND OR NOT TO".split())
keyword = and_ | or_ | not_ | to_
expression = pp.Forward()
valid_word = pp.Regex(
- r'([a-zA-Z0-9*_+.-]|\\\\|\\([+\-!(){}\[\]^"~*?:]|\|\||&&))+'
+ r'([a-zA-Z0-9_.-]|\\\\|\\([+\-!(){}\[\]^"~*?:]|\|\||&&))'
+ r'([a-zA-Z0-9*_+.-]|\\\\|\\([+\-!(){}\[\]^"~*?:]|\|\||&&)|\*|\?)*'
).setName("word")
-valid_word.setParseAction(
+valid_word.set_parse_action(
lambda t: t[0].replace("\\\\", chr(127)).replace("\\", "").replace(chr(127), "\\")
)
@@ -35,8 +37,8 @@ proximity_modifier = pp.Group(TILDE + integer("proximity"))
number = ppc.fnumber()
fuzzy_modifier = TILDE + pp.Optional(number, default=0.5)("fuzzy")
-term = pp.Forward().setName("field")
-field_name = valid_word().setName("fieldname")
+term = pp.Forward().set_name("field")
+field_name = valid_word().set_name("fieldname")
incl_range_search = pp.Group(LBRACK - term("lower") + to_ + term("upper") + RBRACK)
excl_range_search = pp.Group(LBRACE - term("lower") + to_ + term("upper") + RBRACE)
range_search = incl_range_search("incl_range") | excl_range_search("excl_range")
@@ -44,27 +46,28 @@ boost = CARAT - number("boost")
string_expr = pp.Group(string + proximity_modifier) | string
word_expr = pp.Group(valid_word + fuzzy_modifier) | valid_word
-term << (
+term <<= (
~keyword
+ pp.Optional(field_name("field") + COLON)
+ (word_expr | string_expr | range_search | pp.Group(LPAR + expression + RPAR))
+ pp.Optional(boost)
)
-term.setParseAction(lambda t: [t] if "field" in t or "boost" in t else None)
+term.set_parse_action(lambda t: [t] if "field" in t or "boost" in t else None)
-expression << pp.infixNotation(
+expression <<= pp.infixNotation(
term,
[
(required_modifier | prohibit_modifier, 1, pp.opAssoc.RIGHT),
- ((not_ | "!").setParseAction(lambda: "NOT"), 1, pp.opAssoc.RIGHT),
- ((and_ | "&&").setParseAction(lambda: "AND"), 2, pp.opAssoc.LEFT),
+ ((not_ | "!").set_parse_action(lambda: "NOT"), 1, pp.opAssoc.RIGHT),
+ ((and_ | "&&").set_parse_action(lambda: "AND"), 2, pp.opAssoc.LEFT),
(
- pp.Optional(or_ | "||").setName("or").setParseAction(lambda: "OR"),
+ pp.Optional(or_ | "||").setName("or").set_parse_action(lambda: "OR"),
2,
pp.opAssoc.LEFT,
),
],
-)
+).set_name("query expression")
+
if __name__ == "__main__":
@@ -84,6 +87,9 @@ if __name__ == "__main__":
title:"The Right Way" AND text:go
title:"Do it right" AND right
title:Do it right
+ te?t
+ test*
+ te*t
roam~
roam~0.8
"jakarta apache"~10
@@ -99,6 +105,7 @@ if __name__ == "__main__":
"jakarta apache" NOT "Apache Lucene"
"jakarta apache" -"Apache Lucene"
(jakarta OR apache) AND website
+ title:(+return +"pink panther")
\(1+1\)\:2
c\:\\windows
(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)
@@ -163,7 +170,6 @@ if __name__ == "__main__":
term~1.1
[A TO C]
t*erm*
- *term*
term term^3.0 term
term stop^3.0 term
term +stop term
@@ -202,11 +208,6 @@ if __name__ == "__main__":
bar blar {a TO z}
gack ( bar blar { a TO z})
gack (bar blar {a TO z})
- [* TO Z]
- [* TO z]
- [A TO *]
- [a TO *]
- [* TO *]
[\* TO \*]
\!blah
\:blah
@@ -237,7 +238,8 @@ if __name__ == "__main__":
XYZ
(item:\\ item:ABCD\\)
\*
- *
+ blah*blah
+ blah?blah
\\
\||
\&&
@@ -270,15 +272,9 @@ if __name__ == "__main__":
foo:zoo*
foo:zoo*^2
zoo
- foo:*
- foo:*^2
- *:foo
a:the OR a:foo
a:woo OR a:the
- *:*
- (*:*)
- +*:* -*:*
- the wizard of ozzy
+ "the wizard of ozzy"
"""
failtests = r"""
@@ -289,10 +285,33 @@ if __name__ == "__main__":
# multiple '^'s in term
(sub query)^5.0^2.0 plus more
+
+ # cannot start with * or ?
+ *term1 AND term2
+ ?term3 OR term4
+ *
+
+ # unbounded '*' range terms
+ [* TO Z]
+ [* TO z]
+ [A TO *]
+ [a TO *]
+ [* TO *]
+
+ # unbounded field values
+ foo:*
+ foo:*^2
+ *:foo
+ *:*
+ (*:*)
+ +*:* -*:*
+
a:b:c
a:b:c~
a:b:c*
a:b:c~2.0
+ """
+ z = """
\+blah
\-blah
foo \|| bar
@@ -337,7 +356,10 @@ if __name__ == "__main__":
success1, _ = expression.runTests(tests)
success2, _ = expression.runTests(failtests, failureTests=True)
- print("All tests:", ("FAIL", "OK")[success1 and success2])
+ print("\n")
+ print(f"Success tests: {'OK' if success1 else 'FAIL'}")
+ print(f"Fail tests: {'OK' if success2 else 'FAIL'}")
+ print(f"All tests: {'OK' if (success1 and success2) else 'FAIL'}")
if not (success1 and success2):
import sys
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 3b63a11..9414b09 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -43,5 +43,5 @@ class TestExamples(unittest.TestCase):
def test_excelExpr(self):
self._run("excelExpr")
- def test_delta_time(self):
- self._run("delta_time")
+ def test_lucene_grammar(self):
+ self._run("lucene_grammar")