summaryrefslogtreecommitdiff
path: root/examples/booleansearchparser.py
diff options
context:
space:
mode:
authorxecgr <francesc.garcia.robert@gmail.com>2019-08-14 05:29:28 +0200
committerPaul McGuire <ptmcg@users.noreply.github.com>2019-08-13 22:29:28 -0500
commit709030db87149be9a2d8c045b6e125ed068a00c2 (patch)
tree180f6f41bb74d0527d71763e0aa24fceb24d9857 /examples/booleansearchparser.py
parent4ff075c8fa7c81657b9bbec34041627c5704317f (diff)
downloadpyparsing-git-709030db87149be9a2d8c045b6e125ed068a00c2.tar.gz
Boolean Search query parser: allows to perform searches with the common boolean search syntax against a text (#21)
* Add files via upload Boolean Search query parser, based on searchparser, that allows to perform searches with the common boolean search syntax against a text (western + non-western alphabets) SAMPLE USAGE: from booleansearchparser import BooleanSearchParser bsp = BooleanSearchParser() text = u"wildcards at the begining of a search term " exprs= [ u"*cards and term", #True u"wild* and term", #True u"not terms", #True u"terms or begin", #False ] for expr in exprs: print bsp.match(text,expr) #non-western samples text = u"안녕하세요, 당신은 어떠세요?" exprs= [ u"*신은 and 어떠세요", #True u"not 당신은", #False u"당신 or 당", #False ] for expr in exprs: print bsp.match(text,expr) * from __future__ import print_function and changing this over to be Python 2/3 compatible * ptmcg conversation issues
Diffstat (limited to 'examples/booleansearchparser.py')
-rw-r--r--examples/booleansearchparser.py394
1 files changed, 394 insertions, 0 deletions
diff --git a/examples/booleansearchparser.py b/examples/booleansearchparser.py
new file mode 100644
index 0000000..79f9d29
--- /dev/null
+++ b/examples/booleansearchparser.py
@@ -0,0 +1,394 @@
+#-*- coding: utf-8 -*-
+# vim:fileencoding=utf-8
+"""
+Boolean Search query parser (Based on searchparser: https://github.com/pyparsing/pyparsing/blob/master/examples/searchparser.py)
+
+version 2018-07-22
+
+This search query parser uses the excellent Pyparsing module
+(http://pyparsing.sourceforge.net/) to parse search queries by users.
+It handles:
+
+* 'and', 'or' and implicit 'and' operators;
+* parentheses;
+* quoted strings;
+* wildcards at the end of a search term (help*);
+* wildcards at the begining of a search term (*lp);
+* non-western languages
+
+Requirements:
+* Python
+* Pyparsing
+
+
+SAMPLE USAGE:
+from booleansearchparser import BooleanSearchParser
+from __future__ import print_function
+bsp = BooleanSearchParser()
+text = u"wildcards at the begining of a search term "
+exprs= [
+ u"*cards and term", #True
+ u"wild* and term", #True
+ u"not terms", #True
+ u"terms or begin", #False
+]
+for expr in exprs:
+ print (bsp.match(text,expr))
+
+#non-western samples
+text = u"안녕하세요, 당신은 어떠세요?"
+exprs= [
+ u"*신은 and 어떠세요", #True
+ u"not 당신은", #False
+ u"당신 or 당", #False
+]
+for expr in exprs:
+ print (bsp.match(text,expr))
+-------------------------------------------------------------------------------
+Copyright (c) 2006, Estrate, the Netherlands
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+* Neither the name of Estrate nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific
+ prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTORS:
+- Steven Mooij
+- Rudolph Froger
+- Paul McGuire
+- Guiem Bosch
+- Francesc Garcia
+
+TODO:
+- add more docs
+- ask someone to check my English texts
+- add more kinds of wildcards ('*' at the beginning and '*' inside a word)?
+
+"""
+from __future__ import print_function
+from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward, Suppress, Optional, OneOrMore, oneOf
+import re
+import string
+alphabet_ranges = [
+ ##CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)
+ [int("0400",16), int("04FF",16)],
+ ##THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
+ [int("0E00",16), int("0E7F",16)],
+ ##ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) )
+ [int("0600",16), int("07FF",16)],
+ ##CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+ [int("0400",16), int("09FF",16)],
+ #JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system
+ [int("3040",16), int("30FF",16)],
+ #KOREAN : https://en.wikipedia.org/wiki/Hangul
+ [int("AC00",16), int("D7AF",16)],
+ [int("1100",16), int("11FF",16)],
+ [int("3130",16), int("318F",16)],
+ [int("3200",16), int("32FF",16)],
+ [int("A960",16), int("A97F",16)],
+ [int("D7B0",16), int("D7FF",16)],
+ [int("FF00",16), int("FFEF",16)],
+]
+class BooleanSearchParser:
+
+ def __init__(self,only_parse=False):
+ self._methods = {
+ 'and': self.evaluateAnd,
+ 'or': self.evaluateOr,
+ 'not': self.evaluateNot,
+ 'parenthesis': self.evaluateParenthesis,
+ 'quotes': self.evaluateQuotes,
+ 'word': self.evaluateWord,
+ 'wordwildcardprefix': self.evaluateWordWildcardPrefix,
+ 'wordwildcardsufix': self.evaluateWordWildcardSufix,
+ }
+ self._parser = self.parser()
+ self.text = ''
+ self.words = []
+
+ def parser(self):
+ """
+ This function returns a parser.
+ The grammar should be like most full text search engines (Google, Tsearch, Lucene).
+
+ Grammar:
+ - a query consists of alphanumeric words, with an optional '*'
+ wildcard at the end or the begining of a word
+ - a sequence of words between quotes is a literal string
+ - words can be used together by using operators ('and' or 'or')
+ - words with operators can be grouped with parenthesis
+ - a word or group of words can be preceded by a 'not' operator
+ - the 'and' operator precedes an 'or' operator
+ - if an operator is missing, use an 'and' operator
+ """
+ operatorOr = Forward()
+
+ alphabet = (
+ u'*'+
+ alphanums
+ )
+ #suport for non-wester alphabets
+ for r in alphabet_ranges:
+ alphabet += u''.join(chr(c) for c in range(*r) if not chr(c).isspace())
+
+ operatorWord = Group(
+ Word(alphabet+'*')
+ ).setResultsName('word')
+
+
+ operatorQuotesContent = Forward()
+ operatorQuotesContent << (
+ (operatorWord + operatorQuotesContent) | operatorWord
+ )
+
+ operatorQuotes = Group(
+ Suppress('"') + operatorQuotesContent + Suppress('"')
+ ).setResultsName("quotes") | operatorWord
+
+ operatorParenthesis = Group(
+ (Suppress("(") + operatorOr + Suppress(")"))
+ ).setResultsName("parenthesis") | operatorQuotes
+
+ operatorNot = Forward()
+ operatorNot << (Group(
+ Suppress(Keyword("not", caseless=True)) + operatorNot
+ ).setResultsName("not") | operatorParenthesis)
+
+ operatorAnd = Forward()
+ operatorAnd << (
+ Group(
+ operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
+ ).setResultsName("and")|
+ Group(
+ operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
+ ).setResultsName("and") |
+ operatorNot
+ )
+
+ operatorOr << (Group(
+ operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
+ ).setResultsName("or") | operatorAnd)
+
+ return operatorOr.parseString
+ def evaluateAnd(self, argument):
+ return self.evaluate(argument[0]) and (self.evaluate(argument[1]))
+
+ def evaluateOr(self, argument):
+ return self.evaluate(argument[0]) or self.evaluate(argument[1])
+
+
+ def evaluateNot(self, argument):
+ return self.GetNot(self.evaluate(argument[0]))
+
+ def evaluateParenthesis(self, argument):
+ return self.evaluate(argument[0])
+
+ def evaluateQuotes(self, argument):
+ """Evaluate quoted strings
+
+ First is does an 'and' on the indidual search terms, then it asks the
+ function GetQuoted to only return the subset of ID's that contain the
+ literal string.
+ """
+ #r = set()
+ r = False
+ search_terms = []
+ for item in argument:
+ search_terms.append(item[0])
+ r = r and self.evaluate(item)
+ return self.GetQuotes(' '.join(search_terms), r)
+
+ def evaluateWord(self, argument):
+ wildcard_count = argument[0].count(u"*")
+ if wildcard_count > 0:
+ if wildcard_count == 1 and argument[0].startswith(u"*"):
+ return self.GetWordWildcard(argument[0][1:], method = "endswith")
+ if wildcard_count == 1 and argument[0].endswith(u"*"):
+ return self.GetWordWildcard(argument[0][:-1], method = "startswith")
+ else:
+ _regex = argument[0].replace(u"*",u".+")
+ matched = False
+ for w in self.words:
+ matched = bool(re.search(_regex,w))
+ if matched:
+ break
+ return matched
+
+ return self.GetWord(argument[0])
+
+ def evaluateWordWildcardPrefix(self, argument):
+ return self.GetWordWildcard(argument[0], method = "endswith")
+
+ def evaluateWordWildcardSufix(self, argument):
+ return self.GetWordWildcard(argument[0], method = "startswith")
+
+ def evaluate(self, argument):
+ return self._methods[argument.getName()](argument)
+
+ def Parse(self, query):
+ return self.evaluate(self._parser(query)[0])
+
+ def GetWord(self, word):
+ return word in self.words
+
+ def GetWordWildcard(self, word, method = "startswith"):
+ matched = False
+ for w in self.words:
+ matched = getattr(w,method)(word)
+ if matched:
+ break
+ return matched
+
+ """
+ def GetKeyword(self, name, value):
+ return set()
+
+ def GetBetween(self, min, max):
+ print (min,max)
+ return set()
+ """
+
+ def GetQuotes(self, search_string, tmp_result):
+ return search_string in self.text
+
+
+ def GetNot(self, not_set):
+ return not not_set
+
+
+ def _split_words(self,text):
+ words = []
+ """
+ >>> import string
+ >>> string.punctuation
+ '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
+ """
+ #it will keep @, # and
+ #usernames and hashtags can contain dots, so a double check is done
+ r = re.compile(r'[\s{}]+'.format(re.escape('!"$%&\'()*+,-/:;<=>?[\\]^`{|}~')))
+ _words = r.split(text)
+ for _w in _words:
+ if '.' in _w and not _w.startswith("#") and not _w.startswith("@"):
+ for __w in _w.split("."):
+ words.append(__w)
+ continue
+
+ words.append(_w)
+
+ return words
+
+ def match(self,text,expr):
+ self.text = text
+ self.words = self._split_words(text)
+
+ return self.Parse(expr)
+
+
+
+
+class ParserTest(BooleanSearchParser):
+ """Tests the parser with some search queries
+ tests containts a dictionary with tests and expected results.
+ """
+
+ def Test(self):
+ exprs = {
+ '0' : 'help',
+ '1' : 'help or hulp',
+ '2' : 'help and hulp',
+ '3' : 'help hulp',
+ '4' : 'help and hulp or hilp',
+ '5' : 'help or hulp and hilp',
+ '6' : 'help or hulp or hilp or halp',
+ '7' : '(help or hulp) and (hilp or halp)',
+ '8' : 'help and (hilp or halp)',
+ '9' : '(help and (hilp or halp)) or hulp',
+ '10': 'not help',
+ '11': 'not hulp and halp',
+ '12': 'not (help and halp)',
+ '13': '"help me please"',
+ '14': '"help me please" or hulp',
+ '15': '"help me please" or (hulp and halp)',
+ '16': 'help*',
+ '17': 'help or hulp*',
+ '18': 'help* and hulp',
+ '19': 'help and hulp* or hilp',
+ '20': 'help* or hulp or hilp or halp',
+ '21': '(help or hulp*) and (hilp* or halp)',
+ '22': 'help* and (hilp* or halp*)',
+ '23': '(help and (hilp* or halp)) or hulp*',
+ '24': 'not help* and halp',
+ '25': 'not (help* and helpe*)',
+ '26': '"help* me please"',
+ '27': '"help* me* please" or hulp*',
+ '28': '"help me please*" or (hulp and halp)',
+ '29': '"help me please" not (hulp and halp)',
+ '30': '"help me please" hulp',
+ '31': 'help and hilp and not holp',
+ '32': 'help hilp not holp',
+ '33': 'help hilp and not holp',
+ '34': '*lp and halp'
+ }
+
+ texts_matcheswith = {
+ "halp thinks he needs help": [
+ "25", "22", "20", "21", "11", "17", "16", "23", "34", "1", "0", "5", "7", "6", "9", "8"
+ ],
+ "he needs halp": [
+ "24", "25", "20", "11", "10", "12", "34", "6"
+ ],
+ "help": [
+ "25", "20", "12", "17", "16", "1", "0", "5", "6"
+ ],
+ "help hilp": [
+ "25", "22", "20", "32", "21", "12", "17", "16", "19", "31", "23", "1", "0", "5", "4", "7", "6", "9", "8", "33"
+ ],
+ "help me please hulp": [
+ "30", "25", "27", "20", "13", "12", "15", "14", "17", "16", "19", "18", "23", "29", "1", "0", "3", "2", "5", "4", "6", "9"
+ ],
+ "helper": [
+ "20", "10", "12", "16"
+ ],
+ "hulp hilp": [
+ "25", "27", "20", "21", "10", "12", "14", "17", "19", "23", "1", "5", "4", "7", "6", "9"
+ ],
+ "nothing": [
+ "25", "10", "12"
+ ]
+ }
+
+
+ all_ok = True
+ for text,matches in texts_matcheswith.items():
+ _matches = []
+ for _id,expr in exprs.items():
+ if self.match(text,expr):
+ _matches.append(_id)
+ all_ok = all_ok and sorted(texts_matcheswith[text])==sorted(_matches)
+
+ return all_ok
+
+if __name__=='__main__':
+ if ParserTest().Test():
+ print ('All tests OK')
+ else:
+ print ('One or more tests FAILED') \ No newline at end of file