diff options
author | xecgr <francesc.garcia.robert@gmail.com> | 2019-08-14 05:29:28 +0200 |
---|---|---|
committer | Paul McGuire <ptmcg@users.noreply.github.com> | 2019-08-13 22:29:28 -0500 |
commit | 709030db87149be9a2d8c045b6e125ed068a00c2 (patch) | |
tree | 180f6f41bb74d0527d71763e0aa24fceb24d9857 /examples/booleansearchparser.py | |
parent | 4ff075c8fa7c81657b9bbec34041627c5704317f (diff) | |
download | pyparsing-git-709030db87149be9a2d8c045b6e125ed068a00c2.tar.gz |
Boolean Search query parser: allows to perform searches with the common boolean search syntax against a text (#21)
* Add files via upload
Boolean Search query parser, based on searchparser, that allows to perform searches with the common boolean search syntax against a text (western + non-western alphabets)
SAMPLE USAGE:
from booleansearchparser import BooleanSearchParser
bsp = BooleanSearchParser()
text = u"wildcards at the begining of a search term "
exprs= [
u"*cards and term", #True
u"wild* and term", #True
u"not terms", #True
u"terms or begin", #False
]
for expr in exprs:
print bsp.match(text,expr)
#non-western samples
text = u"안녕하세요, 당신은 어떠세요?"
exprs= [
u"*신은 and 어떠세요", #True
u"not 당신은", #False
u"당신 or 당", #False
]
for expr in exprs:
print bsp.match(text,expr)
* from __future__ import print_function and changing this over to be Python 2/3 compatible
* ptmcg conversation issues
Diffstat (limited to 'examples/booleansearchparser.py')
-rw-r--r-- | examples/booleansearchparser.py | 394 |
1 files changed, 394 insertions, 0 deletions
diff --git a/examples/booleansearchparser.py b/examples/booleansearchparser.py new file mode 100644 index 0000000..79f9d29 --- /dev/null +++ b/examples/booleansearchparser.py @@ -0,0 +1,394 @@ +#-*- coding: utf-8 -*- +# vim:fileencoding=utf-8 +""" +Boolean Search query parser (Based on searchparser: https://github.com/pyparsing/pyparsing/blob/master/examples/searchparser.py) + +version 2018-07-22 + +This search query parser uses the excellent Pyparsing module +(http://pyparsing.sourceforge.net/) to parse search queries by users. +It handles: + +* 'and', 'or' and implicit 'and' operators; +* parentheses; +* quoted strings; +* wildcards at the end of a search term (help*); +* wildcards at the begining of a search term (*lp); +* non-western languages + +Requirements: +* Python +* Pyparsing + + +SAMPLE USAGE: +from booleansearchparser import BooleanSearchParser +from __future__ import print_function +bsp = BooleanSearchParser() +text = u"wildcards at the begining of a search term " +exprs= [ + u"*cards and term", #True + u"wild* and term", #True + u"not terms", #True + u"terms or begin", #False +] +for expr in exprs: + print (bsp.match(text,expr)) + +#non-western samples +text = u"안녕하세요, 당신은 어떠세요?" +exprs= [ + u"*신은 and 어떠세요", #True + u"not 당신은", #False + u"당신 or 당", #False +] +for expr in exprs: + print (bsp.match(text,expr)) +------------------------------------------------------------------------------- +Copyright (c) 2006, Estrate, the Netherlands +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of Estrate nor the names of its contributors may be used + to endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CONTRIBUTORS: +- Steven Mooij +- Rudolph Froger +- Paul McGuire +- Guiem Bosch +- Francesc Garcia + +TODO: +- add more docs +- ask someone to check my English texts +- add more kinds of wildcards ('*' at the beginning and '*' inside a word)? + +""" +from __future__ import print_function +from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward, Suppress, Optional, OneOrMore, oneOf +import re +import string +alphabet_ranges = [ + ##CYRILIC: https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block) + [int("0400",16), int("04FF",16)], + ##THAI: https://en.wikipedia.org/wiki/Thai_(Unicode_block) + [int("0E00",16), int("0E7F",16)], + ##ARABIC: https://en.wikipedia.org/wiki/Arabic_(Unicode_block) (Arabic (0600–06FF)+ Syriac (0700–074F)+ Arabic Supplement (0750–077F) ) + [int("0600",16), int("07FF",16)], + ##CHINESE: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + [int("0400",16), int("09FF",16)], + #JAPANESE : https://en.wikipedia.org/wiki/Japanese_writing_system + [int("3040",16), int("30FF",16)], + #KOREAN : https://en.wikipedia.org/wiki/Hangul + [int("AC00",16), int("D7AF",16)], + [int("1100",16), int("11FF",16)], + [int("3130",16), int("318F",16)], + [int("3200",16), int("32FF",16)], + [int("A960",16), int("A97F",16)], + [int("D7B0",16), int("D7FF",16)], + [int("FF00",16), int("FFEF",16)], +] +class BooleanSearchParser: + + def __init__(self,only_parse=False): + self._methods = { + 'and': self.evaluateAnd, + 'or': self.evaluateOr, + 'not': self.evaluateNot, + 'parenthesis': self.evaluateParenthesis, + 'quotes': self.evaluateQuotes, + 'word': self.evaluateWord, + 'wordwildcardprefix': self.evaluateWordWildcardPrefix, + 'wordwildcardsufix': self.evaluateWordWildcardSufix, + } + self._parser = self.parser() + self.text = '' + self.words = [] + + def parser(self): + """ + This function returns a parser. + The grammar should be like most full text search engines (Google, Tsearch, Lucene). + + Grammar: + - a query consists of alphanumeric words, with an optional '*' + wildcard at the end or the begining of a word + - a sequence of words between quotes is a literal string + - words can be used together by using operators ('and' or 'or') + - words with operators can be grouped with parenthesis + - a word or group of words can be preceded by a 'not' operator + - the 'and' operator precedes an 'or' operator + - if an operator is missing, use an 'and' operator + """ + operatorOr = Forward() + + alphabet = ( + u'*'+ + alphanums + ) + #suport for non-wester alphabets + for r in alphabet_ranges: + alphabet += u''.join(chr(c) for c in range(*r) if not chr(c).isspace()) + + operatorWord = Group( + Word(alphabet+'*') + ).setResultsName('word') + + + operatorQuotesContent = Forward() + operatorQuotesContent << ( + (operatorWord + operatorQuotesContent) | operatorWord + ) + + operatorQuotes = Group( + Suppress('"') + operatorQuotesContent + Suppress('"') + ).setResultsName("quotes") | operatorWord + + operatorParenthesis = Group( + (Suppress("(") + operatorOr + Suppress(")")) + ).setResultsName("parenthesis") | operatorQuotes + + operatorNot = Forward() + operatorNot << (Group( + Suppress(Keyword("not", caseless=True)) + operatorNot + ).setResultsName("not") | operatorParenthesis) + + operatorAnd = Forward() + operatorAnd << ( + Group( + operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd + ).setResultsName("and")| + Group( + operatorNot + OneOrMore(~oneOf("and or") + operatorAnd) + ).setResultsName("and") | + operatorNot + ) + + operatorOr << (Group( + operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr + ).setResultsName("or") | operatorAnd) + + return operatorOr.parseString + def evaluateAnd(self, argument): + return self.evaluate(argument[0]) and (self.evaluate(argument[1])) + + def evaluateOr(self, argument): + return self.evaluate(argument[0]) or self.evaluate(argument[1]) + + + def evaluateNot(self, argument): + return self.GetNot(self.evaluate(argument[0])) + + def evaluateParenthesis(self, argument): + return self.evaluate(argument[0]) + + def evaluateQuotes(self, argument): + """Evaluate quoted strings + + First is does an 'and' on the indidual search terms, then it asks the + function GetQuoted to only return the subset of ID's that contain the + literal string. + """ + #r = set() + r = False + search_terms = [] + for item in argument: + search_terms.append(item[0]) + r = r and self.evaluate(item) + return self.GetQuotes(' '.join(search_terms), r) + + def evaluateWord(self, argument): + wildcard_count = argument[0].count(u"*") + if wildcard_count > 0: + if wildcard_count == 1 and argument[0].startswith(u"*"): + return self.GetWordWildcard(argument[0][1:], method = "endswith") + if wildcard_count == 1 and argument[0].endswith(u"*"): + return self.GetWordWildcard(argument[0][:-1], method = "startswith") + else: + _regex = argument[0].replace(u"*",u".+") + matched = False + for w in self.words: + matched = bool(re.search(_regex,w)) + if matched: + break + return matched + + return self.GetWord(argument[0]) + + def evaluateWordWildcardPrefix(self, argument): + return self.GetWordWildcard(argument[0], method = "endswith") + + def evaluateWordWildcardSufix(self, argument): + return self.GetWordWildcard(argument[0], method = "startswith") + + def evaluate(self, argument): + return self._methods[argument.getName()](argument) + + def Parse(self, query): + return self.evaluate(self._parser(query)[0]) + + def GetWord(self, word): + return word in self.words + + def GetWordWildcard(self, word, method = "startswith"): + matched = False + for w in self.words: + matched = getattr(w,method)(word) + if matched: + break + return matched + + """ + def GetKeyword(self, name, value): + return set() + + def GetBetween(self, min, max): + print (min,max) + return set() + """ + + def GetQuotes(self, search_string, tmp_result): + return search_string in self.text + + + def GetNot(self, not_set): + return not not_set + + + def _split_words(self,text): + words = [] + """ + >>> import string + >>> string.punctuation + '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' + """ + #it will keep @, # and + #usernames and hashtags can contain dots, so a double check is done + r = re.compile(r'[\s{}]+'.format(re.escape('!"$%&\'()*+,-/:;<=>?[\\]^`{|}~'))) + _words = r.split(text) + for _w in _words: + if '.' in _w and not _w.startswith("#") and not _w.startswith("@"): + for __w in _w.split("."): + words.append(__w) + continue + + words.append(_w) + + return words + + def match(self,text,expr): + self.text = text + self.words = self._split_words(text) + + return self.Parse(expr) + + + + +class ParserTest(BooleanSearchParser): + """Tests the parser with some search queries + tests containts a dictionary with tests and expected results. + """ + + def Test(self): + exprs = { + '0' : 'help', + '1' : 'help or hulp', + '2' : 'help and hulp', + '3' : 'help hulp', + '4' : 'help and hulp or hilp', + '5' : 'help or hulp and hilp', + '6' : 'help or hulp or hilp or halp', + '7' : '(help or hulp) and (hilp or halp)', + '8' : 'help and (hilp or halp)', + '9' : '(help and (hilp or halp)) or hulp', + '10': 'not help', + '11': 'not hulp and halp', + '12': 'not (help and halp)', + '13': '"help me please"', + '14': '"help me please" or hulp', + '15': '"help me please" or (hulp and halp)', + '16': 'help*', + '17': 'help or hulp*', + '18': 'help* and hulp', + '19': 'help and hulp* or hilp', + '20': 'help* or hulp or hilp or halp', + '21': '(help or hulp*) and (hilp* or halp)', + '22': 'help* and (hilp* or halp*)', + '23': '(help and (hilp* or halp)) or hulp*', + '24': 'not help* and halp', + '25': 'not (help* and helpe*)', + '26': '"help* me please"', + '27': '"help* me* please" or hulp*', + '28': '"help me please*" or (hulp and halp)', + '29': '"help me please" not (hulp and halp)', + '30': '"help me please" hulp', + '31': 'help and hilp and not holp', + '32': 'help hilp not holp', + '33': 'help hilp and not holp', + '34': '*lp and halp' + } + + texts_matcheswith = { + "halp thinks he needs help": [ + "25", "22", "20", "21", "11", "17", "16", "23", "34", "1", "0", "5", "7", "6", "9", "8" + ], + "he needs halp": [ + "24", "25", "20", "11", "10", "12", "34", "6" + ], + "help": [ + "25", "20", "12", "17", "16", "1", "0", "5", "6" + ], + "help hilp": [ + "25", "22", "20", "32", "21", "12", "17", "16", "19", "31", "23", "1", "0", "5", "4", "7", "6", "9", "8", "33" + ], + "help me please hulp": [ + "30", "25", "27", "20", "13", "12", "15", "14", "17", "16", "19", "18", "23", "29", "1", "0", "3", "2", "5", "4", "6", "9" + ], + "helper": [ + "20", "10", "12", "16" + ], + "hulp hilp": [ + "25", "27", "20", "21", "10", "12", "14", "17", "19", "23", "1", "5", "4", "7", "6", "9" + ], + "nothing": [ + "25", "10", "12" + ] + } + + + all_ok = True + for text,matches in texts_matcheswith.items(): + _matches = [] + for _id,expr in exprs.items(): + if self.match(text,expr): + _matches.append(_id) + all_ok = all_ok and sorted(texts_matcheswith[text])==sorted(_matches) + + return all_ok + +if __name__=='__main__': + if ParserTest().Test(): + print ('All tests OK') + else: + print ('One or more tests FAILED')
\ No newline at end of file |