From b2c3ade75384efe76b8774b607e17fe98fab92ef Mon Sep 17 00:00:00 2001 From: ptmcg Date: Tue, 9 Aug 2016 00:23:49 +0000 Subject: TagTag for 2.1.6 release git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/tags/pyparsing_2.1.6@402 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b --- trunk/src/examples/btpyparse.py | 128 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 trunk/src/examples/btpyparse.py (limited to 'trunk/src/examples/btpyparse.py') diff --git a/trunk/src/examples/btpyparse.py b/trunk/src/examples/btpyparse.py new file mode 100644 index 0000000..f3c11ae --- /dev/null +++ b/trunk/src/examples/btpyparse.py @@ -0,0 +1,128 @@ +""" Pyparsing parser for BibTeX files + +A standalone parser using pyparsing. + +pyparsing has a simple and expressive syntax so the grammar is easy to read and +write. + +Matthew Brett 2010 +Simplified BSD license +""" + +from pyparsing import (Regex, Suppress, ZeroOrMore, Group, Optional, Forward, + SkipTo, CaselessLiteral, Dict) + + +class Macro(object): + """ Class to encapsulate undefined macro references """ + def __init__(self, name): + self.name = name + def __repr__(self): + return 'Macro("%s")' % self.name + def __eq__(self, other): + return self.name == other.name + def __ne__(self, other): + return self.name != other.name + + +# Character literals +LCURLY,RCURLY,LPAREN,RPAREN,QUOTE,COMMA,AT,EQUALS,HASH = map(Suppress,'{}()",@=#') + + +def bracketed(expr): + """ Return matcher for `expr` between curly brackets or parentheses """ + return (LPAREN + expr + RPAREN) | (LCURLY + expr + RCURLY) + + +# Define parser components for strings (the hard bit) +chars_no_curly = Regex(r"[^{}]+") +chars_no_curly.leaveWhitespace() +chars_no_quotecurly = Regex(r'[^"{}]+') +chars_no_quotecurly.leaveWhitespace() +# Curly string is some stuff without curlies, or nested curly sequences +curly_string = Forward() +curly_item = Group(curly_string) | chars_no_curly +curly_string << LCURLY + ZeroOrMore(curly_item) + RCURLY +# quoted string is either just stuff within quotes, or stuff within quotes, within +# which there is nested curliness +quoted_item = Group(curly_string) | chars_no_quotecurly +quoted_string = QUOTE + ZeroOrMore(quoted_item) + QUOTE + +# Numbers can just be numbers. Only integers though. +number = Regex('[0-9]+') + +# Basis characters (by exclusion) for variable / field names. The following +# list of characters is from the btparse documentation +any_name = Regex('[^\s"#%\'(),={}]+') + +# btparse says, and the test bibs show by experiment, that macro and field names +# cannot start with a digit. In fact entry type names cannot start with a digit +# either (see tests/bibs). Cite keys can start with a digit +not_digname = Regex('[^\d\s"#%\'(),={}][^\s"#%\'(),={}]*') + +# Comment comments out to end of line +comment = (AT + CaselessLiteral('comment') + + Regex("[\s{(].*").leaveWhitespace()) + +# The name types with their digiteyness +not_dig_lower = not_digname.copy().setParseAction(lambda t: t[0].lower()) +macro_def = not_dig_lower.copy() +macro_ref = not_dig_lower.copy().setParseAction(lambda t : Macro(t[0].lower())) +field_name = not_dig_lower.copy() +# Spaces in names mean they cannot clash with field names +entry_type = not_dig_lower('entry_type') +cite_key = any_name('cite_key') +# Number has to be before macro name +string = (number | macro_ref | quoted_string | curly_string) + +# There can be hash concatenation +field_value = string + ZeroOrMore(HASH + string) +field_def = Group(field_name + EQUALS + field_value) +entry_contents = Dict(ZeroOrMore(field_def + COMMA) + Optional(field_def)) + +# Entry is surrounded either by parentheses or curlies +entry = (AT + entry_type + bracketed(cite_key + COMMA + entry_contents)) + +# Preamble is a macro-like thing with no name +preamble = AT + CaselessLiteral('preamble') + bracketed(field_value) + +# Macros (aka strings) +macro_contents = macro_def + EQUALS + field_value +macro = AT + CaselessLiteral('string') + bracketed(macro_contents) + +# Implicit comments +icomment = SkipTo('@').setParseAction(lambda t : t.insert(0, 'icomment')) + +# entries are last in the list (other than the fallback) because they have +# arbitrary start patterns that would match comments, preamble or macro +definitions = Group(comment | + preamble | + macro | + entry | + icomment) + +# Start symbol +bibfile = ZeroOrMore(definitions) + + +def parse_str(str): + return bibfile.parseString(str) + + +if __name__ == '__main__': + # Run basic test + txt = """ +Some introductory text +(implicit comment) + +@ARTICLE{Authors2011, + author = {First Author and Second Author and Third Author}, + title = {An article about {S}omething}, + journal = "Journal of Articles", + year = {2011}, + volume = {16}, + pages = {1140--1141}, + number = {2} +} +""" + print('\n\n'.join(defn.dump() for defn in parse_str(txt))) -- cgit v1.2.1