Re-calculate only if run as stand-alone module. Add verbose test output.

git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@7650 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2013-04-22 20:08:09 +0000
committer: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2013-04-22 20:08:09 +0000
commit: 854b2e9f1bfc86797b51a671f1f0b70090d1c8f6 (patch)
tree: 1f89094b38468e67d7e235e9b7c84e51c06dfaa1
parent: bbe2301aff249a1091f2319cfe9eebbdbab37798 (diff)
download: docutils-854b2e9f1bfc86797b51a671f1f0b70090d1c8f6.tar.gz
1 files changed, 149 insertions, 130 deletions
diff --git a/docutils/utils/punctuation_chars.py b/docutils/utils/punctuation_chars.py
index b8dbe2b43..f30768197 100644
--- a/docutils/utils/punctuation_chars.py
+++ b/docutils/utils/punctuation_chars.py
@@ -29,126 +29,131 @@ import unicodedata
 # --------------------------------------------
 #
 # The sample strings are generated by punctuation_samples() and put here
-# literal to avoid the time-consuming generation with every Docutils
-# run. Running this file as a standalone module checks the definitions below
-# against a re-calculation.
+# literal to avoid the time-consuming generation with every Docutils run. As
+# the samples are used in regular expressions, special characters are escaped.
+# ::
 
 openers = ur"""\"\'\(\<\[\{༺༼᚛⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢«‘“‹⸂⸄⸉⸌⸜⸠‚„»’”›⸃⸅⸊⸍⸝⸡‛‟"""
 closers = ur"""\"\'\)\>\]\}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣»’”›⸃⸅⸊⸍⸝⸡‛‟«‘“‹⸂⸄⸉⸌⸜⸠‚„"""
 delimiters = ur"\-\/\:֊־᐀᠆‐‑‒–—―⸗⸚〜〰゠︱︲﹘﹣－¡·¿;·՚՛՜՝՞՟։׀׃׆׳״؉؊،؍؛؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾।॥॰෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒྅࿐࿑࿒࿓࿔၊။၌၍၎၏჻፡።፣፤፥፦፧፨᙭᙮᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠇᠈᠉᠊᥄᥅᧞᧟᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᰻᰼᰽᰾᰿᱾᱿᳓‖‗†‡•‣․‥…‧‰‱′″‴‵‶‷‸※‼‽‾⁁⁂⁃⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⳹⳺⳻⳼⳾⳿⸀⸁⸆⸇⸈⸋⸎⸏⸐⸑⸒⸓⸔⸕⸖⸘⸙⸛⸞⸟⸪⸫⸬⸭⸮⸰⸱、。〃〽・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꯫︐︑︒︓︔︕︖︙︰﹅﹆﹉﹊﹋﹌﹐﹑﹒﹔﹕﹖﹗﹟﹠﹡﹨﹪﹫！＂＃％＆＇＊，．／：；？＠＼｡､･𐄀𐄁𐎟𐏐𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐬹𐬺𐬻𐬼𐬽𐬾𐬿𑂻𑂼𑂾𑂿𑃀𑃁𒑰𒑱𒑲𒑳"
 closing_delimiters = ur"\.\,\;\!\?"
 
+# Running this file as a standalone module checks the definitions against a
+# re-calculation::
+
+if __name__ == '__main__':
+
 
 # Unicode punctuation character categories
 # ----------------------------------------
 
-unicode_punctuation_categories = {
-    # 'Pc': 'Connector', # not used in Docutils inline markup recognition
-    'Pd': 'Dash',
-    'Ps': 'Open',
-    'Pe': 'Close',
-    'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
-    'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
-    'Po': 'Other'
-    }
-"""Unicode character categories for punctuation"""
+    unicode_punctuation_categories = {
+        # 'Pc': 'Connector', # not used in Docutils inline markup recognition
+        'Pd': 'Dash',
+        'Ps': 'Open',
+        'Pe': 'Close',
+        'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
+        'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
+        'Po': 'Other'
+        }
+    """Unicode character categories for punctuation"""
 
 
 # generate character pattern strings
 # ==================================
 
-def unicode_charlists(categories, cp_min=0, cp_max=None):
-    """Return dictionary of Unicode character lists.
-
-    For each of the `catagories`, an item contains a list with all Unicode
-    characters with `cp_min` <= code-point <= `cp_max` that belong to the
-    category. (The default values check every code-point supported by Python.)
-    """
-    # Determine highest code point with one of the given categories
-    # (may shorten the search time considerably if there are many
-    # categories with not too high characters):
-    if cp_max is None:
-        cp_max = max(x for x in xrange(sys.maxunicode + 1)
-                     if unicodedata.category(unichr(x)) in categories)
-        # print cp_max # => 74867 for unicode_punctuation_categories
-    charlists = {}
-    for cat in categories:
-        charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
-                          if unicodedata.category(unichr(x)) == cat]
-    return charlists
+    def unicode_charlists(categories, cp_min=0, cp_max=None):
+        """Return dictionary of Unicode character lists.
+
+        For each of the `catagories`, an item contains a list with all Unicode
+        characters with `cp_min` <= code-point <= `cp_max` that belong to the
+        category. (The default values check every code-point supported by Python.)
+        """
+        # Determine highest code point with one of the given categories
+        # (may shorten the search time considerably if there are many
+        # categories with not too high characters):
+        if cp_max is None:
+            cp_max = max(x for x in xrange(sys.maxunicode + 1)
+                        if unicodedata.category(unichr(x)) in categories)
+            # print cp_max # => 74867 for unicode_punctuation_categories
+        charlists = {}
+        for cat in categories:
+            charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1)
+                            if unicodedata.category(unichr(x)) == cat]
+        return charlists
 
 
 # Character categories in Docutils
 # --------------------------------
 
-def punctuation_samples():
+    def punctuation_samples():
 
-    """Docutils punctuation category sample strings.
+        """Docutils punctuation category sample strings.
 
-    Return list of sample strings for the categories "Open", "Close",
-    "Delimiters" and "Closing-Delimiters" used in the `inline markup
-    recognition rules`_.
-    """
+        Return list of sample strings for the categories "Open", "Close",
+        "Delimiters" and "Closing-Delimiters" used in the `inline markup
+        recognition rules`_.
+        """
 
-    # Lists with characters in Unicode punctuation character categories
-    cp_min = 160 # ASCII chars have special rules for backwards compatibility
-    ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
+        # Lists with characters in Unicode punctuation character categories
+        cp_min = 160 # ASCII chars have special rules for backwards compatibility
+        ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
 
-    # match opening/closing characters
-    # --------------------------------
-    # Rearange the lists to ensure matching characters at the same
-    # index position.
+        # match opening/closing characters
+        # --------------------------------
+        # Rearange the lists to ensure matching characters at the same
+        # index position.
 
-    # low quotation marks are also used as closers (e.g. in Greek)
-    # move them to category Pi:
-    ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
-    ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
-    ucharlists['Pi'] += [u'‚', u'„']
+        # low quotation marks are also used as closers (e.g. in Greek)
+        # move them to category Pi:
+        ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
+        ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
+        ucharlists['Pi'] += [u'‚', u'„']
 
-    ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
-    ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
-    ucharlists['Pf'] += [u'‛', u'‟']
+        ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
+        ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+        ucharlists['Pf'] += [u'‛', u'‟']
 
-    # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
-    ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
+        # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
+        ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
 
-    # print u''.join(ucharlists['Ps']).encode('utf8')
-    # print u''.join(ucharlists['Pe']).encode('utf8')
-    # print u''.join(ucharlists['Pi']).encode('utf8')
-    # print u''.join(ucharlists['Pf']).encode('utf8')
+        # print u''.join(ucharlists['Ps']).encode('utf8')
+        # print u''.join(ucharlists['Pe']).encode('utf8')
+        # print u''.join(ucharlists['Pi']).encode('utf8')
+        # print u''.join(ucharlists['Pf']).encode('utf8')
 
-    # The Docutils character categories
-    # ---------------------------------
-    #
-    # The categorization of ASCII chars is non-standard to reduce both
-    # false positives and need for escaping. (see `inline markup recognition
-    # rules`_)
+        # The Docutils character categories
+        # ---------------------------------
+        #
+        # The categorization of ASCII chars is non-standard to reduce both
+        # false positives and need for escaping. (see `inline markup recognition
+        # rules`_)
 
-    # matching, allowed before markup
-    openers = [re.escape('"\'(<[{')]
-    for cat in ('Ps', 'Pi', 'Pf'):
-        openers.extend(ucharlists[cat])
+        # allowed before markup if there is a matching closer
+        openers = [re.escape('"\'(<[{')]
+        for cat in ('Ps', 'Pi', 'Pf'):
+            openers.extend(ucharlists[cat])
 
-    # matching, allowed after markup
-    closers = [re.escape('"\')>]}')]
-    for cat in ('Pe', 'Pf', 'Pi'):
-        closers.extend(ucharlists[cat])
+        # allowed after markup if there is a matching opener
+        closers = [re.escape('"\')>]}')]
+        for cat in ('Pe', 'Pf', 'Pi'):
+            closers.extend(ucharlists[cat])
 
-    # non-matching, allowed on both sides
-    delimiters = [re.escape('-/:')]
-    for cat in ('Pd', 'Po'):
-        delimiters.extend(ucharlists[cat])
+        # non-matching, allowed on both sides
+        delimiters = [re.escape('-/:')]
+        for cat in ('Pd', 'Po'):
+            delimiters.extend(ucharlists[cat])
 
-    # non-matching, after markup
-    closing_delimiters = [re.escape('.,;!?')]
+        # non-matching, after markup
+        closing_delimiters = [re.escape('.,;!?')]
 
-    # # Test open/close matching:
-    # for i in range(min(len(openers),len(closers))):
-    #     print '%4d    %s    %s' % (i, openers[i].encode('utf8'),
-    #                                closers[i].encode('utf8'))
+        # # Test open/close matching:
+        # for i in range(min(len(openers),len(closers))):
+        #     print '%4d    %s    %s' % (i, openers[i].encode('utf8'),
+        #                                closers[i].encode('utf8'))
 
-    return [u''.join(chars)
-            for chars in (openers, closers, delimiters, closing_delimiters)]
+        return [u''.join(chars)
+                for chars in (openers, closers, delimiters, closing_delimiters)]
 
 
 # Matching open/close quotes
@@ -158,54 +163,68 @@ def punctuation_samples():
 # the pairing of open/close quotes is ambigue due to  different typographic
 # conventions in different languages.
 
-quote_pairs = {u'\xbb': u'\xbb', # Swedish
-               u'\u2018': u'\u201a', # Greek
-               u'\u2019': u'\u2019', # Swedish
-               u'\u201a': u'\u2018\u2019', # German, Polish
-               u'\u201c': u'\u201e', # German
-               u'\u201e': u'\u201c\u201d',
-               u'\u201d': u'\u201d', # Swedish
-               u'\u203a': u'\u203a', # Swedish
-              }
-
-def match_chars(c1, c2):
-    try:
-        i = openers.index(c1)
-    except ValueError:  # c1 not in openers
-        return False
-    return c2 == closers[i] or c2 in quote_pairs.get(c1, '')
-
+    quote_pairs = {u'\xbb': u'\xbb', # Swedish
+                u'\u2018': u'\u201a', # Greek
+                u'\u2019': u'\u2019', # Swedish
+                u'\u201a': u'\u2018\u2019', # German, Polish
+                u'\u201c': u'\u201e', # German
+                u'\u201e': u'\u201c\u201d',
+                u'\u201d': u'\u201d', # Swedish
+                u'\u203a': u'\u203a', # Swedish
+                }
 
+    def match_chars(c1, c2):
+        try:
+            i = openers.index(c1)
+        except ValueError:  # c1 not in openers
+            return False
+        return c2 == closers[i] or c2 in quote_pairs.get(c1, '')
 
 
 # print results
 # =============
 
-if __name__ == '__main__':
-
-    # (re) create and compare the samples:
-    (o, c, d, cd) = punctuation_samples()
-    if o != openers:
-        print '- openers = ur"""%s"""' % openers.encode('utf8')
-        print '+ openers = ur"""%s"""' % o.encode('utf8')
-    if c != closers:
-        print '- closers = ur"""%s"""' % closers.encode('utf8')
-        print '+ closers = ur"""%s"""' % c.encode('utf8')
-    if d != delimiters:
-        print '- delimiters = ur"%s"' % delimiters.encode('utf8')
-        print '+ delimiters = ur"%s"' % d.encode('utf8')
-    if cd != closing_delimiters:
-        print '- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8')
-        print '+ closing_delimiters = ur"%s"' % cd.encode('utf8')
-
-    # # test prints
-    # print 'openers = ', repr(openers)
-    # print 'closers = ', repr(closers)
-    # print 'delimiters = ', repr(delimiters)
-    # print 'closing_delimiters = ', repr(closing_delimiters)
-
-    # ucharlists = unicode_charlists(unicode_punctuation_categories)
-    # for cat, chars in ucharlists.items():
-    #     # print cat, chars
-    #     # compact output (visible with a comprehensive font):
-    #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')
+# (re) create and compare the samples:
+
+        (o, c, d, cd) = punctuation_samples()
+        if o != openers:
+            print '- openers = ur"""%s"""' % openers.encode('utf8')
+            print '+ openers = ur"""%s"""' % o.encode('utf8')
+        if c != closers:
+            print '- closers = ur"""%s"""' % closers.encode('utf8')
+            print '+ closers = ur"""%s"""' % c.encode('utf8')
+        if d != delimiters:
+            print '- delimiters = ur"%s"' % delimiters.encode('utf8')
+            print '+ delimiters = ur"%s"' % d.encode('utf8')
+        if cd != closing_delimiters:
+            print '- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8')
+            print '+ closing_delimiters = ur"%s"' % cd.encode('utf8')
+
+# test prints
+
+        # print 'openers = ', repr(openers)
+        # print 'closers = ', repr(closers)
+        # print 'delimiters = ', repr(delimiters)
+        # print 'closing_delimiters = ', repr(closing_delimiters)
+
+
+        # ucharlists = unicode_charlists(unicode_punctuation_categories)
+        # for cat, chars in ucharlists.items():
+        #     # print cat, chars
+        #     # compact output (visible with a comprehensive font):
+        #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')
+
+# verbose print
+
+        print 'openers:'
+        for ch in openers:
+            print ch.encode('utf8'), unicodedata.name(ch)
+        print 'closers:'
+        for ch in closers:
+            print ch.encode('utf8'), unicodedata.name(ch)
+        print 'delimiters:'
+        for ch in delimiters:
+            print ch.encode('utf8'), unicodedata.name(ch)
+        print 'closing_delimiters:'
+        for ch in closing_delimiters:
+            print ch.encode('utf8'), unicodedata.name(ch)
author	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2013-04-22 20:08:09 +0000
committer	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2013-04-22 20:08:09 +0000
commit	854b2e9f1bfc86797b51a671f1f0b70090d1c8f6 (patch)
tree	1f89094b38468e67d7e235e9b7c84e51c06dfaa1
parent	bbe2301aff249a1091f2319cfe9eebbdbab37798 (diff)
download	docutils-854b2e9f1bfc86797b51a671f1f0b70090d1c8f6.tar.gz