summaryrefslogtreecommitdiff
path: root/handy.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-08-21 17:29:10 -0600
committerKarl Williamson <khw@cpan.org>2014-08-22 12:14:59 -0600
commit305b86516461e93877909338ac3642c6ac09b651 (patch)
treed5d7b3c47a3af1537ffb549ee3eacf937bcd9571 /handy.h
parentb51533f3c738c0d34d686dc15720c781f1043802 (diff)
downloadperl-305b86516461e93877909338ac3642c6ac09b651.tar.gz
Add and use macros for case-insensitive comparison
This adds to handy.h isALPHA_FOLD_EQ(c1,c2) which efficiently tests if c1 and c2 are the same character, case-insensitively. For example isALPHA_FOLD_EQ(c, 's') returns true if and only if <c> is 's' or 'S'. isALPHA_FOLD_NE() is also added by this commit. At least one of c1 and c2 must be known to be in [A-Za-z] or this macro doesn't work properly. (There is an assert for this in the macro in DEBUGGING builds). That is why the name includes "ALPHA", so you won't forget when using it. This functionality has been in regcomp.c for a while, under a different name. I had thought that the only reason to make it more generally available was potential speed gain, but recent gcc versions optimize to the same code, so I thought there wasn't any point to doing so. But I now think that using this makes things easier to read (and certainly shorter to type in). Once you grok what this macro does, it simplifies what you have to keep in your mind when reading logical expressions with multiple operands. That something can be either upper or lower case can be a distraction to understanding the larger point of the expression.
Diffstat (limited to 'handy.h')
-rw-r--r--handy.h18
1 files changed, 18 insertions, 0 deletions
diff --git a/handy.h b/handy.h
index c5c4d4b100..d4c15a54af 100644
--- a/handy.h
+++ b/handy.h
@@ -1706,6 +1706,24 @@ typedef U32 line_t;
* both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */
#define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c)))
+/* Efficiently returns a boolean as to if two native characters are equivalent
+ * case-insenstively. At least one of the characters must be one of [A-Za-z];
+ * the ALPHA in the name is to remind you of that. This is asserted() in
+ * DEBUGGING builds. Because [A-Za-z] are invariant under UTF-8, this macro
+ * works (on valid input) for both non- and UTF-8-encoded bytes.
+ *
+ * When one of the inputs is a compile-time constant and gets folded by the
+ * compiler, this reduces to an AND and a TEST. On both EBCDIC and ASCII
+ * machines, 'A' and 'a' differ by a single bit; the same with the upper and
+ * lower case of all other ASCII-range alphabetics. On ASCII platforms, they
+ * are 32 apart; on EBCDIC, they are 64. This uses an exclusive 'or' to find
+ * that bit and then inverts it to form a mask, with just a single 0, in the
+ * bit position where the upper- and lowercase differ. */
+#define isALPHA_FOLD_EQ(c1, c2) \
+ (__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2)) \
+ ((c1) & ~('A' ^ 'a')) == ((c2) & ~('A' ^ 'a')))
+#define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2)))
+
/*
=head1 Memory Management