summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Barkov <bar@mnogosearch.org>2013-12-02 14:39:08 +0400
committerAlexander Barkov <bar@mnogosearch.org>2013-12-02 14:39:08 +0400
commit5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032 (patch)
tree0e88f2fe808ebab43f6f6c1c4bd314693525455c
parentd25d7ec589cb83acd00ae2c7251dd851ff3cc1a7 (diff)
downloadmariadb-git-5bb01fa1ace1dcfe87c9c1eae3cd30a55c9de032.tar.gz
MDEV-5357 REGEXP word boundaries don't work
Applied a patch from Philip Hazel implementing the non-standard syntax for word boundaries in PCRE, for compatibility with the old Henry Spencer's regex library.
-rw-r--r--mysql-test/include/ctype_regex_utf8.inc24
-rw-r--r--mysql-test/include/ctype_utf8mb4.inc20
-rw-r--r--mysql-test/r/ctype_utf8.result28
-rw-r--r--mysql-test/r/ctype_utf8mb4.result28
-rw-r--r--mysql-test/r/ctype_utf8mb4_heap.result28
-rw-r--r--mysql-test/r/ctype_utf8mb4_innodb.result28
-rw-r--r--mysql-test/r/ctype_utf8mb4_myisam.result28
-rw-r--r--mysql-test/t/ctype_utf8.test20
-rw-r--r--mysql-test/t/ctype_utf8mb4.test20
-rw-r--r--pcre/pcre_compile.c35
-rw-r--r--pcre/pcre_internal.h2
11 files changed, 192 insertions, 69 deletions
diff --git a/mysql-test/include/ctype_regex_utf8.inc b/mysql-test/include/ctype_regex_utf8.inc
new file mode 100644
index 00000000000..d389cb214f7
--- /dev/null
+++ b/mysql-test/include/ctype_regex_utf8.inc
@@ -0,0 +1,24 @@
+#
+# Bug #3928 regexp [[:>:]] and UTF-8
+#
+SELECT @@character_set_client, @@collation_connection;
+
+# This should return TRUE
+select 'вася' rlike '\\bвася\\b';
+select 'вася ' rlike '\\bвася\\b';
+select ' вася' rlike '\\bвася\\b';
+select ' вася ' rlike '\\bвася\\b';
+
+select 'вася' rlike '[[:<:]]вася[[:>:]]';
+select 'вася ' rlike '[[:<:]]вася[[:>:]]';
+select ' вася' rlike '[[:<:]]вася[[:>:]]';
+select ' вася ' rlike '[[:<:]]вася[[:>:]]';
+
+# This should return FALSE
+select 'васяz' rlike '\\bвася\\b';
+select 'zвася' rlike '\\bвася\\b';
+select 'zвасяz' rlike '\\bвася\\b';
+
+select 'васяz' rlike '[[:<:]]вася[[:>:]]';
+select 'zвася' rlike '[[:<:]]вася[[:>:]]';
+select 'zвасяz' rlike '[[:<:]]вася[[:>:]]';
diff --git a/mysql-test/include/ctype_utf8mb4.inc b/mysql-test/include/ctype_utf8mb4.inc
index af3a4564026..9ee2414e142 100644
--- a/mysql-test/include/ctype_utf8mb4.inc
+++ b/mysql-test/include/ctype_utf8mb4.inc
@@ -224,25 +224,9 @@ drop table t1;
#
# Testing regexp
#
-set collation_connection=utf8mb4_general_ci;
---source include/ctype_regex.inc
-set names utf8mb4;
-
-#
-# Bug #3928 regexp [[:>:]] and UTF-8
-#
set names utf8mb4;
-
-# This should return TRUE
-select 'вася' rlike '\\bвася\\b';
-select 'вася ' rlike '\\bвася\\b';
-select ' вася' rlike '\\bвася\\b';
-select ' вася ' rlike '\\bвася\\b';
-
-# This should return FALSE
-select 'васяz' rlike '\\bвася\\b';
-select 'zвася' rlike '\\bвася\\b';
-select 'zвасяz' rlike '\\bвася\\b';
+--source include/ctype_regex.inc
+--source include/ctype_regex_utf8.inc
#
# Bug #4555
diff --git a/mysql-test/r/ctype_utf8.result b/mysql-test/r/ctype_utf8.result
index 7342dffa1e0..f98fe649f80 100644
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -270,7 +270,7 @@ b
select * from t1 where a = 'b' and a != 'b';
a
drop table t1;
-set collation_connection=utf8_general_ci;
+set names utf8;
drop table if exists t1;
create table t1 as
select repeat(' ', 64) as s1, repeat(' ',64) as s2
@@ -314,8 +314,9 @@ NULL
NULL
NULL
drop table t1;
-set names utf8;
-set names utf8;
+SELECT @@character_set_client, @@collation_connection;
+@@character_set_client @@collation_connection
+utf8 utf8_general_ci
select 'вася' rlike '\\bвася\\b';
'вася' rlike '\\bвася\\b'
1
@@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b';
select ' вася ' rlike '\\bвася\\b';
' вася ' rlike '\\bвася\\b'
1
+select 'вася' rlike '[[:<:]]вася[[:>:]]';
+'вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select 'вася ' rlike '[[:<:]]вася[[:>:]]';
+'вася ' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася' rlike '[[:<:]]вася[[:>:]]';
+' вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася ' rlike '[[:<:]]вася[[:>:]]';
+' вася ' rlike '[[:<:]]вася[[:>:]]'
+1
select 'васяz' rlike '\\bвася\\b';
'васяz' rlike '\\bвася\\b'
0
@@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b';
select 'zвасяz' rlike '\\bвася\\b';
'zвасяz' rlike '\\bвася\\b'
0
+select 'васяz' rlike '[[:<:]]вася[[:>:]]';
+'васяz' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвася' rlike '[[:<:]]вася[[:>:]]';
+'zвася' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвасяz' rlike '[[:<:]]вася[[:>:]]';
+'zвасяz' rlike '[[:<:]]вася[[:>:]]'
+0
CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8_unicode_ci);
ALTER TABLE t1 ADD COLUMN b CHAR(20);
DROP TABLE t1;
diff --git a/mysql-test/r/ctype_utf8mb4.result b/mysql-test/r/ctype_utf8mb4.result
index e9608188e9f..4580d90c5bc 100644
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -270,7 +270,7 @@ b
select * from t1 where a = 'b' and a != 'b';
a
drop table t1;
-set collation_connection=utf8mb4_general_ci;
+set names utf8mb4;
drop table if exists t1;
create table t1 as
select repeat(' ', 64) as s1, repeat(' ',64) as s2
@@ -314,8 +314,9 @@ NULL
NULL
NULL
drop table t1;
-set names utf8mb4;
-set names utf8mb4;
+SELECT @@character_set_client, @@collation_connection;
+@@character_set_client @@collation_connection
+utf8mb4 utf8mb4_general_ci
select 'вася' rlike '\\bвася\\b';
'вася' rlike '\\bвася\\b'
1
@@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b';
select ' вася ' rlike '\\bвася\\b';
' вася ' rlike '\\bвася\\b'
1
+select 'вася' rlike '[[:<:]]вася[[:>:]]';
+'вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select 'вася ' rlike '[[:<:]]вася[[:>:]]';
+'вася ' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася' rlike '[[:<:]]вася[[:>:]]';
+' вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася ' rlike '[[:<:]]вася[[:>:]]';
+' вася ' rlike '[[:<:]]вася[[:>:]]'
+1
select 'васяz' rlike '\\bвася\\b';
'васяz' rlike '\\bвася\\b'
0
@@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b';
select 'zвасяz' rlike '\\bвася\\b';
'zвасяz' rlike '\\bвася\\b'
0
+select 'васяz' rlike '[[:<:]]вася[[:>:]]';
+'васяz' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвася' rlike '[[:<:]]вася[[:>:]]';
+'zвася' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвасяz' rlike '[[:<:]]вася[[:>:]]';
+'zвасяz' rlike '[[:<:]]вася[[:>:]]'
+0
CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci);
ALTER TABLE t1 ADD COLUMN b CHAR(20);
DROP TABLE t1;
diff --git a/mysql-test/r/ctype_utf8mb4_heap.result b/mysql-test/r/ctype_utf8mb4_heap.result
index 0ffe26b5f25..bcacdd4e16e 100644
--- a/mysql-test/r/ctype_utf8mb4_heap.result
+++ b/mysql-test/r/ctype_utf8mb4_heap.result
@@ -260,7 +260,7 @@ b
select * from t1 where a = 'b' and a != 'b';
a
drop table t1;
-set collation_connection=utf8mb4_general_ci;
+set names utf8mb4;
drop table if exists t1;
create table t1 as
select repeat(' ', 64) as s1, repeat(' ',64) as s2
@@ -304,8 +304,9 @@ NULL
NULL
NULL
drop table t1;
-set names utf8mb4;
-set names utf8mb4;
+SELECT @@character_set_client, @@collation_connection;
+@@character_set_client @@collation_connection
+utf8mb4 utf8mb4_general_ci
select 'вася' rlike '\\bвася\\b';
'вася' rlike '\\bвася\\b'
1
@@ -318,6 +319,18 @@ select ' вася' rlike '\\bвася\\b';
select ' вася ' rlike '\\bвася\\b';
' вася ' rlike '\\bвася\\b'
1
+select 'вася' rlike '[[:<:]]вася[[:>:]]';
+'вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select 'вася ' rlike '[[:<:]]вася[[:>:]]';
+'вася ' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася' rlike '[[:<:]]вася[[:>:]]';
+' вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася ' rlike '[[:<:]]вася[[:>:]]';
+' вася ' rlike '[[:<:]]вася[[:>:]]'
+1
select 'васяz' rlike '\\bвася\\b';
'васяz' rlike '\\bвася\\b'
0
@@ -327,6 +340,15 @@ select 'zвася' rlike '\\bвася\\b';
select 'zвасяz' rlike '\\bвася\\b';
'zвасяz' rlike '\\bвася\\b'
0
+select 'васяz' rlike '[[:<:]]вася[[:>:]]';
+'васяz' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвася' rlike '[[:<:]]вася[[:>:]]';
+'zвася' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвасяz' rlike '[[:<:]]вася[[:>:]]';
+'zвасяz' rlike '[[:<:]]вася[[:>:]]'
+0
CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci) ENGINE heap;
ALTER TABLE t1 ADD COLUMN b CHAR(20);
DROP TABLE t1;
diff --git a/mysql-test/r/ctype_utf8mb4_innodb.result b/mysql-test/r/ctype_utf8mb4_innodb.result
index 3e1554cd0ae..2375ca3bb92 100644
--- a/mysql-test/r/ctype_utf8mb4_innodb.result
+++ b/mysql-test/r/ctype_utf8mb4_innodb.result
@@ -270,7 +270,7 @@ b
select * from t1 where a = 'b' and a != 'b';
a
drop table t1;
-set collation_connection=utf8mb4_general_ci;
+set names utf8mb4;
drop table if exists t1;
create table t1 as
select repeat(' ', 64) as s1, repeat(' ',64) as s2
@@ -314,8 +314,9 @@ NULL
NULL
NULL
drop table t1;
-set names utf8mb4;
-set names utf8mb4;
+SELECT @@character_set_client, @@collation_connection;
+@@character_set_client @@collation_connection
+utf8mb4 utf8mb4_general_ci
select 'вася' rlike '\\bвася\\b';
'вася' rlike '\\bвася\\b'
1
@@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b';
select ' вася ' rlike '\\bвася\\b';
' вася ' rlike '\\bвася\\b'
1
+select 'вася' rlike '[[:<:]]вася[[:>:]]';
+'вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select 'вася ' rlike '[[:<:]]вася[[:>:]]';
+'вася ' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася' rlike '[[:<:]]вася[[:>:]]';
+' вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася ' rlike '[[:<:]]вася[[:>:]]';
+' вася ' rlike '[[:<:]]вася[[:>:]]'
+1
select 'васяz' rlike '\\bвася\\b';
'васяz' rlike '\\bвася\\b'
0
@@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b';
select 'zвасяz' rlike '\\bвася\\b';
'zвасяz' rlike '\\bвася\\b'
0
+select 'васяz' rlike '[[:<:]]вася[[:>:]]';
+'васяz' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвася' rlike '[[:<:]]вася[[:>:]]';
+'zвася' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвасяz' rlike '[[:<:]]вася[[:>:]]';
+'zвасяz' rlike '[[:<:]]вася[[:>:]]'
+0
CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci) ENGINE InnoDB;
ALTER TABLE t1 ADD COLUMN b CHAR(20);
DROP TABLE t1;
diff --git a/mysql-test/r/ctype_utf8mb4_myisam.result b/mysql-test/r/ctype_utf8mb4_myisam.result
index 4efcedf708f..b467f07ece9 100644
--- a/mysql-test/r/ctype_utf8mb4_myisam.result
+++ b/mysql-test/r/ctype_utf8mb4_myisam.result
@@ -270,7 +270,7 @@ b
select * from t1 where a = 'b' and a != 'b';
a
drop table t1;
-set collation_connection=utf8mb4_general_ci;
+set names utf8mb4;
drop table if exists t1;
create table t1 as
select repeat(' ', 64) as s1, repeat(' ',64) as s2
@@ -314,8 +314,9 @@ NULL
NULL
NULL
drop table t1;
-set names utf8mb4;
-set names utf8mb4;
+SELECT @@character_set_client, @@collation_connection;
+@@character_set_client @@collation_connection
+utf8mb4 utf8mb4_general_ci
select 'вася' rlike '\\bвася\\b';
'вася' rlike '\\bвася\\b'
1
@@ -328,6 +329,18 @@ select ' вася' rlike '\\bвася\\b';
select ' вася ' rlike '\\bвася\\b';
' вася ' rlike '\\bвася\\b'
1
+select 'вася' rlike '[[:<:]]вася[[:>:]]';
+'вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select 'вася ' rlike '[[:<:]]вася[[:>:]]';
+'вася ' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася' rlike '[[:<:]]вася[[:>:]]';
+' вася' rlike '[[:<:]]вася[[:>:]]'
+1
+select ' вася ' rlike '[[:<:]]вася[[:>:]]';
+' вася ' rlike '[[:<:]]вася[[:>:]]'
+1
select 'васяz' rlike '\\bвася\\b';
'васяz' rlike '\\bвася\\b'
0
@@ -337,6 +350,15 @@ select 'zвася' rlike '\\bвася\\b';
select 'zвасяz' rlike '\\bвася\\b';
'zвасяz' rlike '\\bвася\\b'
0
+select 'васяz' rlike '[[:<:]]вася[[:>:]]';
+'васяz' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвася' rlike '[[:<:]]вася[[:>:]]';
+'zвася' rlike '[[:<:]]вася[[:>:]]'
+0
+select 'zвасяz' rlike '[[:<:]]вася[[:>:]]';
+'zвасяz' rlike '[[:<:]]вася[[:>:]]'
+0
CREATE TABLE t1 (a enum ('Y', 'N') DEFAULT 'N' COLLATE utf8mb4_unicode_ci) ENGINE MyISAM;
ALTER TABLE t1 ADD COLUMN b CHAR(20);
DROP TABLE t1;
diff --git a/mysql-test/t/ctype_utf8.test b/mysql-test/t/ctype_utf8.test
index 6f2222b8e45..468804130f4 100644
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -199,25 +199,9 @@ drop table t1;
#
# Testing regexp
#
-set collation_connection=utf8_general_ci;
---source include/ctype_regex.inc
-set names utf8;
-
-#
-# Bug #3928 regexp [[:>:]] and UTF-8
-#
set names utf8;
-
-# This should return TRUE
-select 'вася' rlike '\\bвася\\b';
-select 'вася ' rlike '\\bвася\\b';
-select ' вася' rlike '\\bвася\\b';
-select ' вася ' rlike '\\bвася\\b';
-
-# This should return FALSE
-select 'васяz' rlike '\\bвася\\b';
-select 'zвася' rlike '\\bвася\\b';
-select 'zвасяz' rlike '\\bвася\\b';
+--source include/ctype_regex.inc
+--source include/ctype_regex_utf8.inc
#
# Bug #4555
diff --git a/mysql-test/t/ctype_utf8mb4.test b/mysql-test/t/ctype_utf8mb4.test
index 934adb50cca..7a3c67bb417 100644
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -197,25 +197,9 @@ drop table t1;
#
# Testing regexp
#
-set collation_connection=utf8mb4_general_ci;
---source include/ctype_regex.inc
-set names utf8mb4;
-
-#
-# Bug #3928 regexp [[:>:]] and UTF-8
-#
set names utf8mb4;
-
-# This should return TRUE
-select 'вася' rlike '\\bвася\\b';
-select 'вася ' rlike '\\bвася\\b';
-select ' вася' rlike '\\bвася\\b';
-select ' вася ' rlike '\\bвася\\b';
-
-# This should return FALSE
-select 'васяz' rlike '\\bвася\\b';
-select 'zвася' rlike '\\bвася\\b';
-select 'zвасяz' rlike '\\bвася\\b';
+--source include/ctype_regex.inc
+--source include/ctype_regex_utf8.inc
#
# Bug #4555
diff --git a/pcre/pcre_compile.c b/pcre/pcre_compile.c
index 0ebb3f168f1..a307372fbfe 100644
--- a/pcre/pcre_compile.c
+++ b/pcre/pcre_compile.c
@@ -253,6 +253,19 @@ static const verbitem verbs[] = {
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
+/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
+another regex library. */
+
+static const pcre_uchar sub_start_of_word[] = {
+ CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+ CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
+
+static const pcre_uchar sub_end_of_word[] = {
+ CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+ CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
+ CHAR_RIGHT_PARENTHESIS, '\0' };
+
+
/* Tables of names of POSIX character classes and their lengths. The names are
now all in a single string, to reduce the number of relocations when a shared
library is dynamically loaded. The list of lengths is terminated by a zero
@@ -4036,8 +4049,30 @@ for (;; ptr++)
goto FAILED;
}
goto NORMAL_CHAR;
+
+ /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
+ used for "start of word" and "end of word". As these are otherwise illegal
+ sequences, we don't break anything by recognizing them. They are replaced
+ by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
+ erroneous and are handled by the normal code below. */
case CHAR_LEFT_SQUARE_BRACKET:
+ if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
+ {
+ nestptr = ptr + 7;
+ ptr = sub_start_of_word - 1;
+ continue;
+ }
+
+ if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
+ {
+ nestptr = ptr + 7;
+ ptr = sub_end_of_word - 1;
+ continue;
+ }
+
+ /* Handle a real character class. */
+
previous = code;
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
diff --git a/pcre/pcre_internal.h b/pcre/pcre_internal.h
index 307069ca9d6..cd6ef3ed83e 100644
--- a/pcre/pcre_internal.h
+++ b/pcre/pcre_internal.h
@@ -1794,6 +1794,8 @@ only. */
#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
+#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
+#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS