From 6c0cfad17b54029974d4c40503e5146adc994ca9 Mon Sep 17 00:00:00 2001
From: unknown <bar@mysql.com>
Date: Tue, 11 Apr 2006 13:25:02 +0500
Subject: Bug#16233: XML: ExtractValue() fails with special characters

ExtractValue didn't understand tag and attribute names
consisting of "tricky" national letters (e.g. latin accenter letters).
It happened because XPath lex parser recognized only basic
latin letter a..z ad a part of an identifier.

Fixed to recognize all letters by means of new "full ctype" which
was added recently.


mysql-test/r/xml.result:
  Adding test case
mysql-test/t/xml.test:
  Adding test case
sql/item_xmlfunc.cc:
  Using recently implemented "true" ctype functionality
      to treat all national letters as valid tag names,
      Only basic latin letters worked so far.
strings/ctype-simple.c:
  A bug fix: ctype is array of 257 elements,
      adding offset to address correct element.
---
 mysql-test/r/xml.result | 23 ++++++++++++++++++
 mysql-test/t/xml.test   | 20 ++++++++++++++++
 sql/item_xmlfunc.cc     | 64 ++++++++++++++++++++-----------------------------
 strings/ctype-simple.c  |  2 +-
 4 files changed, 70 insertions(+), 39 deletions(-)
diff --git a/mysql-test/r/xml.result b/mysql-test/r/xml.result
index 52f80000015..2946c56da6b 100644
--- a/mysql-test/r/xml.result
+++ b/mysql-test/r/xml.result
@@ -615,3 +615,26 @@ select extractValue('<e>1</e>','last()');
 ERROR HY000: XPATH syntax error: ''
 select extractValue('<e><a>1</a></e>','/e/');
 ERROR HY000: XPATH syntax error: ''
+set names utf8;
+select extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r');
+extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r')
+r
+select extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ');
+extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ')
+Ñ
+select extractValue('<Ñ r="r"/>','/Ñ/@r');
+extractValue('<Ñ r="r"/>','/Ñ/@r')
+r
+select extractValue('<r Ñ="Ñ"/>','/r/@Ñ');
+extractValue('<r Ñ="Ñ"/>','/r/@Ñ')
+Ñ
+DROP PROCEDURE IF EXISTS p2;
+CREATE PROCEDURE p2 ()
+BEGIN
+DECLARE p LONGTEXT CHARACTER SET UTF8 DEFAULT '<Ñ><r>A</r></Ñ>';
+SELECT EXTRACTVALUE(p,'/Ñ/r');
+END//
+CALL p2();
+EXTRACTVALUE(p,'/Ñ/r')
+A
+DROP PROCEDURE p2;
diff --git a/mysql-test/t/xml.test b/mysql-test/t/xml.test
index af3ec2d827e..4bc76287fe2 100644
--- a/mysql-test/t/xml.test
+++ b/mysql-test/t/xml.test
@@ -295,3 +295,23 @@ select extractValue('<e>1</e>','last()');
 --error 1105
 select extractValue('<e><a>1</a></e>','/e/');
 
+#
+# Bug#16233: XML: ExtractValue() fails with special characters
+#
+set names utf8;
+select extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r');
+select extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ');
+select extractValue('<Ñ r="r"/>','/Ñ/@r');
+select extractValue('<r Ñ="Ñ"/>','/r/@Ñ');
+--disable_warnings
+DROP PROCEDURE IF EXISTS p2;
+--enable_warnings
+DELIMITER //;
+CREATE PROCEDURE p2 ()
+BEGIN
+ DECLARE p LONGTEXT CHARACTER SET UTF8 DEFAULT '<Ñ><r>A</r></Ñ>';
+ SELECT EXTRACTVALUE(p,'/Ñ/r');
+END//
+DELIMITER ;//
+CALL p2();
+DROP PROCEDURE p2;
diff --git a/sql/item_xmlfunc.cc b/sql/item_xmlfunc.cc
index 91f958d5b70..71900c26c2d 100644
--- a/sql/item_xmlfunc.cc
+++ b/sql/item_xmlfunc.cc
@@ -1304,30 +1304,6 @@ my_xpath_init(MY_XPATH *xpath)
 }
 
 
-/*
-  Some ctype-alike helper functions. Note, we cannot
-  reuse cs->ident_map[], because in Xpath, unlike in SQL, 
-  dash character is a valid identifier part.
-*/
-static int
-my_xident_beg(int c)
-{
-  return (((c) >= 'a' && (c) <= 'z') ||
-          ((c) >= 'A' && (c) <= 'Z') ||
-          ((c) == '_'));
-}
-
-
-static int
-my_xident_body(int c)
-{
-  return (((c) >= 'a' && (c) <= 'z') ||
-          ((c) >= 'A' && (c) <= 'Z') ||
-          ((c) >= '0' && (c) <= '9') ||
-          ((c)=='-') || ((c) == '_'));
-}
-
-
 static int
 my_xdigit(int c)
 {
@@ -1350,7 +1326,7 @@ static void
 my_xpath_lex_scan(MY_XPATH *xpath,
                   MY_XPATH_LEX *lex, const char *beg, const char *end)
 {
-  int ch;
+  int ch, ctype, length;
   for ( ; beg < end && *beg == ' ' ; beg++); // skip leading spaces
   lex->beg= beg;
   
@@ -1360,20 +1336,20 @@ my_xpath_lex_scan(MY_XPATH *xpath,
     lex->term= MY_XPATH_LEX_EOF; // end of line reached
     return;
   }
-  ch= *beg++;
-  
-  if (ch > 0 && ch < 128 && simpletok[ch])
-  {
-    // a token consisting of one character found
-    lex->end= beg;
-    lex->term= ch;
-    return;
-  }
-  
-  if (my_xident_beg(ch)) // ident, or a function call, or a keyword
+
+  // Check ident, or a function call, or a keyword
+  if ((length= xpath->cs->cset->ctype(xpath->cs, &ctype,
+                                      (const uchar*) beg,
+                                      (const uchar*) end)) > 0 &&
+      ((ctype & (_MY_L | _MY_U)) || *beg == '_'))
   {
-    // scan until the end of the identifier
-    for ( ; beg < end && my_xident_body(*beg); beg++);
+    // scan untill the end of the idenfitier
+    for (beg+= length; 
+         (length= xpath->cs->cset->ctype(xpath->cs, &ctype,
+                                         (const uchar*) beg,
+                                         (const uchar*) end)) > 0 &&
+         ((ctype & (_MY_L | _MY_U | _MY_NMR)) || *beg == '_' || *beg == '-') ;
+         beg+= length) /* no op */;
     lex->end= beg;
 
     // check if a function call
@@ -1388,6 +1364,18 @@ my_xpath_lex_scan(MY_XPATH *xpath,
     return;
   }
 
+
+  ch= *beg++;
+  
+  if (ch > 0 && ch < 128 && simpletok[ch])
+  {
+    // a token consisting of one character found
+    lex->end= beg;
+    lex->term= ch;
+    return;
+  }
+
+
   if (my_xdigit(ch)) // a sequence of digits
   {
     for ( ; beg < end && my_xdigit(*beg) ; beg++);
diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c
index 7dd3dfca29a..a9fd5b8852e 100644
--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -1362,7 +1362,7 @@ int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype,
     *ctype= 0;
     return MY_CS_TOOSMALL;
   }
-  *ctype= cs->ctype[*s];
+  *ctype= cs->ctype[*s + 1];
   return 1;
 }
 
-- 
cgit v1.2.1