diff options
author | Daniel Veillard <veillard@redhat.com> | 2010-11-04 15:16:27 +0100 |
---|---|---|
committer | Daniel Veillard <veillard@redhat.com> | 2010-11-04 15:16:27 +0100 |
commit | 60587d6ebd0239c8433119cf4e6399e346009786 (patch) | |
tree | b1f3e7a5474016d14df6954834be07e9a9d93a93 /parser.c | |
parent | 91d239c5cf5acdc078e037689b4574688381f8b1 (diff) | |
download | libxml2-60587d6ebd0239c8433119cf4e6399e346009786.tar.gz |
606592 update language ID parser to RFC 5646
Mostly except we keep support for some older constructs and
don't implement extension or privateuse. It's messy because
it's used mostly by XSD datatype which itself reference RFC 3066
and suggests a lexical space completely different from what
5646 defines.
Diffstat (limited to 'parser.c')
-rw-r--r-- | parser.c | 196 |
1 files changed, 159 insertions, 37 deletions
@@ -1297,60 +1297,182 @@ xmlCleanSpecialAttr(xmlParserCtxtPtr ctxt) * [37] UserCode ::= ('x' | 'X') '-' ([a-z] | [A-Z])+ * [38] Subcode ::= ([a-z] | [A-Z])+ * + * The current REC reference the sucessors of RFC 1766, currently 5646 + * + * http://www.rfc-editor.org/rfc/rfc5646.txt + * langtag = language + * ["-" script] + * ["-" region] + * *("-" variant) + * *("-" extension) + * ["-" privateuse] + * language = 2*3ALPHA ; shortest ISO 639 code + * ["-" extlang] ; sometimes followed by + * ; extended language subtags + * / 4ALPHA ; or reserved for future use + * / 5*8ALPHA ; or registered language subtag + * + * extlang = 3ALPHA ; selected ISO 639 codes + * *2("-" 3ALPHA) ; permanently reserved + * + * script = 4ALPHA ; ISO 15924 code + * + * region = 2ALPHA ; ISO 3166-1 code + * / 3DIGIT ; UN M.49 code + * + * variant = 5*8alphanum ; registered variants + * / (DIGIT 3alphanum) + * + * extension = singleton 1*("-" (2*8alphanum)) + * + * ; Single alphanumerics + * ; "x" reserved for private use + * singleton = DIGIT ; 0 - 9 + * / %x41-57 ; A - W + * / %x59-5A ; Y - Z + * / %x61-77 ; a - w + * / %x79-7A ; y - z + * + * it sounds right to still allow Irregular i-xxx IANA and user codes too + * The parser below doesn't try to cope with extension or privateuse + * that could be added but that's not interoperable anyway + * * Returns 1 if correct 0 otherwise **/ int xmlCheckLanguageID(const xmlChar * lang) { - const xmlChar *cur = lang; + const xmlChar *cur = lang, *nxt; if (cur == NULL) return (0); if (((cur[0] == 'i') && (cur[1] == '-')) || - ((cur[0] == 'I') && (cur[1] == '-'))) { - /* - * IANA code - */ - cur += 2; - while (((cur[0] >= 'A') && (cur[0] <= 'Z')) || /* non input consuming */ - ((cur[0] >= 'a') && (cur[0] <= 'z'))) - cur++; - } else if (((cur[0] == 'x') && (cur[1] == '-')) || - ((cur[0] == 'X') && (cur[1] == '-'))) { + ((cur[0] == 'I') && (cur[1] == '-')) || + ((cur[0] == 'x') && (cur[1] == '-')) || + ((cur[0] == 'X') && (cur[1] == '-'))) { /* - * User code + * Still allow IANA code and user code which were coming + * from the previous version of the XML-1.0 specification + * it's deprecated but we should not fail */ cur += 2; - while (((cur[0] >= 'A') && (cur[0] <= 'Z')) || /* non input consuming */ + while (((cur[0] >= 'A') && (cur[0] <= 'Z')) || ((cur[0] >= 'a') && (cur[0] <= 'z'))) cur++; - } else if (((cur[0] >= 'A') && (cur[0] <= 'Z')) || - ((cur[0] >= 'a') && (cur[0] <= 'z'))) { + return(cur[0] == 0); + } + nxt = cur; + while (((nxt[0] >= 'A') && (nxt[0] <= 'Z')) || + ((nxt[0] >= 'a') && (nxt[0] <= 'z'))) + nxt++; + if (nxt - cur >= 4) { /* - * ISO639 + * Reserved */ - cur++; - if (((cur[0] >= 'A') && (cur[0] <= 'Z')) || - ((cur[0] >= 'a') && (cur[0] <= 'z'))) - cur++; - else - return (0); - } else - return (0); - while (cur[0] != 0) { /* non input consuming */ - if (cur[0] != '-') - return (0); - cur++; - if (((cur[0] >= 'A') && (cur[0] <= 'Z')) || - ((cur[0] >= 'a') && (cur[0] <= 'z'))) - cur++; - else - return (0); - while (((cur[0] >= 'A') && (cur[0] <= 'Z')) || /* non input consuming */ - ((cur[0] >= 'a') && (cur[0] <= 'z'))) - cur++; + if ((nxt - cur > 8) || (nxt[0] != 0)) + return(0); + return(1); } + if (nxt - cur < 2) + return(0); + /* we got an ISO 639 code */ + if (nxt[0] == 0) + return(1); + if (nxt[0] != '-') + return(0); + + nxt++; + cur = nxt; + /* now we can have extlang or script or region or variant */ + if ((nxt[0] >= '0') && (nxt[0] <= '9')) + goto region_m49; + + while (((nxt[0] >= 'A') && (nxt[0] <= 'Z')) || + ((nxt[0] >= 'a') && (nxt[0] <= 'z'))) + nxt++; + if (nxt - cur == 4) + goto script; + if (nxt - cur == 2) + goto region; + if ((nxt - cur >= 5) && (nxt - cur <= 8)) + goto variant; + if (nxt - cur != 3) + return(0); + /* we parsed an extlang */ + if (nxt[0] == 0) + return(1); + if (nxt[0] != '-') + return(0); + + nxt++; + cur = nxt; + /* now we can have script or region or variant */ + if ((nxt[0] >= '0') && (nxt[0] <= '9')) + goto region_m49; + + while (((nxt[0] >= 'A') && (nxt[0] <= 'Z')) || + ((nxt[0] >= 'a') && (nxt[0] <= 'z'))) + nxt++; + if (nxt - cur == 2) + goto region; + if ((nxt - cur >= 5) && (nxt - cur <= 8)) + goto variant; + if (nxt - cur != 4) + return(0); + /* we parsed a script */ +script: + if (nxt[0] == 0) + return(1); + if (nxt[0] != '-') + return(0); + + nxt++; + cur = nxt; + /* now we can have region or variant */ + if ((nxt[0] >= '0') && (nxt[0] <= '9')) + goto region_m49; + + while (((nxt[0] >= 'A') && (nxt[0] <= 'Z')) || + ((nxt[0] >= 'a') && (nxt[0] <= 'z'))) + nxt++; + + if ((nxt - cur >= 5) && (nxt - cur <= 8)) + goto variant; + if (nxt - cur != 2) + return(0); + /* we parsed a region */ +region: + if (nxt[0] == 0) + return(1); + if (nxt[0] != '-') + return(0); + + nxt++; + cur = nxt; + /* now we can just have a variant */ + while (((nxt[0] >= 'A') && (nxt[0] <= 'Z')) || + ((nxt[0] >= 'a') && (nxt[0] <= 'z'))) + nxt++; + + if ((nxt - cur < 5) || (nxt - cur > 8)) + return(0); + + /* we parsed a variant */ +variant: + if (nxt[0] == 0) + return(1); + if (nxt[0] != '-') + return(0); + /* extensions and private use subtags not checked */ return (1); + +region_m49: + if (((nxt[1] >= '0') && (nxt[1] <= '9')) && + ((nxt[2] >= '0') && (nxt[2] <= '9'))) { + nxt += 3; + goto region; + } + return(0); } /************************************************************************ @@ -11567,7 +11689,7 @@ xmldecl_done: * if size is greater than len. Otherwise, memmove in xmlBufferAdd * will blindly copy extra bytes from memory. */ - if (size > len) { + if ((unsigned int) size > len) { remain = size - len; size = len; } else { |