/* Copyright (c) 2004, 2013, Oracle and/or its affiliates. Copyright (c) 2009, 2021, MariaDB This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; version 2 of the License. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ #ifdef SCANNER_NEXT_NCHARS #define SCANNER_NEXT_RETURN(_w,_n) \ do { weight_and_nchars_t rc= {_w, _n}; return rc; } while(0) #define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \ do { \ weight_and_nchars_t rc= { _cnt->weight[0], \ _ignorable_nchars + \ my_contraction_char_length(_cnt) }; \ return rc; \ } while(0) #else #define SCANNER_NEXT_RETURN(_w,_n) do { return _w; } while (0) #define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \ do { return _cnt->weight[0]; } while(0) #endif static inline #ifdef SCANNER_NEXT_NCHARS weight_and_nchars_t MY_FUNCTION_NAME(scanner_next_with_nchars)(my_uca_scanner *scanner, const my_uca_scanner_param *param, size_t nchars) #else int MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner, const my_uca_scanner_param *param) #endif { #ifdef SCANNER_NEXT_NCHARS uint ignorable_nchars; #define LOCAL_MAX_CONTRACTION_LENGTH nchars #else #define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION #endif uint16 weight= my_uca_scanner_next_expansion_weight(scanner); if (weight) { /* More weights left from the previous step. Return the next weight from the current expansion. Return "0" as "nchars". The real nchars was set on a previous iteration. */ SCANNER_NEXT_RETURN(weight, 0); } #ifdef SCANNER_NEXT_NCHARS for (ignorable_nchars= 0 ; ; ignorable_nchars++) #else for ( ; ; ) #endif { const uint16 *wpage; int mblen; my_wc_t currwc= 0; const uint16 *cweight; #if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS) if (scanner->sbeg + 1 < scanner->send) { const MY_UCA_2BYTES_ITEM *ww; ww= my_uca_level_booster_2bytes_item_addr_const(param->level->booster, scanner->sbeg[0], scanner->sbeg[1]); if (my_uca_2bytes_item_is_applicable(ww)) { /* Byte pairs that make 2-byte head characters in previous context pairs are marked as not applicable for optimization during the collation initialization. So when we come here sbeg[0] and sbeg[1] are: - either two ASCII characters - or one 2-byte character which IS NOT a previous context head Just remember sbeg[1] as the previous character for simplicity. This may erroneously interpret bytes 0x80..0x9F as previous context head characters U+0080..U+009F. However, CLDR does not have any real collations that use these characters as previous context heads. */ scanner->page= 0; scanner->code= (int) scanner->sbeg[1]; scanner->sbeg+= 2; if ((weight= my_uca_scanner_set_weight(scanner, ww->weight))) { /* TODO: add support for scanner_next_with_nchars and do this: SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1); */ return weight; } continue; /* Ignorable character */ } /* 2 byte optimization is not applicable, go the slow path */ } #endif /* Get next character */ #if MY_UCA_ASCII_OPTIMIZE /* Get next ASCII character */ if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80) { currwc= scanner->sbeg[0]; scanner->sbeg+= 1; #if MY_UCA_COMPILE_CONTRACTIONS if (my_uca_needs_context_handling(param->level, currwc)) { const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param, currwc, LOCAL_MAX_CONTRACTION_LENGTH); if (cnt) { if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight))) SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars); continue; /* Ignorable contraction */ } } #endif scanner->page= 0; scanner->code= (int) currwc; cweight= param->level->weights[0] + scanner->code * param->level->lengths[0]; if ((weight= my_uca_scanner_set_weight(scanner, cweight))) SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1); continue; /* Ignorable character */ } else #endif /* Get next MB character */ if (((mblen= MY_MB_WC(scanner, param, &currwc, scanner->sbeg, scanner->send)) <= 0)) { if (scanner->sbeg >= scanner->send) { /* No more bytes, end of line reached */ SCANNER_NEXT_RETURN(-1, ignorable_nchars); } /* There are some more bytes left. Non-positive mb_len means that we got an incomplete or a bad byte sequence. Consume mbminlen bytes. */ if ((scanner->sbeg+= param->cs->mbminlen) > scanner->send) { /* For safety purposes don't go beyond the string range. */ scanner->sbeg= scanner->send; } /* Treat every complete or incomplete mbminlen unit as a weight which is greater than weight for any possible normal character. 0xFFFF is greater than any possible weight in the UCA weight table. */ SCANNER_NEXT_RETURN(0xFFFF, ignorable_nchars + 1); } scanner->sbeg+= mblen; if (currwc > param->level->maxchar) { SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner), ignorable_nchars + 1); } #if MY_UCA_COMPILE_CONTRACTIONS if (my_uca_needs_context_handling(param->level, currwc)) { const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param, currwc, LOCAL_MAX_CONTRACTION_LENGTH); if (cnt) { if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight))) SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars); continue; /* Ignorable contraction */ } } #endif /* Process single character */ scanner->page= currwc >> 8; scanner->code= currwc & 0xFF; /* If weight page for w[0] does not exist, then calculate algoritmically */ if (!(wpage= param->level->weights[scanner->page])) SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner, param), ignorable_nchars + 1); /* Calculate pointer to w[0]'s weight, using page and offset */ cweight= wpage + scanner->code * param->level->lengths[scanner->page]; if ((weight= my_uca_scanner_set_weight(scanner, cweight))) SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1); continue; /* Ignorable character */ } SCANNER_NEXT_RETURN(0, 0); /* Not reachable */ } #undef SCANNER_NEXT_NCHARS #undef SCANNER_NEXT_RETURN #undef SCANNER_NEXT_RETURN_CONTRACTION #undef LOCAL_MAX_CONTRACTION_LENGTH