summaryrefslogtreecommitdiff
path: root/REORG.TODO/string/strxfrm_l.c
diff options
context:
space:
mode:
Diffstat (limited to 'REORG.TODO/string/strxfrm_l.c')
-rw-r--r--REORG.TODO/string/strxfrm_l.c747
1 files changed, 747 insertions, 0 deletions
diff --git a/REORG.TODO/string/strxfrm_l.c b/REORG.TODO/string/strxfrm_l.c
new file mode 100644
index 0000000000..dd98a4caaf
--- /dev/null
+++ b/REORG.TODO/string/strxfrm_l.c
@@ -0,0 +1,747 @@
+/* Copyright (C) 1995-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Written by Ulrich Drepper <drepper@gnu.org>, 1995.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <langinfo.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
+
+#ifndef STRING_TYPE
+# define STRING_TYPE char
+# define USTRING_TYPE unsigned char
+# define STRXFRM __strxfrm_l
+# define STRLEN strlen
+# define STPNCPY __stpncpy
+# define WEIGHT_H "../locale/weight.h"
+# define SUFFIX MB
+# define L(arg) arg
+#endif
+
+#define CONCAT(a,b) CONCAT1(a,b)
+#define CONCAT1(a,b) a##b
+
+/* Maximum string size that is calculated with cached indices. Right now this
+ is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
+ lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
+#define SMALL_STR_SIZE 4095
+
+#include "../locale/localeinfo.h"
+#include WEIGHT_H
+
+/* Group locale data for shorter parameter lists. */
+typedef struct
+{
+ uint_fast32_t nrules;
+ unsigned char *rulesets;
+ USTRING_TYPE *weights;
+ int32_t *table;
+ USTRING_TYPE *extra;
+ int32_t *indirect;
+} locale_data_t;
+
+#ifndef WIDE_CHAR_VERSION
+
+/* We need UTF-8 encoding of numbers. */
+static int
+utf8_encode (char *buf, int val)
+{
+ int retval;
+
+ if (val < 0x80)
+ {
+ *buf++ = (char) val;
+ retval = 1;
+ }
+ else
+ {
+ int step;
+
+ for (step = 2; step < 6; ++step)
+ if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
+ break;
+ retval = step;
+
+ *buf = (unsigned char) (~0xff >> step);
+ --step;
+ do
+ {
+ buf[step] = 0x80 | (val & 0x3f);
+ val >>= 6;
+ }
+ while (--step > 0);
+ *buf |= val;
+ }
+
+ return retval;
+}
+#endif
+
+/* Find next weight and rule index. Inlined since called for every char. */
+static __always_inline size_t
+find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
+ unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
+{
+ int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
+ -1);
+ *rule_idx = tmp >> 24;
+ int32_t idx = tmp & 0xffffff;
+ size_t len = l_data->weights[idx++];
+
+ /* Skip over indices of previous levels. */
+ for (int i = 0; i < pass; i++)
+ {
+ idx += len;
+ len = l_data->weights[idx++];
+ }
+
+ *weight_idx = idx;
+ return len;
+}
+
+static int
+find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
+ const int pass)
+{
+ int32_t weight_idx;
+ unsigned char rule_idx;
+ const USTRING_TYPE *usrc = us;
+
+ find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
+ return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
+}
+
+/* Do the transformation. */
+static size_t
+do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
+ const locale_data_t *l_data)
+{
+ int32_t weight_idx;
+ unsigned char rule_idx;
+ uint_fast32_t pass;
+ size_t needed = 0;
+ size_t last_needed;
+
+ /* Now the passes over the weights. */
+ for (pass = 0; pass < l_data->nrules; ++pass)
+ {
+ size_t backw_len = 0;
+ last_needed = needed;
+ const USTRING_TYPE *cur = usrc;
+ const USTRING_TYPE *backw_start = NULL;
+
+ /* We assume that if a rule has defined `position' in one section
+ this is true for all of them. */
+ int position = find_position (cur, l_data, pass);
+
+ if (position == 0)
+ {
+ while (*cur != L('\0'))
+ {
+ const USTRING_TYPE *pos = cur;
+ size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
+ pass);
+ int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
+
+ if ((rule & sort_forward) != 0)
+ {
+ /* Handle the pushed backward sequence. */
+ if (backw_start != NULL)
+ {
+ for (size_t i = backw_len; i > 0; )
+ {
+ int32_t weight_idx;
+ unsigned char rule_idx;
+ size_t len = find_idx (&backw_start, &weight_idx,
+ &rule_idx, l_data, pass);
+ if (needed + i < n)
+ for (size_t j = len; j > 0; j--)
+ dest[needed + i - j] =
+ l_data->weights[weight_idx++];
+
+ i -= len;
+ }
+
+ needed += backw_len;
+ backw_start = NULL;
+ backw_len = 0;
+ }
+
+ /* Now handle the forward element. */
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = l_data->weights[weight_idx++];
+ else
+ /* No more characters fit into the buffer. */
+ needed += len;
+ }
+ else
+ {
+ /* Remember start of the backward sequence & track length. */
+ if (backw_start == NULL)
+ backw_start = pos;
+ backw_len += len;
+ }
+ }
+
+
+ /* Handle the pushed backward sequence. */
+ if (backw_start != NULL)
+ {
+ for (size_t i = backw_len; i > 0; )
+ {
+ size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
+ l_data, pass);
+ if (needed + i < n)
+ for (size_t j = len; j > 0; j--)
+ dest[needed + i - j] =
+ l_data->weights[weight_idx++];
+
+ i -= len;
+ }
+
+ needed += backw_len;
+ }
+ }
+ else
+ {
+ int val = 1;
+#ifndef WIDE_CHAR_VERSION
+ char buf[7];
+ size_t buflen;
+#endif
+ size_t i;
+
+ while (*cur != L('\0'))
+ {
+ const USTRING_TYPE *pos = cur;
+ size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
+ pass);
+ int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
+
+ if ((rule & sort_forward) != 0)
+ {
+ /* Handle the pushed backward sequence. */
+ if (backw_start != NULL)
+ {
+ for (size_t p = backw_len; p > 0; p--)
+ {
+ size_t len;
+ int32_t weight_idx;
+ unsigned char rule_idx;
+ const USTRING_TYPE *backw_cur = backw_start;
+
+ /* To prevent a warning init the used vars. */
+ len = find_idx (&backw_cur, &weight_idx,
+ &rule_idx, l_data, pass);
+
+ for (i = 1; i < p; i++)
+ len = find_idx (&backw_cur, &weight_idx,
+ &rule_idx, l_data, pass);
+
+ if (len != 0)
+ {
+#ifdef WIDE_CHAR_VERSION
+ if (needed + 1 + len < n)
+ {
+ dest[needed] = val;
+ for (i = 0; i < len; ++i)
+ dest[needed + 1 + i] =
+ l_data->weights[weight_idx + i];
+ }
+ needed += 1 + len;
+#else
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ l_data->weights[weight_idx + i];
+ }
+ needed += buflen + len;
+#endif
+ val = 1;
+ }
+ else
+ ++val;
+ }
+
+ backw_start = NULL;
+ backw_len = 0;
+ }
+
+ /* Now handle the forward element. */
+ if (len != 0)
+ {
+#ifdef WIDE_CHAR_VERSION
+ if (needed + 1 + len < n)
+ {
+ dest[needed] = val;
+ for (i = 0; i < len; ++i)
+ dest[needed + 1 + i] =
+ l_data->weights[weight_idx + i];
+ }
+ needed += 1 + len;
+#else
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ l_data->weights[weight_idx + i];
+ }
+ needed += buflen + len;
+#endif
+ val = 1;
+ }
+ else
+ ++val;
+ }
+ else
+ {
+ /* Remember start of the backward sequence & track length. */
+ if (backw_start == NULL)
+ backw_start = pos;
+ backw_len++;
+ }
+ }
+
+ /* Handle the pushed backward sequence. */
+ if (backw_start != NULL)
+ {
+ for (size_t p = backw_len; p > 0; p--)
+ {
+ size_t len;
+ int32_t weight_idx;
+ unsigned char rule_idx;
+ const USTRING_TYPE *backw_cur = backw_start;
+
+ /* To prevent a warning init the used vars. */
+ len = find_idx (&backw_cur, &weight_idx,
+ &rule_idx, l_data, pass);
+
+ for (i = 1; i < p; i++)
+ len = find_idx (&backw_cur, &weight_idx,
+ &rule_idx, l_data, pass);
+
+ if (len != 0)
+ {
+#ifdef WIDE_CHAR_VERSION
+ if (needed + 1 + len < n)
+ {
+ dest[needed] = val;
+ for (i = 0; i < len; ++i)
+ dest[needed + 1 + i] =
+ l_data->weights[weight_idx + i];
+ }
+ needed += 1 + len;
+#else
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ l_data->weights[weight_idx + i];
+ }
+ needed += buflen + len;
+#endif
+ val = 1;
+ }
+ else
+ ++val;
+ }
+ }
+ }
+
+ /* Finally store the byte to separate the passes or terminate
+ the string. */
+ if (needed < n)
+ dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
+ ++needed;
+ }
+
+ /* This is a little optimization: many collation specifications have
+ a `position' rule at the end and if no non-ignored character
+ is found the last \1 byte is immediately followed by a \0 byte
+ signalling this. We can avoid the \1 byte(s). */
+ if (needed > 2 && needed == last_needed + 1)
+ {
+ /* Remove the \1 byte. */
+ if (--needed <= n)
+ dest[needed - 1] = L('\0');
+ }
+
+ /* Return the number of bytes/words we need, but don't count the NUL
+ byte/word at the end. */
+ return needed - 1;
+}
+
+/* Do the transformation using weight-index and rule cache. */
+static size_t
+do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
+ size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
+{
+ uint_fast32_t nrules = l_data->nrules;
+ unsigned char *rulesets = l_data->rulesets;
+ USTRING_TYPE *weights = l_data->weights;
+ uint_fast32_t pass;
+ size_t needed = 0;
+ size_t last_needed;
+ size_t idxcnt;
+
+ /* Now the passes over the weights. */
+ for (pass = 0; pass < nrules; ++pass)
+ {
+ size_t backw_stop = ~0ul;
+ int rule = rulesets[rulearr[0] * nrules + pass];
+ /* We assume that if a rule has defined `position' in one section
+ this is true for all of them. */
+ int position = rule & sort_position;
+
+ last_needed = needed;
+ if (position == 0)
+ {
+ for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
+ {
+ if ((rule & sort_forward) != 0)
+ {
+ size_t len;
+
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxcnt; backw > backw_stop; )
+ {
+ --backw;
+ len = weights[idxarr[backw]++];
+
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[backw]++];
+ else
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[backw] += len;
+ }
+ }
+
+ backw_stop = ~0ul;
+ }
+
+ /* Now handle the forward element. */
+ len = weights[idxarr[idxcnt]++];
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[idxcnt]++];
+ else
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[idxcnt] += len;
+ }
+ }
+ else
+ {
+ /* Remember where the backwards series started. */
+ if (backw_stop == ~0ul)
+ backw_stop = idxcnt;
+ }
+
+ rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+ }
+
+
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ backw = idxcnt;
+ while (backw > backw_stop)
+ {
+ size_t len = weights[idxarr[--backw]++];
+
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[backw]++];
+ else
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[backw] += len;
+ }
+ }
+ }
+ }
+ else
+ {
+ int val = 1;
+#ifndef WIDE_CHAR_VERSION
+ char buf[7];
+ size_t buflen;
+#endif
+ size_t i;
+
+ for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
+ {
+ if ((rule & sort_forward) != 0)
+ {
+ size_t len;
+
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxcnt; backw > backw_stop; )
+ {
+ --backw;
+ len = weights[idxarr[backw]++];
+ if (len != 0)
+ {
+#ifdef WIDE_CHAR_VERSION
+ if (needed + 1 + len < n)
+ {
+ dest[needed] = val;
+ for (i = 0; i < len; ++i)
+ dest[needed + 1 + i] =
+ weights[idxarr[backw] + i];
+ }
+ needed += 1 + len;
+#else
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[backw] + i];
+ }
+ needed += buflen + len;
+#endif
+ idxarr[backw] += len;
+ val = 1;
+ }
+ else
+ ++val;
+ }
+
+ backw_stop = ~0ul;
+ }
+
+ /* Now handle the forward element. */
+ len = weights[idxarr[idxcnt]++];
+ if (len != 0)
+ {
+#ifdef WIDE_CHAR_VERSION
+ if (needed + 1+ len < n)
+ {
+ dest[needed] = val;
+ for (i = 0; i < len; ++i)
+ dest[needed + 1 + i] =
+ weights[idxarr[idxcnt] + i];
+ }
+ needed += 1 + len;
+#else
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[idxcnt] + i];
+ }
+ needed += buflen + len;
+#endif
+ idxarr[idxcnt] += len;
+ val = 1;
+ }
+ else
+ /* Note that we don't have to increment `idxarr[idxcnt]'
+ since the length is zero. */
+ ++val;
+ }
+ else
+ {
+ /* Remember where the backwards series started. */
+ if (backw_stop == ~0ul)
+ backw_stop = idxcnt;
+ }
+
+ rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+ }
+
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ backw = idxmax - 1;
+ while (backw > backw_stop)
+ {
+ size_t len = weights[idxarr[--backw]++];
+ if (len != 0)
+ {
+#ifdef WIDE_CHAR_VERSION
+ if (needed + 1 + len < n)
+ {
+ dest[needed] = val;
+ for (i = 0; i < len; ++i)
+ dest[needed + 1 + i] =
+ weights[idxarr[backw] + i];
+ }
+ needed += 1 + len;
+#else
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[backw] + i];
+ }
+ needed += buflen + len;
+#endif
+ idxarr[backw] += len;
+ val = 1;
+ }
+ else
+ ++val;
+ }
+ }
+ }
+
+ /* Finally store the byte to separate the passes or terminate
+ the string. */
+ if (needed < n)
+ dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
+ ++needed;
+ }
+
+ /* This is a little optimization: many collation specifications have
+ a `position' rule at the end and if no non-ignored character
+ is found the last \1 byte is immediately followed by a \0 byte
+ signalling this. We can avoid the \1 byte(s). */
+ if (needed > 2 && needed == last_needed + 1)
+ {
+ /* Remove the \1 byte. */
+ if (--needed <= n)
+ dest[needed - 1] = L('\0');
+ }
+
+ /* Return the number of bytes/words we need, but don't count the NUL
+ byte/word at the end. */
+ return needed - 1;
+}
+
+size_t
+STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+{
+ locale_data_t l_data;
+ struct __locale_data *current = l->__locales[LC_COLLATE];
+ l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
+
+ /* Handle byte comparison case. */
+ if (l_data.nrules == 0)
+ {
+ size_t srclen = STRLEN (src);
+
+ if (n != 0)
+ STPNCPY (dest, src, MIN (srclen + 1, n));
+
+ return srclen;
+ }
+
+ /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
+ if (*src == L('\0'))
+ {
+ if (n != 0)
+ *dest = L('\0');
+ return 0;
+ }
+
+ /* Get the locale data. */
+ l_data.rulesets = (unsigned char *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+ l_data.table = (int32_t *)
+ current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
+ l_data.weights = (USTRING_TYPE *)
+ current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
+ l_data.extra = (USTRING_TYPE *)
+ current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
+ l_data.indirect = (int32_t *)
+ current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
+
+ assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
+ assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
+ assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
+ assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
+
+ /* We need the elements of the string as unsigned values since they
+ are used as indeces. */
+ const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
+
+ /* Allocate cache for small strings on the stack and fill it with weight and
+ rule indices. If the cache size is not sufficient, continue with the
+ uncached xfrm version. */
+ size_t idxmax = 0;
+ const USTRING_TYPE *cur = usrc;
+ int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
+ unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
+
+ do
+ {
+ int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
+ -1);
+ rulearr[idxmax] = tmp >> 24;
+ idxarr[idxmax] = tmp & 0xffffff;
+
+ ++idxmax;
+ }
+ while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
+
+ /* This element is only read, the value never used but to determine
+ another value which then is ignored. */
+ rulearr[idxmax] = '\0';
+
+ /* Do the transformation. */
+ if (*cur == L('\0'))
+ return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
+ else
+ return do_xfrm (usrc, dest, n, &l_data);
+}
+libc_hidden_def (STRXFRM)
+
+#ifndef WIDE_CHAR_VERSION
+weak_alias (__strxfrm_l, strxfrm_l)
+#endif