summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2003-11-29 06:40:52 +0000
committerUlrich Drepper <drepper@redhat.com>2003-11-29 06:40:52 +0000
commit37369d1cefec4d1990e59fbec613d2bc66e68eba (patch)
tree86439ecf806f00f3350470f99fb8d700b1cbe2a8
parentbb3f4825c411e676c51479fea59643af540810b5 (diff)
downloadglibc-37369d1cefec4d1990e59fbec613d2bc66e68eba.tar.gz
Update.
* misc/mntent_r.c (decode_name): Fix decoding of tab, add decoding of newline. * manual/sysinfo.texi (mtab): Adjust description accordingly. Reported by Andries.Brouwer@cwi.nl.
-rw-r--r--ChangeLog5
-rw-r--r--manual/sysinfo.texi13
-rw-r--r--misc/mntent_r.c8
-rw-r--r--posix/regex_internal.c460
4 files changed, 360 insertions, 126 deletions
diff --git a/ChangeLog b/ChangeLog
index a04f6895e0..8d963e4618 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2003-11-28 Ulrich Drepper <drepper@redhat.com>
+ * misc/mntent_r.c (decode_name): Fix decoding of tab, add decoding
+ of newline.
+ * manual/sysinfo.texi (mtab): Adjust description accordingly.
+ Reported by Andries.Brouwer@cwi.nl.
+
* sysdeps/x86_64/fpu/libm-test-ulps: Add some more minor changes
to compensate other setup.
diff --git a/manual/sysinfo.texi b/manual/sysinfo.texi
index a310240801..0a44830359 100644
--- a/manual/sysinfo.texi
+++ b/manual/sysinfo.texi
@@ -673,12 +673,13 @@ filled with the information from the next entry from the file currently
read.
The file format used prescribes the use of spaces or tab characters to
-separate the fields. This makes it harder to use name containing one of
-these characters (e.g., mount points using spaces). Therefore these
-characters are encoded in the files and the @code{getmntent} function
-takes care of the decoding while reading the entries back in.
-@code{'\040'} is used to encode a space character, @code{'\012'} to
-encode a tab character and @code{'\\'} to encode a backslash.
+separate the fields. This makes it harder to use name containing one
+of these characters (e.g., mount points using spaces). Therefore
+these characters are encoded in the files and the @code{getmntent}
+function takes care of the decoding while reading the entries back in.
+@code{'\040'} is used to encode a space character, @code{'\011'} to
+encode a tab character, @code{'\012'} to encode a newline character,
+and @code{'\\'} to encode a backslash.
If there was an error or the end of the file is reached the return value
is @code{NULL}.
diff --git a/misc/mntent_r.c b/misc/mntent_r.c
index 9bfe8756f1..3a47f61f38 100644
--- a/misc/mntent_r.c
+++ b/misc/mntent_r.c
@@ -84,12 +84,18 @@ decode_name (char *buf)
*wp++ = ' ';
rp += 3;
}
- else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1' && rp[3] == '2')
+ else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1' && rp[3] == '1')
{
/* \012 is a TAB. */
*wp++ = '\t';
rp += 3;
}
+ else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1' && rp[3] == '2')
+ {
+ /* \012 is a NEWLINE. */
+ *wp++ = '\n';
+ rp += 3;
+ }
else if (rp[0] == '\\' && rp[1] == '\\')
{
/* We have to escape \\ to be able to represent all characters. */
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index f78ec79e65..1fd3e164ef 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -62,17 +62,14 @@ re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
init_len = dfa->mb_cur_max;
init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
re_string_construct_common (str, len, pstr, trans, icase, dfa);
- pstr->stop = pstr->len;
ret = re_string_realloc_buffers (pstr, init_buf_len);
if (BE (ret != REG_NOERROR, 0))
return ret;
- pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
- : (unsigned char *) str);
- pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
- pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
- || dfa->mb_cur_max > 1) ? pstr->valid_len : len;
+ pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
+ pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
+ pstr->valid_raw_len = pstr->valid_len;
return REG_NOERROR;
}
@@ -88,9 +85,6 @@ re_string_construct (pstr, str, len, trans, icase, dfa)
{
reg_errcode_t ret;
re_string_construct_common (str, len, pstr, trans, icase, dfa);
- pstr->stop = pstr->len;
- /* Set 0 so that this function can initialize whole buffers. */
- pstr->valid_len = 0;
if (len > 0)
{
@@ -98,15 +92,27 @@ re_string_construct (pstr, str, len, trans, icase, dfa)
if (BE (ret != REG_NOERROR, 0))
return ret;
}
- pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
- : (unsigned char *) str);
- pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
+ pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
if (icase)
{
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
- build_wcs_upper_buffer (pstr);
+ {
+ while (1)
+ {
+ ret = build_wcs_upper_buffer (pstr);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ if (pstr->valid_raw_len >= len)
+ break;
+ if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
+ break;
+ ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ }
+ }
else
#endif /* RE_ENABLE_I18N */
build_upper_buffer (pstr);
@@ -122,12 +128,13 @@ re_string_construct (pstr, str, len, trans, icase, dfa)
if (trans != NULL)
re_string_translate_buffer (pstr);
else
- pstr->valid_len = len;
+ {
+ pstr->valid_len = pstr->bufs_len;
+ pstr->valid_raw_len = pstr->bufs_len;
+ }
}
}
- /* Initialized whole buffers, then valid_len == bufs_len. */
- pstr->valid_len = pstr->bufs_len;
return REG_NOERROR;
}
@@ -145,9 +152,16 @@ re_string_realloc_buffers (pstr, new_buf_len)
if (BE (new_array == NULL, 0))
return REG_ESPACE;
pstr->wcs = new_array;
+ if (pstr->offsets != NULL)
+ {
+ int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
+ if (BE (new_array == NULL, 0))
+ return REG_ESPACE;
+ pstr->offsets = new_array;
+ }
}
#endif /* RE_ENABLE_I18N */
- if (MBS_ALLOCATED (pstr))
+ if (pstr->mbs_allocated)
{
unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
new_buf_len);
@@ -155,16 +169,6 @@ re_string_realloc_buffers (pstr, new_buf_len)
return REG_ESPACE;
pstr->mbs = new_array;
}
- if (MBS_CASE_ALLOCATED (pstr))
- {
- unsigned char *new_array = re_realloc (pstr->mbs_case, unsigned char,
- new_buf_len);
- if (BE (new_array == NULL, 0))
- return REG_ESPACE;
- pstr->mbs_case = new_array;
- if (!MBS_ALLOCATED (pstr))
- pstr->mbs = pstr->mbs_case;
- }
pstr->bufs_len = new_buf_len;
return REG_NOERROR;
}
@@ -182,11 +186,15 @@ re_string_construct_common (str, len, pstr, trans, icase, dfa)
memset (pstr, '\0', sizeof (re_string_t));
pstr->raw_mbs = (const unsigned char *) str;
pstr->len = len;
+ pstr->raw_len = len;
pstr->trans = trans;
pstr->icase = icase ? 1 : 0;
+ pstr->mbs_allocated = (trans != NULL || icase);
pstr->mb_cur_max = dfa->mb_cur_max;
pstr->is_utf8 = dfa->is_utf8;
pstr->map_notascii = dfa->map_notascii;
+ pstr->stop = pstr->len;
+ pstr->raw_stop = pstr->stop;
}
#ifdef RE_ENABLE_I18N
@@ -206,18 +214,39 @@ static void
build_wcs_buffer (pstr)
re_string_t *pstr;
{
+#ifdef _LIBC
+ unsigned char buf[pstr->mb_cur_max];
+#else
+ unsigned char buf[64];
+#endif
mbstate_t prev_st;
int byte_idx, end_idx, mbclen, remain_len;
+
/* Build the buffers from pstr->valid_len to either pstr->len or
pstr->bufs_len. */
- end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
+ end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
{
wchar_t wc;
+ const char *p;
+
remain_len = end_idx - byte_idx;
prev_st = pstr->cur_state;
- mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
- + byte_idx), remain_len, &pstr->cur_state);
+ /* Apply the translation if we need. */
+ if (BE (pstr->trans != NULL, 0))
+ {
+ int i, ch;
+
+ for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
+ {
+ ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
+ buf[i] = pstr->trans[ch];
+ }
+ p = (const char *) buf;
+ }
+ else
+ p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
+ mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
if (BE (mbclen == (size_t) -2, 0))
{
/* The buffer doesn't have enough space, finish to build. */
@@ -229,15 +258,11 @@ build_wcs_buffer (pstr)
/* We treat these cases as a singlebyte character. */
mbclen = 1;
wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
+ if (BE (pstr->trans != NULL, 0))
+ wc = pstr->trans[wc];
pstr->cur_state = prev_st;
}
- /* Apply the translation if we need. */
- if (pstr->trans != NULL && mbclen == 1)
- {
- int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
- pstr->mbs_case[byte_idx] = ch;
- }
/* Write wide character and padding. */
pstr->wcs[byte_idx++] = wc;
/* Write paddings. */
@@ -245,61 +270,83 @@ build_wcs_buffer (pstr)
pstr->wcs[byte_idx++] = WEOF;
}
pstr->valid_len = byte_idx;
+ pstr->valid_raw_len = byte_idx;
}
/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
but for REG_ICASE. */
-static void
+static int
build_wcs_upper_buffer (pstr)
re_string_t *pstr;
{
mbstate_t prev_st;
- int byte_idx, end_idx, mbclen, remain_len;
- /* Build the buffers from pstr->valid_len to either pstr->len or
- pstr->bufs_len. */
- end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
+ int src_idx, byte_idx, end_idx, mbclen, remain_len;
+#ifdef _LIBC
+ unsigned char buf[pstr->mb_cur_max];
+#else
+ unsigned char buf[64];
+#endif
+
+ byte_idx = pstr->valid_len;
+ end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
#ifdef _LIBC
/* The following optimization assumes that the wchar_t encoding is
always ISO 10646. */
- if (! pstr->map_notascii && pstr->trans == NULL)
- for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
- if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
- && mbsinit (&pstr->cur_state))
- {
- /* In case of a singlebyte character. */
- pstr->mbs[byte_idx]
- = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
- /* The next step uses the assumption that wchar_t is encoded
- with ISO 10646: all ASCII values can be converted like this. */
- pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
- ++byte_idx;
- }
- else
+ if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
+ {
+ while (byte_idx < end_idx)
{
wchar_t wc;
+
+ if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
+ && mbsinit (&pstr->cur_state))
+ {
+ /* In case of a singlebyte character. */
+ pstr->mbs[byte_idx]
+ = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
+ /* The next step uses the assumption that wchar_t is encoded
+ with ISO 10646: all ASCII values can be converted like
+ this. */
+ pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
+ ++byte_idx;
+ continue;
+ }
+
remain_len = end_idx - byte_idx;
prev_st = pstr->cur_state;
mbclen = mbrtowc (&wc,
((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
+ byte_idx), remain_len, &pstr->cur_state);
- if (BE (mbclen > 1, 1))
+ if (BE (mbclen > 0, 1))
{
+ wchar_t wcu = wc;
if (iswlower (wc))
- wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc),
- &prev_st);
+ {
+ int mbcdlen;
+
+ wcu = towupper (wc);
+ mbcdlen = wcrtomb (buf, wcu, &prev_st);
+ if (BE (mbclen == mbcdlen, 1))
+ memcpy (pstr->mbs + byte_idx, buf, mbclen);
+ else
+ {
+ src_idx = byte_idx;
+ goto offsets_needed;
+ }
+ }
else
memcpy (pstr->mbs + byte_idx,
pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
- pstr->wcs[byte_idx++] = towupper (wc);
+ pstr->wcs[byte_idx++] = wcu;
/* Write paddings. */
for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
pstr->wcs[byte_idx++] = WEOF;
}
else if (mbclen == (size_t) -1 || mbclen == 0)
{
- /* It is an invalid character. Just use the byte. */
+ /* It is an invalid character or '\0'. Just use the byte. */
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
pstr->mbs[byte_idx] = ch;
/* And also cast it to wide char. */
@@ -314,48 +361,116 @@ build_wcs_upper_buffer (pstr)
break;
}
}
+ pstr->valid_len = byte_idx;
+ pstr->valid_raw_len = byte_idx;
+ return REG_NOERROR;
+ }
else
#endif
- for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
+ for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
{
wchar_t wc;
+ const char *p;
+offsets_needed:
remain_len = end_idx - byte_idx;
prev_st = pstr->cur_state;
- mbclen = mbrtowc (&wc,
- ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
- + byte_idx), remain_len, &pstr->cur_state);
- if (mbclen == 1)
+ if (BE (pstr->trans != NULL, 0))
{
- /* In case of a singlebyte character. */
- int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
- /* Apply the translation if we need. */
- if (BE (pstr->trans != NULL, 0) && mbclen == 1)
+ int i, ch;
+
+ for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
{
- ch = pstr->trans[ch];
- pstr->mbs_case[byte_idx] = ch;
+ ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
+ buf[i] = pstr->trans[ch];
}
- pstr->wcs[byte_idx] = towupper (wc);
- pstr->mbs[byte_idx++] = toupper (ch);
- if (BE (mbclen == (size_t) -1, 0))
- pstr->cur_state = prev_st;
+ p = (const char *) buf;
}
- else if (BE (mbclen != (size_t) -2, 1))
+ else
+ p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
+ mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
+ if (BE (mbclen > 0, 1))
{
+ wchar_t wcu = wc;
if (iswlower (wc))
- wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
+ {
+ int mbcdlen;
+
+ wcu = towupper (wc);
+ mbcdlen = wcrtomb (buf, wcu, &prev_st);
+ if (BE (mbclen == mbcdlen, 1))
+ memcpy (pstr->mbs + byte_idx, buf, mbclen);
+ else
+ {
+ int i;
+
+ if (byte_idx + mbcdlen > pstr->bufs_len)
+ {
+ pstr->cur_state = prev_st;
+ break;
+ }
+
+ if (pstr->offsets == NULL)
+ {
+ pstr->offsets = re_malloc (int, pstr->bufs_len);
+
+ if (pstr->offsets == NULL)
+ return REG_ESPACE;
+ }
+ if (!pstr->offsets_needed)
+ {
+ for (i = 0; i < byte_idx; ++i)
+ pstr->offsets[i] = i;
+ pstr->offsets_needed = 1;
+ }
+
+ memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
+ pstr->wcs[byte_idx] = wcu;
+ pstr->offsets[byte_idx] = src_idx;
+ for (i = 1; i < mbcdlen; ++i)
+ {
+ pstr->offsets[byte_idx + i]
+ = src_idx + (i < mbclen ? i : mbclen - 1);
+ pstr->wcs[byte_idx + i] = WEOF;
+ }
+ pstr->len += mbcdlen - mbclen;
+ if (pstr->raw_stop > src_idx)
+ pstr->stop += mbcdlen - mbclen;
+ end_idx = (pstr->bufs_len > pstr->len)
+ ? pstr->len : pstr->bufs_len;
+ byte_idx += mbcdlen;
+ src_idx += mbclen;
+ continue;
+ }
+ }
else
- memcpy (pstr->mbs + byte_idx,
- pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
- pstr->wcs[byte_idx++] = towupper (wc);
+ memcpy (pstr->mbs + byte_idx, p, mbclen);
+
+ if (BE (pstr->offsets_needed != 0, 0))
+ {
+ int i;
+ for (i = 0; i < mbclen; ++i)
+ pstr->offsets[byte_idx + i] = src_idx + i;
+ }
+ src_idx += mbclen;
+
+ pstr->wcs[byte_idx++] = wcu;
/* Write paddings. */
for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
pstr->wcs[byte_idx++] = WEOF;
}
else if (mbclen == (size_t) -1 || mbclen == 0)
{
- /* It is an invalid character. Just use the byte. */
- int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
+ /* It is an invalid character or '\0'. Just use the byte. */
+ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
+
+ if (BE (pstr->trans != NULL, 0))
+ ch = pstr->trans [ch];
pstr->mbs[byte_idx] = ch;
+
+ if (BE (pstr->offsets_needed != 0, 0))
+ pstr->offsets[byte_idx] = src_idx;
+ ++src_idx;
+
/* And also cast it to wide char. */
pstr->wcs[byte_idx++] = (wchar_t) ch;
if (BE (mbclen == (size_t) -1, 0))
@@ -369,6 +484,8 @@ build_wcs_upper_buffer (pstr)
}
}
pstr->valid_len = byte_idx;
+ pstr->valid_raw_len = src_idx;
+ return REG_NOERROR;
}
/* Skip characters until the index becomes greater than NEW_RAW_IDX.
@@ -385,7 +502,7 @@ re_string_skip_chars (pstr, new_raw_idx, last_wc)
wchar_t wc = 0;
/* Skip the characters which are not necessary to check. */
- for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
+ for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
rawbuf_idx < new_raw_idx;)
{
int remain_len;
@@ -420,17 +537,15 @@ build_upper_buffer (pstr)
for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
{
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
- if (pstr->trans != NULL)
- {
- ch = pstr->trans[ch];
- pstr->mbs_case[char_idx] = ch;
- }
+ if (BE (pstr->trans != NULL, 0))
+ ch = pstr->trans[ch];
if (islower (ch))
pstr->mbs[char_idx] = toupper (ch);
else
pstr->mbs[char_idx] = ch;
}
pstr->valid_len = char_idx;
+ pstr->valid_raw_len = char_idx;
}
/* Apply TRANS to the buffer in PSTR. */
@@ -445,10 +560,11 @@ re_string_translate_buffer (pstr)
for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
{
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
- pstr->mbs_case[buf_idx] = pstr->trans[ch];
+ pstr->mbs[buf_idx] = pstr->trans[ch];
}
pstr->valid_len = buf_idx;
+ pstr->valid_raw_len = buf_idx;
}
/* This function re-construct the buffers.
@@ -468,14 +584,15 @@ re_string_reconstruct (pstr, idx, eflags, newline)
if (pstr->mb_cur_max > 1)
memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
#endif /* RE_ENABLE_I18N */
- pstr->len += pstr->raw_mbs_idx;
- pstr->stop += pstr->raw_mbs_idx;
- pstr->valid_len = pstr->raw_mbs_idx = 0;
+ pstr->len = pstr->raw_len;
+ pstr->stop = pstr->raw_stop;
+ pstr->valid_len = 0;
+ pstr->raw_mbs_idx = 0;
+ pstr->valid_raw_len = 0;
+ pstr->offsets_needed = 0;
pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
: CONTEXT_NEWLINE | CONTEXT_BEGBUF);
- if (!MBS_CASE_ALLOCATED (pstr))
- pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
- if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
+ if (!pstr->mbs_allocated)
pstr->mbs = (unsigned char *) pstr->raw_mbs;
offset = idx;
}
@@ -483,7 +600,13 @@ re_string_reconstruct (pstr, idx, eflags, newline)
if (offset != 0)
{
/* Are the characters which are already checked remain? */
- if (offset < pstr->valid_len)
+ if (offset < pstr->valid_raw_len
+#ifdef RE_ENABLE_I18N
+ /* Handling this would enlarge the code too much.
+ Accept a slowdown in that case. */
+ && pstr->offsets_needed == 0
+#endif
+ )
{
/* Yes, move them to the front of the buffer. */
pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
@@ -493,13 +616,11 @@ re_string_reconstruct (pstr, idx, eflags, newline)
memmove (pstr->wcs, pstr->wcs + offset,
(pstr->valid_len - offset) * sizeof (wint_t));
#endif /* RE_ENABLE_I18N */
- if (MBS_ALLOCATED (pstr))
+ if (pstr->mbs_allocated)
memmove (pstr->mbs, pstr->mbs + offset,
pstr->valid_len - offset);
- if (MBS_CASE_ALLOCATED (pstr))
- memmove (pstr->mbs_case, pstr->mbs_case + offset,
- pstr->valid_len - offset);
pstr->valid_len -= offset;
+ pstr->valid_raw_len -= offset;
#if DEBUG
assert (pstr->valid_len > 0);
#endif
@@ -507,16 +628,26 @@ re_string_reconstruct (pstr, idx, eflags, newline)
else
{
/* No, skip all characters until IDX. */
+#ifdef RE_ENABLE_I18N
+ if (BE (pstr->offsets_needed, 0))
+ {
+ pstr->len = pstr->raw_len - idx + offset;
+ pstr->stop = pstr->raw_stop - idx + offset;
+ pstr->offsets_needed = 0;
+ }
+#endif
pstr->valid_len = 0;
+ pstr->valid_raw_len = 0;
#ifdef RE_ENABLE_I18N
if (pstr->mb_cur_max > 1)
{
int wcs_idx;
wint_t wc = WEOF;
+#ifdef _LIBC
if (pstr->is_utf8)
{
- const unsigned char *raw, *p, *end;
+ const unsigned char *raw, *p, *q, *end;
/* Special case UTF-8. Multi-byte chars start with any
byte other than 0x80 - 0xbf. */
@@ -527,13 +658,22 @@ re_string_reconstruct (pstr, idx, eflags, newline)
{
mbstate_t cur_state;
wchar_t wc2;
- int mlen;
+ int mlen = raw + pstr->len - p;
+ unsigned char buf[6];
+ q = p;
+ if (BE (pstr->trans != NULL, 0))
+ {
+ int i = mlen < 6 ? mlen : 6;
+ while (--i >= 0)
+ buf[i] = pstr->trans[p[i]];
+ q = buf;
+ }
/* XXX Don't use mbrtowc, we know which conversion
to use (UTF-8 -> UCS4). */
memset (&cur_state, 0, sizeof (cur_state));
- mlen = mbrtowc (&wc2, p, raw + pstr->len - p,
- &cur_state) - (raw + offset - p);
+ mlen = mbrtowc (&wc2, p, mlen, &cur_state)
+ - (raw + offset - p);
if (mlen >= 0)
{
memset (&pstr->cur_state, '\0',
@@ -544,12 +684,17 @@ re_string_reconstruct (pstr, idx, eflags, newline)
break;
}
}
+#endif
if (wc == WEOF)
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
- for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
- pstr->wcs[wcs_idx] = WEOF;
- if (pstr->trans && wc <= 0xff)
- wc = pstr->trans[wc];
+ if (BE (pstr->valid_len, 0))
+ {
+ for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+ pstr->wcs[wcs_idx] = WEOF;
+ if (pstr->mbs_allocated)
+ memset (pstr->mbs, 255, pstr->valid_len);
+ }
+ pstr->valid_raw_len = pstr->valid_len;
pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
: ((newline && IS_WIDE_NEWLINE (wc))
? CONTEXT_NEWLINE : 0));
@@ -565,13 +710,8 @@ re_string_reconstruct (pstr, idx, eflags, newline)
? CONTEXT_NEWLINE : 0));
}
}
- if (!MBS_CASE_ALLOCATED (pstr))
- {
- pstr->mbs_case += offset;
- /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */
- if (!MBS_ALLOCATED (pstr))
- pstr->mbs += offset;
- }
+ if (!pstr->mbs_allocated)
+ pstr->mbs += offset;
}
pstr->raw_mbs_idx = idx;
pstr->len -= offset;
@@ -582,7 +722,11 @@ re_string_reconstruct (pstr, idx, eflags, newline)
if (pstr->mb_cur_max > 1)
{
if (pstr->icase)
- build_wcs_upper_buffer (pstr);
+ {
+ int ret = build_wcs_upper_buffer (pstr);
+ if (BE (ret != REG_NOERROR, 0))
+ return ret;
+ }
else
build_wcs_buffer (pstr);
}
@@ -601,17 +745,95 @@ re_string_reconstruct (pstr, idx, eflags, newline)
return REG_NOERROR;
}
+static unsigned char
+re_string_peek_byte_case (const re_string_t *pstr,
+ int idx)
+{
+ int ch, off;
+
+ /* Handle the common (easiest) cases first. */
+ if (BE (!pstr->icase, 1))
+ return re_string_peek_byte (pstr, idx);
+
+#ifdef RE_ENABLE_I18N
+ if (pstr->mb_cur_max > 1
+ && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
+ return re_string_peek_byte (pstr, idx);
+#endif
+
+ off = pstr->cur_idx + idx;
+#ifdef RE_ENABLE_I18N
+ if (pstr->offsets_needed)
+ off = pstr->offsets[off];
+#endif
+
+ ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
+ if (pstr->trans)
+ ch = pstr->trans[ch];
+
+#ifdef RE_ENABLE_I18N
+ /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
+ this function returns CAPITAL LETTER I instead of first byte of
+ DOTLESS SMALL LETTER I. The latter would confuse the parser,
+ since peek_byte_case doesn't advance cur_idx in any way. */
+ if (pstr->offsets_needed && !isascii (ch))
+ return re_string_peek_byte (pstr, idx);
+#endif
+
+ return ch;
+}
+
+static unsigned char
+re_string_fetch_byte_case (re_string_t *pstr)
+{
+ int ch, off;
+
+ if (BE (!pstr->icase, 1))
+ return re_string_fetch_byte (pstr);
+
+#ifdef RE_ENABLE_I18N
+ if (pstr->offsets_needed)
+ {
+ /* For tr_TR.UTF-8 [[:islower:]] there is
+ [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
+ in that case the whole multi-byte character and return
+ the original letter. On the other side, with
+ [[: DOTLESS SMALL LETTER I return [[:I, as doing
+ anything else would complicate things too much. */
+
+ if (!re_string_first_byte (pstr, pstr->cur_idx))
+ return re_string_fetch_byte (pstr);
+
+ off = pstr->offsets[pstr->cur_idx];
+ ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
+ if (pstr->trans)
+ ch = pstr->trans[ch];
+
+ if (! isascii (ch))
+ return re_string_fetch_byte (pstr);
+
+ re_string_skip_bytes (pstr,
+ re_string_char_size_at (pstr, pstr->cur_idx));
+ return ch;
+ }
+#endif
+
+ ch = pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
+ if (pstr->trans)
+ ch = pstr->trans[ch];
+ return ch;
+}
+
static void
re_string_destruct (pstr)
re_string_t *pstr;
{
#ifdef RE_ENABLE_I18N
re_free (pstr->wcs);
+ re_free (pstr->offsets);
#endif /* RE_ENABLE_I18N */
- if (MBS_ALLOCATED (pstr))
+ if (pstr->mbs_allocated)
re_free (pstr->mbs);
- if (MBS_CASE_ALLOCATED (pstr))
- re_free (pstr->mbs_case);
}
/* Return the context at IDX in INPUT. */