/* GNU SED, a batch stream editor.
Copyright (C) 2003-2023 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; If not, see . */
#include "sed.h"
#include
#include
#include "localcharset.h"
int mb_cur_max;
bool is_utf8;
/* Return non-zero if CH is part of a valid multibyte sequence:
Either incomplete yet valid sequence (in case of a leading byte),
or the last byte of a valid multibyte sequence.
Return zero in all other cases:
CH is a valid single-byte character (e.g. 0x01-0x7F in UTF-8 locales);
CH is an invalid byte in a multibyte sequence for the currentl locale,
CH is the NUL byte.
Reset CUR_STAT in the case of an invalid byte.
*/
int
is_mb_char (int ch, mbstate_t *cur_stat)
{
const char c = ch ;
const int mb_pending = !mbsinit (cur_stat);
const int result = mbrtowc (NULL, &c, 1, cur_stat);
switch (result)
{
case -2: /* Beginning or middle of valid multibyte sequence */
return 1;
case -1: /* Invalid sequence, byte treated like a single-byte character */
memset (cur_stat, 0, sizeof (mbstate_t));
return 0;
case 1: /* A valid byte, check if part of on-going multibyte sequence */
return mb_pending;
case 0: /* Special case of mbrtowc(3): the NUL character */
/* TODO: test this */
return 1;
default: /* Should never happen, as per mbrtowc(3) documentation */
panic ("is_mb_char: mbrtowc (0x%x) returned %d",
(unsigned int) ch, result);
}
}
void
initialize_mbcs (void)
{
/* For UTF-8, we know that the encoding is stateless. */
const char *codeset_name;
codeset_name = locale_charset ();
is_utf8 = (strcmp (codeset_name, "UTF-8") == 0);
mb_cur_max = MB_CUR_MAX;
}