testsuite/test-mbrtowc.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

/* Auxiliary program to test mbrtowc(3) behaviour.
   Copyright 2016-2023 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; If not, see <https://www.gnu.org/licenses/>. */

/* Test the operating-system's native mbrtowc(3) function,
   by feeding it multibyte seqeunces one byte at a time,
   and reporting the result.

   The program prints the following values after each mbrtowc invocation,
   separated by commas:

   -2  the octet is contributes to a valid yet incomplete multibyte sequence
       in the current locale.

   -1  the octet causes an encoding error.

    0  the octet represents a NUL byte

    1  the octet is a valid single-byte character, OR
       completes a valid multibyte sequence.

  Because the program invokes mbrtowc(3) byte-by-byte, the reported
  result should never be larger than 1.

  Example of typical output with UTF-8 encoding
  ---------------------------------------------

  The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as:
    hex: 0xE2 0x88 0x91
    oct:  342  210  211

  Decoding the valid sequence byte-by-byte gives:
    $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc
    -2,-2,1

  '\210' is not a valid leading byte in UTF-8,
  thus the first byte gives -1, and the 'X' is treated
  as a valid single-byte character:

    $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc
    -1,1

  '\342' is a valid yet incomplete multibyte sequence.
  Passing it to mbrtowc results in value '-2'.
  The following value 'X' gives an encoding error '-1'
  (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence):

    $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc
    -2,-1


  Detecting implementation bugs in mbrtowc
  ----------------------------------------

  UTF-8 implementation is correct on most operating systems.
  Other multibyte locales might present more difficulties.
  An example is the Japanese SHIFT-JIS locale under Mac OS X.
  NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis'
  under Ubuntu. 'ja_JP.sjis' was also found on some systems.

  Using unicode character 'KATAKANA LETTER ZE' (U+30BC)
   UTF-8:    hex: 0xE3  0x82  0xBC
   Shift-jis hex: 0x83  0x5B
             oct:  203   133

  The following is a valid multibyte sequence in SHIFT-JIS,
  the first byte should result in '-2' (valid yet incomplete),
  and the second byte should result in '1' (a valid multibyte sequence
  completed):

    $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
    -2,1

  The follwing is an INVALID multibyte sequence in SHIFT-JIS
  (The byte ':' is not valid as a second octet).
  Buggy implementations will accept this as a valid multibyte sequence:

    # NOTE: this result indicates a buggy mbrtowc
    $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
    -2,1

  A correct implementations should report '-1' for the second byte (i.e.
  an encoding error):

    $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
    -2,-1


  Expected results with correct implementations
  ---------------------------------------------

  In GNU Sed some tests purposely use invalid multibyte sequences
  to test sed's behaviour. A buggy implemetation of mbrtowc
  would result in false-alarm failures.

  The following are expected results in correct implementations:
  (locale names are from Mac OS X):

    $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
    -2,1
    $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
    -2,-1
    $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc
    -2,-1
*/

#include <config.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>

#include "closeout.h"
#include "error.h"
#include "progname.h"

/* stub replacement for non-standard err(3) */
static int
die (const char *msg)
{
  error (0, 0, "%s: error: %s\n", program_name, msg);
  exit (EXIT_FAILURE);
}

int
main (int argc, char **argv)
{
  int c;
  int first = 1;

  set_program_name (argv[0]);
  if (!setlocale (LC_ALL, ""))
    die ("failed to set locale");

  while ((c = getchar ()) != EOF)
    {
      wchar_t wc;
      char ch = (unsigned char) c;
      int i = (int) mbrtowc (&wc, &ch, 1, NULL);

      if (!first)
        putchar (',');
      first = 0;

      printf ("%d", i);
    }

  if (first)
    die ("empty input");

  putchar ('\n');

  if (ferror (stdin))
    die ("read error");
  close_stdout ();

  exit (EXIT_SUCCESS);
}