libc/src/stdio/scanf_core/int_converter.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

//===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/stdio/scanf_core/int_converter.h"

#include "src/__support/CPP/limits.h"
#include "src/__support/ctype_utils.h"
#include "src/stdio/scanf_core/converter_utils.h"
#include "src/stdio/scanf_core/core_structs.h"
#include "src/stdio/scanf_core/reader.h"

#include <stddef.h>

namespace __llvm_libc {
namespace scanf_core {

// This code is very similar to the code in __support/str_to_integer.h but is
// not quite the same. Here is the list of differences and why they exist:
//  1) This takes a reader and a format section instead of a char* and the base.
//      This should be fairly self explanatory. While the char* could be adapted
//      to a reader and the base could be calculated ahead of time, the
//      semantics are slightly different, specifically a char* can be indexed
//      freely (I can read str[2] and then str[0]) whereas a File (which the
//      reader may contain) cannot.
//  2) Because this uses a Reader, this function can only unget once.
//      This is relevant because scanf specifies it reads the "longest sequence
//      of input characters which does not exceed any specified field width and
//      which is, or is a prefix of, a matching input sequence." Whereas the
//      strtol function accepts "the longest initial subsequence of the input
//      string (...) that is of the expected form." This is demonstrated by the
//      differences in how they deal with the string "0xZZZ" when parsing as
//      hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
//      since it reads the first 'Z', sees that it's not a valid hex digit, and
//      reverses one character. The strtol function on the other hand only
//      accepts the "0" since that's the longest valid hexadecimal sequence. It
//      sees the 'Z' after the "0x" and determines that this is not the prefix
//      to a valid hex string.
//  3) This conversion may have a maximum width.
//      If a maximum width is specified, this conversion is only allowed to
//      accept a certain number of characters. Strtol doesn't have any such
//      limitation.
int convert_int(Reader *reader, const FormatSection &to_conv) {
  // %d "Matches an optionally signed decimal integer [...] with the value 10
  // for the base argument. The corresponding argument shall be a pointer to
  // signed integer."

  // %i "Matches an optionally signed integer [...] with the value 0 for the
  // base argument. The corresponding argument shall be a pointer to signed
  // integer."

  // %u "Matches an optionally signed decimal integer [...] with the value 10
  // for the base argument. The corresponding argument shall be a pointer to
  // unsigned integer"

  // %o "Matches an optionally signed octal integer [...] with the value 8 for
  // the base argument. The corresponding argument shall be a pointer to
  // unsigned integer"

  // %x/X "Matches an optionally signed hexadecimal integer [...] with the value
  // 16 for the base argument. The corresponding argument shall be a pointer to
  // unsigned integer"

  size_t max_width = cpp::numeric_limits<size_t>::max();
  if (to_conv.max_width > 0) {
    max_width = to_conv.max_width;
  }

  uintmax_t result = 0;
  bool is_number = false;
  bool is_signed = false;
  int base = 0;
  if (to_conv.conv_name == 'i') {
    base = 0;
    is_signed = true;
  } else if (to_conv.conv_name == 'o') {
    base = 8;
  } else if (to_lower(to_conv.conv_name) == 'x' || to_conv.conv_name == 'p') {
    base = 16;
  } else if (to_conv.conv_name == 'd') {
    base = 10;
    is_signed = true;
  } else { // conv_name must be 'u'
    base = 10;
  }

  char cur_char = reader->getc();

  char result_sign = '+';
  if (cur_char == '+' || cur_char == '-') {
    result_sign = cur_char;
    if (max_width > 1) {
      --max_width;
      cur_char = reader->getc();
    } else {
      // If the max width has been hit already, then the return value must be 0
      // since no actual digits of the number have been parsed yet.
      write_int_with_length(0, to_conv);
      return MATCHING_FAILURE;
    }
  }
  const bool is_negative = result_sign == '-';

  // Base of 0 means automatically determine the base. Base of 16 may have a
  // prefix of "0x"
  if (base == 0 || base == 16) {
    // If the first character is 0, then it could be octal or hex.
    if (cur_char == '0') {
      is_number = true;

      // Read the next character to check.
      if (max_width > 1) {
        --max_width;
        cur_char = reader->getc();
      } else {
        write_int_with_length(0, to_conv);
        return READ_OK;
      }

      if (to_lower(cur_char) == 'x') {
        // This is a valid hex prefix.
        base = 16;
        if (max_width > 1) {
          --max_width;
          cur_char = reader->getc();
        } else {
          write_int_with_length(0, to_conv);
          return READ_OK;
        }

      } else {
        if (base == 0) {
          base = 8;
        }
      }
    } else if (base == 0) {
      if (internal::isdigit(cur_char)) {
        // If the first character is a different number, then it's 10.
        base = 10;
      } else {
        // If the first character isn't a valid digit, then there are no valid
        // digits at all. The number is 0.
        reader->ungetc(cur_char);
        write_int_with_length(0, to_conv);
        return MATCHING_FAILURE;
      }
    }
  }

  constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
  constexpr uintmax_t SIGNED_MAX =
      static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
  constexpr uintmax_t NEGATIVE_SIGNED_MAX =
      static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;

  const uintmax_t MAX =
      (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
                 : UNSIGNED_MAX);

  const uintmax_t max_div_by_base = MAX / base;

  if (internal::isalnum(cur_char) && b36_char_to_int(cur_char) < base) {
    is_number = true;
  }

  bool has_overflow = false;
  size_t i = 0;
  for (; i < max_width && internal::isalnum(cur_char) &&
         b36_char_to_int(cur_char) < base;
       ++i, cur_char = reader->getc()) {

    uintmax_t cur_digit = b36_char_to_int(cur_char);

    if (result == MAX) {
      has_overflow = true;
      continue;
    } else if (result > max_div_by_base) {
      result = MAX;
      has_overflow = true;
    } else {
      result = result * base;
    }

    if (result > MAX - cur_digit) {
      result = MAX;
      has_overflow = true;
    } else {
      result = result + cur_digit;
    }
  }

  // We always read one more character than will be used, so we have to put the
  // last one back.
  reader->ungetc(cur_char);

  if (has_overflow) {
    write_int_with_length(MAX, to_conv);
  } else {
    if (is_negative)
      result = -result;

    write_int_with_length(result, to_conv);
  }

  if (!is_number)
    return MATCHING_FAILURE;
  return READ_OK;
}

} // namespace scanf_core
} // namespace __llvm_libc