summaryrefslogtreecommitdiff
path: root/libc/src/stdio/scanf_core/parser.cpp
blob: 44e853c8a8de8fea3cc5853a2b98f7662232133f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
//===-- Format string parser implementation for scanf ----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// #define LIBC_COPT_SCANF_DISABLE_INDEX_MODE 1 // This will be a compile flag.

#include "src/stdio/scanf_core/parser.h"

#include "src/__support/arg_list.h"

#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/bitset.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/ctype_utils.h"
#include "src/__support/str_to_integer.h"

namespace __llvm_libc {
namespace scanf_core {

#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
#define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
#else
#define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE

FormatSection Parser::get_next_section() {
  FormatSection section;
  size_t starting_pos = cur_pos;
  if (str[cur_pos] == '%') {
    // format section
    section.has_conv = true;

    ++cur_pos;
    [[maybe_unused]] size_t conv_index = 0;

#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
    conv_index = parse_index(&cur_pos);
#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE

    if (str[cur_pos] == '*') {
      ++cur_pos;
      section.flags = FormatFlags::NO_WRITE;
    }

    // handle width
    section.max_width = -1;
    if (internal::isdigit(str[cur_pos])) {
      auto result = internal::strtointeger<int>(str + cur_pos, 10);
      section.max_width = result.value;
      cur_pos = cur_pos + result.parsed_len;
    }

    // TODO(michaelrj): add posix allocate flag support.
    // if (str[cur_pos] == 'm') {
    //   ++cur_pos;
    //   section.flags = FormatFlags::ALLOCATE;
    // }

    LengthModifier lm = parse_length_modifier(&cur_pos);
    section.length_modifier = lm;

    section.conv_name = str[cur_pos];

    // If NO_WRITE is not set, then read the next arg as the output pointer.
    if ((section.flags & FormatFlags::NO_WRITE) == 0) {
      // Since all outputs are pointers, there's no need to distinguish when
      // reading from va_args. They're all the same size and stored the same.
      section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
    }

    // If the end of the format section is on the '\0'. This means we need to
    // not advance the cur_pos and we should not count this has having a
    // conversion.
    if (str[cur_pos] != '\0') {
      ++cur_pos;
    } else {
      section.has_conv = false;
    }

    // If the format is a bracketed one, then we need to parse out the insides
    // of the brackets.
    if (section.conv_name == '[') {
      constexpr char CLOSING_BRACKET = ']';
      constexpr char INVERT_FLAG = '^';
      constexpr char RANGE_OPERATOR = '-';

      cpp::bitset<256> scan_set;
      bool invert = false;

      // The circumflex in the first position represents the inversion flag, but
      // it's easier to apply that at the end so we just store it for now.
      if (str[cur_pos] == INVERT_FLAG) {
        invert = true;
        ++cur_pos;
      }

      // This is used to determine if a hyphen is being used as a literal or as
      // a range operator.
      size_t set_start_pos = cur_pos;

      // Normally the right bracket closes the set, but if it's the first
      // character (possibly after the inversion flag) then it's instead
      // included as a character in the set and the second right bracket closes
      // the set.
      if (str[cur_pos] == CLOSING_BRACKET) {
        scan_set.set(CLOSING_BRACKET);
        ++cur_pos;
      }

      while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
        // If a hyphen is being used as a range operator, since it's neither at
        // the beginning nor end of the set.
        if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
            str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
          // Technically there is no requirement to correct the ordering of the
          // range, but since the range operator is entirely implementation
          // defined it seems like a good convenience.
          char a = str[cur_pos - 1];
          char b = str[cur_pos + 1];
          char start = (a < b ? a : b);
          char end = (a < b ? b : a);
          scan_set.set_range(start, end);
          cur_pos += 2;
        } else {
          scan_set.set(str[cur_pos]);
          ++cur_pos;
        }
      }
      if (invert)
        scan_set.flip();

      if (str[cur_pos] == CLOSING_BRACKET) {
        ++cur_pos;
        section.scan_set = scan_set;
      } else {
        // if the end of the string was encountered, this is not a valid set.
        section.has_conv = false;
      }
    }
  } else {
    // raw section
    section.has_conv = false;
    while (str[cur_pos] != '%' && str[cur_pos] != '\0')
      ++cur_pos;
  }
  section.raw_string = {str + starting_pos, cur_pos - starting_pos};
  return section;
}

LengthModifier Parser::parse_length_modifier(size_t *local_pos) {
  switch (str[*local_pos]) {
  case ('l'):
    if (str[*local_pos + 1] == 'l') {
      *local_pos += 2;
      return LengthModifier::ll;
    } else {
      ++*local_pos;
      return LengthModifier::l;
    }
  case ('h'):
    if (str[*local_pos + 1] == 'h') {
      *local_pos += 2;
      return LengthModifier::hh;
    } else {
      ++*local_pos;
      return LengthModifier::h;
    }
  case ('L'):
    ++*local_pos;
    return LengthModifier::L;
  case ('j'):
    ++*local_pos;
    return LengthModifier::j;
  case ('z'):
    ++*local_pos;
    return LengthModifier::z;
  case ('t'):
    ++*local_pos;
    return LengthModifier::t;
  default:
    return LengthModifier::NONE;
  }
}

//----------------------------------------------------
// INDEX MODE ONLY FUNCTIONS AFTER HERE:
//----------------------------------------------------

#ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE

size_t Parser::parse_index(size_t *local_pos) {
  if (internal::isdigit(str[*local_pos])) {
    auto result = internal::strtointeger<int>(str + *local_pos, 10);
    size_t index = result.value;
    if (str[*local_pos + result.parsed_len] != '$')
      return 0;
    *local_pos = 1 + result.parsed_len + *local_pos;
    return index;
  }
  return 0;
}

void Parser::args_to_index(size_t index) {
  if (args_index > index) {
    args_index = 1;
    args_cur = args_start;
  }

  while (args_index < index) {
    // Since all arguments must be pointers, we can just read all of them as
    // void * and not worry about type issues.
    args_cur.next_var<void *>();
    ++args_index;
  }
}

#endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE

} // namespace scanf_core
} // namespace __llvm_libc