summaryrefslogtreecommitdiff
path: root/sed/sed.h
blob: b904fca4380ff4505eb71f8ddec2cca6eb82cde5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/*  GNU SED, a batch stream editor.
    Copyright (C) 1989-2023 Free Software Foundation, Inc.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3, or (at your option)
    any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; If not, see <https://www.gnu.org/licenses/>. */

#include <config.h>
#include "basicdefs.h"
#include "dfa.h"
#include "localeinfo.h"
#include "regex.h"
#include <stdio.h>
#include "unlocked-io.h"

#include "utils.h"

/* Struct vector is used to describe a compiled sed program. */
struct vector {
  struct sed_cmd *v;	/* a dynamically allocated array */
  idx_t v_allocated;	/* ... number of slots allocated */
  idx_t v_length;	/* ... number of slots in use */
};

/* This structure tracks files used by sed so that they may all be
   closed cleanly at normal program termination.  A flag is kept that tells
   if a missing newline was encountered, so that it is added on the
   next line and the two lines are not concatenated.  */
struct output {
  char *name;
  bool missing_newline;
  FILE *fp;
  struct output *link;
};

struct text_buf {
  char *text;
  idx_t text_length;
};

struct regex {
  regex_t pattern;
  int flags;
  idx_t sz;
  struct dfa *dfa;
  bool begline;
  bool endline;
  char re[1];
};

struct readcmd {
  char *fname;
  bool append; /* true: append (default); false: prepend (gnu extension) */
};

enum replacement_types {
  REPL_ASIS = 0,
  REPL_UPPERCASE = 1,
  REPL_LOWERCASE = 2,
  REPL_UPPERCASE_FIRST = 4,
  REPL_LOWERCASE_FIRST = 8,
  REPL_MODIFIERS = REPL_UPPERCASE_FIRST | REPL_LOWERCASE_FIRST,

  /* These are given to aid in debugging */
  REPL_UPPERCASE_UPPERCASE = REPL_UPPERCASE_FIRST | REPL_UPPERCASE,
  REPL_UPPERCASE_LOWERCASE = REPL_UPPERCASE_FIRST | REPL_LOWERCASE,
  REPL_LOWERCASE_UPPERCASE = REPL_LOWERCASE_FIRST | REPL_UPPERCASE,
  REPL_LOWERCASE_LOWERCASE = REPL_LOWERCASE_FIRST | REPL_LOWERCASE
};

enum text_types {
  TEXT_BUFFER,
  TEXT_REPLACEMENT,
  TEXT_REGEX
};

enum posixicity_types {
  POSIXLY_EXTENDED,	/* with GNU extensions */
  POSIXLY_CORRECT,	/* with POSIX-compatible GNU extensions */
  POSIXLY_BASIC		/* pedantically POSIX */
};

enum addr_state {
  RANGE_INACTIVE,	/* never been active */
  RANGE_ACTIVE,		/* between first and second address */
  RANGE_CLOSED		/* like RANGE_INACTIVE, but range has ended once */
};

enum addr_types {
  ADDR_IS_NULL,		/* null address */
  ADDR_IS_REGEX,	/* a.addr_regex is valid */
  ADDR_IS_NUM,		/* a.addr_number is valid */
  ADDR_IS_NUM_MOD,	/* a.addr_number is valid, addr_step is modulo */
  ADDR_IS_STEP,		/* address is +N (only valid for addr2) */
  ADDR_IS_STEP_MOD,	/* address is ~N (only valid for addr2) */
  ADDR_IS_LAST		/* address is $ */
};

struct addr {
  enum addr_types addr_type;
  intmax_t addr_number;
  intmax_t addr_step;
  struct regex *addr_regex;
};


struct replacement {
  char *prefix;
  idx_t prefix_length;
  int subst_id;
  enum replacement_types repl_type;
  struct replacement *next;
};

struct subst {
  struct regex *regx;
  struct replacement *replacement;
  intmax_t numb;	/* if >0, only substitute for match number "numb" */
  struct output *outf;	/* 'w' option given */
  unsigned global : 1;	/* 'g' option given */
  unsigned print : 2;	/* 'p' option given (before/after eval) */
  unsigned eval : 1;	/* 'e' option given */
  unsigned max_id : 4;  /* maximum backreference on the RHS */
#ifdef lint
  char* replacement_buffer;
#endif
};




struct sed_cmd {
  struct addr *a1;	/* save space: usually is NULL */
  struct addr *a2;

  /* See description the enum, above.  */
  enum addr_state range_state;

  /* Non-zero if command is to be applied to non-matches. */
  char addr_bang;

  /* The actual command character. */
  char cmd;

  /* auxiliary data for various commands */
  union {
    /* This structure is used for a, i, and c commands. */
    struct text_buf cmd_txt;

    /* This is used for the l, q and Q commands. */
    intmax_t int_arg;

    /* This is used for the {}, b, and t commands. */
    idx_t jump_index;

    /* This is used for the r command. */
    struct readcmd readcmd;

    /* This is used for the hairy s command. */
    struct subst *cmd_subst;

    /* This is used for the w command. */
    struct output *outf;

    /* This is used for the R command.
       (despite the struct name, it is used for both in and out files). */
    struct output *inf;

    /* This is used for the y command. */
    unsigned char *translate;
    char **translatemb;

    /* This is used for the ':' command (debug only).  */
    char* label_name;
  } x;
};


_Noreturn void bad_prog (char const *why, ...)
  _GL_ATTRIBUTE_FORMAT_PRINTF_STANDARD (1, 2);
_Noreturn void bad_prog_notranslate (char const *why, ...)
  _GL_ATTRIBUTE_FORMAT_PRINTF_STANDARD (1, 2);
idx_t normalize_text (char *text, idx_t len, enum text_types buftype);
struct vector *compile_string (struct vector *, char *str, idx_t len);
struct vector *compile_file (struct vector *, const char *cmdfile);
void check_final_program (struct vector *);
void rewind_read_files (void);
void finish_program (struct vector *);

struct regex *compile_regex (struct buffer *b, int flags, int needed_sub);
int match_regex (struct regex *regex,
                 char *buf, idx_t buflen, idx_t buf_start_offset,
                 struct re_registers *regarray, int regsize);
#ifdef lint
void release_regex (struct regex *);
#endif

void
debug_print_command (const struct vector *program, const struct sed_cmd *sc);
void
debug_print_program (const struct vector *program);
void
debug_print_char (char c);

int process_files (struct vector *, char **argv);

int main (int, char **);

extern struct localeinfo localeinfo;

extern int extended_regexp_flags;

/* one-byte buffer delimiter */
extern char buffer_delimiter;

/* If set, fflush(stdout) on every line output,
   and turn off stream buffering on inputs.  */
extern bool unbuffered;

/* If set, don't write out the line unless explicitly told to. */
extern bool no_default_output;

/* If set, reset line counts on every new file. */
extern bool separate_files;

/* If set, follow symlinks when invoked with -i option */
extern bool follow_symlinks;

/* Do we need to be pedantically POSIX compliant? */
extern enum posixicity_types posixicity;

/* How long should the 'l' command's output line be? */
extern idx_t lcmd_out_line_len;

/* How do we edit files in-place? (we don't if NULL) */
extern char *in_place_extension;

/* The mode to use to read and write files, either "rt"/"w" or "rb"/"wb".  */
extern char const *read_mode;
extern char const *write_mode;

/* Should we use EREs? */
extern bool use_extended_syntax_p;

/* Declarations for multibyte character sets.  */
extern int mb_cur_max;
extern bool is_utf8;

/* If set, operate in 'sandbox' mode - disable e/r/w commands */
extern bool sandbox;

/* If set, print debugging information.  */
extern bool debug;

#define MBRTOWC(pwc, s, n, ps) \
  (mb_cur_max == 1 ? \
   (*(pwc) = btowc (*(unsigned char *) (s)), 1) : \
   mbrtowc ((pwc), (s), (n), (ps)))

#define WCRTOMB(s, wc, ps) \
  (mb_cur_max == 1 ? \
   (*(s) = wctob ((wint_t) (wc)), 1) : \
   wcrtomb ((s), (wc), (ps)))

#define MBSINIT(s) \
  (mb_cur_max == 1 ? 1 : mbsinit ((s)))

#define MBRLEN(s, n, ps) \
  (mb_cur_max == 1 ? 1 : mbrtowc (NULL, s, n, ps))

#define IS_MB_CHAR(ch, ps)                \
  (mb_cur_max == 1 ? 0 : is_mb_char (ch, ps))

extern int is_mb_char (int ch, mbstate_t *ps);
extern void initialize_mbcs (void);

/* Use this to suppress gcc's '...may be used before initialized' warnings. */
#ifdef lint
# define IF_LINT(Code) Code
#else
# define IF_LINT(Code) /* empty */
#endif

#ifndef FALLTHROUGH
# if __GNUC__ < 7
#  define FALLTHROUGH ((void) 0)
# else
#  define FALLTHROUGH __attribute__ ((__fallthrough__))
# endif
#endif