summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2023-03-16 18:25:43 +0200
committerArnold D. Robbins <arnold@skeeve.com>2023-03-16 18:25:43 +0200
commit2e18b77f5b6926e6616ce22d2d3e6d511de69c9b (patch)
treedfb889b9411a7f126facaeb465aebb2247961d72
parented09e9c66665f98eb070cc28d87abb9cb0096c3f (diff)
downloadgawk-2e18b77f5b6926e6616ce22d2d3e6d511de69c9b.tar.gz
Start revamp of CSV handling.
-rw-r--r--ChangeLog17
-rw-r--r--awk.h4
-rw-r--r--field.c30
-rw-r--r--io.c45
-rw-r--r--main.c18
-rw-r--r--pc/ChangeLog4
-rw-r--r--pc/Makefile.tst7
-rw-r--r--test/ChangeLog7
-rwxr-xr-xtest/Gentests11
-rw-r--r--test/Makefile.am5
-rw-r--r--test/Makefile.in7
-rw-r--r--test/Maketests2
-rw-r--r--test/badargs.ok1
-rw-r--r--test/csv1.awk6
14 files changed, 148 insertions, 16 deletions
diff --git a/ChangeLog b/ChangeLog
index 11c326b9..55d16d3c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2023-03-16 Arnold D. Robbins <arnold@skeeve.com>
+
+ * awk.h (enum do_flag_values): Add DO_CSV.
+ (do_csv): New macro.
+ (init_csv_fields, init_csv_records): Add declarations.
+ * field.c (init_csv_fields): New function.
+ (set_parser): Don't set the parser if doing CSV. Add warnings.
+ * io.c (csvscan): New function (placeholder for now).
+ (init_csv_records): New function.
+ (set_RS): Don't set the parser if doing CSV. Add warnings.
+ * main.c (optab): Add new options -k/--csv.
+ (main): Fatal out if --posix and --csv. Call init_csv_records()
+ and init_csv_fields().
+ (usage): Add a line for the new options.
+ (load_procinfo): Install PROCINFO["CSV"] if doing CSV.
+ (parse_args): Update for new options.
+
2023-03-09 Arnold D. Robbins <arnold@skeeve.com>
* gawkapi.h: Update copyright year. Small edit in leading comment.
diff --git a/awk.h b/awk.h
index 661eb637..c7b589a3 100644
--- a/awk.h
+++ b/awk.h
@@ -1173,6 +1173,7 @@ extern enum do_flag_values {
DO_PROFILE = 0x02000, /* profile the program */
DO_DEBUG = 0x04000, /* debug the program */
DO_MPFR = 0x08000, /* arbitrary-precision floating-point math */
+ DO_CSV = 0x10000, /* process comma-separated-value files */
} do_flags;
#define do_traditional (do_flags & DO_TRADITIONAL)
@@ -1187,6 +1188,7 @@ extern enum do_flag_values {
#define do_sandbox (do_flags & DO_SANDBOX)
#define do_debug (do_flags & DO_DEBUG)
#define do_mpfr (do_flags & DO_MPFR)
+#define do_csv (do_flags & DO_CSV)
extern bool do_optimize;
extern int use_lc_numeric;
@@ -1569,6 +1571,7 @@ extern NODE *get_actual_argument(NODE *, int, bool);
#endif
/* field.c */
extern void init_fields(void);
+extern void init_csv_fields(void);
extern void set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *);
extern void reset_record(void);
extern void rebuild_record(void);
@@ -1629,6 +1632,7 @@ extern int isdirpunct(int c);
/* io.c */
extern void init_sockets(void);
extern void init_io(void);
+extern void init_csv_records(void);
extern void register_input_parser(awk_input_parser_t *input_parser);
extern void register_output_wrapper(awk_output_wrapper_t *wrapper);
extern void register_two_way_processor(awk_two_way_processor_t *processor);
diff --git a/field.c b/field.c
index 7f20b69c..44c153dc 100644
--- a/field.c
+++ b/field.c
@@ -114,6 +114,15 @@ init_fields()
field0_valid = true;
}
+/* init_csv_fields --- set up to handle --csv */
+
+void
+init_csv_fields(void)
+{
+ if (do_csv)
+ parse_field = comma_parse_field;
+}
+
/* grow_fields --- acquire new fields as needed */
static void
@@ -771,6 +780,7 @@ sc_parse_field(long up_to, /* parse only up to this field number */
* via (*parse_field)(). This variation is for when FS is a comma,
* we do very basic CSV parsing, the same as BWK awk.
*/
+
static long
comma_parse_field(long up_to, /* parse only up to this field number */
char **buf, /* on input: string to parse; on output: point to start next */
@@ -1285,11 +1295,29 @@ do_patsplit(int nargs)
static void
set_parser(parse_field_func_t func)
{
+ /*
+ * Setting FS does nothing if CSV mode, warn in that case,
+ * but don't warn on first call which happens at initialization.
+ */
+ static bool first_time = true;
+ static bool warned = false;
+
+ if (! first_time && do_csv) {
+ if (! warned) {
+ warned = true;
+ warning(_("assignment to FS/FIELDWIDTHS/FPAT has no effect when using --csv"));
+ }
+ return;
+ }
+
normal_parse_field = func;
if (! api_parser_override && parse_field != func) {
parse_field = func;
update_PROCINFO_str("FS", current_field_sep_str());
}
+
+ if (first_time)
+ first_time = false;
}
/* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
@@ -1503,8 +1531,6 @@ choose_fs_function:
else if (fs->stptr[0] == '\\')
/* same special case */
strcpy(buf, "[\\\\]");
- else if (fs->stptr[0] == ',' && ! do_posix)
- set_parser(comma_parse_field);
else
set_parser(sc_parse_field);
}
diff --git a/io.c b/io.c
index 85f56447..41167b58 100644
--- a/io.c
+++ b/io.c
@@ -265,6 +265,7 @@ static bool avoid_flush(const char *name);
static RECVALUE rs1scan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
static RECVALUE rsnullscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
static RECVALUE rsrescan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
+static RECVALUE csvscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
static RECVALUE (*matchrec)(IOBUF *iop, struct recmatch *recm, SCANSTATE *state) = rs1scan;
@@ -341,6 +342,15 @@ init_io()
read_can_timeout = true;
}
+/* init_csv_records --- set up for CSV handling */
+
+void
+init_csv_records(void)
+{
+ if (do_csv)
+ matchrec = csvscan;
+}
+
#if defined(__MINGW32__) || defined(__CYGWIN__)
/* binmode --- convert BINMODE to string for fopen */
@@ -3820,6 +3830,14 @@ find_longest_terminator:
return REC_OK;
}
+/* csvscan --- handle --csv mode */
+
+static RECVALUE
+csvscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state)
+{
+ return rs1scan(iop, recm, state); // XXX so it'll compile and run
+}
+
/* retryable --- return true if PROCINFO[<filename>, "RETRY"] exists */
static inline int
@@ -4069,6 +4087,13 @@ get_a_record(char **out, /* pointer to pointer to data */
void
set_RS()
{
+ /*
+ * Setting RS does nothing if CSV mode, warn in that case,
+ * but don't warn on first call which happens at initialization.
+ */
+ static bool first_time = true;
+ static bool warned = false;
+
static NODE *save_rs = NULL;
/*
@@ -4099,9 +4124,15 @@ set_RS()
refree(RS_re[1]);
RS_re[0] = RS_re[1] = RS_regexp = NULL;
+ if (! first_time && ! warned && do_csv) {
+ warned = true;
+ warning(_("assignment to RS has no effect when using --csv"));
+ }
+
if (RS->stlen == 0) {
RS_is_null = true;
- matchrec = rsnullscan;
+ if (first_time || ! do_csv)
+ matchrec = rsnullscan;
} else if ((RS->stlen > 1 || (RS->flags & REGEX) != 0) && ! do_traditional) {
static bool warned = false;
@@ -4109,17 +4140,23 @@ set_RS()
RS_re[1] = make_regexp(RS->stptr, RS->stlen, true, true, true);
RS_regexp = RS_re[IGNORECASE];
- matchrec = rsrescan;
+ if (first_time || ! do_csv)
+ matchrec = rsrescan;
if (do_lint_extensions && ! warned) {
lintwarn(_("multicharacter value of `RS' is a gawk extension"));
warned = true;
}
- } else
- matchrec = rs1scan;
+ } else {
+ if (first_time || ! do_csv)
+ matchrec = rs1scan;
+ }
set_FS:
if (current_field_sep() == Using_FS)
set_FS();
+
+ if (first_time)
+ first_time = false;
}
diff --git a/main.c b/main.c
index 9fa67f83..c161e8ee 100644
--- a/main.c
+++ b/main.c
@@ -171,6 +171,7 @@ static const struct option optab[] = {
{ "bignum", no_argument, NULL, 'M' },
{ "characters-as-bytes", no_argument, & do_binary, 'b' },
{ "copyright", no_argument, NULL, 'C' },
+ { "csv", no_argument, NULL, 'k' },
{ "debug", optional_argument, NULL, 'D' },
{ "dump-variables", optional_argument, NULL, 'd' },
{ "exec", required_argument, NULL, 'E' },
@@ -375,6 +376,9 @@ main(int argc, char **argv)
}
}
+ if (do_csv && do_posix)
+ fatal(_("`--posix' and `--csv' conflict"));
+
if (do_lint) {
if (os_is_setuid())
lintwarn(_("running %s setuid root may be a security problem"), myname);
@@ -415,6 +419,10 @@ main(int argc, char **argv)
/* Set up the special variables */
init_vars();
+ /* set up CSV */
+ init_csv_records();
+ init_csv_fields();
+
/* Set up the field variables */
init_fields();
@@ -624,6 +632,7 @@ usage(int exitval, FILE *fp)
fputs(_("\t-h\t\t\t--help\n"), fp);
fputs(_("\t-i includefile\t\t--include=includefile\n"), fp);
fputs(_("\t-I\t\t\t--trace\n"), fp);
+ fputs(_("\t-k\t\t\t--csv\n"), fp);
fputs(_("\t-l library\t\t--load=library\n"), fp);
/*
* TRANSLATORS: the "fatal", "invalid" and "no-ext" here are literal
@@ -1105,6 +1114,9 @@ load_procinfo()
update_PROCINFO_str("pma", get_pma_version());
#endif /* USE_PERSISTENT_MALLOC */
+ if (do_csv)
+ update_PROCINFO_num("CSV", 1);
+
load_procinfo_argv();
return PROCINFO_node;
}
@@ -1569,7 +1581,7 @@ parse_args(int argc, char **argv)
/*
* The + on the front tells GNU getopt not to rearrange argv.
*/
- const char *optlist = "+F:f:v:W;bcCd::D::e:E:ghi:Il:L::nNo::Op::MPrSstVYZ:";
+ const char *optlist = "+F:f:v:W;bcCd::D::e:E:ghi:kIl:L::nNo::Op::MPrSstVYZ:";
int old_optind;
int c;
char *scan;
@@ -1668,6 +1680,10 @@ parse_args(int argc, char **argv)
do_itrace = true;
break;
+ case 'k': // k is for "comma". it's a stretch, I know
+ do_flags |= DO_CSV;
+ break;
+
case 'l':
(void) add_srcfile(SRC_EXTLIB, optarg, srcfiles, NULL, NULL);
break;
diff --git a/pc/ChangeLog b/pc/ChangeLog
index e809bfac..541be9c5 100644
--- a/pc/ChangeLog
+++ b/pc/ChangeLog
@@ -1,3 +1,7 @@
+2023-03-16 Arnold D. Robbins <arnold@skeeve.com>
+
+ * Makefile.tst: Regenerated.
+
2023-03-12 Eli Zaretskii <eliz@gnu.org>
* Makefile.ext (readdir_test.$(SOEXT)): Fix typo.
diff --git a/pc/Makefile.tst b/pc/Makefile.tst
index 316d778c..72f8a9cb 100644
--- a/pc/Makefile.tst
+++ b/pc/Makefile.tst
@@ -288,9 +288,12 @@ NEED_SANDBOX = sandbox1
# List of tests that need --traditional
NEED_TRADITIONAL = litoct tradanch rscompat
-# Lists of tests that need the PMA allocator and a backing file
+# List of tests that need the PMA allocator and a backing file
NEED_PMA = pma
+# List of tests that need --csv
+NEED_CSV = csv1
+
# Lists of tests that run a shell script
RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01
@@ -2721,7 +2724,7 @@ crlf:
csv1:
@echo $@
- @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
dbugeval2:
diff --git a/test/ChangeLog b/test/ChangeLog
index e206fa9b..f247c540 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,10 @@
+2023-03-16 Arnold D. Robbins <arnold@skeeve.com>
+
+ * Gentests: Handle NEED_CSV.
+ * Makefile.am (NEED_CSV): New list of tests that need --csv.
+ * badargs.ok: Update after code changes.
+ * csv1.awk: Adjust after code changes.
+
2023-03-09 Arnold D. Robbins <arnold@skeeve.com>
* badargs.ok: Update after code changes.
diff --git a/test/Gentests b/test/Gentests
index 42a81cff..b3a8f787 100755
--- a/test/Gentests
+++ b/test/Gentests
@@ -108,6 +108,13 @@ BEGIN {
next
}
+/^NEED_CSV *=/,/[^\\]$/ {
+ gsub(/(^NEED_CSV *=|\\$)/,"")
+ for (i = 1; i <= NF; i++)
+ csv[$i]
+ next
+}
+
/^GENTESTS_UNUSED *=/,/[^\\]$/ {
gsub(/(^GENTESTS_UNUSED *=|\\$)/,"")
for (i = 1; i <= NF; i++)
@@ -229,6 +236,10 @@ function generate(x, s, i, locale_string)
s = s " --re-interval"
delete re_interval[x]
}
+ if (x in csv) {
+ s = s " --csv"
+ delete csv[x]
+ }
if (x".in" in files) {
s = s " < \"$(srcdir)\"/$@.in"
delete files[x".in"]
diff --git a/test/Makefile.am b/test/Makefile.am
index 6d000178..7bb4c983 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1604,9 +1604,12 @@ NEED_SANDBOX = sandbox1
# List of tests that need --traditional
NEED_TRADITIONAL = litoct tradanch rscompat
-# Lists of tests that need the PMA allocator and a backing file
+# List of tests that need the PMA allocator and a backing file
NEED_PMA = pma
+# List of tests that need --csv
+NEED_CSV = csv1
+
# Lists of tests that run a shell script
RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01
diff --git a/test/Makefile.in b/test/Makefile.in
index 1cd775d1..28c5ebc5 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -1868,9 +1868,12 @@ NEED_SANDBOX = sandbox1
# List of tests that need --traditional
NEED_TRADITIONAL = litoct tradanch rscompat
-# Lists of tests that need the PMA allocator and a backing file
+# List of tests that need the PMA allocator and a backing file
NEED_PMA = pma
+# List of tests that need --csv
+NEED_CSV = csv1
+
# Lists of tests that run a shell script
RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01
@@ -4484,7 +4487,7 @@ crlf:
csv1:
@echo $@
- @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
dbugeval2:
diff --git a/test/Maketests b/test/Maketests
index 628ff3fa..d284aab6 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -1414,7 +1414,7 @@ crlf:
csv1:
@echo $@
- @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+ @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
dbugeval2:
diff --git a/test/badargs.ok b/test/badargs.ok
index d2c67cac..1d79bc78 100644
--- a/test/badargs.ok
+++ b/test/badargs.ok
@@ -17,6 +17,7 @@ Short options: GNU long options: (extensions)
-h --help
-i includefile --include=includefile
-I --trace
+ -k --csv
-l library --load=library
-L[fatal|invalid|no-ext] --lint[=fatal|invalid|no-ext]
-M --bignum
diff --git a/test/csv1.awk b/test/csv1.awk
index 12bbf1e5..4896ef7c 100644
--- a/test/csv1.awk
+++ b/test/csv1.awk
@@ -1,6 +1,6 @@
-BEGIN {
- FS = ","
-}
+# BEGIN {
+# FS = ","
+# }
{
printf(" \t%s\t", $0)