Start revamp of CSV handling.

author: Arnold D. Robbins <arnold@skeeve.com> 2023-03-16 18:25:43 +0200
committer: Arnold D. Robbins <arnold@skeeve.com> 2023-03-16 18:25:43 +0200
commit: 2e18b77f5b6926e6616ce22d2d3e6d511de69c9b (patch)
tree: dfb889b9411a7f126facaeb465aebb2247961d72
parent: ed09e9c66665f98eb070cc28d87abb9cb0096c3f (diff)
download: gawk-2e18b77f5b6926e6616ce22d2d3e6d511de69c9b.tar.gz
14 files changed, 148 insertions, 16 deletions
diff --git a/ChangeLog b/ChangeLog
index 11c326b9..55d16d3c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2023-03-16         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* awk.h (enum do_flag_values): Add DO_CSV.
+	(do_csv): New macro.
+	(init_csv_fields, init_csv_records): Add declarations.
+	* field.c (init_csv_fields): New function.
+	(set_parser): Don't set the parser if doing CSV. Add warnings.
+	* io.c (csvscan): New function (placeholder for now).
+	(init_csv_records): New function.
+	(set_RS): Don't set the parser if doing CSV. Add warnings.
+	* main.c (optab): Add new options -k/--csv.
+	(main): Fatal out if --posix and --csv. Call init_csv_records()
+	and init_csv_fields().
+	(usage): Add a line for the new options.
+	(load_procinfo): Install PROCINFO["CSV"] if doing CSV.
+	(parse_args): Update for new options.
+
 2023-03-09         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* gawkapi.h: Update copyright year. Small edit in leading comment.
diff --git a/awk.h b/awk.h
index 661eb637..c7b589a3 100644
--- a/awk.h
+++ b/awk.h
@@ -1173,6 +1173,7 @@ extern enum do_flag_values {
 	DO_PROFILE	   = 0x02000,	/* profile the program */
 	DO_DEBUG	   = 0x04000,	/* debug the program */
 	DO_MPFR		   = 0x08000,	/* arbitrary-precision floating-point math */
+	DO_CSV		   = 0x10000,	/* process comma-separated-value files */
 } do_flags;
 
 #define do_traditional      (do_flags & DO_TRADITIONAL)
@@ -1187,6 +1188,7 @@ extern enum do_flag_values {
 #define do_sandbox          (do_flags & DO_SANDBOX)
 #define do_debug            (do_flags & DO_DEBUG)
 #define do_mpfr             (do_flags & DO_MPFR)
+#define do_csv              (do_flags & DO_CSV)
 
 extern bool do_optimize;
 extern int use_lc_numeric;
@@ -1569,6 +1571,7 @@ extern NODE *get_actual_argument(NODE *, int, bool);
 #endif
 /* field.c */
 extern void init_fields(void);
+extern void init_csv_fields(void);
 extern void set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *);
 extern void reset_record(void);
 extern void rebuild_record(void);
@@ -1629,6 +1632,7 @@ extern int isdirpunct(int c);
 /* io.c */
 extern void init_sockets(void);
 extern void init_io(void);
+extern void init_csv_records(void);
 extern void register_input_parser(awk_input_parser_t *input_parser);
 extern void register_output_wrapper(awk_output_wrapper_t *wrapper);
 extern void register_two_way_processor(awk_two_way_processor_t *processor);
diff --git a/field.c b/field.c
index 7f20b69c..44c153dc 100644
--- a/field.c
+++ b/field.c
@@ -114,6 +114,15 @@ init_fields()
 	field0_valid = true;
 }
 
+/* init_csv_fields --- set up to handle --csv */
+
+void
+init_csv_fields(void)
+{
+	if (do_csv)
+		parse_field = comma_parse_field;
+}
+
 /* grow_fields --- acquire new fields as needed */
 
 static void
@@ -771,6 +780,7 @@ sc_parse_field(long up_to,	/* parse only up to this field number */
  * via (*parse_field)().  This variation is for when FS is a comma,
  * we do very basic CSV parsing, the same as BWK awk.
  */
+
 static long
 comma_parse_field(long up_to,	/* parse only up to this field number */
 	char **buf,	/* on input: string to parse; on output: point to start next */
@@ -1285,11 +1295,29 @@ do_patsplit(int nargs)
 static void
 set_parser(parse_field_func_t func)
 {
+	/*
+	 * Setting FS does nothing if CSV mode, warn in that case,
+	 * but don't warn on first call which happens at initialization.
+	 */
+	static bool first_time = true;
+	static bool warned = false;
+
+	if (! first_time && do_csv) {
+		if (! warned) {
+			warned = true;
+			warning(_("assignment to FS/FIELDWIDTHS/FPAT has no effect when using --csv"));
+		}
+		return;
+	}
+
 	normal_parse_field = func;
 	if (! api_parser_override && parse_field != func) {
 		parse_field = func;
 	        update_PROCINFO_str("FS", current_field_sep_str());
 	}
+
+	if (first_time)
+		first_time = false;
 }
 
 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
@@ -1503,8 +1531,6 @@ choose_fs_function:
 			else if (fs->stptr[0] == '\\')
 				/* same special case */
 				strcpy(buf, "[\\\\]");
-			else if (fs->stptr[0] == ',' && ! do_posix)
-				set_parser(comma_parse_field);
 			else
 				set_parser(sc_parse_field);
 		}
diff --git a/io.c b/io.c
index 85f56447..41167b58 100644
--- a/io.c
+++ b/io.c
@@ -265,6 +265,7 @@ static bool avoid_flush(const char *name);
 static RECVALUE rs1scan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
 static RECVALUE rsnullscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
 static RECVALUE rsrescan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
+static RECVALUE csvscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state);
 
 static RECVALUE (*matchrec)(IOBUF *iop, struct recmatch *recm, SCANSTATE *state) = rs1scan;
 
@@ -341,6 +342,15 @@ init_io()
 		read_can_timeout = true;
 }
 
+/* init_csv_records --- set up for CSV handling */
+
+void
+init_csv_records(void)
+{
+	if (do_csv)
+		matchrec = csvscan;
+}
+
 
 #if defined(__MINGW32__) || defined(__CYGWIN__)
 /* binmode --- convert BINMODE to string for fopen */
@@ -3820,6 +3830,14 @@ find_longest_terminator:
 	return REC_OK;
 }
 
+/* csvscan --- handle --csv mode */
+
+static RECVALUE
+csvscan(IOBUF *iop, struct recmatch *recm, SCANSTATE *state)
+{
+	return rs1scan(iop, recm, state);	// XXX so it'll compile and run
+}
+
 /* retryable --- return true if PROCINFO[<filename>, "RETRY"] exists */
 
 static inline int
@@ -4069,6 +4087,13 @@ get_a_record(char **out,        /* pointer to pointer to data */
 void
 set_RS()
 {
+	/*
+	 * Setting RS does nothing if CSV mode, warn in that case,
+	 * but don't warn on first call which happens at initialization.
+	 */
+	static bool first_time = true;
+	static bool warned = false;
+
 	static NODE *save_rs = NULL;
 
 	/*
@@ -4099,9 +4124,15 @@ set_RS()
 	refree(RS_re[1]);
 	RS_re[0] = RS_re[1] = RS_regexp = NULL;
 
+	if (! first_time && ! warned && do_csv) {
+		warned = true;
+		warning(_("assignment to RS has no effect when using --csv"));
+	}
+
 	if (RS->stlen == 0) {
 		RS_is_null = true;
-		matchrec = rsnullscan;
+		if (first_time || ! do_csv)
+			matchrec = rsnullscan;
 	} else if ((RS->stlen > 1 || (RS->flags & REGEX) != 0) && ! do_traditional) {
 		static bool warned = false;
 
@@ -4109,17 +4140,23 @@ set_RS()
 		RS_re[1] = make_regexp(RS->stptr, RS->stlen, true, true, true);
 		RS_regexp = RS_re[IGNORECASE];
 
-		matchrec = rsrescan;
+		if (first_time || ! do_csv)
+			matchrec = rsrescan;
 
 		if (do_lint_extensions && ! warned) {
 			lintwarn(_("multicharacter value of `RS' is a gawk extension"));
 			warned = true;
 		}
-	} else
-		matchrec = rs1scan;
+	} else {
+		if (first_time || ! do_csv)
+			matchrec = rs1scan;
+	}
 set_FS:
 	if (current_field_sep() == Using_FS)
 		set_FS();
+
+	if (first_time)
+		first_time = false;
 }
 
 
diff --git a/main.c b/main.c
index 9fa67f83..c161e8ee 100644
--- a/main.c
+++ b/main.c
@@ -171,6 +171,7 @@ static const struct option optab[] = {
 	{ "bignum",		no_argument,		NULL,	'M' },
 	{ "characters-as-bytes", no_argument,		& do_binary,	 'b' },
 	{ "copyright",		no_argument,		NULL,	'C' },
+	{ "csv",		no_argument,		NULL,	'k' },
 	{ "debug",		optional_argument,	NULL,	'D' },
 	{ "dump-variables",	optional_argument,	NULL,	'd' },
 	{ "exec",		required_argument,	NULL,	'E' },
@@ -375,6 +376,9 @@ main(int argc, char **argv)
 		}
 	}
 
+	if (do_csv && do_posix)
+		fatal(_("`--posix' and `--csv' conflict"));
+
 	if (do_lint) {
 		if (os_is_setuid())
 			lintwarn(_("running %s setuid root may be a security problem"), myname);
@@ -415,6 +419,10 @@ main(int argc, char **argv)
 	/* Set up the special variables */
 	init_vars();
 
+	/* set up CSV */
+	init_csv_records();
+	init_csv_fields();
+
 	/* Set up the field variables */
 	init_fields();
 
@@ -624,6 +632,7 @@ usage(int exitval, FILE *fp)
 	fputs(_("\t-h\t\t\t--help\n"), fp);
 	fputs(_("\t-i includefile\t\t--include=includefile\n"), fp);
 	fputs(_("\t-I\t\t\t--trace\n"), fp);
+	fputs(_("\t-k\t\t\t--csv\n"), fp);
 	fputs(_("\t-l library\t\t--load=library\n"), fp);
 	/*
 	 * TRANSLATORS: the "fatal", "invalid" and "no-ext" here are literal
@@ -1105,6 +1114,9 @@ load_procinfo()
 	update_PROCINFO_str("pma", get_pma_version());
 #endif /* USE_PERSISTENT_MALLOC */
 
+	if (do_csv)
+		update_PROCINFO_num("CSV", 1);
+
 	load_procinfo_argv();
 	return PROCINFO_node;
 }
@@ -1569,7 +1581,7 @@ parse_args(int argc, char **argv)
 	/*
 	 * The + on the front tells GNU getopt not to rearrange argv.
 	 */
-	const char *optlist = "+F:f:v:W;bcCd::D::e:E:ghi:Il:L::nNo::Op::MPrSstVYZ:";
+	const char *optlist = "+F:f:v:W;bcCd::D::e:E:ghi:kIl:L::nNo::Op::MPrSstVYZ:";
 	int old_optind;
 	int c;
 	char *scan;
@@ -1668,6 +1680,10 @@ parse_args(int argc, char **argv)
 			do_itrace = true;
 			break;
 
+		case 'k':	// k is for "comma". it's a stretch, I know
+			do_flags |= DO_CSV;
+			break;
+
 		case 'l':
 			(void) add_srcfile(SRC_EXTLIB, optarg, srcfiles, NULL, NULL);
 			break;
diff --git a/pc/ChangeLog b/pc/ChangeLog
index e809bfac..541be9c5 100644
--- a/pc/ChangeLog
+++ b/pc/ChangeLog
@@ -1,3 +1,7 @@
+2023-03-16         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* Makefile.tst: Regenerated.
+
 2023-03-12  Eli Zaretskii  <eliz@gnu.org>
 
 	* Makefile.ext (readdir_test.$(SOEXT)): Fix typo.
diff --git a/pc/Makefile.tst b/pc/Makefile.tst
index 316d778c..72f8a9cb 100644
--- a/pc/Makefile.tst
+++ b/pc/Makefile.tst
@@ -288,9 +288,12 @@ NEED_SANDBOX = sandbox1
 # List of tests that need --traditional
 NEED_TRADITIONAL = litoct tradanch rscompat
 
-# Lists of tests that need the PMA allocator and a backing file
+# List of tests that need the PMA allocator and a backing file
 NEED_PMA = pma
 
+# List of tests that need --csv
+NEED_CSV = csv1
+
 # Lists of tests that run a shell script
 RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01
 
@@ -2721,7 +2724,7 @@ crlf:
 
 csv1:
 	@echo $@
-	@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+	@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
 	@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
 dbugeval2:
diff --git a/test/ChangeLog b/test/ChangeLog
index e206fa9b..f247c540 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,10 @@
+2023-03-16         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* Gentests: Handle NEED_CSV.
+	* Makefile.am (NEED_CSV): New list of tests that need --csv.
+	* badargs.ok: Update after code changes.
+	* csv1.awk: Adjust after code changes.
+
 2023-03-09         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* badargs.ok: Update after code changes.
diff --git a/test/Gentests b/test/Gentests
index 42a81cff..b3a8f787 100755
--- a/test/Gentests
+++ b/test/Gentests
@@ -108,6 +108,13 @@ BEGIN {
 	next
 }
 
+/^NEED_CSV *=/,/[^\\]$/ {
+	gsub(/(^NEED_CSV *=|\\$)/,"")
+	for (i = 1; i <= NF; i++)
+		csv[$i]
+	next
+}
+
 /^GENTESTS_UNUSED *=/,/[^\\]$/ {
 	gsub(/(^GENTESTS_UNUSED *=|\\$)/,"")
 	for (i = 1; i <= NF; i++)
@@ -229,6 +236,10 @@ function generate(x,	s, i, locale_string)
 		s = s " --re-interval"
 		delete re_interval[x]
 	}
+	if (x in csv) {
+		s = s " --csv"
+		delete csv[x]
+	}
 	if (x".in" in files) {
 		s = s " < \"$(srcdir)\"/$@.in"
 		delete files[x".in"]
diff --git a/test/Makefile.am b/test/Makefile.am
index 6d000178..7bb4c983 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1604,9 +1604,12 @@ NEED_SANDBOX = sandbox1
 # List of tests that need --traditional
 NEED_TRADITIONAL = litoct tradanch rscompat
 
-# Lists of tests that need the PMA allocator and a backing file
+# List of tests that need the PMA allocator and a backing file
 NEED_PMA = pma
 
+# List of tests that need --csv
+NEED_CSV = csv1
+
 # Lists of tests that run a shell script
 RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01
 
diff --git a/test/Makefile.in b/test/Makefile.in
index 1cd775d1..28c5ebc5 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -1868,9 +1868,12 @@ NEED_SANDBOX = sandbox1
 # List of tests that need --traditional
 NEED_TRADITIONAL = litoct tradanch rscompat
 
-# Lists of tests that need the PMA allocator and a backing file
+# List of tests that need the PMA allocator and a backing file
 NEED_PMA = pma
 
+# List of tests that need --csv
+NEED_CSV = csv1
+
 # Lists of tests that run a shell script
 RUN_SHELL = exit fflush localenl modifiers next randtest rtlen rtlen01
 
@@ -4484,7 +4487,7 @@ crlf:
 
 csv1:
 	@echo $@
-	@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+	@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
 	@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
 dbugeval2:
diff --git a/test/Maketests b/test/Maketests
index 628ff3fa..d284aab6 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -1414,7 +1414,7 @@ crlf:
 
 csv1:
 	@echo $@
-	@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
+	@-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  --csv < "$(srcdir)"/$@.in >_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
 	@-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
 dbugeval2:
diff --git a/test/badargs.ok b/test/badargs.ok
index d2c67cac..1d79bc78 100644
--- a/test/badargs.ok
+++ b/test/badargs.ok
@@ -17,6 +17,7 @@ Short options:		GNU long options: (extensions)
 	-h			--help
 	-i includefile		--include=includefile
 	-I			--trace
+	-k			--csv
 	-l library		--load=library
 	-L[fatal|invalid|no-ext]	--lint[=fatal|invalid|no-ext]
 	-M			--bignum
diff --git a/test/csv1.awk b/test/csv1.awk
index 12bbf1e5..4896ef7c 100644
--- a/test/csv1.awk
+++ b/test/csv1.awk
@@ -1,6 +1,6 @@
-BEGIN {
-	FS = ","
-}
+# BEGIN {
+# 	FS = ","
+# }
 
 {
 	printf(" \t%s\t", $0)
author	Arnold D. Robbins <arnold@skeeve.com>	2023-03-16 18:25:43 +0200
committer	Arnold D. Robbins <arnold@skeeve.com>	2023-03-16 18:25:43 +0200
commit	2e18b77f5b6926e6616ce22d2d3e6d511de69c9b (patch)
tree	dfb889b9411a7f126facaeb465aebb2247961d72
parent	ed09e9c66665f98eb070cc28d87abb9cb0096c3f (diff)
download	gawk-2e18b77f5b6926e6616ce22d2d3e6d511de69c9b.tar.gz