NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

bin/56585: quoted elements for rs(1) (new feature, patch)



>Number:         56585
>Category:       bin
>Synopsis:       support input with quoted separators
>Confidential:   no
>Severity:       non-critical
>Priority:       medium
>Responsible:    bin-bug-people
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Mon Dec 27 17:35:01 +0000 2021
>Originator:     Martin Neitzel
>Release:        NetBSD 8.2_STABLE, 9.x, 9.99.x as of 2021-12-27
>Organization:
	Marshlabs
>Environment:
	System: NetBSD hackett.marshlabs.gaertner.de 8.2_STABLE NetBSD 8.2_STABLE (GENERIC) #3: Sat May 2 15:05:24 CEST 2020 neitzel%hackett.marshlabs.gaertner.de@localhost:/scratch/obj/sys/arch/amd64/compile/GENERIC amd64
Architecture: any
Machine: any
>Description:
	The enclosed patch provides a new "-q" option for rs(1) to
	deal with quoted fields possibly containing delimiters.
	These are common with CSV data, in particular generated by
	certain spreadsheet programs.   The actual character marking
	a quoted element is selectable (defaulting to the double
	quote).  Empty fields and embedded quotes (when doubled)
	are properly dealt with.  Input data may be non-uniformly
	unquoted and/or quoted ad lib.

	The patch covers both the code and the man page.

>How-To-Repeat:
	Apply the patch and run:
	printf '"foo bar" baz\none two\n' | rs -q -T
>Fix:

The patch below is relative to NetBSD-8.2 and applies with minimal
fuzz to NetBSD-9.x and -9.99.x, too.

cvs diff: Diffing .
Index: rs.1
===================================================================
RCS file: /cvsroot/src/usr.bin/rs/rs.1,v
retrieving revision 1.10
diff -u -r1.10 rs.1
--- rs.1	4 Jan 2016 23:55:36 -0000	1.10
+++ rs.1	27 Dec 2021 16:32:56 -0000
@@ -37,7 +37,7 @@
 .Nd reshape a data array
 .Sh SYNOPSIS
 .Nm
-.Op Fl CcSs Op Ar x
+.Op Fl CcSsq Op Ar x
 .Op Fl GgKkw Ar N
 .Op Fl EeHhjmnTty
 .Op Ar rows Op Ar cols
@@ -88,6 +88,21 @@
 .Ar x
 is taken to be
 .Sq \&^I .
+.It Fl q Op Ar x
+Recognize a starting quote character if present as
+.Em first
+character in a column.
+The column will then extend at least to the ending quote,
+ignoring any column separators in between.
+.Em Doubled
+occurences of the quote character after the starting quote are taken as
+quoted quote characters
+and do not end the quoted string.
+Input fields may be quoted or unquoted in an arbitrary mix.
+A missing
+.Ar x
+is taken to be the double-quote character
+.Sq \(dq .
 .It Fl e
 Consider each line of input as an array entry.
 .It Fl G Ar N
Index: rs.c
===================================================================
RCS file: /cvsroot/src/usr.bin/rs/rs.c,v
retrieving revision 1.15
diff -u -r1.15 rs.c
--- rs.c	6 Sep 2011 18:28:58 -0000	1.15
+++ rs.c	27 Dec 2021 16:32:56 -0000
@@ -92,6 +92,7 @@
 static int	propgutter;
 static char	isep = ' ', osep = ' ';
 static int	owidth = 80, gutter = 2;
+static char	iquote = '\0';
 
 static void	  usage(const char *, ...) __dead __printflike(1, 2);
 static void	  getargs(int, char *[]);
@@ -149,6 +150,30 @@
 			if (*p == isep && multisep)
 				continue;
 			icols++;
+			/*
+                         * Parse an element, could be as comlex as:
+			 *	"aaa""bbb"ccc
+			 *.
+			 * The first while loop deals with quoted material,
+			 * the second while loop with unquoted material (ccc).
+			 * We treat either part as optional rather than
+			 * enforcing a strict quoted-vs-unquoted alternative.
+			 */
+			/* quoted stuff: */
+			while (iquote && *p == iquote) {
+				p++;	/* scan opening quote */
+				/* scan stuff inside quotes: */
+				while (*p && *p != iquote)
+					p++;
+				/* skip closing quote: */
+				if (*p)
+					p++;
+				/*
+				 * Another iteration through this loop realizes
+				 * doubled quotes.
+				 */
+			}
+			/* unquoted stuff: */
 			while (*p && *p != isep)
 				p++;
 		}
@@ -170,6 +195,14 @@
 				*ep = empty;
 			else			/* store column entry */
 				*ep = p;
+			/* for comments on quoted/unquoted parts see above. */
+			while (iquote && *p == iquote) {
+				p++;
+				while (p < endp &&  *p != iquote)
+					p++;
+				if (*p)
+					p++;
+			}
 			while (p < endp && *p != isep)
 				p++;		/* find end of entry */
 			*p = '\0';		/* mark end of entry */
@@ -242,7 +275,7 @@
 	vwarnx(msg, ap);
 	va_end(ap);
 	fprintf(stderr,
-"usage:  rs [ -[csCS][x][kKgGw][N]tTeEnyjhHm ] [ rows [ cols ] ]\n");
+"usage:  rs [ -[csCSq][x][kKgGw][N]tTeEnyjhHm ] [ rows [ cols ] ]\n");
 	exit(1);
 }
 
@@ -419,6 +452,12 @@
 				else
 					osep = '\t';	/* default is ^I */
 				break;
+			case 'q':
+				if (p[1])
+					iquote = *++p;
+				else
+					iquote = '"';	/* default is " */
+				break;
 			case 'w':		/* window width, default 80 */
 				p = getnum(&owidth, p, 0);
 				if (owidth <= 0)




Home | Main Index | Thread Index | Old Index