Subject: bin/27257: sort(1) not quite POSIX compliant, also lacks interesting feature
To: None <gnats-bugs@gnats.NetBSD.org>
From: seebs <seebs@vash.cel.plethora.net>
List: netbsd-bugs
Date: 10/13/2004 23:47:10
>Number:         27257
>Category:       bin
>Synopsis:       sort(1) doesn't handle blank fields correctly in some cases
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    bin-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Thu Oct 14 04:48:00 UTC 2004
>Closed-Date:
>Last-Modified:
>Originator:     seebs
>Release:        NetBSD 2.99.9
>Organization:
	N/A
>Environment:
System: NetBSD vash.cel.plethora.net 2.99.9 NetBSD 2.99.9 (VASH) #1: Mon Oct 11 03:22:12 CDT 2004 seebs@vash.cel.plethora.net:/usr/src/sys/arch/i386/compile/VASH i386
Architecture: i386
Machine: i386
>Description:
	There are three issues here.

	1.  sort(1) doesn't handle blank fields correctly when performing a
	stable numeric sort.
	2.  the man page doesn't mention how it should work
	3.  I wanted an extra feature in sort.
>How-To-Repeat:
	$ sort -n <<!
	x
	-1
	0
	1
	!

	What's wrong?  What's wrong is that x should sort next to 0, but
	in fact, doesn't.  POSIX says that an empty digit string should sort
	as zero.

	In fact, though, for my purposes, I'd rather have blank strings sort
	as negative infinity.  This is non-standard, so I propose an extension
	to do it.

>Fix:

	This requires careful study.

	Sort works by building a key in front of each line.  Multiple
	keys are just appended.  After each key, a weight[0] (either
	nweights[0] for numbers, or lweights[0] for text keys) is appended.
	If the sort is stable, it's then whacked with a record separator.

	Except that, in the case where no digits were found, only ONE
	character was put in the key, and THAT character gets whacked.

	So, the simple solution to the POSIX problem is to add
		*pos++ = nweights[0];
	to the code for the !zeroskip case in number().

	That's a real live bug fix.

	However, in many cases, "no numeric data" makes more sense sorted
	separately out from all of the numeric data, and people might want
	this.  I propose an extension, the '-N' flag, which provides these
	semantics.  The following patch implements and documents that, as
	well as the actual bug fix.

	This version of sort passes all the regression tests we did before,
	so I don't think I broke anything important.

*** src/fields.c	Mon Mar 15 01:34:29 2004
--- fields.c	Wed Oct 13 12:31:01 2004
***************
*** 89,95 ****
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
--- 89,95 ----
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
***************
*** 184,189 ****
--- 184,190 ----
  	struct column icol, tcol;
  	u_int flags;
  	u_int Rflag;
+ 	u_int BNflag;
  
  	icol = cur_fld->icol;
  	tcol = cur_fld->tcol;
***************
*** 210,216 ****
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag);
  	}
  
  	mask = cur_fld->mask;
--- 211,218 ----
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
  	}
  
  	mask = cur_fld->mask;
***************
*** 246,254 ****
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
--- 248,256 ----
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag, BNflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
***************
*** 288,294 ****
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			*pos++ = nweights[127];
  			return (pos);
  		}
  	}
--- 290,301 ----
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			if (BNflag) {
! 				*pos++ = nweights[1];
! 			} else {
! 				*pos++ = nweights[127];
! 			}
! 			*pos++ = nweights[0];
  			return (pos);
  		}
  	}
*** src/init.c	Sun Feb 22 08:20:27 2004
--- init.c	Wed Oct 13 12:33:29 2004
***************
*** 260,265 ****
--- 260,266 ----
  		case 'f': return (F);
  		case 'i': return (I);
  		case 'n': return (N);
+ 		case 'N': return (BN) | (N);
  		case 'r': return (R);
  		default:  return (0);
  	}
*** src/sort.c	Wed Jul 28 01:48:44 2004
--- sort.c	Wed Oct 13 12:26:23 2004
***************
*** 158,165 ****
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
--- 158,168 ----
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
+ 		case 'N':
+ 			fldtab->flags |= BN | N;
+ 			break;
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
***************
*** 360,366 ****
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
--- 363,369 ----
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
*** src/sort.h	Sun Feb 22 08:20:27 2004
--- sort.h	Tue Oct 12 19:02:01 2004
***************
*** 91,96 ****
--- 91,97 ----
  #define R 16		/* Field is reversed with respect to the global weight */
  #define BI 32		/* ignore blanks in icol */
  #define BT 64		/* ignore blanks in tcol */
+ #define BN 128          /* invalid numeric fields are -infinity */
  
  /* masks for delimiters: blanks, fields, and termination. */
  #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */
>Release-Note:
>Audit-Trail:
>Unformatted: