Subject: bin/30504: sort(1) is chock full of boundary condition problems
To: None <gnats-admin@netbsd.org, netbsd-bugs@netbsd.org>
From: seebs <seebs@vash.cel.plethora.net>
List: netbsd-bugs
Date: 06/12/2005 06:41:01
>Number:         30504
>Category:       bin
>Synopsis:       sort mis-sorts many numbers
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    bin-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Sun Jun 12 06:41:00 +0000 2005
>Originator:     seebs
>Release:        NetBSD 3.99.3
>Organization:
	N/A
>Environment:
System: NetBSD vash.cel.plethora.net 3.99.3 NetBSD 3.99.3 (VASH) #0: Wed Apr 20 01:40:35 CDT 2005 seebs@vash.cel.plethora.net:/usr/src/sys/arch/i386/compile/VASH i386
Architecture: i386
Machine: i386
>Description:
	There are a number of bugs in /bin/sort.  For instance:

	$ cat > test
	0.09
	0.1
	0.0
	^D
	$ sort -n test
	0.09
	0
	0.1

>How-To-Repeat:
	Start trying to debug a much more obscure case.

>Fix:
	This fix is largely the work of Alan Barrett, who helpfully responded
	to a mailing list query.  I added an N flag which sorts non-numeric
	values as negative infinity, because I frequently prefer this behavior
	to the POSIX-mandated behavior.  However, apart from that, this is
	straight up bug fixes.

*** /usr/src/usr.bin/sort/fields.c	Sun Mar 14 15:12:14 2004
--- fields.c	Sun Jun 12 01:33:54 2005
***************
*** 89,95 ****
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
--- 89,95 ----
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
***************
*** 151,172 ****
  		    fieldtable->flags)) == NULL)
  			return (1);
  
- 	keybuf->offset = keypos - keybuf->data;
- 	keybuf->length = keybuf->offset + line->size;
- 	if (keybuf->length + sizeof(TRECHEADER) > size) {
- 		/* line too long for buffer */
- 		return (1);
- 	}
- 
  	/*
  	 * Make [s]radixsort() only sort by relevant part of key if:
  	 * 1. we want to choose unique items by relevant field[s]
  	 * 2. we want stable sort and so the items should be sorted only by
  	 *    the relevant field[s]
  	 */
! 	if (UNIQUE || (stable_sort && keybuf->offset < line->size))
! 		keypos[-1] = REC_D;
  
  	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
  	return (0);
  }
--- 151,175 ----
  		    fieldtable->flags)) == NULL)
  			return (1);
  
  	/*
  	 * Make [s]radixsort() only sort by relevant part of key if:
  	 * 1. we want to choose unique items by relevant field[s]
  	 * 2. we want stable sort and so the items should be sorted only by
  	 *    the relevant field[s]
  	 */
! 	if (UNIQUE || stable_sort)
! 		*keypos++ = REC_D;
  
+ 	/*
+ 	 * Append the entire line.  This will be used for final output
+ 	 * even if it is ignored by [s]radixsort().
+ 	 */
+ 	keybuf->offset = keypos - keybuf->data;
+ 	keybuf->length = keybuf->offset + line->size;
+ 	if (keybuf->length + sizeof(TRECHEADER) > size) {
+ 		/* line too long for buffer */
+ 		return (1);
+ 	}
  	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
  	return (0);
  }
***************
*** 184,189 ****
--- 187,193 ----
  	struct column icol, tcol;
  	u_int flags;
  	u_int Rflag;
+ 	u_int BNflag;
  
  	icol = cur_fld->icol;
  	tcol = cur_fld->tcol;
***************
*** 210,216 ****
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag);
  	}
  
  	mask = cur_fld->mask;
--- 214,221 ----
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
  	}
  
  	mask = cur_fld->mask;
***************
*** 232,255 ****
  	return (tablepos == endkey ? NULL : tablepos);
  }
  
! /* Uses the first bin to assign sign, expsign, 0, and the first
   * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
!  *   When sorting in forward order:
!  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
!  * else use (0-99)->(2-102).
!  * If the exponent is >=61, use another byte for each additional 253
   * in the exponent. Cutoff is at 567.
   * To avoid confusing the exponent and the mantissa, use a field delimiter
   * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
   * only time a field delimiter can come in that position.
   * Reverse order is done analagously.
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
  	int bite, expsign = 1, sign = 1, zeroskip = 0;
--- 237,300 ----
  	return (tablepos == endkey ? NULL : tablepos);
  }
  
! /*
!  * number(): Convert a number into a format that can be sorted by
!  * radixsort().
!  *
!  * The first bin encodes assign sign, expsign, 0, and the first
   * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
!  * Subsequent bytes encode the remainder of the exponent (if the
!  * exponent was too large to be encoded entirely in the first bin),
!  * and the mantissa.
!  *
!  * The following values are used in the first bin (after being mapped
!  * via nweights[]/fnum[]/rnum[]):
!  *
!  * 255: cannot be used, because the nweights[]/fnum[]/rnum[] mapping
!  *	requires inputs in the range 0..254.
!  * 254: +overflow		(+sign, +exponent, exponent > 567)
!  * 251..253: unused
!  * 250: +sign, +exponent, exponent >= 61, exponent <= 567
!  * 190..249: +sign, +exponent, exponent > 0, exponent <= 60
!  * 189: +sign, exponent=0
!  * 129..188: +sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
!  * 128: +sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
!  * ** 128 is ambiguous
!  * 128: +underflow		(+sign, -exponent, abs(exponent) > 567)
!  * 127: zero, or not a valid number
!  * 126: -underflow		(-sign, -exponent, abs(exponent) > 567)
!  * 125: -sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
!  * 65..124: -sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
!  * 64: -sign, exponent=0
!  * 4..63: -sign, +exponent, exponent > 0, exponent <= 60
!  * 3: -sign, +exponent, exponent >= 61, exponent <= 567
!  * 1..2: unused
!  * 0: -overflow			(-sign, +exponent, exponent > 567)
!  *
!  * If abs(exponent) is >=61, use another bin for each additional 253
   * in the exponent. Cutoff is at 567.
   * To avoid confusing the exponent and the mantissa, use a field delimiter
   * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
   * only time a field delimiter can come in that position.
+  *
+  * After the exponent, append zero or more bins to represent the mantissa,
+  * with two mantissa digits per bin.
+  * When sorting in forward order:
+  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
+  * else use (0-99)->(2-102).
   * Reverse order is done analagously.
+  *
+  * After the mantissa, append a final bin containing 0 or 254,
+  * depending on whether we want the value to sort before or after a
+  * similar value that has more digits in the mantissa.
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag, BNflag;
  {
+ 	u_char *startpos = pos;
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
  	int bite, expsign = 1, sign = 1, zeroskip = 0;
***************
*** 288,294 ****
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			*pos++ = nweights[127];
  			return (pos);
  		}
  	}
--- 333,339 ----
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			*pos++ = BNflag ? nweights[0]: nweights[127];
  			return (pos);
  		}
  	}
***************
*** 305,311 ****
  	}
  	bite = min(exponent, 61);
  	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! 				: (expsign ? 64-bite : 64+bite)];
  	if (bite >= 61) {
  		do {
  			exponent -= bite;
--- 350,359 ----
  	}
  	bite = min(exponent, 61);
  	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! 					/* range 128 .. 250 */
! 				: (expsign ? 64-bite : 64+bite)
! 					/* range 3 .. 125 */
! 			];
  	if (bite >= 61) {
  		do {
  			exponent -= bite;
***************
*** 322,345 ****
--- 370,405 ----
  				if (pos == bufend)
  					return (NULL);
  				if (*line != '0' || lastvalue != '0')
+ 				{
  					nonzero = pos;	
+ 				}
  			} else
  				lastvalue = *line;
  			parity ^= 1;
  		} else if (*line == DECIMAL) {
  			if (!expincr)	/* a decimal already occurred once */
+ 			{
  				break;
+ 			}
  			expincr = 0;
  		} else
  			break;
  	}
+ 	if (!nonzero && lastvalue == '0')
+ 	{
+ 		pos = startpos;	
+ 	}
  	if ((parity && lastvalue != '0') || !nonzero) {
  		*pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
  					OFF_TENS[lastvalue] + '0';
  	} else
+ 	{
  		pos = nonzero;	
+ 	}
  	if (pos > bufend-1)
+ 	{
  		return (NULL);
+ 	}
  	*pos++ = or_sign ? nweights[254] : nweights[0];
  	return (pos);
  }
***************
*** 347,352 ****
--- 407,414 ----
  /* This forces a gap around the record delimiter
   * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
   * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
+  *
+  * XXX: fnum[255] and rnum[255] are unused.
   */
  void
  num_init()
***************
*** 362,371 ****
  	}
  	for (i = 0; i < REC_D; i++) {
  		fnum[i] = i;
! 		rnum[255 - i] = i;
  	}
  	for (i = REC_D; i <255; i++) {
  		fnum[i] = i + 1;
! 		rnum[255 - i] = i - 1;
  	}
  }
--- 424,433 ----
  	}
  	for (i = 0; i < REC_D; i++) {
  		fnum[i] = i;
! 		rnum[254 - i] = i;
  	}
  	for (i = REC_D; i <255; i++) {
  		fnum[i] = i + 1;
! 		rnum[254 - i] = i - 1;
  	}
  }
*** /usr/src/usr.bin/sort/init.c	Wed Nov  3 14:14:36 2004
--- init.c	Sun Jun 12 01:30:42 2005
***************
*** 1,4 ****
! /*	$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $	*/
  
  /*-
   * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
--- 1,4 ----
! /*	$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $	*/
  
  /*-
   * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
***************
*** 71,77 ****
  #include "sort.h"
  
  #ifndef lint
! __RCSID("$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $");
  __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
  #endif /* not lint */
  
--- 71,77 ----
  #include "sort.h"
  
  #ifndef lint
! __RCSID("$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $");
  __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
  #endif /* not lint */
  
***************
*** 260,265 ****
--- 260,266 ----
  		case 'f': return (F);
  		case 'i': return (I);
  		case 'n': return (N);
+ 		case 'N': return (BN) | (N);
  		case 'r': return (R);
  		default:  return (0);
  	}
***************
*** 282,288 ****
  		if (argv[i][0] != '+' && !fplus)
  			continue;
  
! 		if (fplus && (argv[i][0] != '-' || !isdigit((unsigned char)argv[i][1]))) {
  			fplus = 0;
  			if (argv[i][0] != '+') {
  				/* not a -POS argument, skip */
--- 283,289 ----
  		if (argv[i][0] != '+' && !fplus)
  			continue;
  
! 		if (fplus && (argv[i][0] != '-' || !isdigit((int) argv[i][1]))) {
  			fplus = 0;
  			if (argv[i][0] != '+') {
  				/* not a -POS argument, skip */
*** /usr/src/usr.bin/sort/sort.1	Fri Jul 23 08:20:36 2004
--- sort.1	Sun Jun 12 01:35:02 2005
***************
*** 162,168 ****
  .Fl n
  option no longer implies the
  .Fl b
! option.)
  .It Fl r
  Reverse the sense of comparisons.
  .It Fl S
--- 162,174 ----
  .Fl n
  option no longer implies the
  .Fl b
! option.)  If there is no such numeric string, the field is treated as
! having the value zero.
! .It Fl N
! When sorting numerically, treat non-numeric fields as negative infinity,
! rather than as zero.  Implies the
! .Fl n
! option.
  .It Fl r
  Reverse the sense of comparisons.
  .It Fl S
*** /usr/src/usr.bin/sort/sort.c	Fri Jul 23 08:26:11 2004
--- sort.c	Sun Jun 12 01:30:42 2005
***************
*** 158,165 ****
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
--- 158,168 ----
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
+ 		case 'N':
+ 			fldtab->flags |= BN | N;
+ 			break;
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
***************
*** 360,366 ****
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
--- 363,369 ----
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
*** /usr/src/usr.bin/sort/sort.h	Sun Feb 15 08:22:55 2004
--- sort.h	Sun Jun 12 01:32:08 2005
***************
*** 91,96 ****
--- 91,97 ----
  #define R 16		/* Field is reversed with respect to the global weight */
  #define BI 32		/* ignore blanks in icol */
  #define BT 64		/* ignore blanks in tcol */
+ #define BN 128          /* invalid numeric fields are -infinity */
  
  /* masks for delimiters: blanks, fields, and termination. */
  #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */