Subject: bin/30195: sort mis-sorts "0", blank space, in numeric sorts
To: None <gnats-admin@netbsd.org, netbsd-bugs@netbsd.org>
From: Seebs <seebs@guild.plethora.net>
List: netbsd-bugs
Date: 05/11/2005 09:36:00
>Number:         30195
>Category:       bin
>Synopsis:       sort sorts blank spaces weirdly, and "0" between "0.09" and "0.1"
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    bin-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Wed May 11 09:36:00 +0000 2005
>Originator:     Seebs
>Release:        NetBSD 3.99.3
>Organization:
Consulting, computers, web hosting, and shell access: http://www.plethora.net/
>Environment:
Tested on 2.99 and 3.99
System: NetBSD guild.plethora.net 2.99.15 NetBSD 2.99.15 (GUILD) #0: Tue Feb 8 00:56:13 CST 2005 seebs@guild.plethora.net:/usr/src/sys/arch/i386/compile/GUILD i386
Architecture: i386
Machine: i386
>Description:
	The deep magic used by sort(1) to handle numeric keys has two subtle
	bugs, both in the case where it runs out of digits.

	The first bug, which is more obvious, is that non-numeric values do
	not sort where POSIX says they will sort, and may not be sorted
	consistently.

	The second, which is more subtle, is that "0" sorts precisely between
	0.0999999 and 0.1.

	This is arguably a refinement of, or followup to, bin/27257.

>How-To-Repeat:
	Play around with sort.
>Fix:

This fix fixes the two bugs described above.  The "zeroskip" value turns out
not to matter; it is *always* good in POSIXy sort to have a value with no
non-zero digits sort precisely between other values.  The "nweights[0]" is
there because every other way of generating a key moves pos one step too far,
to be smashed over later.  Otherwise, our zeroes get smashed to look as though
there were no key; that produces surprising (and non-POSIX) results.

***************
*** 286,296 ****
  	}
  	/* next character better be a digit */
  	if (*line < '1' || *line > '9' || line >= lineend) {
! 		/* only exit if we didn't skip any zero number */
! 		if (!zeroskip) {
! 			*pos++ = nweights[127];
! 			return (pos);
! 		}
  	}
  	if (expincr) {
  		for (tline = line-1; *++tline >= '0' && 
--- 286,295 ----
  	}
  	/* next character better be a digit */
  	if (*line < '1' || *line > '9' || line >= lineend) {
! 		/* all zeroes sort between anything with a value */
! 		*pos++ = nweights[127];
! 		*pos++ = nweights[0];
! 		return (pos);
  	}
  	if (expincr) {
  		for (tline = line-1; *++tline >= '0' && 

However, in fact, it's occasionally much nicer to cause non-numeric values to
sort as -INFINITY.  The following patch provides an extension, the -N option,
to produce that result.


*** /usr/src/usr.bin/sort/fields.c	Sun Mar 14 15:12:14 2004
--- fields.c	Wed May 11 04:18:43 2005
***************
*** 89,95 ****
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
--- 89,95 ----
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
***************
*** 184,189 ****
--- 184,190 ----
  	struct column icol, tcol;
  	u_int flags;
  	u_int Rflag;
+ 	u_int BNflag;
  
  	icol = cur_fld->icol;
  	tcol = cur_fld->tcol;
***************
*** 210,216 ****
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag);
  	}
  
  	mask = cur_fld->mask;
--- 211,218 ----
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
  	}
  
  	mask = cur_fld->mask;
***************
*** 246,254 ****
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
--- 248,256 ----
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag, BNflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
***************
*** 288,294 ****
--- 290,305 ----
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
+ 			if (BNflag) {
+ 				*pos++ = nweights[1];
+ 			} else {
+ 				*pos++ = nweights[127];
+ 			}
+ 			*pos++ = nweights[0];
+ 			return (pos);
+ 		} else {
  			*pos++ = nweights[127];
+ 			*pos++ = nweights[0];
  			return (pos);
  		}
  	}
*** /usr/src/usr.bin/sort/files.c	Sun Feb 15 05:52:12 2004
--- files.c	Wed May 11 04:18:43 2005
***************
*** 371,377 ****
  	const RECHEADER *rec;
  	FILE *fp;
  {
! 	EWRITE(rec->data+rec->offset, 1, rec->length - rec->offset, fp);
  }
  
  /*
--- 371,386 ----
  	const RECHEADER *rec;
  	FILE *fp;
  {
! 	if (debug) {
! 		int i;
! 		for (i = 0; i < rec->offset; ++i) {
! 			fprintf(fp, "%02x", rec->data[i] & 0xff);
! 		}
! 		fprintf(fp, ":");
! 		EWRITE(rec->data+rec->offset, 1, rec->length - rec->offset, fp);
! 	} else {
! 		EWRITE(rec->data+rec->offset, 1, rec->length - rec->offset, fp);
! 	}
  }
  
  /*
*** /usr/src/usr.bin/sort/init.c	Wed Nov  3 14:14:36 2004
--- init.c	Wed May 11 04:18:43 2005
***************
*** 1,4 ****
! /*	$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $	*/
  
  /*-
   * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
--- 1,4 ----
! /*	$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $	*/
  
  /*-
   * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
***************
*** 71,77 ****
  #include "sort.h"
  
  #ifndef lint
! __RCSID("$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $");
  __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
  #endif /* not lint */
  
--- 71,77 ----
  #include "sort.h"
  
  #ifndef lint
! __RCSID("$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $");
  __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
  #endif /* not lint */
  
***************
*** 260,265 ****
--- 260,266 ----
  		case 'f': return (F);
  		case 'i': return (I);
  		case 'n': return (N);
+ 		case 'N': return (BN) | (N);
  		case 'r': return (R);
  		default:  return (0);
  	}
***************
*** 282,288 ****
  		if (argv[i][0] != '+' && !fplus)
  			continue;
  
! 		if (fplus && (argv[i][0] != '-' || !isdigit((unsigned char)argv[i][1]))) {
  			fplus = 0;
  			if (argv[i][0] != '+') {
  				/* not a -POS argument, skip */
--- 283,289 ----
  		if (argv[i][0] != '+' && !fplus)
  			continue;
  
! 		if (fplus && (argv[i][0] != '-' || !isdigit(argv[i][1]))) {
  			fplus = 0;
  			if (argv[i][0] != '+') {
  				/* not a -POS argument, skip */
*** /usr/src/usr.bin/sort/sort.c	Fri Jul 23 08:26:11 2004
--- sort.c	Wed May 11 04:18:43 2005
***************
*** 158,165 ****
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
--- 158,168 ----
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
+ 		case 'N':
+ 			fldtab->flags |= BN | N;
+ 			break;
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
***************
*** 360,366 ****
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
--- 363,369 ----
  	if (msg != NULL)
  		(void)fprintf(stderr, "sort: %s\n", msg);
  	(void)fprintf(stderr,
! 	    "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
  	    " [-R char] [-T dir]", getprogname());
  	(void)fprintf(stderr,
  	    "             [-t char] [file ...]\n");
*** /usr/src/usr.bin/sort/sort.h	Sun Feb 15 08:22:55 2004
--- sort.h	Wed May 11 04:18:43 2005
***************
*** 91,96 ****
--- 91,97 ----
  #define R 16		/* Field is reversed with respect to the global weight */
  #define BI 32		/* ignore blanks in icol */
  #define BT 64		/* ignore blanks in tcol */
+ #define BN 128          /* invalid numeric fields are -infinity */
  
  /* masks for delimiters: blanks, fields, and termination. */
  #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */