Subject: Re: bin/30504
To: None <jdolecek@netbsd.org, gnats-admin@netbsd.org,>
From: Peter Seebach <seebs@plethora.net>
List: netbsd-bugs
Date: 06/14/2005 21:26:02
The following reply was made to PR bin/30504; it has been noted by GNATS.

From: seebs@plethora.net (Peter Seebach)
To: jdolecek@netbsd.org, gnats-bugs@netbsd.org
Cc: 
Subject: Re: bin/30504 
Date: Tue, 14 Jun 2005 16:25:44 -0500

 In message <20050612122725.0939363B11F@narn.netbsd.org>, wiz@netbsd.org writes:
 >Synopsis: sort mis-sorts many numbers
 >
 >Responsible-Changed-From-To: bin-bug-people->jdolecek
 >Responsible-Changed-By: wiz@netbsd.org
 >Responsible-Changed-When: Sun, 12 Jun 2005 12:27:24 +0000
 >Responsible-Changed-Why:
 >jdolecek supports sort(1).
 
 I found one more boundary condition.  On a reverse key sort, trailing zeroes
 sorted as >9 instead of <1.
 
 *** /usr/src/usr.bin/sort/fields.c	Sun Mar 14 15:12:14 2004
 --- fields.c	Tue Jun 14 16:01:58 2005
 ***************
 *** 89,95 ****
   }
   		
   static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
 ! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
   
   #define DECIMAL '.'
   #define OFFSET 128
 --- 89,95 ----
   }
   		
   static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
 ! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));
   
   #define DECIMAL '.'
   #define OFFSET 128
 ***************
 *** 151,172 ****
   		    fieldtable->flags)) == NULL)
   			return (1);
   
 - 	keybuf->offset = keypos - keybuf->data;
 - 	keybuf->length = keybuf->offset + line->size;
 - 	if (keybuf->length + sizeof(TRECHEADER) > size) {
 - 		/* line too long for buffer */
 - 		return (1);
 - 	}
 - 
   	/*
   	 * Make [s]radixsort() only sort by relevant part of key if:
   	 * 1. we want to choose unique items by relevant field[s]
   	 * 2. we want stable sort and so the items should be sorted only by
   	 *    the relevant field[s]
   	 */
 ! 	if (UNIQUE || (stable_sort && keybuf->offset < line->size))
 ! 		keypos[-1] = REC_D;
   
   	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
   	return (0);
   }
 --- 151,175 ----
   		    fieldtable->flags)) == NULL)
   			return (1);
   
   	/*
   	 * Make [s]radixsort() only sort by relevant part of key if:
   	 * 1. we want to choose unique items by relevant field[s]
   	 * 2. we want stable sort and so the items should be sorted only by
   	 *    the relevant field[s]
   	 */
 ! 	if (UNIQUE || stable_sort)
 ! 		*keypos++ = REC_D;
   
 + 	/*
 + 	 * Append the entire line.  This will be used for final output
 + 	 * even if it is ignored by [s]radixsort().
 + 	 */
 + 	keybuf->offset = keypos - keybuf->data;
 + 	keybuf->length = keybuf->offset + line->size;
 + 	if (keybuf->length + sizeof(TRECHEADER) > size) {
 + 		/* line too long for buffer */
 + 		return (1);
 + 	}
   	memcpy(keybuf->data + keybuf->offset, line->data, line->size);
   	return (0);
   }
 ***************
 *** 184,189 ****
 --- 187,193 ----
   	struct column icol, tcol;
   	u_int flags;
   	u_int Rflag;
 + 	u_int BNflag;
   
   	icol = cur_fld->icol;
   	tcol = cur_fld->tcol;
 ***************
 *** 210,216 ****
   
   	if (flags & N) {
   		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
 ! 		return number(tablepos, endkey, start, end, Rflag);
   	}
   
   	mask = cur_fld->mask;
 --- 214,221 ----
   
   	if (flags & N) {
   		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
 ! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
 ! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
   	}
   
   	mask = cur_fld->mask;
 ***************
 *** 232,255 ****
   	return (tablepos == endkey ? NULL : tablepos);
   }
   
 ! /* Uses the first bin to assign sign, expsign, 0, and the first
    * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
 !  *   When sorting in forward order:
 !  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
 !  * else use (0-99)->(2-102).
 !  * If the exponent is >=61, use another byte for each additional 253
    * in the exponent. Cutoff is at 567.
    * To avoid confusing the exponent and the mantissa, use a field delimiter
    * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
    * only time a field delimiter can come in that position.
    * Reverse order is done analagously.
    */
   
   static u_char *
 ! number(pos, bufend, line, lineend, Rflag)
   	u_char *line, *pos, *bufend, *lineend;
 ! 	int Rflag;
   {
   	int or_sign, parity = 0;
   	int expincr = 1, exponent = -1;
   	int bite, expsign = 1, sign = 1, zeroskip = 0;
 --- 237,300 ----
   	return (tablepos == endkey ? NULL : tablepos);
   }
   
 ! /*
 !  * number(): Convert a number into a format that can be sorted by
 !  * radixsort().
 !  *
 !  * The first bin encodes assign sign, expsign, 0, and the first
    * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
 !  * Subsequent bytes encode the remainder of the exponent (if the
 !  * exponent was too large to be encoded entirely in the first bin),
 !  * and the mantissa.
 !  *
 !  * The following values are used in the first bin (after being mapped
 !  * via nweights[]/fnum[]/rnum[]):
 !  *
 !  * 255: cannot be used, because the nweights[]/fnum[]/rnum[] mapping
 !  *	requires inputs in the range 0..254.
 !  * 254: +overflow		(+sign, +exponent, exponent > 567)
 !  * 251..253: unused
 !  * 250: +sign, +exponent, exponent >= 61, exponent <= 567
 !  * 190..249: +sign, +exponent, exponent > 0, exponent <= 60
 !  * 189: +sign, exponent=0
 !  * 129..188: +sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
 !  * 128: +sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
 !  * ** 128 is ambiguous
 !  * 128: +underflow		(+sign, -exponent, abs(exponent) > 567)
 !  * 127: zero, or not a valid number
 !  * 126: -underflow		(-sign, -exponent, abs(exponent) > 567)
 !  * 125: -sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
 !  * 65..124: -sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
 !  * 64: -sign, exponent=0
 !  * 4..63: -sign, +exponent, exponent > 0, exponent <= 60
 !  * 3: -sign, +exponent, exponent >= 61, exponent <= 567
 !  * 1..2: unused
 !  * 0: -overflow			(-sign, +exponent, exponent > 567)
 !  *
 !  * If abs(exponent) is >=61, use another bin for each additional 253
    * in the exponent. Cutoff is at 567.
    * To avoid confusing the exponent and the mantissa, use a field delimiter
    * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
    * only time a field delimiter can come in that position.
 +  *
 +  * After the exponent, append zero or more bins to represent the mantissa,
 +  * with two mantissa digits per bin.
 +  * When sorting in forward order:
 +  * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
 +  * else use (0-99)->(2-102).
    * Reverse order is done analagously.
 +  *
 +  * After the mantissa, append a final bin containing 0 or 254,
 +  * depending on whether we want the value to sort before or after a
 +  * similar value that has more digits in the mantissa.
    */
   
   static u_char *
 ! number(pos, bufend, line, lineend, Rflag, BNflag)
   	u_char *line, *pos, *bufend, *lineend;
 ! 	int Rflag, BNflag;
   {
 + 	u_char *startpos = pos;
   	int or_sign, parity = 0;
   	int expincr = 1, exponent = -1;
   	int bite, expsign = 1, sign = 1, zeroskip = 0;
 ***************
 *** 288,294 ****
   	if (*line < '1' || *line > '9' || line >= lineend) {
   		/* only exit if we didn't skip any zero number */
   		if (!zeroskip) {
 ! 			*pos++ = nweights[127];
   			return (pos);
   		}
   	}
 --- 333,339 ----
   	if (*line < '1' || *line > '9' || line >= lineend) {
   		/* only exit if we didn't skip any zero number */
   		if (!zeroskip) {
 ! 			*pos++ = BNflag ? nweights[0]: nweights[127];
   			return (pos);
   		}
   	}
 ***************
 *** 305,311 ****
   	}
   	bite = min(exponent, 61);
   	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
 ! 				: (expsign ? 64-bite : 64+bite)];
   	if (bite >= 61) {
   		do {
   			exponent -= bite;
 --- 350,359 ----
   	}
   	bite = min(exponent, 61);
   	*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
 ! 					/* range 128 .. 250 */
 ! 				: (expsign ? 64-bite : 64+bite)
 ! 					/* range 3 .. 125 */
 ! 			];
   	if (bite >= 61) {
   		do {
   			exponent -= bite;
 ***************
 *** 333,338 ****
 --- 381,388 ----
   		} else
   			break;
   	}
 + 	if (!nonzero && lastvalue == '0')
 + 		pos = startpos;	
   	if ((parity && lastvalue != '0') || !nonzero) {
   		*pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
   					OFF_TENS[lastvalue] + '0';
 ***************
 *** 340,352 ****
   		pos = nonzero;	
   	if (pos > bufend-1)
   		return (NULL);
 ! 	*pos++ = or_sign ? nweights[254] : nweights[0];
   	return (pos);
   }
   
   /* This forces a gap around the record delimiter
    * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
    * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
    */
   void
   num_init()
 --- 390,404 ----
   		pos = nonzero;	
   	if (pos > bufend-1)
   		return (NULL);
 ! 	*pos++ = (or_sign ^ Rflag) ? nweights[254] : nweights[0];
   	return (pos);
   }
   
   /* This forces a gap around the record delimiter
    * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
    * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
 +  *
 +  * XXX: fnum[255] and rnum[255] are unused.
    */
   void
   num_init()
 ***************
 *** 362,371 ****
   	}
   	for (i = 0; i < REC_D; i++) {
   		fnum[i] = i;
 ! 		rnum[255 - i] = i;
   	}
   	for (i = REC_D; i <255; i++) {
   		fnum[i] = i + 1;
 ! 		rnum[255 - i] = i - 1;
   	}
   }
 --- 414,423 ----
   	}
   	for (i = 0; i < REC_D; i++) {
   		fnum[i] = i;
 ! 		rnum[254 - i] = i;
   	}
   	for (i = REC_D; i <255; i++) {
   		fnum[i] = i + 1;
 ! 		rnum[254 - i] = i - 1;
   	}
   }
 *** /usr/src/usr.bin/sort/init.c	Wed Nov  3 14:14:36 2004
 --- init.c	Sun Jun 12 01:36:04 2005
 ***************
 *** 1,4 ****
 ! /*	$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $	*/
   
   /*-
    * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
 --- 1,4 ----
 ! /*	$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $	*/
   
   /*-
    * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
 ***************
 *** 71,77 ****
   #include "sort.h"
   
   #ifndef lint
 ! __RCSID("$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $");
   __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
   #endif /* not lint */
   
 --- 71,77 ----
   #include "sort.h"
   
   #ifndef lint
 ! __RCSID("$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $");
   __SCCSID("@(#)init.c	8.1 (Berkeley) 6/6/93");
   #endif /* not lint */
   
 ***************
 *** 260,265 ****
 --- 260,266 ----
   		case 'f': return (F);
   		case 'i': return (I);
   		case 'n': return (N);
 + 		case 'N': return (BN) | (N);
   		case 'r': return (R);
   		default:  return (0);
   	}
 ***************
 *** 282,288 ****
   		if (argv[i][0] != '+' && !fplus)
   			continue;
   
 ! 		if (fplus && (argv[i][0] != '-' || !isdigit((unsigned char)argv[i][1]))) {
   			fplus = 0;
   			if (argv[i][0] != '+') {
   				/* not a -POS argument, skip */
 --- 283,289 ----
   		if (argv[i][0] != '+' && !fplus)
   			continue;
   
 ! 		if (fplus && (argv[i][0] != '-' || !isdigit((int) argv[i][1]))) {
   			fplus = 0;
   			if (argv[i][0] != '+') {
   				/* not a -POS argument, skip */
 *** /usr/src/usr.bin/sort/sort.1	Fri Jul 23 08:20:36 2004
 --- sort.1	Sun Jun 12 01:36:04 2005
 ***************
 *** 162,168 ****
   .Fl n
   option no longer implies the
   .Fl b
 ! option.)
   .It Fl r
   Reverse the sense of comparisons.
   .It Fl S
 --- 162,174 ----
   .Fl n
   option no longer implies the
   .Fl b
 ! option.)  If there is no such numeric string, the field is treated as
 ! having the value zero.
 ! .It Fl N
 ! When sorting numerically, treat non-numeric fields as negative infinity,
 ! rather than as zero.  Implies the
 ! .Fl n
 ! option.
   .It Fl r
   Reverse the sense of comparisons.
   .It Fl S
 *** /usr/src/usr.bin/sort/sort.c	Fri Jul 23 08:26:11 2004
 --- sort.c	Sun Jun 12 01:36:04 2005
 ***************
 *** 158,165 ****
   	if (!(tmpdir = getenv("TMPDIR")))
   		tmpdir = _PATH_TMP;
   
 ! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
   		switch (ch) {
   		case 'b':
   			fldtab->flags |= BI | BT;
   			break;
 --- 158,168 ----
   	if (!(tmpdir = getenv("TMPDIR")))
   		tmpdir = _PATH_TMP;
   
 ! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
   		switch (ch) {
 + 		case 'N':
 + 			fldtab->flags |= BN | N;
 + 			break;
   		case 'b':
   			fldtab->flags |= BI | BT;
   			break;
 ***************
 *** 360,366 ****
   	if (msg != NULL)
   		(void)fprintf(stderr, "sort: %s\n", msg);
   	(void)fprintf(stderr,
 ! 	    "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
   	    " [-R char] [-T dir]", getprogname());
   	(void)fprintf(stderr,
   	    "             [-t char] [file ...]\n");
 --- 363,369 ----
   	if (msg != NULL)
   		(void)fprintf(stderr, "sort: %s\n", msg);
   	(void)fprintf(stderr,
 ! 	    "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
   	    " [-R char] [-T dir]", getprogname());
   	(void)fprintf(stderr,
   	    "             [-t char] [file ...]\n");
 *** /usr/src/usr.bin/sort/sort.h	Sun Feb 15 08:22:55 2004
 --- sort.h	Sun Jun 12 01:36:04 2005
 ***************
 *** 91,96 ****
 --- 91,97 ----
   #define R 16		/* Field is reversed with respect to the global weight */
   #define BI 32		/* ignore blanks in icol */
   #define BT 64		/* ignore blanks in tcol */
 + #define BN 128          /* invalid numeric fields are -infinity */
   
   /* masks for delimiters: blanks, fields, and termination. */
   #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */