Subject: bin/30504: sort(1) is chock full of boundary condition problems
To: None <gnats-admin@netbsd.org, netbsd-bugs@netbsd.org>
From: seebs <seebs@vash.cel.plethora.net>
List: netbsd-bugs
Date: 06/12/2005 06:41:01
>Number: 30504
>Category: bin
>Synopsis: sort mis-sorts many numbers
>Confidential: no
>Severity: serious
>Priority: medium
>Responsible: bin-bug-people
>State: open
>Class: sw-bug
>Submitter-Id: net
>Arrival-Date: Sun Jun 12 06:41:00 +0000 2005
>Originator: seebs
>Release: NetBSD 3.99.3
>Organization:
N/A
>Environment:
System: NetBSD vash.cel.plethora.net 3.99.3 NetBSD 3.99.3 (VASH) #0: Wed Apr 20 01:40:35 CDT 2005 seebs@vash.cel.plethora.net:/usr/src/sys/arch/i386/compile/VASH i386
Architecture: i386
Machine: i386
>Description:
There are a number of bugs in /bin/sort. For instance:
$ cat > test
0.09
0.1
0.0
^D
$ sort -n test
0.09
0
0.1
>How-To-Repeat:
Start trying to debug a much more obscure case.
>Fix:
This fix is largely the work of Alan Barrett, who helpfully responded
to a mailing list query. I added an N flag which sorts non-numeric
values as negative infinity, because I frequently prefer this behavior
to the POSIX-mandated behavior. However, apart from that, this is
straight up bug fixes.
*** /usr/src/usr.bin/sort/fields.c Sun Mar 14 15:12:14 2004
--- fields.c Sun Jun 12 01:33:54 2005
***************
*** 89,95 ****
}
static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
#define DECIMAL '.'
#define OFFSET 128
--- 89,95 ----
}
static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));
#define DECIMAL '.'
#define OFFSET 128
***************
*** 151,172 ****
fieldtable->flags)) == NULL)
return (1);
- keybuf->offset = keypos - keybuf->data;
- keybuf->length = keybuf->offset + line->size;
- if (keybuf->length + sizeof(TRECHEADER) > size) {
- /* line too long for buffer */
- return (1);
- }
-
/*
* Make [s]radixsort() only sort by relevant part of key if:
* 1. we want to choose unique items by relevant field[s]
* 2. we want stable sort and so the items should be sorted only by
* the relevant field[s]
*/
! if (UNIQUE || (stable_sort && keybuf->offset < line->size))
! keypos[-1] = REC_D;
memcpy(keybuf->data + keybuf->offset, line->data, line->size);
return (0);
}
--- 151,175 ----
fieldtable->flags)) == NULL)
return (1);
/*
* Make [s]radixsort() only sort by relevant part of key if:
* 1. we want to choose unique items by relevant field[s]
* 2. we want stable sort and so the items should be sorted only by
* the relevant field[s]
*/
! if (UNIQUE || stable_sort)
! *keypos++ = REC_D;
+ /*
+ * Append the entire line. This will be used for final output
+ * even if it is ignored by [s]radixsort().
+ */
+ keybuf->offset = keypos - keybuf->data;
+ keybuf->length = keybuf->offset + line->size;
+ if (keybuf->length + sizeof(TRECHEADER) > size) {
+ /* line too long for buffer */
+ return (1);
+ }
memcpy(keybuf->data + keybuf->offset, line->data, line->size);
return (0);
}
***************
*** 184,189 ****
--- 187,193 ----
struct column icol, tcol;
u_int flags;
u_int Rflag;
+ u_int BNflag;
icol = cur_fld->icol;
tcol = cur_fld->tcol;
***************
*** 210,216 ****
if (flags & N) {
Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! return number(tablepos, endkey, start, end, Rflag);
}
mask = cur_fld->mask;
--- 214,221 ----
if (flags & N) {
Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! return number(tablepos, endkey, start, end, Rflag, BNflag);
}
mask = cur_fld->mask;
***************
*** 232,255 ****
return (tablepos == endkey ? NULL : tablepos);
}
! /* Uses the first bin to assign sign, expsign, 0, and the first
* 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
! * When sorting in forward order:
! * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
! * else use (0-99)->(2-102).
! * If the exponent is >=61, use another byte for each additional 253
* in the exponent. Cutoff is at 567.
* To avoid confusing the exponent and the mantissa, use a field delimiter
* if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
* only time a field delimiter can come in that position.
* Reverse order is done analagously.
*/
static u_char *
! number(pos, bufend, line, lineend, Rflag)
u_char *line, *pos, *bufend, *lineend;
! int Rflag;
{
int or_sign, parity = 0;
int expincr = 1, exponent = -1;
int bite, expsign = 1, sign = 1, zeroskip = 0;
--- 237,300 ----
return (tablepos == endkey ? NULL : tablepos);
}
! /*
! * number(): Convert a number into a format that can be sorted by
! * radixsort().
! *
! * The first bin encodes assign sign, expsign, 0, and the first
* 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
! * Subsequent bytes encode the remainder of the exponent (if the
! * exponent was too large to be encoded entirely in the first bin),
! * and the mantissa.
! *
! * The following values are used in the first bin (after being mapped
! * via nweights[]/fnum[]/rnum[]):
! *
! * 255: cannot be used, because the nweights[]/fnum[]/rnum[] mapping
! * requires inputs in the range 0..254.
! * 254: +overflow (+sign, +exponent, exponent > 567)
! * 251..253: unused
! * 250: +sign, +exponent, exponent >= 61, exponent <= 567
! * 190..249: +sign, +exponent, exponent > 0, exponent <= 60
! * 189: +sign, exponent=0
! * 129..188: +sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
! * 128: +sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
! * ** 128 is ambiguous
! * 128: +underflow (+sign, -exponent, abs(exponent) > 567)
! * 127: zero, or not a valid number
! * 126: -underflow (-sign, -exponent, abs(exponent) > 567)
! * 125: -sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
! * 65..124: -sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
! * 64: -sign, exponent=0
! * 4..63: -sign, +exponent, exponent > 0, exponent <= 60
! * 3: -sign, +exponent, exponent >= 61, exponent <= 567
! * 1..2: unused
! * 0: -overflow (-sign, +exponent, exponent > 567)
! *
! * If abs(exponent) is >=61, use another bin for each additional 253
* in the exponent. Cutoff is at 567.
* To avoid confusing the exponent and the mantissa, use a field delimiter
* if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
* only time a field delimiter can come in that position.
+ *
+ * After the exponent, append zero or more bins to represent the mantissa,
+ * with two mantissa digits per bin.
+ * When sorting in forward order:
+ * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
+ * else use (0-99)->(2-102).
* Reverse order is done analagously.
+ *
+ * After the mantissa, append a final bin containing 0 or 254,
+ * depending on whether we want the value to sort before or after a
+ * similar value that has more digits in the mantissa.
*/
static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
u_char *line, *pos, *bufend, *lineend;
! int Rflag, BNflag;
{
+ u_char *startpos = pos;
int or_sign, parity = 0;
int expincr = 1, exponent = -1;
int bite, expsign = 1, sign = 1, zeroskip = 0;
***************
*** 288,294 ****
if (*line < '1' || *line > '9' || line >= lineend) {
/* only exit if we didn't skip any zero number */
if (!zeroskip) {
! *pos++ = nweights[127];
return (pos);
}
}
--- 333,339 ----
if (*line < '1' || *line > '9' || line >= lineend) {
/* only exit if we didn't skip any zero number */
if (!zeroskip) {
! *pos++ = BNflag ? nweights[0]: nweights[127];
return (pos);
}
}
***************
*** 305,311 ****
}
bite = min(exponent, 61);
*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! : (expsign ? 64-bite : 64+bite)];
if (bite >= 61) {
do {
exponent -= bite;
--- 350,359 ----
}
bite = min(exponent, 61);
*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! /* range 128 .. 250 */
! : (expsign ? 64-bite : 64+bite)
! /* range 3 .. 125 */
! ];
if (bite >= 61) {
do {
exponent -= bite;
***************
*** 322,345 ****
--- 370,405 ----
if (pos == bufend)
return (NULL);
if (*line != '0' || lastvalue != '0')
+ {
nonzero = pos;
+ }
} else
lastvalue = *line;
parity ^= 1;
} else if (*line == DECIMAL) {
if (!expincr) /* a decimal already occurred once */
+ {
break;
+ }
expincr = 0;
} else
break;
}
+ if (!nonzero && lastvalue == '0')
+ {
+ pos = startpos;
+ }
if ((parity && lastvalue != '0') || !nonzero) {
*pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
OFF_TENS[lastvalue] + '0';
} else
+ {
pos = nonzero;
+ }
if (pos > bufend-1)
+ {
return (NULL);
+ }
*pos++ = or_sign ? nweights[254] : nweights[0];
return (pos);
}
***************
*** 347,352 ****
--- 407,414 ----
/* This forces a gap around the record delimiter
* Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
* rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
+ *
+ * XXX: fnum[255] and rnum[255] are unused.
*/
void
num_init()
***************
*** 362,371 ****
}
for (i = 0; i < REC_D; i++) {
fnum[i] = i;
! rnum[255 - i] = i;
}
for (i = REC_D; i <255; i++) {
fnum[i] = i + 1;
! rnum[255 - i] = i - 1;
}
}
--- 424,433 ----
}
for (i = 0; i < REC_D; i++) {
fnum[i] = i;
! rnum[254 - i] = i;
}
for (i = REC_D; i <255; i++) {
fnum[i] = i + 1;
! rnum[254 - i] = i - 1;
}
}
*** /usr/src/usr.bin/sort/init.c Wed Nov 3 14:14:36 2004
--- init.c Sun Jun 12 01:30:42 2005
***************
*** 1,4 ****
! /* $NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
--- 1,4 ----
! /* $NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
***************
*** 71,77 ****
#include "sort.h"
#ifndef lint
! __RCSID("$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $");
__SCCSID("@(#)init.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
--- 71,77 ----
#include "sort.h"
#ifndef lint
! __RCSID("$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $");
__SCCSID("@(#)init.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
***************
*** 260,265 ****
--- 260,266 ----
case 'f': return (F);
case 'i': return (I);
case 'n': return (N);
+ case 'N': return (BN) | (N);
case 'r': return (R);
default: return (0);
}
***************
*** 282,288 ****
if (argv[i][0] != '+' && !fplus)
continue;
! if (fplus && (argv[i][0] != '-' || !isdigit((unsigned char)argv[i][1]))) {
fplus = 0;
if (argv[i][0] != '+') {
/* not a -POS argument, skip */
--- 283,289 ----
if (argv[i][0] != '+' && !fplus)
continue;
! if (fplus && (argv[i][0] != '-' || !isdigit((int) argv[i][1]))) {
fplus = 0;
if (argv[i][0] != '+') {
/* not a -POS argument, skip */
*** /usr/src/usr.bin/sort/sort.1 Fri Jul 23 08:20:36 2004
--- sort.1 Sun Jun 12 01:35:02 2005
***************
*** 162,168 ****
.Fl n
option no longer implies the
.Fl b
! option.)
.It Fl r
Reverse the sense of comparisons.
.It Fl S
--- 162,174 ----
.Fl n
option no longer implies the
.Fl b
! option.) If there is no such numeric string, the field is treated as
! having the value zero.
! .It Fl N
! When sorting numerically, treat non-numeric fields as negative infinity,
! rather than as zero. Implies the
! .Fl n
! option.
.It Fl r
Reverse the sense of comparisons.
.It Fl S
*** /usr/src/usr.bin/sort/sort.c Fri Jul 23 08:26:11 2004
--- sort.c Sun Jun 12 01:30:42 2005
***************
*** 158,165 ****
if (!(tmpdir = getenv("TMPDIR")))
tmpdir = _PATH_TMP;
! while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
switch (ch) {
case 'b':
fldtab->flags |= BI | BT;
break;
--- 158,168 ----
if (!(tmpdir = getenv("TMPDIR")))
tmpdir = _PATH_TMP;
! while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
switch (ch) {
+ case 'N':
+ fldtab->flags |= BN | N;
+ break;
case 'b':
fldtab->flags |= BI | BT;
break;
***************
*** 360,366 ****
if (msg != NULL)
(void)fprintf(stderr, "sort: %s\n", msg);
(void)fprintf(stderr,
! "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
" [-R char] [-T dir]", getprogname());
(void)fprintf(stderr,
" [-t char] [file ...]\n");
--- 363,369 ----
if (msg != NULL)
(void)fprintf(stderr, "sort: %s\n", msg);
(void)fprintf(stderr,
! "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
" [-R char] [-T dir]", getprogname());
(void)fprintf(stderr,
" [-t char] [file ...]\n");
*** /usr/src/usr.bin/sort/sort.h Sun Feb 15 08:22:55 2004
--- sort.h Sun Jun 12 01:32:08 2005
***************
*** 91,96 ****
--- 91,97 ----
#define R 16 /* Field is reversed with respect to the global weight */
#define BI 32 /* ignore blanks in icol */
#define BT 64 /* ignore blanks in tcol */
+ #define BN 128 /* invalid numeric fields are -infinity */
/* masks for delimiters: blanks, fields, and termination. */
#define BLANK 1 /* ' ', '\t'; '\n' if -T is invoked */