Subject: sort(1)... Possible bug, and looking for suggestions on features.
To: None <current-users@netbsd.org>
From: Peter Seebach <seebs@plethora.net>
List: current-users
Date: 10/12/2004 19:17:14
According to POSIX, when sorting numerically, "An empty digit string shall be
treated as zero."

In fact, NetBSD's sort does not do this:

$ sort -t: -n t1
 :line 1 (space)
:line 2 (empty)
x:line 4 (x)
-1:line 3 (-1)
0:line 6 (zero)
1:line 5 (one)

But, if you ask for an unstable sort, it does:
$ sort -S -t: -n t1
-1:line 3 (-1)
 :line 1 (space)
:line 2 (empty)
x:line 4 (x)
0:line 6 (zero)
1:line 5 (one)

This behavior is inconsistent.

The issue appears to be that, in the case where there have been no digits at
all in a string, sort stashes a single 0x80 in the key.  However, when trying
to do a stable sort, sort then smashes the last character of the key... The
only character we stored!  (fields.c, line 168.)

The problem, I think, comes from enterfield (fields.c), which has:
	*tablepos++ = lweight[0];
as the last line...; in short, enterfield is trying to make sure that a
shorter record always sorts before a longer one, I think... There is similar
code at the end of number().  But, in the case of a blank, there's no such
character.  One simple solution is this:

*** src/fields.c        Mon Mar 15 01:34:29 2004
--- fields.c    Tue Oct 12 18:56:40 2004
***************
*** 289,294 ****
--- 289,295 ----
                /* only exit if we didn't skip any zero number */
                if (!zeroskip) {
                        *pos++ = nweights[127];
+                       *pos++ = nweights[0];
                        return (pos);
                }
        }

This doesn't break any regression tests, and appears to ensure that blank
fields are correctly sorted in with other 0 values.

I'm asking about this on the list because:
1.  I am not convinced that I fully understand sort(1).
2.  I am pretty sure I found a "simple" fix for this exact problem once
at BSDi, and that my obvious fix was incorrect, so I'd appreciate a second
opinion.


...

There is another thing I want a second opinion about.  While I'm happy to
patch sort(1) for compliance with POSIX, I actually dislike the POSIX
behavior; I'd rather have non-values sort as negative infinity.

This should obviously be optional.  I've written a patch to produce such
behavior, attaching it to the '-N' flag.  But...
1.  Anyone know of a sort(1) implementation already using that flag for
something else, or using some other flag for this behavior?
2.  Anyone else have opinions on such a patch?

Here's the patch, with both things done.  I'd like feedback on this in case
it's obviously stupid, before I send-pr it.

*** src/fields.c	Mon Mar 15 01:34:29 2004
--- fields.c	Tue Oct 12 19:09:37 2004
***************
*** 89,95 ****
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
--- 89,95 ----
  }
  		
  static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));
  
  #define DECIMAL '.'
  #define OFFSET 128
***************
*** 184,189 ****
--- 184,190 ----
  	struct column icol, tcol;
  	u_int flags;
  	u_int Rflag;
+ 	u_int BNflag;
  
  	icol = cur_fld->icol;
  	tcol = cur_fld->tcol;
***************
*** 210,216 ****
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag);
  	}
  
  	mask = cur_fld->mask;
--- 211,218 ----
  
  	if (flags & N) {
  		Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! 		BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! 		return number(tablepos, endkey, start, end, Rflag, BNflag);
  	}
  
  	mask = cur_fld->mask;
***************
*** 246,254 ****
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
--- 248,256 ----
   */
  
  static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
  	u_char *line, *pos, *bufend, *lineend;
! 	int Rflag, BNflag;
  {
  	int or_sign, parity = 0;
  	int expincr = 1, exponent = -1;
***************
*** 288,294 ****
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			*pos++ = nweights[127];
  			return (pos);
  		}
  	}
--- 290,301 ----
  	if (*line < '1' || *line > '9' || line >= lineend) {
  		/* only exit if we didn't skip any zero number */
  		if (!zeroskip) {
! 			if (BNflag) {
! 				*pos++ = nweights[0];
! 			} else {
! 				*pos++ = nweights[127];
! 			}
! 			*pos++ = nweights[0];
  			return (pos);
  		}
  	}
*** src/init.c	Sun Feb 22 08:20:27 2004
--- init.c	Tue Oct 12 19:07:12 2004
***************
*** 256,261 ****
--- 256,262 ----
  				return (BI);
  			else
  				return (BT);
+ 		case 'B': return (BN);
  		case 'd': return (D);
  		case 'f': return (F);
  		case 'i': return (I);
*** src/sort.c	Wed Jul 28 01:48:44 2004
--- sort.c	Tue Oct 12 19:15:11 2004
***************
*** 158,165 ****
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
--- 158,168 ----
  	if (!(tmpdir = getenv("TMPDIR")))
  		tmpdir = _PATH_TMP;
  
! 	while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
  		switch (ch) {
+ 		case 'N':
+ 			fldtab->flags |= BN | N;
+ 			break;
  		case 'b':
  			fldtab->flags |= BI | BT;
  			break;
*** src/sort.h	Sun Feb 22 08:20:27 2004
--- sort.h	Tue Oct 12 19:02:01 2004
***************
*** 91,96 ****
--- 91,97 ----
  #define R 16		/* Field is reversed with respect to the global weight */
  #define BI 32		/* ignore blanks in icol */
  #define BT 64		/* ignore blanks in tcol */
+ #define BN 128          /* invalid numeric fields are -infinity */
  
  /* masks for delimiters: blanks, fields, and termination. */
  #define BLANK 1		/* ' ', '\t'; '\n' if -T is invoked */