Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/usr.bin/sort Save length of key instead of relying of the we...



details:   https://anonhg.NetBSD.org/src/rev/58c5343e257f
branches:  trunk
changeset: 747318:58c5343e257f
user:      dsl <dsl%NetBSD.org@localhost>
date:      Thu Sep 10 22:02:40 2009 +0000

description:
Save length of key instead of relying of the weight of the record sep.
This frees a byte value to use for 'end of key' (to correctly sort
short keys) while still having a weight assigned to the field sep.
(Unless -t is given, the field sep is in the field data.)
Do reverse sorts by writing the output file in reverse order (rather
than reversing the sort - apart from merges).
All key compares are now unweighted.
For 'sort -u' mark duplicates keys during the sort and don't write
to the output.
Use -S to mean a posix sort - where equal keys are sorted using the
raw record (rather than being kept in the original order).
For 'sort -f' (no keys) generate a key of the folded data (as for -n
-i and -d), simplifies the code and allows a 'posix' sort.

diffstat:

 usr.bin/sort/Makefile     |    5 +-
 usr.bin/sort/append.c     |   82 ++++----------------
 usr.bin/sort/fields.c     |   40 +++++----
 usr.bin/sort/files.c      |    7 +-
 usr.bin/sort/fsort.c      |   20 +---
 usr.bin/sort/init.c       |   66 +++++++++-------
 usr.bin/sort/msort.c      |   43 +++-------
 usr.bin/sort/radix_sort.c |  179 +++++++++++++++++++++++++++------------------
 usr.bin/sort/sort.c       |   46 +++++++----
 usr.bin/sort/sort.h       |   25 +++--
 10 files changed, 256 insertions(+), 257 deletions(-)

diffs (truncated from 1026 to 300 lines):

diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/Makefile
--- a/usr.bin/sort/Makefile     Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/Makefile     Thu Sep 10 22:02:40 2009 +0000
@@ -1,8 +1,11 @@
-#      $NetBSD: Makefile,v 1.7 2009/09/05 09:16:18 dsl Exp $
+#      $NetBSD: Makefile,v 1.8 2009/09/10 22:02:40 dsl Exp $
 #      from: @(#)Makefile      8.1 (Berkeley) 6/6/93
 
 PROG=  sort
 SRCS=  append.c fields.c files.c fsort.c init.c msort.c sort.c tmp.c
 SRCS+= radix_sort.c
 
+LDADD+=-lutil
+DPADD+=${LIBUTIL}
+
 .include <bsd.prog.mk>
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/append.c
--- a/usr.bin/sort/append.c     Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/append.c     Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $  */
+/*     $NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $  */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,82 +64,34 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
 #include <stdlib.h>
-#include <string.h>
-
-static int
-wt_cmp(const u_char *a, const u_char *b, size_t len, u_char *wts)
-{
-    size_t i;
-
-    for (i = 0; i < len; i++) {
-           if (wts[*a++] != wts[*b++])
-               return 1;
-    }
-
-    return 0;
-}
 
 /*
- * copy sorted lines to output; check for uniqueness
+ * copy sorted lines to output
+ * Ignore duplicates (marked with -ve keylen)
  */
 void
-append(const RECHEADER **keylist, int nelem, FILE *fp, put_func_t put, u_char *wts)
+append(RECHEADER **keylist, int nelem, FILE *fp, put_func_t put)
 {
-       const RECHEADER **cpos, **lastkey;
-       const RECHEADER *crec, *prec;
-       size_t plen;
+       RECHEADER **cpos, **lastkey;
+       RECHEADER *crec;
 
        lastkey = keylist + nelem;
-       if (!UNIQUE || wts == NULL) {
-               for (cpos = keylist; cpos < lastkey; cpos++)
-                       put(*cpos, fp);
-               return;
-       }
-
-       if (nelem == 0)
-               return;
-
-       cpos = keylist;
-       prec = *cpos;
-
-       if (!SINGL_FLD) {
-               /* Key for each line is already in adjacent bytes */
-               plen = prec->offset;
-               for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
+       if (REVERSE) {
+               for (cpos = lastkey; cpos-- > keylist;) {
                        crec = *cpos;
-                       if (crec->offset == plen
-                           && memcmp(crec->data, prec->data, plen) == 0) {
-                               /* Duplicate key */
-                               continue;
-                       }
-                       put(prec, fp);
-                       prec = crec;
-                       plen = prec->offset;
+                       if (crec->keylen >= 0)
+                               put(crec, fp);
                }
-               put(prec, fp);
-               return;
+       } else {
+               for (cpos = keylist; cpos < lastkey; cpos++) {
+                       crec = *cpos;
+                       if (crec->keylen >= 0)
+                               put(crec, fp);
+               }
        }
-
-       /* We have to compare the raw data - which means applying weight */
-
-       /* Key for each line is already in adjacent bytes */
-       plen = prec->length;
-       for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
-               crec = *cpos;
-               if (crec->length == plen
-                   && wt_cmp(crec->data, prec->data, plen, wts) == 0) {
-                       /* Duplicate key */
-                       continue;
-               }
-               put(prec, fp);
-               prec = crec;
-               plen = prec->length;
-       }
-       put(prec, fp);
-       return;
 }
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/fields.c
--- a/usr.bin/sort/fields.c     Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/fields.c     Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $  */
+/*     $NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $  */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $");
+__RCSID("$NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)fields.c 8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -152,11 +152,19 @@
                    fieldtable->flags)) == NULL)
                        return (1);
        }
-       *keypos++ = 0;
 
        keybuf->offset = keypos - keybuf->data;
        keybuf->length = keybuf->offset + line_size;
 
+       /*
+        * Posix requires that equal keys be further sorted by the
+        * entire original record.
+        * NetBSD has (at least for some time) kept equal keys in
+        * their original order.
+        * For 'sort -u' posix_sort is unset.
+        */
+       keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
+
        memcpy(keypos, line_data, line_size);
        return (0);
 }
@@ -209,33 +217,31 @@
                        *tablepos++ = lweight[*start];
                }
        }
-       /* Add extra byte to sort short keys correctly */
-       *tablepos++ = flags & R ? 255 : 1;
+       /* Add extra byte (absent from lweight) to sort short keys correctly */
+       *tablepos++ = lweight[REC_D];
        return tablepos;
 }
 
 /*
  * Numbers are converted to a floating point format (exponent & mantissa)
  * so that they compare correctly as sequence of unsigned bytes.
- * The output cannot contain a 0x00 byte (the record separator).
- * Bytes 0x01 and 0xff are used to terminate positive and negative numbers
+ * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
  * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
  *
  * The first byte contain the overall sign, exponent sign and some of the
  * exponent. These have to be ordered (-ve value, decreasing exponent),
  * zero, (+ve value, increasing exponent).
- * After excluding 0, 1, 0xff and 0x80 (used for zero) there are 61
- * exponent values available, this isn't quite enough and the highest
- * values are used to encode large exponents in multiple bytes.
  *
- * An exponent of zero has value 0xc0 for +ve numbers and 0x40 for -ves.
+ * The first byte is 0x80 for zero, 0x40 for -ve with exponent 0 and
+ * 0xc0 for +ve with exponent zero.
+ * This only leaves 62 byte values for +ve exponents - which isn't enough.
+ * The largest 5 exponent values are used to hold a byte count of the
+ * number of following bytes that contain 7 exponent bits per byte,
  *
  * The mantissa is stored 2 digits per byte offset by 0x40, for negative
  * numbers the order must be reversed (they are subtracted from 0x100).
  *
  * Reverse sorts are done by inverting the sign of the number.
- *
- * We don't have to worry about REC_D, the key is terminated by 0x00.
  */
 
 #define SIGNED(reverse, value) ((reverse) ? 0x100 - (value) : (value))
@@ -298,16 +304,14 @@
 
        /* Maybe here we should allow for e+12 (etc) */
 
-       /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
-       exponent += 0xc0;
-
-       if (exponent > 0x80 + MAX_EXP_ENC && exponent < 0x100 - MAX_EXP_ENC) {
+       if (exponent < 0x3f - MAX_EXP_ENC && -exponent < 0x3f - MAX_EXP_ENC) {
                /* Value ok for simple encoding */
+               /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
+               exponent += 0xc0;
                *pos++ = SIGNED(reverse, exponent);
        } else {
                /* Out or range for a single byte */
                int c, t;
-               exponent -= 0xc0;
                t = exponent > 0 ? exponent : -exponent;
                /* Count how many 7-bit blocks are needed */
                for (c = 0; ; c++) {
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/files.c
--- a/usr.bin/sort/files.c      Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/files.c      Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $   */
+/*     $NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $   */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)files.c  8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -123,6 +123,7 @@
                        if (c == REC_D) {
                                recbuf->offset = 0;
                                recbuf->length = pos - recbuf->data;
+                               recbuf->keylen = recbuf->length - 1;
                                return (0);
                        }
                }
@@ -138,6 +139,7 @@
                                *pos++ = REC_D;
                                recbuf->offset = 0;
                                recbuf->length = pos - recbuf->data;
+                               recbuf->keylen = recbuf->length - 1;
                                return (0);
                        }
                        FCLOSE(fp);
@@ -154,6 +156,7 @@
 
                        recbuf->offset = 0;
                        recbuf->length = 0;
+                       recbuf->keylen = 0;
 
                        return (BUFFEND);
                }
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/fsort.c
--- a/usr.bin/sort/fsort.c      Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/fsort.c      Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $   */
+/*     $NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $   */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -72,7 +72,7 @@
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $");
 __SCCSID("@(#)fsort.c  8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -86,8 +86,8 @@
 void
 fsort(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
 {
-       const RECHEADER **keylist;
-       const RECHEADER **keypos, **keyp;
+       RECHEADER **keylist;
+       RECHEADER **keypos, **keyp;
        RECHEADER *buffer;
        size_t bufsize = DEFBUFSIZE;
        u_char *bufend;
@@ -158,25 +158,19 @@
                }
 
                /* Sort this set of records */



Home | Main Index | Thread Index | Old Index