Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/usr.bin/sort Rework the way sort generates sort keys:



details:   https://anonhg.NetBSD.org/src/rev/57c4ba7608d3
branches:  trunk
changeset: 746885:57c4ba7608d3
user:      dsl <dsl%NetBSD.org@localhost>
date:      Sat Aug 22 10:53:28 2009 +0000

description:
Rework the way sort generates sort keys:
- If we generate a key, it is always sortable using memcmp()
- If we are sorting the whole record, then a weight-table must be used
  during compares.
- Major surgery to encoding of numbers to ensure unique keys for equal
  numeric values.  Reverse numerics are handled by inverting the sign.
- Case folding (-f) is handled when the sort keys are generated. No other
  code has to care at all.
- Key uniqueness (-u) is done during merge for large datasets. It only
  has to be done when writing the output file for small files.
  Since the file is in key order this is simple!
Probably fixes all of: PR/27257 PR/25551 PR/22182 PR/31095 PR/30504
PR/36816 PR/37860 PR/39308
Also PR/18614 should no longer die, but a little more work needs to be
done on the merging for very large files.

diffstat:

 usr.bin/sort/append.c |  144 +++++++++---------------
 usr.bin/sort/fields.c |  298 +++++++++++++++++++++++++------------------------
 usr.bin/sort/files.c  |   13 +-
 usr.bin/sort/fsort.c  |   28 +++-
 usr.bin/sort/init.c   |  128 +++++++++------------
 usr.bin/sort/msort.c  |  112 ++++++++----------
 usr.bin/sort/sort.c   |   32 ++--
 usr.bin/sort/sort.h   |   31 +++--
 8 files changed, 374 insertions(+), 412 deletions(-)

diffs (truncated from 1240 to 300 lines):

diff -r ac54b4ecbeae -r 57c4ba7608d3 usr.bin/sort/append.c
--- a/usr.bin/sort/append.c     Sat Aug 22 10:02:21 2009 +0000
+++ b/usr.bin/sort/append.c     Sat Aug 22 10:53:28 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: append.c,v 1.19 2009/08/20 06:36:25 dsl Exp $  */
+/*     $NetBSD: append.c,v 1.20 2009/08/22 10:53:28 dsl Exp $  */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,118 +64,82 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: append.c,v 1.19 2009/08/20 06:36:25 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.20 2009/08/22 10:53:28 dsl Exp $");
 __SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
 #include <stdlib.h>
 #include <string.h>
 
-#define OUTPUT {                                                       \
-       if ((n = cpos - ppos) > 1) {                                    \
-               ppos -= n;                                              \
-               radix_sort(ppos, n, wts1, REC_D);                       \
-               for (; ppos < cpos; ppos++) {                           \
-                       prec = (const RECHEADER *) (*ppos - REC_DATA_OFFSET);\
-                       put(prec, fp);                                  \
-               }                                                       \
-       } else put(prec, fp);                                           \
+static int
+wt_cmp(const u_char *a, const u_char *b, size_t len, u_char *wts)
+{
+    size_t i;
+
+    for (i = 0; i < len; i++) {
+           if (wts[*a++] != wts[*b++])
+               return 1;
+    }
+
+    return 0;
 }
 
 /*
  * copy sorted lines to output; check for uniqueness
  */
 void
-append(const u_char **keylist, int nelem, FILE *fp, put_func_t put,
-    struct field *ftbl)
+append(const u_char **keylist, int nelem, FILE *fp, put_func_t put, u_char *wts)
 {
-       u_char *wts, *wts1;
-       int n;
-       const u_char **cpos, **ppos, **lastkey;
-       const u_char *cend, *pend, *start;
+       const u_char **cpos, **lastkey;
        const struct recheader *crec, *prec;
+       size_t plen;
 
-       if (*keylist == '\0' && UNIQUE)
+       lastkey = keylist + nelem;
+       if (!UNIQUE || wts == NULL) {
+               for (cpos = keylist; cpos < lastkey; cpos++)
+                       put((const RECHEADER *)(*cpos - REC_DATA_OFFSET), fp);
+               return;
+       }
+
+       if (nelem == 0)
                return;
 
-       wts1 = wts = ftbl[0].weights;
-       if ((!UNIQUE) && SINGL_FLD && ftbl[0].flags & F) {
-               /* Folding case */
-               if (ftbl[0].flags & R)
-                       wts1 = Rascii;
-               else
-                       wts1 = ascii;
-       }
+       cpos = keylist;
+       prec = (const RECHEADER *) (*cpos - REC_DATA_OFFSET);
 
-       lastkey = keylist + nelem;
-       if (SINGL_FLD && (UNIQUE || wts1 != wts)) {
-               ppos = keylist;
-               prec = (const RECHEADER *) (*ppos - REC_DATA_OFFSET);
-               if (UNIQUE)
-                       put(prec, fp);
+       if (!SINGL_FLD) {
+               /* Key for each line is already in adjacent bytes */
+               plen = prec->offset;
                for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
                        crec = (const RECHEADER *) (*cpos - REC_DATA_OFFSET);
-                       if (crec->length == prec->length) {
-                               /*
-                                * Set pend and cend so that trailing NUL and
-                                * record separator is ignored.
-                                */
-                               pend = (const u_char *) &prec->data + prec->length - 2;
-                               cend = (const u_char *) &crec->data + crec->length - 2;
-                               for (start = *cpos; cend >= start; cend--) {
-                                       if (wts[*cend] != wts[*pend])
-                                               break;
-                                       pend--;
-                               }
-                               if (pend + 1 != *ppos) {
-                                       if (!UNIQUE) {
-                                               OUTPUT;
-                                       } else
-                                               put(crec, fp);
-                                       ppos = cpos;
-                                       prec = crec;
-                               }
-                       } else {
-                               if (!UNIQUE) {
-                                       OUTPUT;
-                               } else
-                                       put(crec, fp);
-                               ppos = cpos;
-                               prec = crec;
+                       if (crec->offset == plen
+                           && memcmp(crec->data, prec->data, plen) == 0) {
+                               /* Duplicate key */
+                               continue;
                        }
+                       put(prec, fp);
+                       prec = crec;
+                       plen = prec->offset;
                }
-               if (!UNIQUE)  { OUTPUT; }
-       } else if (UNIQUE) {
-               ppos = keylist;
-               prec = (const RECHEADER *) (*ppos - REC_DATA_OFFSET);
                put(prec, fp);
-               for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
-                       crec = (const RECHEADER *) (*cpos - REC_DATA_OFFSET);
-                       if (crec->offset == prec->offset) {
-                               /*
-                                * Set pend and cend so that trailing NUL and
-                                * record separator is ignored.
-                                */
-                               pend = (const u_char *) &prec->data + prec->offset - 2;
-                               cend = (const u_char *) &crec->data + crec->offset - 2;
-                               for (start = *cpos; cend >= start; cend--) {
-                                       if (wts[*cend] != wts[*pend])
-                                               break;
-                                       pend--;
-                               }
-                               if (pend + 1 != *ppos) {
-                                       ppos = cpos;
-                                       prec = crec;
-                                       put(prec, fp);
-                               }
-                       } else {
-                               ppos = cpos;
-                               prec = crec;
-                               put(prec, fp);
-                       }
+               return;
+       }
+
+       /* We have to compare the raw data - which means applying weight */
+
+       /* Key for each line is already in adjacent bytes */
+       plen = prec->length;
+       for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
+               crec = (const RECHEADER *) (*cpos - REC_DATA_OFFSET);
+               if (crec->length == plen
+                   && wt_cmp(crec->data, prec->data, plen, wts) == 0) {
+                       /* Duplicate key */
+                       continue;
                }
-       } else for (cpos = keylist; cpos < lastkey; cpos++) {
-               crec = (const RECHEADER *) (*cpos - REC_DATA_OFFSET);
-               put(crec, fp);
+               put(prec, fp);
+               prec = crec;
+               plen = prec->length;
        }
+       put(prec, fp);
+       return;
 }
diff -r ac54b4ecbeae -r 57c4ba7608d3 usr.bin/sort/fields.c
--- a/usr.bin/sort/fields.c     Sat Aug 22 10:02:21 2009 +0000
+++ b/usr.bin/sort/fields.c     Sat Aug 22 10:53:28 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: fields.c,v 1.24 2009/08/20 06:36:25 dsl Exp $  */
+/*     $NetBSD: fields.c,v 1.25 2009/08/22 10:53:28 dsl Exp $  */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: fields.c,v 1.24 2009/08/20 06:36:25 dsl Exp $");
+__RCSID("$NetBSD: fields.c,v 1.25 2009/08/22 10:53:28 dsl Exp $");
 __SCCSID("@(#)fields.c 8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -84,13 +84,7 @@
 static u_char *enterfield(u_char *, const u_char *, struct field *, int);
 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
 
-#define DECIMAL '.'
-#define OFFSET 128
-
-u_char TENS[10];       /* TENS[0] = REC_D <= 128 ? 130 - '0' : 2 -'0'... */
-u_char NEGTENS[10];    /* NEGTENS[0] = REC_D <= 128 ? 126 + '0' : 252 +'0' */
-u_char *OFF_TENS, *OFF_NTENS;  /* TENS - '0', NEGTENS - '0' */
-u_char fnum[NBINS], rnum[NBINS];
+#define DECIMAL_POINT '.'
 
 /*
  * constructs sort key with leading recheader, followed by the key,
@@ -142,9 +136,10 @@
         * original line data (for output) as the 'keybuf' data.
         * keybuf->length is the number of key bytes + data bytes.
         * keybuf->offset is the number of key bytes.
-        * We add a record separator (usually \n) after the key in case
+        * We add a record separator weight after the key in case
         * (as is usual) we need to preserve the order of equal lines,
         * and for 'sort -u'.
+        * The key itself will have had the correct weight applied.
         */
        keypos = keybuf->data;
        endkey = keybuf_end - line_size - 1;
@@ -157,7 +152,7 @@
                    fieldtable->flags)) == NULL)
                        return (1);
        }
-       *keypos++ = REC_D;
+       *keypos++ = 0;
 
        keybuf->offset = keypos - keybuf->data;
        keybuf->length = keybuf->offset + line_size;
@@ -176,7 +171,6 @@
        u_char *start, *end, *lineend, *mask, *lweight;
        struct column icol, tcol;
        u_int flags;
-       u_int Rflag;
 
        icol = cur_fld->icol;
        tcol = cur_fld->tcol;
@@ -201,162 +195,174 @@
                        end = tcol.p->end;
        }
 
-       if (flags & N) {
-               Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
-               return number(tablepos, endkey, start, end, Rflag);
-       }
+       if (flags & N)
+               return number(tablepos, endkey, start, end, flags);
+
+       /* Bound check space - assuming nothing is skipped */
+       if (tablepos + (end - start) + 1 >= endkey)
+               return NULL;
 
        mask = cur_fld->mask;
        lweight = cur_fld->weights;     
-       for (; start < end; start++)
-               if (mask[*start]) {
-                       if (*start <= 1) {
-                               if (tablepos+2 >= endkey)
-                                       return (NULL);
-                               *tablepos++ = lweight[1];
-                               *tablepos++ = lweight[*start ? 2 : 1];
-                       } else {
-                               if (tablepos+1 >= endkey)
-                                       return (NULL);
-                               *tablepos++ = lweight[*start];
-                       }
+       for (; start < end; start++) {
+               if (mask && mask[*start]) {
+                       *tablepos++ = lweight[*start];
                }
-       *tablepos++ = lweight[0];
-       return (tablepos == endkey ? NULL : tablepos);
+       }
+       /* Add extra byte to sort short keys correctly */
+       *tablepos++ = flags & R ? 255 : 1;
+       return tablepos;
 }
 
-/* Uses the first bin to assign sign, expsign, 0, and the first
- * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
- *   When sorting in forward order:
- * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
- * else use (0-99)->(2-102).
- * If the exponent is >=61, use another byte for each additional 253
- * in the exponent. Cutoff is at 567.
- * To avoid confusing the exponent and the mantissa, use a field delimiter
- * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
- * only time a field delimiter can come in that position.
- * Reverse order is done analagously.
+/*
+ * Numbers are converted to a floating point format (exponent & mantissa)
+ * so that they compare correctly as sequence of unsigned bytes.
+ * The output cannot contain a 0x00 byte (the record separator).
+ * Bytes 0x01 and 0xff are used to terminate positive and negative numbers



Home | Main Index | Thread Index | Old Index