Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/usr.bin/sort The code that attempted to sort large files by ...



details:   https://anonhg.NetBSD.org/src/rev/056aba84ec73
branches:  trunk
changeset: 746714:056aba84ec73
user:      dsl <dsl%NetBSD.org@localhost>
date:      Tue Aug 18 18:00:28 2009 +0000

description:
The code that attempted to sort large files by sorting each chunk by the
first key byte and writing to a temp file, then sorting the records from
each temp file that had the same first key byte (and repeating for upto
4 key bytes) was a nice idea, but completely doomed to failure.
Eg PR/9308 where a 70MB file has all but one record the same and short keys.
Not only does the code not work, it is rather guaranteed to be slow.
Instead always use a merge sort for fully sorted chunk of records (each
temporary file contains one lot of sorted records).
The -H option already did this, so just rip out all the code and variables
that can't be used when -H was specified.
Further cleanup to come ...

diffstat:

 usr.bin/sort/append.c |   39 +-----
 usr.bin/sort/files.c  |   71 +---------
 usr.bin/sort/fsort.c  |  361 ++++++++++++++-----------------------------------
 usr.bin/sort/sort.c   |   10 +-
 usr.bin/sort/sort.h   |    6 +-
 5 files changed, 120 insertions(+), 367 deletions(-)

diffs (truncated from 638 to 300 lines):

diff -r 7a356373cd8f -r 056aba84ec73 usr.bin/sort/append.c
--- a/usr.bin/sort/append.c     Tue Aug 18 17:52:30 2009 +0000
+++ b/usr.bin/sort/append.c     Tue Aug 18 18:00:28 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $  */
+/*     $NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $  */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,7 +64,7 @@
 #include "sort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $");
 __SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -183,38 +183,3 @@
                put(crec, fp);
        }
 }
-
-/*
- * output the already sorted eol bin.
- */
-void
-rd_append(int binno, int infl0, int nfiles, FILE *outfp, u_char *buffer,
-    u_char *bufend)
-{
-       RECHEADER *rec;
-
-       rec = (RECHEADER *) buffer;
-       if (!getnext(binno, infl0, NULL, nfiles,
-                       (RECHEADER *) buffer, bufend, 0)) {
-               putline(rec, outfp);
-               while (getnext(binno, infl0, NULL, nfiles, (RECHEADER *) buffer,
-                       bufend, 0) == 0) {
-                       if (!UNIQUE)
-                               putline(rec, outfp);
-               }
-       }
-}
-
-/*
- * append plain text--used after sorting the biggest bin.
- */
-void
-concat(FILE *a, FILE *b)
-{
-        int nread;
-        char buffer[4096];
-
-       rewind(b);
-        while ((nread = fread(buffer, 1, 4096, b)) > 0)
-                EWRITE(buffer, 1, nread, a);
-}
diff -r 7a356373cd8f -r 056aba84ec73 usr.bin/sort/files.c
--- a/usr.bin/sort/files.c      Tue Aug 18 17:52:30 2009 +0000
+++ b/usr.bin/sort/files.c      Tue Aug 18 18:00:28 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $   */
+/*     $NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $   */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $");
+__RCSID("$NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $");
 __SCCSID("@(#)files.c  8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -74,73 +74,6 @@
 static ssize_t seq(FILE *, u_char **);
 
 /*
- * this is the subroutine for file management for fsort().
- * It keeps the buffers for all temporary files.
- */
-int
-getnext(int binno, int infl0, struct filelist *filelist, int nfiles,
-    RECHEADER *pos, u_char *end, struct field *dummy)
-{
-       int i;
-       u_char *hp;
-       static size_t nleft = 0;
-       static int cnt = 0, flag = -1;
-       static u_char maxb = 0;
-       static FILE *fp;
-
-       if (nleft == 0) {
-               if (binno < 0)  /* reset files. */ {
-                       for (i = 0; i < nfiles; i++) {
-                               rewind(fstack[infl0 + i].fp);
-                               fstack[infl0 + i].max_o = 0;
-                       }
-                       flag = -1;
-                       nleft = cnt = 0;
-                       return (-1);
-               }
-               maxb = fstack[infl0].maxb;
-               for (; nleft == 0; cnt++) {
-                       if (cnt >= nfiles) {
-                               cnt = 0;
-                               return (EOF);
-                       }
-                       fp = fstack[infl0 + cnt].fp;
-                       fread(&nleft, sizeof(nleft), 1, fp);
-                       if (binno < maxb)
-                               fstack[infl0+cnt].max_o
-                                       += sizeof(nleft) + nleft;
-                       else if (binno == maxb) {
-                               if (binno != fstack[infl0].lastb) {
-                                       fseeko(fp, fstack[infl0+
-                                               cnt].max_o, SEEK_SET);
-                                       fread(&nleft, sizeof(nleft), 1, fp);
-                               }
-                               if (nleft == 0)
-                                       fclose(fp);
-                       } else if (binno == maxb + 1) {         /* skip a bin */
-                               fseek(fp, nleft, SEEK_CUR);
-                               fread(&nleft, sizeof(nleft), 1, fp);
-                               flag = cnt;
-                       }
-               }
-       }
-       if ((u_char *) pos > end - REC_DATA_OFFSET)
-               return (BUFFEND);
-       fread(pos, REC_DATA_OFFSET, 1, fp);
-       if (end - pos->data < (ptrdiff_t)pos->length) {
-               hp = ((u_char *)pos) + REC_DATA_OFFSET;
-               for (i = REC_DATA_OFFSET; i ;  i--)
-                       ungetc(*--hp, fp);
-               return (BUFFEND);
-       }
-       fread(pos->data, pos->length, 1, fp);
-       nleft -= pos->length + REC_DATA_OFFSET;
-       if (nleft == 0 && binno == fstack[infl0].maxb)
-               fclose(fp);
-       return (0);
-}
-
-/*
  * this is called when there is no special key. It's only called
  * in the first fsort pass.
  */
diff -r 7a356373cd8f -r 056aba84ec73 usr.bin/sort/fsort.c
--- a/usr.bin/sort/fsort.c      Tue Aug 18 17:52:30 2009 +0000
+++ b/usr.bin/sort/fsort.c      Tue Aug 18 18:00:28 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $   */
+/*     $NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $   */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -62,17 +62,17 @@
  */
 
 /*
- * Read in the next bin.  If it fits in one segment sort it;
- * otherwise refine it by segment deeper by one character,
- * and try again on smaller bins.  Sort the final bin at this level
- * of recursion to keep the head of fstack at 0.
- * After PANIC passes, abort to merge sort.
+ * Read in a block of records (until 'enough').
+ * sort, write to temp file.
+ * Merge sort temp files into output file
+ * Small files miss out the temp file stage.
+ * Large files might get multiple merges.
  */
 #include "sort.h"
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $");
+__RCSID("$NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $");
 __SCCSID("@(#)fsort.c  8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -100,17 +100,13 @@
        const u_char **keypos;
        u_char *bufend;
        u_char *weights;
-       int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
-       int c, nelem, base;
-       long sizes [NBINS + 1];
+       int ntfiles, mfct = 0;
+       int c, nelem;
        get_func_t get;
        struct recheader *crec;
        struct field tfield[2];
-       FILE *prevfp, *tailfp[FSORTMAX + 1];
        u_char *nbuffer;
 
-       memset(tailfp, 0, sizeof(tailfp));
-       prevfp = outfp;
        memset(tfield, 0, sizeof(tfield));
        if (ftbl[0].flags & R)
                tfield[0].weights = Rascii;
@@ -124,256 +120,113 @@
                memset(keylist, 0, MAXNUM * sizeof(u_char *));
        }
        bufend = buffer + bufsize;
-       if (binno >= 0) {
-               base = top + nfiles;
-               get = getnext;
-       } else {
-               base = 0;
-               if (SINGL_FLD)
-                       get = makeline;
-               else
-                       get = makekey;
-       }                               
-       for (;;) {
-               memset(sizes, 0, sizeof(sizes));
-               c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */
-               if (binno == weights[REC_D] &&
-                   !(SINGL_FLD && ftbl[0].flags & F)) {        /* pop */
-                       rd_append(weights[REC_D], top,
-                           nfiles, prevfp, buffer, bufend);
-                       break;
-               } else if (binno == weights[REC_D]) {
-                       depth = 0;              /* start over on flat weights */
-                       ftbl = tfield;
-                       weights = ftbl[0].weights;
+       if (SINGL_FLD)
+               get = makeline;
+       else
+               get = makekey;
+
+       c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */
+       keypos = keylist;            /* XXXGCC -Wuninitialized m68k */
+       crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */
+       while (c != EOF) {
+               keypos = keylist;
+               nelem = 0;
+               crec = (RECHEADER *) buffer;
+
+          do_read:
+               while ((c = get(-1, top, filelist, nfiles, crec,
+                   bufend, ftbl)) == 0) {
+                       *keypos++ = crec->data + depth;
+                       if (++nelem == MAXNUM) {
+                               c = BUFFEND;
+                               break;
+                       }
+                       crec =(RECHEADER *)((char *) crec +
+                           SALIGN(crec->length) + REC_DATA_OFFSET);
                }
-               keypos = keylist;            /* XXXGCC -Wuninitialized m68k */
-               crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */
-               while (c != EOF) {
-                       keypos = keylist;
-                       nelem = 0;
-                       crec = (RECHEADER *) buffer;
+
+               if (c == BUFFEND && nelem < MAXNUM
+                   && bufsize < MAXBUFSIZE) {
+                       const u_char **keyp;
+                       u_char *oldb = buffer;
+
+                       /* buffer was too small for data, allocate
+                        * bigger buffer */
+                       nbuffer = realloc(buffer, bufsize * 2);
+                       if (!nbuffer) {
+                               err(2, "failed to realloc buffer to %lu bytes",
+                                       (unsigned long) bufsize * 2);
+                       }
+                       buffer = nbuffer;
+                       bufsize *= 2;
+                       bufend = buffer + bufsize;
+
+                       /* patch up keylist[] */
+                       for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
+                               *keyp = buffer + (*keyp - oldb);
+
+                       crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
+                       goto do_read;
+               }
 
-                  do_read:
-                       while ((c = get(binno, top, filelist, nfiles, crec,
-                           bufend, ftbl)) == 0) {
-                               *keypos++ = crec->data + depth;
-                               if (++nelem == MAXNUM) {
-                                       c = BUFFEND;
-                                       break;
-                               }
-                               crec =(RECHEADER *)((char *) crec +
-                                   SALIGN(crec->length) + REC_DATA_OFFSET);
+               if (c != BUFFEND && !ntfiles && !mfct) {
+                       /* do not push */
+                       continue;
+               }
+
+               /* push */
+               fstack[MSTART + mfct].fp = ftmp();



Home | Main Index | Thread Index | Old Index