Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/usr.bin/sort When we need to merge more than 16 files, do th...



details:   https://anonhg.NetBSD.org/src/rev/10f10b814c2b
branches:  trunk
changeset: 748024:10f10b814c2b
user:      dsl <dsl%NetBSD.org@localhost>
date:      Fri Oct 09 20:29:43 2009 +0000

description:
When we need to merge more than 16 files, do them in a hierarchy.
Reduces the amount of data written to temporary files.
The 3-level stack has to do a simple reduce after 4352 input files, for
a normal file sort this is 35GB of data or about 500 million records.
This needs about 50 open fd's - which should be ok.
Clearly the merge sort could process more input files in one go - speeding
up the sort, but at some point the number of input files would exceed
whatever limit was applied.

diffstat:

 usr.bin/sort/msort.c |  83 +++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 73 insertions(+), 10 deletions(-)

diffs (126 lines):

diff -r 0bb1b631e579 -r 10f10b814c2b usr.bin/sort/msort.c
--- a/usr.bin/sort/msort.c      Fri Oct 09 20:23:19 2009 +0000
+++ b/usr.bin/sort/msort.c      Fri Oct 09 20:29:43 2009 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: msort.c,v 1.27 2009/09/26 21:16:55 dsl Exp $   */
+/*     $NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $   */
 
 /*-
  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
 #include "fsort.h"
 
 #ifndef lint
-__RCSID("$NetBSD: msort.c,v 1.27 2009/09/26 21:16:55 dsl Exp $");
+__RCSID("$NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $");
 __SCCSID("@(#)msort.c  8.1 (Berkeley) 6/6/93");
 #endif /* not lint */
 
@@ -86,29 +86,49 @@
 
 static int cmp(RECHEADER *, RECHEADER *);
 static int insert(struct mfile **, struct mfile *, int, int);
+static void merge_sort_fstack(FILE *, put_func_t, struct field *);
 
 /*
  * Number of files merge() can merge in one pass.
- * This should be power of two so that it's possible to use this value
- * for rouding.
  */
 #define MERGE_FNUM      16
 
 static struct mfile fstack[MERGE_FNUM];
-static int fstack_count;
+static struct mfile fstack_1[MERGE_FNUM];
+static struct mfile fstack_2[MERGE_FNUM];
+static int fstack_count, fstack_1_count, fstack_2_count;
 
 void
 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
 {
-       FILE *mfp;
+       FILE *mfp, *mfp1, *mfp2;
 
        if (fstack_count == MERGE_FNUM) {
                /* Must reduce the number of temporary files */
                mfp = ftmp();
-               merge_sort(mfp, putrec, ftbl);
-               fstack[0].fp = mfp;
-               fstack[0].get = geteasy;
-               fstack_count = 1;
+               merge_sort_fstack(mfp, putrec, ftbl);
+               /* Save output in next layer */
+               if (fstack_1_count == MERGE_FNUM) {
+                       mfp1 = ftmp();
+                       memcpy(fstack, fstack_1, sizeof fstack);
+                       merge_sort_fstack(mfp1, putrec, ftbl);
+                       if (fstack_2_count == MERGE_FNUM) {
+                               /* More than 4096 files! */
+                               mfp2 = ftmp();
+                               memcpy(fstack, fstack_2, sizeof fstack);
+                               merge_sort_fstack(mfp2, putrec, ftbl);
+                               fstack_2[0].fp = mfp2;
+                               fstack_2_count = 1;
+                       }
+                       fstack_2[fstack_2_count].fp = mfp1;
+                       fstack_2[fstack_2_count].get = geteasy;
+                       fstack_2_count++;
+                       fstack_1_count = 0;
+               }
+               fstack_1[fstack_1_count].fp = mfp;
+               fstack_1[fstack_1_count].get = geteasy;
+               fstack_1_count++;
+               fstack_count = 0;
        }
 
        fstack[fstack_count].fp = fp;
@@ -135,6 +155,49 @@
 void
 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
 {
+       int count = fstack_1_count + fstack_2_count;
+       FILE *mfp;
+       int i;
+
+       if (count == 0) {
+               /* All files in initial array */
+               merge_sort_fstack(outfp, put, ftbl);
+               return;
+       }
+
+       count += fstack_count;
+
+       /* Too many files for one merge sort */
+       for (;;) {
+               /* Sort latest 16 files */
+               i = count;
+               if (i > MERGE_FNUM)
+                       i = MERGE_FNUM;
+               while (fstack_count > 0)
+                       fstack[--i] = fstack[--fstack_count];
+               while (i > 0 && fstack_1_count > 0)
+                       fstack[--i] = fstack_1[--fstack_1_count];
+               while (i > 0)
+                       fstack[--i] = fstack_2[--fstack_2_count];
+               if (count <= MERGE_FNUM) {
+                       /* Got all the data */
+                       fstack_count = count;
+                       merge_sort_fstack(outfp, put, ftbl);
+                       return;
+               }
+               mfp = ftmp();
+               fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
+               merge_sort_fstack(mfp, putrec, ftbl);
+               fstack[0].fp = mfp;
+               fstack[0].get = geteasy;
+               fstack_count = 1;
+               count -= MERGE_FNUM - 1;
+       }
+}
+
+static void
+merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
+{
        struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
        RECHEADER *new_rec;
        u_char *new_end;



Home | Main Index | Thread Index | Old Index