Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/usr.bin/sort The code that attempted to sort large files by ...
details: https://anonhg.NetBSD.org/src/rev/056aba84ec73
branches: trunk
changeset: 746714:056aba84ec73
user: dsl <dsl%NetBSD.org@localhost>
date: Tue Aug 18 18:00:28 2009 +0000
description:
The code that attempted to sort large files by sorting each chunk by the
first key byte and writing to a temp file, then sorting the records from
each temp file that had the same first key byte (and repeating for upto
4 key bytes) was a nice idea, but completely doomed to failure.
Eg PR/9308 where a 70MB file has all but one record the same and short keys.
Not only does the code not work, it is rather guaranteed to be slow.
Instead always use a merge sort for fully sorted chunk of records (each
temporary file contains one lot of sorted records).
The -H option already did this, so just rip out all the code and variables
that can't be used when -H was specified.
Further cleanup to come ...
diffstat:
usr.bin/sort/append.c | 39 +-----
usr.bin/sort/files.c | 71 +---------
usr.bin/sort/fsort.c | 361 ++++++++++++++-----------------------------------
usr.bin/sort/sort.c | 10 +-
usr.bin/sort/sort.h | 6 +-
5 files changed, 120 insertions(+), 367 deletions(-)
diffs (truncated from 638 to 300 lines):
diff -r 7a356373cd8f -r 056aba84ec73 usr.bin/sort/append.c
--- a/usr.bin/sort/append.c Tue Aug 18 17:52:30 2009 +0000
+++ b/usr.bin/sort/append.c Tue Aug 18 18:00:28 2009 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $ */
+/* $NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,7 +64,7 @@
#include "sort.h"
#ifndef lint
-__RCSID("$NetBSD: append.c,v 1.17 2009/08/16 20:02:04 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.18 2009/08/18 18:00:28 dsl Exp $");
__SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -183,38 +183,3 @@
put(crec, fp);
}
}
-
-/*
- * output the already sorted eol bin.
- */
-void
-rd_append(int binno, int infl0, int nfiles, FILE *outfp, u_char *buffer,
- u_char *bufend)
-{
- RECHEADER *rec;
-
- rec = (RECHEADER *) buffer;
- if (!getnext(binno, infl0, NULL, nfiles,
- (RECHEADER *) buffer, bufend, 0)) {
- putline(rec, outfp);
- while (getnext(binno, infl0, NULL, nfiles, (RECHEADER *) buffer,
- bufend, 0) == 0) {
- if (!UNIQUE)
- putline(rec, outfp);
- }
- }
-}
-
-/*
- * append plain text--used after sorting the biggest bin.
- */
-void
-concat(FILE *a, FILE *b)
-{
- int nread;
- char buffer[4096];
-
- rewind(b);
- while ((nread = fread(buffer, 1, 4096, b)) > 0)
- EWRITE(buffer, 1, nread, a);
-}
diff -r 7a356373cd8f -r 056aba84ec73 usr.bin/sort/files.c
--- a/usr.bin/sort/files.c Tue Aug 18 17:52:30 2009 +0000
+++ b/usr.bin/sort/files.c Tue Aug 18 18:00:28 2009 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $ */
+/* $NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: files.c,v 1.33 2009/08/16 19:53:43 dsl Exp $");
+__RCSID("$NetBSD: files.c,v 1.34 2009/08/18 18:00:28 dsl Exp $");
__SCCSID("@(#)files.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -74,73 +74,6 @@
static ssize_t seq(FILE *, u_char **);
/*
- * this is the subroutine for file management for fsort().
- * It keeps the buffers for all temporary files.
- */
-int
-getnext(int binno, int infl0, struct filelist *filelist, int nfiles,
- RECHEADER *pos, u_char *end, struct field *dummy)
-{
- int i;
- u_char *hp;
- static size_t nleft = 0;
- static int cnt = 0, flag = -1;
- static u_char maxb = 0;
- static FILE *fp;
-
- if (nleft == 0) {
- if (binno < 0) /* reset files. */ {
- for (i = 0; i < nfiles; i++) {
- rewind(fstack[infl0 + i].fp);
- fstack[infl0 + i].max_o = 0;
- }
- flag = -1;
- nleft = cnt = 0;
- return (-1);
- }
- maxb = fstack[infl0].maxb;
- for (; nleft == 0; cnt++) {
- if (cnt >= nfiles) {
- cnt = 0;
- return (EOF);
- }
- fp = fstack[infl0 + cnt].fp;
- fread(&nleft, sizeof(nleft), 1, fp);
- if (binno < maxb)
- fstack[infl0+cnt].max_o
- += sizeof(nleft) + nleft;
- else if (binno == maxb) {
- if (binno != fstack[infl0].lastb) {
- fseeko(fp, fstack[infl0+
- cnt].max_o, SEEK_SET);
- fread(&nleft, sizeof(nleft), 1, fp);
- }
- if (nleft == 0)
- fclose(fp);
- } else if (binno == maxb + 1) { /* skip a bin */
- fseek(fp, nleft, SEEK_CUR);
- fread(&nleft, sizeof(nleft), 1, fp);
- flag = cnt;
- }
- }
- }
- if ((u_char *) pos > end - REC_DATA_OFFSET)
- return (BUFFEND);
- fread(pos, REC_DATA_OFFSET, 1, fp);
- if (end - pos->data < (ptrdiff_t)pos->length) {
- hp = ((u_char *)pos) + REC_DATA_OFFSET;
- for (i = REC_DATA_OFFSET; i ; i--)
- ungetc(*--hp, fp);
- return (BUFFEND);
- }
- fread(pos->data, pos->length, 1, fp);
- nleft -= pos->length + REC_DATA_OFFSET;
- if (nleft == 0 && binno == fstack[infl0].maxb)
- fclose(fp);
- return (0);
-}
-
-/*
* this is called when there is no special key. It's only called
* in the first fsort pass.
*/
diff -r 7a356373cd8f -r 056aba84ec73 usr.bin/sort/fsort.c
--- a/usr.bin/sort/fsort.c Tue Aug 18 17:52:30 2009 +0000
+++ b/usr.bin/sort/fsort.c Tue Aug 18 18:00:28 2009 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $ */
+/* $NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -62,17 +62,17 @@
*/
/*
- * Read in the next bin. If it fits in one segment sort it;
- * otherwise refine it by segment deeper by one character,
- * and try again on smaller bins. Sort the final bin at this level
- * of recursion to keep the head of fstack at 0.
- * After PANIC passes, abort to merge sort.
+ * Read in a block of records (until 'enough').
+ * sort, write to temp file.
+ * Merge sort temp files into output file
+ * Small files miss out the temp file stage.
+ * Large files might get multiple merges.
*/
#include "sort.h"
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: fsort.c,v 1.36 2009/08/16 20:02:04 dsl Exp $");
+__RCSID("$NetBSD: fsort.c,v 1.37 2009/08/18 18:00:28 dsl Exp $");
__SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -100,17 +100,13 @@
const u_char **keypos;
u_char *bufend;
u_char *weights;
- int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
- int c, nelem, base;
- long sizes [NBINS + 1];
+ int ntfiles, mfct = 0;
+ int c, nelem;
get_func_t get;
struct recheader *crec;
struct field tfield[2];
- FILE *prevfp, *tailfp[FSORTMAX + 1];
u_char *nbuffer;
- memset(tailfp, 0, sizeof(tailfp));
- prevfp = outfp;
memset(tfield, 0, sizeof(tfield));
if (ftbl[0].flags & R)
tfield[0].weights = Rascii;
@@ -124,256 +120,113 @@
memset(keylist, 0, MAXNUM * sizeof(u_char *));
}
bufend = buffer + bufsize;
- if (binno >= 0) {
- base = top + nfiles;
- get = getnext;
- } else {
- base = 0;
- if (SINGL_FLD)
- get = makeline;
- else
- get = makekey;
- }
- for (;;) {
- memset(sizes, 0, sizeof(sizes));
- c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */
- if (binno == weights[REC_D] &&
- !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */
- rd_append(weights[REC_D], top,
- nfiles, prevfp, buffer, bufend);
- break;
- } else if (binno == weights[REC_D]) {
- depth = 0; /* start over on flat weights */
- ftbl = tfield;
- weights = ftbl[0].weights;
+ if (SINGL_FLD)
+ get = makeline;
+ else
+ get = makekey;
+
+ c = nelem = ntfiles = 0; /* XXXGCC -Wuninitialized m68k */
+ keypos = keylist; /* XXXGCC -Wuninitialized m68k */
+ crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */
+ while (c != EOF) {
+ keypos = keylist;
+ nelem = 0;
+ crec = (RECHEADER *) buffer;
+
+ do_read:
+ while ((c = get(-1, top, filelist, nfiles, crec,
+ bufend, ftbl)) == 0) {
+ *keypos++ = crec->data + depth;
+ if (++nelem == MAXNUM) {
+ c = BUFFEND;
+ break;
+ }
+ crec =(RECHEADER *)((char *) crec +
+ SALIGN(crec->length) + REC_DATA_OFFSET);
}
- keypos = keylist; /* XXXGCC -Wuninitialized m68k */
- crec = (RECHEADER *) buffer; /* XXXGCC -Wuninitialized m68k */
- while (c != EOF) {
- keypos = keylist;
- nelem = 0;
- crec = (RECHEADER *) buffer;
+
+ if (c == BUFFEND && nelem < MAXNUM
+ && bufsize < MAXBUFSIZE) {
+ const u_char **keyp;
+ u_char *oldb = buffer;
+
+ /* buffer was too small for data, allocate
+ * bigger buffer */
+ nbuffer = realloc(buffer, bufsize * 2);
+ if (!nbuffer) {
+ err(2, "failed to realloc buffer to %lu bytes",
+ (unsigned long) bufsize * 2);
+ }
+ buffer = nbuffer;
+ bufsize *= 2;
+ bufend = buffer + bufsize;
+
+ /* patch up keylist[] */
+ for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
+ *keyp = buffer + (*keyp - oldb);
+
+ crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
+ goto do_read;
+ }
- do_read:
- while ((c = get(binno, top, filelist, nfiles, crec,
- bufend, ftbl)) == 0) {
- *keypos++ = crec->data + depth;
- if (++nelem == MAXNUM) {
- c = BUFFEND;
- break;
- }
- crec =(RECHEADER *)((char *) crec +
- SALIGN(crec->length) + REC_DATA_OFFSET);
+ if (c != BUFFEND && !ntfiles && !mfct) {
+ /* do not push */
+ continue;
+ }
+
+ /* push */
+ fstack[MSTART + mfct].fp = ftmp();
Home |
Main Index |
Thread Index |
Old Index