Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/usr.bin/sort Save length of key instead of relying of the we...
details: https://anonhg.NetBSD.org/src/rev/58c5343e257f
branches: trunk
changeset: 747318:58c5343e257f
user: dsl <dsl%NetBSD.org@localhost>
date: Thu Sep 10 22:02:40 2009 +0000
description:
Save length of key instead of relying of the weight of the record sep.
This frees a byte value to use for 'end of key' (to correctly sort
short keys) while still having a weight assigned to the field sep.
(Unless -t is given, the field sep is in the field data.)
Do reverse sorts by writing the output file in reverse order (rather
than reversing the sort - apart from merges).
All key compares are now unweighted.
For 'sort -u' mark duplicates keys during the sort and don't write
to the output.
Use -S to mean a posix sort - where equal keys are sorted using the
raw record (rather than being kept in the original order).
For 'sort -f' (no keys) generate a key of the folded data (as for -n
-i and -d), simplifies the code and allows a 'posix' sort.
diffstat:
usr.bin/sort/Makefile | 5 +-
usr.bin/sort/append.c | 82 ++++----------------
usr.bin/sort/fields.c | 40 +++++----
usr.bin/sort/files.c | 7 +-
usr.bin/sort/fsort.c | 20 +---
usr.bin/sort/init.c | 66 +++++++++-------
usr.bin/sort/msort.c | 43 +++-------
usr.bin/sort/radix_sort.c | 179 +++++++++++++++++++++++++++------------------
usr.bin/sort/sort.c | 46 +++++++----
usr.bin/sort/sort.h | 25 +++--
10 files changed, 256 insertions(+), 257 deletions(-)
diffs (truncated from 1026 to 300 lines):
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/Makefile
--- a/usr.bin/sort/Makefile Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/Makefile Thu Sep 10 22:02:40 2009 +0000
@@ -1,8 +1,11 @@
-# $NetBSD: Makefile,v 1.7 2009/09/05 09:16:18 dsl Exp $
+# $NetBSD: Makefile,v 1.8 2009/09/10 22:02:40 dsl Exp $
# from: @(#)Makefile 8.1 (Berkeley) 6/6/93
PROG= sort
SRCS= append.c fields.c files.c fsort.c init.c msort.c sort.c tmp.c
SRCS+= radix_sort.c
+LDADD+=-lutil
+DPADD+=${LIBUTIL}
+
.include <bsd.prog.mk>
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/append.c
--- a/usr.bin/sort/append.c Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/append.c Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -64,82 +64,34 @@
#include "sort.h"
#ifndef lint
-__RCSID("$NetBSD: append.c,v 1.21 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: append.c,v 1.22 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)append.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
#include <stdlib.h>
-#include <string.h>
-
-static int
-wt_cmp(const u_char *a, const u_char *b, size_t len, u_char *wts)
-{
- size_t i;
-
- for (i = 0; i < len; i++) {
- if (wts[*a++] != wts[*b++])
- return 1;
- }
-
- return 0;
-}
/*
- * copy sorted lines to output; check for uniqueness
+ * copy sorted lines to output
+ * Ignore duplicates (marked with -ve keylen)
*/
void
-append(const RECHEADER **keylist, int nelem, FILE *fp, put_func_t put, u_char *wts)
+append(RECHEADER **keylist, int nelem, FILE *fp, put_func_t put)
{
- const RECHEADER **cpos, **lastkey;
- const RECHEADER *crec, *prec;
- size_t plen;
+ RECHEADER **cpos, **lastkey;
+ RECHEADER *crec;
lastkey = keylist + nelem;
- if (!UNIQUE || wts == NULL) {
- for (cpos = keylist; cpos < lastkey; cpos++)
- put(*cpos, fp);
- return;
- }
-
- if (nelem == 0)
- return;
-
- cpos = keylist;
- prec = *cpos;
-
- if (!SINGL_FLD) {
- /* Key for each line is already in adjacent bytes */
- plen = prec->offset;
- for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
+ if (REVERSE) {
+ for (cpos = lastkey; cpos-- > keylist;) {
crec = *cpos;
- if (crec->offset == plen
- && memcmp(crec->data, prec->data, plen) == 0) {
- /* Duplicate key */
- continue;
- }
- put(prec, fp);
- prec = crec;
- plen = prec->offset;
+ if (crec->keylen >= 0)
+ put(crec, fp);
}
- put(prec, fp);
- return;
+ } else {
+ for (cpos = keylist; cpos < lastkey; cpos++) {
+ crec = *cpos;
+ if (crec->keylen >= 0)
+ put(crec, fp);
+ }
}
-
- /* We have to compare the raw data - which means applying weight */
-
- /* Key for each line is already in adjacent bytes */
- plen = prec->length;
- for (cpos = &keylist[1]; cpos < lastkey; cpos++) {
- crec = *cpos;
- if (crec->length == plen
- && wt_cmp(crec->data, prec->data, plen, wts) == 0) {
- /* Duplicate key */
- continue;
- }
- put(prec, fp);
- prec = crec;
- plen = prec->length;
- }
- put(prec, fp);
- return;
}
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/fields.c
--- a/usr.bin/sort/fields.c Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/fields.c Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $ */
+/* $NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
#include "sort.h"
#ifndef lint
-__RCSID("$NetBSD: fields.c,v 1.27 2009/08/22 21:28:55 dsl Exp $");
+__RCSID("$NetBSD: fields.c,v 1.28 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)fields.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -152,11 +152,19 @@
fieldtable->flags)) == NULL)
return (1);
}
- *keypos++ = 0;
keybuf->offset = keypos - keybuf->data;
keybuf->length = keybuf->offset + line_size;
+ /*
+ * Posix requires that equal keys be further sorted by the
+ * entire original record.
+ * NetBSD has (at least for some time) kept equal keys in
+ * their original order.
+ * For 'sort -u' posix_sort is unset.
+ */
+ keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
+
memcpy(keypos, line_data, line_size);
return (0);
}
@@ -209,33 +217,31 @@
*tablepos++ = lweight[*start];
}
}
- /* Add extra byte to sort short keys correctly */
- *tablepos++ = flags & R ? 255 : 1;
+ /* Add extra byte (absent from lweight) to sort short keys correctly */
+ *tablepos++ = lweight[REC_D];
return tablepos;
}
/*
* Numbers are converted to a floating point format (exponent & mantissa)
* so that they compare correctly as sequence of unsigned bytes.
- * The output cannot contain a 0x00 byte (the record separator).
- * Bytes 0x01 and 0xff are used to terminate positive and negative numbers
+ * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
* to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
*
* The first byte contain the overall sign, exponent sign and some of the
* exponent. These have to be ordered (-ve value, decreasing exponent),
* zero, (+ve value, increasing exponent).
- * After excluding 0, 1, 0xff and 0x80 (used for zero) there are 61
- * exponent values available, this isn't quite enough and the highest
- * values are used to encode large exponents in multiple bytes.
*
- * An exponent of zero has value 0xc0 for +ve numbers and 0x40 for -ves.
+ * The first byte is 0x80 for zero, 0x40 for -ve with exponent 0 and
+ * 0xc0 for +ve with exponent zero.
+ * This only leaves 62 byte values for +ve exponents - which isn't enough.
+ * The largest 5 exponent values are used to hold a byte count of the
+ * number of following bytes that contain 7 exponent bits per byte,
*
* The mantissa is stored 2 digits per byte offset by 0x40, for negative
* numbers the order must be reversed (they are subtracted from 0x100).
*
* Reverse sorts are done by inverting the sign of the number.
- *
- * We don't have to worry about REC_D, the key is terminated by 0x00.
*/
#define SIGNED(reverse, value) ((reverse) ? 0x100 - (value) : (value))
@@ -298,16 +304,14 @@
/* Maybe here we should allow for e+12 (etc) */
- /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
- exponent += 0xc0;
-
- if (exponent > 0x80 + MAX_EXP_ENC && exponent < 0x100 - MAX_EXP_ENC) {
+ if (exponent < 0x3f - MAX_EXP_ENC && -exponent < 0x3f - MAX_EXP_ENC) {
/* Value ok for simple encoding */
+ /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
+ exponent += 0xc0;
*pos++ = SIGNED(reverse, exponent);
} else {
/* Out or range for a single byte */
int c, t;
- exponent -= 0xc0;
t = exponent > 0 ? exponent : -exponent;
/* Count how many 7-bit blocks are needed */
for (c = 0; ; c++) {
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/files.c
--- a/usr.bin/sort/files.c Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/files.c Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -65,7 +65,7 @@
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: files.c,v 1.36 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: files.c,v 1.37 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)files.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -123,6 +123,7 @@
if (c == REC_D) {
recbuf->offset = 0;
recbuf->length = pos - recbuf->data;
+ recbuf->keylen = recbuf->length - 1;
return (0);
}
}
@@ -138,6 +139,7 @@
*pos++ = REC_D;
recbuf->offset = 0;
recbuf->length = pos - recbuf->data;
+ recbuf->keylen = recbuf->length - 1;
return (0);
}
FCLOSE(fp);
@@ -154,6 +156,7 @@
recbuf->offset = 0;
recbuf->length = 0;
+ recbuf->keylen = 0;
return (BUFFEND);
}
diff -r 411a124296ad -r 58c5343e257f usr.bin/sort/fsort.c
--- a/usr.bin/sort/fsort.c Thu Sep 10 21:36:39 2009 +0000
+++ b/usr.bin/sort/fsort.c Thu Sep 10 22:02:40 2009 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $ */
+/* $NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
@@ -72,7 +72,7 @@
#include "fsort.h"
#ifndef lint
-__RCSID("$NetBSD: fsort.c,v 1.40 2009/09/05 12:00:25 dsl Exp $");
+__RCSID("$NetBSD: fsort.c,v 1.41 2009/09/10 22:02:40 dsl Exp $");
__SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
@@ -86,8 +86,8 @@
void
fsort(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
{
- const RECHEADER **keylist;
- const RECHEADER **keypos, **keyp;
+ RECHEADER **keylist;
+ RECHEADER **keypos, **keyp;
RECHEADER *buffer;
size_t bufsize = DEFBUFSIZE;
u_char *bufend;
@@ -158,25 +158,19 @@
}
/* Sort this set of records */
Home |
Main Index |
Thread Index |
Old Index