Subject: Re: bin/30504
To: None <jdolecek@netbsd.org, gnats-admin@netbsd.org,>
From: Peter Seebach <seebs@plethora.net>
List: netbsd-bugs
Date: 06/14/2005 21:26:02
The following reply was made to PR bin/30504; it has been noted by GNATS.
From: seebs@plethora.net (Peter Seebach)
To: jdolecek@netbsd.org, gnats-bugs@netbsd.org
Cc:
Subject: Re: bin/30504
Date: Tue, 14 Jun 2005 16:25:44 -0500
In message <20050612122725.0939363B11F@narn.netbsd.org>, wiz@netbsd.org writes:
>Synopsis: sort mis-sorts many numbers
>
>Responsible-Changed-From-To: bin-bug-people->jdolecek
>Responsible-Changed-By: wiz@netbsd.org
>Responsible-Changed-When: Sun, 12 Jun 2005 12:27:24 +0000
>Responsible-Changed-Why:
>jdolecek supports sort(1).
I found one more boundary condition. On a reverse key sort, trailing zeroes
sorted as >9 instead of <1.
*** /usr/src/usr.bin/sort/fields.c Sun Mar 14 15:12:14 2004
--- fields.c Tue Jun 14 16:01:58 2005
***************
*** 89,95 ****
}
static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int));
#define DECIMAL '.'
#define OFFSET 128
--- 89,95 ----
}
static u_char *enterfield __P((u_char *, u_char *, struct field *, int));
! static u_char *number __P((u_char *, u_char *, u_char *, u_char *, int, int));
#define DECIMAL '.'
#define OFFSET 128
***************
*** 151,172 ****
fieldtable->flags)) == NULL)
return (1);
- keybuf->offset = keypos - keybuf->data;
- keybuf->length = keybuf->offset + line->size;
- if (keybuf->length + sizeof(TRECHEADER) > size) {
- /* line too long for buffer */
- return (1);
- }
-
/*
* Make [s]radixsort() only sort by relevant part of key if:
* 1. we want to choose unique items by relevant field[s]
* 2. we want stable sort and so the items should be sorted only by
* the relevant field[s]
*/
! if (UNIQUE || (stable_sort && keybuf->offset < line->size))
! keypos[-1] = REC_D;
memcpy(keybuf->data + keybuf->offset, line->data, line->size);
return (0);
}
--- 151,175 ----
fieldtable->flags)) == NULL)
return (1);
/*
* Make [s]radixsort() only sort by relevant part of key if:
* 1. we want to choose unique items by relevant field[s]
* 2. we want stable sort and so the items should be sorted only by
* the relevant field[s]
*/
! if (UNIQUE || stable_sort)
! *keypos++ = REC_D;
+ /*
+ * Append the entire line. This will be used for final output
+ * even if it is ignored by [s]radixsort().
+ */
+ keybuf->offset = keypos - keybuf->data;
+ keybuf->length = keybuf->offset + line->size;
+ if (keybuf->length + sizeof(TRECHEADER) > size) {
+ /* line too long for buffer */
+ return (1);
+ }
memcpy(keybuf->data + keybuf->offset, line->data, line->size);
return (0);
}
***************
*** 184,189 ****
--- 187,193 ----
struct column icol, tcol;
u_int flags;
u_int Rflag;
+ u_int BNflag;
icol = cur_fld->icol;
tcol = cur_fld->tcol;
***************
*** 210,216 ****
if (flags & N) {
Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! return number(tablepos, endkey, start, end, Rflag);
}
mask = cur_fld->mask;
--- 214,221 ----
if (flags & N) {
Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0;
! BNflag = (gflags & BN ) | (flags & BN) ? 1 : 0;
! return number(tablepos, endkey, start, end, Rflag, BNflag);
}
mask = cur_fld->mask;
***************
*** 232,255 ****
return (tablepos == endkey ? NULL : tablepos);
}
! /* Uses the first bin to assign sign, expsign, 0, and the first
* 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
! * When sorting in forward order:
! * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
! * else use (0-99)->(2-102).
! * If the exponent is >=61, use another byte for each additional 253
* in the exponent. Cutoff is at 567.
* To avoid confusing the exponent and the mantissa, use a field delimiter
* if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
* only time a field delimiter can come in that position.
* Reverse order is done analagously.
*/
static u_char *
! number(pos, bufend, line, lineend, Rflag)
u_char *line, *pos, *bufend, *lineend;
! int Rflag;
{
int or_sign, parity = 0;
int expincr = 1, exponent = -1;
int bite, expsign = 1, sign = 1, zeroskip = 0;
--- 237,300 ----
return (tablepos == endkey ? NULL : tablepos);
}
! /*
! * number(): Convert a number into a format that can be sorted by
! * radixsort().
! *
! * The first bin encodes assign sign, expsign, 0, and the first
* 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ).
! * Subsequent bytes encode the remainder of the exponent (if the
! * exponent was too large to be encoded entirely in the first bin),
! * and the mantissa.
! *
! * The following values are used in the first bin (after being mapped
! * via nweights[]/fnum[]/rnum[]):
! *
! * 255: cannot be used, because the nweights[]/fnum[]/rnum[] mapping
! * requires inputs in the range 0..254.
! * 254: +overflow (+sign, +exponent, exponent > 567)
! * 251..253: unused
! * 250: +sign, +exponent, exponent >= 61, exponent <= 567
! * 190..249: +sign, +exponent, exponent > 0, exponent <= 60
! * 189: +sign, exponent=0
! * 129..188: +sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
! * 128: +sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
! * ** 128 is ambiguous
! * 128: +underflow (+sign, -exponent, abs(exponent) > 567)
! * 127: zero, or not a valid number
! * 126: -underflow (-sign, -exponent, abs(exponent) > 567)
! * 125: -sign, -exponent, abs(exponent) >= 61, abs(exponent) <= 567
! * 65..124: -sign, -exponent, abs(exponent) > 0, abs(exponent) <= 60
! * 64: -sign, exponent=0
! * 4..63: -sign, +exponent, exponent > 0, exponent <= 60
! * 3: -sign, +exponent, exponent >= 61, exponent <= 567
! * 1..2: unused
! * 0: -overflow (-sign, +exponent, exponent > 567)
! *
! * If abs(exponent) is >=61, use another bin for each additional 253
* in the exponent. Cutoff is at 567.
* To avoid confusing the exponent and the mantissa, use a field delimiter
* if the exponent is exactly 61, 61+252, etc--this is ok, since it's the
* only time a field delimiter can come in that position.
+ *
+ * After the exponent, append zero or more bins to represent the mantissa,
+ * with two mantissa digits per bin.
+ * When sorting in forward order:
+ * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128;
+ * else use (0-99)->(2-102).
* Reverse order is done analagously.
+ *
+ * After the mantissa, append a final bin containing 0 or 254,
+ * depending on whether we want the value to sort before or after a
+ * similar value that has more digits in the mantissa.
*/
static u_char *
! number(pos, bufend, line, lineend, Rflag, BNflag)
u_char *line, *pos, *bufend, *lineend;
! int Rflag, BNflag;
{
+ u_char *startpos = pos;
int or_sign, parity = 0;
int expincr = 1, exponent = -1;
int bite, expsign = 1, sign = 1, zeroskip = 0;
***************
*** 288,294 ****
if (*line < '1' || *line > '9' || line >= lineend) {
/* only exit if we didn't skip any zero number */
if (!zeroskip) {
! *pos++ = nweights[127];
return (pos);
}
}
--- 333,339 ----
if (*line < '1' || *line > '9' || line >= lineend) {
/* only exit if we didn't skip any zero number */
if (!zeroskip) {
! *pos++ = BNflag ? nweights[0]: nweights[127];
return (pos);
}
}
***************
*** 305,311 ****
}
bite = min(exponent, 61);
*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! : (expsign ? 64-bite : 64+bite)];
if (bite >= 61) {
do {
exponent -= bite;
--- 350,359 ----
}
bite = min(exponent, 61);
*pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite)
! /* range 128 .. 250 */
! : (expsign ? 64-bite : 64+bite)
! /* range 3 .. 125 */
! ];
if (bite >= 61) {
do {
exponent -= bite;
***************
*** 333,338 ****
--- 381,388 ----
} else
break;
}
+ if (!nonzero && lastvalue == '0')
+ pos = startpos;
if ((parity && lastvalue != '0') || !nonzero) {
*pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' :
OFF_TENS[lastvalue] + '0';
***************
*** 340,352 ****
pos = nonzero;
if (pos > bufend-1)
return (NULL);
! *pos++ = or_sign ? nweights[254] : nweights[0];
return (pos);
}
/* This forces a gap around the record delimiter
* Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
* rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
*/
void
num_init()
--- 390,404 ----
pos = nonzero;
if (pos > bufend-1)
return (NULL);
! *pos++ = (or_sign ^ Rflag) ? nweights[254] : nweights[0];
return (pos);
}
/* This forces a gap around the record delimiter
* Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255));
* rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0))
+ *
+ * XXX: fnum[255] and rnum[255] are unused.
*/
void
num_init()
***************
*** 362,371 ****
}
for (i = 0; i < REC_D; i++) {
fnum[i] = i;
! rnum[255 - i] = i;
}
for (i = REC_D; i <255; i++) {
fnum[i] = i + 1;
! rnum[255 - i] = i - 1;
}
}
--- 414,423 ----
}
for (i = 0; i < REC_D; i++) {
fnum[i] = i;
! rnum[254 - i] = i;
}
for (i = REC_D; i <255; i++) {
fnum[i] = i + 1;
! rnum[254 - i] = i - 1;
}
}
*** /usr/src/usr.bin/sort/init.c Wed Nov 3 14:14:36 2004
--- init.c Sun Jun 12 01:36:04 2005
***************
*** 1,4 ****
! /* $NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
--- 1,4 ----
! /* $NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
***************
*** 71,77 ****
#include "sort.h"
#ifndef lint
! __RCSID("$NetBSD: init.c,v 1.16 2004/11/03 20:14:36 dsl Exp $");
__SCCSID("@(#)init.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
--- 71,77 ----
#include "sort.h"
#ifndef lint
! __RCSID("$NetBSD: init.c,v 1.15 2004/02/18 20:44:36 jdolecek Exp $");
__SCCSID("@(#)init.c 8.1 (Berkeley) 6/6/93");
#endif /* not lint */
***************
*** 260,265 ****
--- 260,266 ----
case 'f': return (F);
case 'i': return (I);
case 'n': return (N);
+ case 'N': return (BN) | (N);
case 'r': return (R);
default: return (0);
}
***************
*** 282,288 ****
if (argv[i][0] != '+' && !fplus)
continue;
! if (fplus && (argv[i][0] != '-' || !isdigit((unsigned char)argv[i][1]))) {
fplus = 0;
if (argv[i][0] != '+') {
/* not a -POS argument, skip */
--- 283,289 ----
if (argv[i][0] != '+' && !fplus)
continue;
! if (fplus && (argv[i][0] != '-' || !isdigit((int) argv[i][1]))) {
fplus = 0;
if (argv[i][0] != '+') {
/* not a -POS argument, skip */
*** /usr/src/usr.bin/sort/sort.1 Fri Jul 23 08:20:36 2004
--- sort.1 Sun Jun 12 01:36:04 2005
***************
*** 162,168 ****
.Fl n
option no longer implies the
.Fl b
! option.)
.It Fl r
Reverse the sense of comparisons.
.It Fl S
--- 162,174 ----
.Fl n
option no longer implies the
.Fl b
! option.) If there is no such numeric string, the field is treated as
! having the value zero.
! .It Fl N
! When sorting numerically, treat non-numeric fields as negative infinity,
! rather than as zero. Implies the
! .Fl n
! option.
.It Fl r
Reverse the sense of comparisons.
.It Fl S
*** /usr/src/usr.bin/sort/sort.c Fri Jul 23 08:26:11 2004
--- sort.c Sun Jun 12 01:36:04 2005
***************
*** 158,165 ****
if (!(tmpdir = getenv("TMPDIR")))
tmpdir = _PATH_TMP;
! while ((ch = getopt(argc, argv, "bcdfik:mHno:rR:sSt:T:ux")) != -1) {
switch (ch) {
case 'b':
fldtab->flags |= BI | BT;
break;
--- 158,168 ----
if (!(tmpdir = getenv("TMPDIR")))
tmpdir = _PATH_TMP;
! while ((ch = getopt(argc, argv, "bcdfik:mHnNo:rR:sSt:T:ux")) != -1) {
switch (ch) {
+ case 'N':
+ fldtab->flags |= BN | N;
+ break;
case 'b':
fldtab->flags |= BI | BT;
break;
***************
*** 360,366 ****
if (msg != NULL)
(void)fprintf(stderr, "sort: %s\n", msg);
(void)fprintf(stderr,
! "usage: %s [-bcdfHimnrSsu] [-k field1[,field2]] [-o output]"
" [-R char] [-T dir]", getprogname());
(void)fprintf(stderr,
" [-t char] [file ...]\n");
--- 363,369 ----
if (msg != NULL)
(void)fprintf(stderr, "sort: %s\n", msg);
(void)fprintf(stderr,
! "usage: %s [-bcdfHimnNrSsu] [-k field1[,field2]] [-o output]"
" [-R char] [-T dir]", getprogname());
(void)fprintf(stderr,
" [-t char] [file ...]\n");
*** /usr/src/usr.bin/sort/sort.h Sun Feb 15 08:22:55 2004
--- sort.h Sun Jun 12 01:36:04 2005
***************
*** 91,96 ****
--- 91,97 ----
#define R 16 /* Field is reversed with respect to the global weight */
#define BI 32 /* ignore blanks in icol */
#define BT 64 /* ignore blanks in tcol */
+ #define BN 128 /* invalid numeric fields are -infinity */
/* masks for delimiters: blanks, fields, and termination. */
#define BLANK 1 /* ' ', '\t'; '\n' if -T is invoked */