(This is on NetBSD/amd64 9.1) Some long double calculations lose precision, even when the values that are involved should easily fit in the 64 mantissa bits that long doubles are promised to have (according to LDBL_MANT_DIG). Below is a demonstration program that I extracted from a Macro-11 assembler, in particular the part where a 64 bit PDP-11 floating point number is constructed. These have 1 sign bit, 8 exponent bit, and 55 (56) mantissa bits (if you count the implict 1 at the start). See https://gitlab.com/Rhialto/macro11/-/blob/issue-5-floating-point-literals/parse.c#L380 I chopped out a bit that isn't relevant for this case (truncating/rounding to smaller output). I was testing some integral numbers that get near the end of the exactly representable range of whole numbers of these floats, and I discovered that the results did not always match the expected values. I didn't pull the "expected" values out if thin air, but those are the results from the RSX Macro11 assembler. That's why they are octal. The loss of precision probably occurs at the line // The following big literal is 2 to the 56th power: ufrac = (uint64_t) (frac * 72057594037927936.0); where frac is in the range [ 0.5 , 1.0 >, so I would expect that multiplying with 2**56 is perfectly feasable (just changes the exponent). Debugging output shows that differences in the lsbits that were detectable before (when printed with %La), were no longer after. In particular the last test case 1<<57 + 2 looks wrong to me; it should be exactly representable but gets the same result as 1<<57. 1<<57 - 1 should be rounded up but maybe this is slightly more debatable. # /* gcc demo.c -o demo -lm ./demo exit $? */ #include <stdlib.h> #include <stdint.h> #include <stdio.h> #include <string.h> #include <math.h> #include <float.h> #include <ieeefp.h> #define DEBUG_FLOAT 1 #if DEBUG_FLOAT void printflt(unsigned *flt, int size) { printf("%06o: ", flt[0]); printf("sign: %d ", (flt[0] & 0x8000) >> 15); printf("uexp: %x ", (flt[0] & 0x7F80) >> 7); printf("ufrac: %02x", flt[0] & 0x007F); for (int i = 1; i < size; i++) { printf(" %04x", flt[i]); } printf("\n"); } #define DF(x) printf x #else #define DF(x) #endif /* * We need 56 bits of mantissa. * * Try to detect if it is needed, possible and useful to use * long double instead of double, when parsing floating point numbers. */ #if DBL_MANT_DIG >= 56 /* plain double seems big enough */ # define USE_LONG_DOUBLE 0 /* long double exists and seems big enough */ #elif LDBL_MANT_DIG >= 56 # define USE_LONG_DOUBLE 1 #elif defined(LDBL_MANT_DIG) /* long double exists but is probably still too small */ # define USE_LONG_DOUBLE 1 #else /* long double does not exist and plain double is too small */ # define USE_LONG_DOUBLE 0 #endif #if USE_LONG_DOUBLE # define DOUBLE long double # define SCANF_FMT "%Lf" # define FREXP frexpl #else # define DOUBLE double # define SCANF_FMT "%lf" # define FREXP frexp #endif /* Parse PDP-11 64-bit floating point format. */ /* Give a pointer to "size" words to receive the result. */ /* Note: there are probably degenerate cases that store incorrect results. For example, I think rounding up a FLT2 might cause exponent overflow. Sorry. */ /* Note also that the full 56 bits of precision probably aren't always available on the source platform, given the widespread application of IEEE floating point formats, so expect some differences. Sorry again. */ int parse_float( char *cp, int size, unsigned *flt) { DOUBLE d; /* value */ DOUBLE frac; /* fractional value */ uint64_t ufrac; /* fraction converted to 56 bit unsigned integer */ int i; /* Number of fields converted by sscanf */ int n; /* Number of characters converted by sscanf */ int sexp; /* Signed exponent */ unsigned uexp; /* Unsigned excess-128 exponent */ unsigned sign = 0; /* Sign mask */ //fpsetprec(FP_PE); /* makes no difference... */ i = sscanf(cp, SCANF_FMT "%n", &d, &n); if (i == 0) return 0; /* Wasn't able to convert */ DF(("input: %s -> scanf: %Lf\n", cp, d)); cp += n; if (d == 0.0) { for (i = 0; i < size; i++) { flt[i] = 0; /* All-bits-zero equals zero */ } return 1; /* Good job. */ } frac = FREXP(d, &sexp); /* Separate into exponent and mantissa */ DF(("frac: %Lf %La sexp: %d\n", frac, frac, sexp)); if (sexp < -127 || sexp > 127) return 0; /* Exponent out of range. */ uexp = sexp + 128; /* Make excess-128 mode */ uexp &= 0xff; /* express in 8 bits */ DF(("uexp: %02x\n", uexp)); /* * frexp guarantees its fractional return value is * abs(frac) >= 0.5 and abs(frac) < 1.0 * Another way to think of this is that: * abs(frac) >= 2**-1 and abs(frac) < 2**0 */ if (frac < 0) { sign = (1 << 15); /* Negative sign */ frac = -frac; /* fix the mantissa */ } /* * For the PDP-11 floating point representation the * fractional part is 7 bits (for 16-bit floating point * literals), 23 bits (for 32-bit floating point values), * or 55 bits (for 64-bit floating point values). * However the bit immediately above the MSB is always 1 * because the value is normalized. So it's actually * 8 bits, 24 bits, or 56 bits. * We multiply the fractional part of our value by * 2**56 to fully expose all of those bits (including * the MSB which is 1). */ /* The following big literal is 2 to the 56th power: */ ufrac = (uint64_t) (frac * 72057594037927936.0); /* Align fraction bits */ DF(("ufrac: %016lx\n", ufrac)); DF(("56 : %016lx\n", (1LL<<56) - 1)); /* ufrac is now >= 2**55 and < 2**56 */ /* * +--+--------+-------+ +--------+--------+ * |15|14 7|6 0| |15 | 0| * +--+--------+-------+ +--------+--------+ * | S|EEEEEEEE|MMMMMMM| |MMMMMMMM|MMMMMMMM| ...2 more words... * +--+--------+-------+ +--------+--------+ * Sign (1 bit) * Exponent (8 bits) * Mantissa (7 bits) */ /* ... omitted truncation/rounding to 2 or 1 word results ... */ flt[0] = (unsigned) (sign | (uexp << 7) | ((ufrac >> 48) & 0x7F)); if (size > 1) { flt[1] = (unsigned) ((ufrac >> 32) & 0xffff); if (size > 2) { flt[2] = (unsigned) ((ufrac >> 16) & 0xffff); flt[3] = (unsigned) ((ufrac >> 0) & 0xffff); } } return 1; } void test_one(char *input, unsigned expected0, unsigned expected1, unsigned expected2, unsigned expected3) { unsigned result[4]; printf("------------------------\n"); parse_float(input, 4, result); if (result[0] != expected0 || result[1] != expected1 || result[2] != expected2 || result[3] != expected3) { printf("Unexpected result: %04x %04x %04x %04x\n", result[0], result[1], result[2], result[3]); printf(" expected : %04x %04x %04x %04x\n", expected0, expected1, expected2, expected3); printflt(result, 4); } } int main(int argc, char **argv) { /* Should print 64 */ DF(("LDBL_MANT_DIG: %d\n", LDBL_MANT_DIG)); /* First a number that should just fit in a 56 bit mantissa. */ /* 1 << 56 */ test_one("72057594037927936", 056200, 000000, 000000, 000000); /* One more and one less should also still fit */ test_one("72057594037927937", 056200, 000000, 000000, 000001); test_one("72057594037927935", 056177, 0177777, 0177777, 0177777); /* 1 << 57 should also be exactly representable */ test_one("144115188075855872", 056400, 000000, 000000, 000000); /* 1 less lacks one significant bit so will be rounded up */ test_one("144115188075855871", 056400, 000000, 000000, 000000); /* Same for 1 more, rounded down */ test_one("144115188075855873", 056400, 000000, 000000, 000000); /* but 2 more should show up in the lsb */ /* This one seems most clearly problematic */ test_one("144115188075855874", 056400, 000000, 000000, 000001); return 0; } -Olaf. -- Olaf 'Rhialto' Seibert -- rhialto at falu dot nl ___ Anyone who is capable of getting themselves made President should on \X/ no account be allowed to do the job. --Douglas Adams, "THGTTG"
Attachment:
signature.asc
Description: PGP signature