NetBSD-Bugs archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: bin/50638: Extreme slowness on loading gzipped kernels on oldCPUs
The following reply was made to PR bin/50638; it has been noted by GNATS.
From: Martin Husemann <martin%duskware.de@localhost>
To: gnats-bugs%NetBSD.org@localhost
Cc: tsutsui%ceres.dti.ne.jp@localhost
Subject: Re: bin/50638: Extreme slowness on loading gzipped kernels on oldCPUs
Date: Tue, 12 Jan 2016 12:41:24 +0100
--k1lZvvs/B4yU6o8G
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
I measured four variants by netbooting the slowest VAX I have available
right now with a hacked version of script(1), using the -r option to record
timestamps and a hack to display them (seconds:nanoseconds: $output).
I removed the twidling and size output. What looks like command input in
the first line is actually output from the VAX boot when it starts trying
a new kernel name, so no user interaction involved here anywhere.
non compressed kernel:
1452589100:570546000: > boot netbsd
1452589150:429907000: Copyright(c)...
= 49.8594s
gzip'd kernel with -current code in cread.c:
1452589566:425603000: > boot netbsd
1452590438:634284000: Copyright(c)...
= 872.209s
text data bss dec hex filename
61532 900 10236 72668 11bdc /ssd/hosts/vax/usr/mdec/boot
gzip'd kernel with Joerg's variant:
1452591021:055044000: > boot netbsd
1452591461:785939000: Copyright(c)...
= 440.731s
text data bss dec hex filename
61596 900 11264 73760 12020 /ssd/hosts/vax/usr/mdec/boot
gzip'd kernel and crc32() returning 0 always:
1452593154:151872000: > boot netbsd
1452593527:861325000: Copyright(c)...
= 373.709s
text data bss dec hex filename
61360 900 10236 72496 11b30 /ssd/hosts/vax/usr/mdec/boot
DYNAMIC_CRC_TABLE variant:
1452595355:235578000: > boot netbsd
1452595738:124262000: Copyright(c)...
= 382.889s
which sounds too quick, repeat DYNAMIC_CRC_TABLE test:
1452596046:563402000: > boot netbsd
1452596447:310237000: Copyright(c)...
= 400.747s
text data bss dec hex filename
62538 900 18428 81866 13fca /ssd/hosts/vax/usr/mdec/boot
The patch I used is attached. It is not optimized for size, but for ease
of testing all variants.
Martin
--k1lZvvs/B4yU6o8G
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=patch
Index: cread.c
===================================================================
RCS file: /cvsroot/src/sys/lib/libsa/cread.c,v
retrieving revision 1.27
diff -u -r1.27 cread.c
--- cread.c 25 Jul 2015 07:06:11 -0000 1.27
+++ cread.c 12 Jan 2016 11:34:29 -0000
@@ -93,8 +93,45 @@
* a 4-bit table and at least 8K smaller than the libkern version.
*/
#ifndef ETHER_CRC_POLY_LE
-#define ETHER_CRC_POLY_LE 0xedb88320
+#define ETHER_CRC_POLY_LE 0xedb88320U
#endif
+
+#define CRC_VARIANT 2 // 0 = none, 1 = joerg's, 2 = -current, 3 = DYNAMIC_CRC_TABLE
+
+#if CRC_VARIANT == 0
+uint32_t
+crc32(uint32_t crc, const uint8_t *buf, size_t len)
+{
+ return 0;
+}
+#elif CRC_VARIANT == 1
+uint32_t
+crc32(uint32_t crc, const uint8_t *buf, size_t len)
+{
+ static volatile int crc_tbl_inited = 0;
+ static uint32_t crc_tbl[256];
+
+ if (!crc_tbl_inited) {
+ uint32_t crc2, b, i;
+ for (b = 0; b < 256; ++b) {
+ crc2 = b;
+ for (i = 8; i > 0; --i) {
+ if (crc2 & 1)
+ crc2 = (crc2 >> 1) ^ ETHER_CRC_POLY_LE;
+ else
+ crc2 = (crc2 >> 1);
+ }
+ crc_tbl[b] = crc2;
+ }
+ crc_tbl_inited = 1;
+ }
+
+ crc = crc ^ 0xffffffffU;
+ while (len--)
+ crc = crc_tbl[(crc ^ *buf++) & 0xff] ^ (crc >> 8);
+ return (crc ^ 0xffffffffU);
+}
+#elif CRC_VARIANT == 2
uint32_t
crc32(uint32_t crc, const uint8_t *const buf, size_t len)
{
@@ -115,6 +152,186 @@
}
return (crc ^ 0xffffffffU);
}
+#elif CRC_VARIANT == 3
+
+static volatile bool crc_table_empty = true;
+static uint32_t crc_table[8][256];
+static void make_crc_table(void);
+
+typedef uint32_t u4;
+
+/* Definitions for doing the crc four data bytes at a time. */
+#define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+ (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+
+/*
+ Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+ x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+ Polynomials over GF(2) are represented in binary, one bit per coefficient,
+ with the lowest powers in the most significant bit. Then adding polynomials
+ is just exclusive-or, and multiplying a polynomial by x is a right shift by
+ one. If we call the above polynomial p, and represent a byte as the
+ polynomial q, also with the lowest power in the most significant bit (so the
+ byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+ where a mod b means the remainder after dividing a by b.
+
+ This calculation is done using the shift-register method of multiplying and
+ taking the remainder. The register is initialized to zero, and for each
+ incoming bit, x^32 is added mod p to the register if the bit is a one (where
+ x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+ x (which is shifting right by one and adding x^32 mod p if the bit shifted
+ out is a one). We start with the highest power (least significant bit) of
+ q and repeat for all eight bits of q.
+
+ The first table is simply the CRC of all possible eight bit values. This is
+ all the information needed to generate CRCs on data a byte at a time for all
+ combinations of CRC register values and incoming bytes. The remaining tables
+ allow for word-at-a-time CRC calculation for both big-endian and little-
+ endian machines, where a word is four bytes.
+*/
+static void make_crc_table(void)
+{
+ uint32_t c;
+ int n, k;
+ uint32_t poly; /* polynomial exclusive-or pattern */
+ /* terms of polynomial defining this crc (except x^32): */
+ static volatile bool first = true; /* flag to limit concurrent making */
+ static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+ /* See if another task is already doing this (not thread-safe, but better
+ than nothing -- significantly reduces duration of vulnerability in
+ case the advice about DYNAMIC_CRC_TABLE is ignored) */
+ if (first) {
+ first = false;
+
+ /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+ poly = 0;
+ for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+ poly |= 1 << (31 - p[n]);
+
+ /* generate a crc for every 8-bit value */
+ for (n = 0; n < 256; n++) {
+ c = (uint32_t)n;
+ for (k = 0; k < 8; k++)
+ c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+ crc_table[0][n] = c;
+ }
+
+ /* generate crc for each value followed by one, two, and three zeros,
+ and then the byte reversal of those as well as the first table */
+ for (n = 0; n < 256; n++) {
+ c = crc_table[0][n];
+ crc_table[4][n] = REV(c);
+ for (k = 1; k < 4; k++) {
+ c = crc_table[0][c & 0xff] ^ (c >> 8);
+ crc_table[k][n] = c;
+ crc_table[k + 4][n] = REV(c);
+ }
+ }
+
+ crc_table_empty = false;
+ }
+ else { /* not first */
+ /* wait for the other guy to finish (not efficient, but rare) */
+ while (crc_table_empty)
+ ;
+ }
+}
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+ c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+ crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+uint32_t crc32(uint32_t crc, const uint8_t *buf, size_t len)
+{
+ register u4 c;
+ register const u4 *buf4;
+
+ if (buf == NULL) return 0UL;
+
+ if (crc_table_empty)
+ make_crc_table();
+
+ c = (u4)crc;
+ c = ~c;
+ while (len && ((uintptr_t)buf & 3)) {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ len--;
+ }
+
+ buf4 = (const u4 *)(const void *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const unsigned char *)buf4;
+
+ if (len) do {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ } while (--len);
+ c = ~c;
+ return (uint32_t)c;
+}
+
+#else /* BIG_ENDIAN */
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+ c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+ crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+uint32_t crc32(uint32_t crc, const uint8_t *buf, size_t len)
+{
+ register u4 c;
+ register const u4 *buf4;
+
+ if (buf == NULL) return 0UL;
+
+ if (crc_table_empty)
+ make_crc_table();
+
+ c = REV((u4)crc);
+ c = ~c;
+ while (len && ((uintptr_t)buf & 3)) {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ len--;
+ }
+
+ buf4 = (const u4 *)(const void *)buf;
+ buf4--;
+ while (len >= 32) {
+ DOBIG32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOBIG4;
+ len -= 4;
+ }
+ buf4++;
+ buf = (const unsigned char *)buf4;
+
+ if (len) do {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ } while (--len);
+ c = ~c;
+ return (uint32_t)(REV(c));
+}
+#endif
+
+#else
+#error something is wrong
+#endif
/*
* compression utilities
--k1lZvvs/B4yU6o8G--
Home |
Main Index |
Thread Index |
Old Index