NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: bin/50638: Extreme slowness on loading gzipped kernels on oldCPUs



The following reply was made to PR bin/50638; it has been noted by GNATS.

From: Martin Husemann <martin%duskware.de@localhost>
To: gnats-bugs%NetBSD.org@localhost
Cc: tsutsui%ceres.dti.ne.jp@localhost
Subject: Re: bin/50638: Extreme slowness on loading gzipped kernels on oldCPUs
Date: Tue, 12 Jan 2016 12:41:24 +0100

 --k1lZvvs/B4yU6o8G
 Content-Type: text/plain; charset=us-ascii
 Content-Disposition: inline
 
 I measured four variants by netbooting the slowest VAX I have available
 right now with a hacked version of script(1), using the -r option to record
 timestamps and a hack to display them (seconds:nanoseconds: $output).
 I removed the twidling and size output. What looks like command input in
 the first line is actually output from the VAX boot when it starts trying
 a new kernel name, so no user interaction involved here anywhere.
 
 non compressed kernel:
 
   1452589100:570546000: > boot netbsd
   1452589150:429907000: Copyright(c)...
   = 49.8594s
 
 gzip'd kernel with -current code in cread.c:
 
   1452589566:425603000: > boot netbsd
   1452590438:634284000: Copyright(c)...
   = 872.209s
 
    text    data     bss     dec     hex filename
   61532     900   10236   72668   11bdc /ssd/hosts/vax/usr/mdec/boot
 
 
 gzip'd kernel with Joerg's variant:
 
  1452591021:055044000: > boot netbsd
  1452591461:785939000: Copyright(c)...
  = 440.731s
 
    text    data     bss     dec     hex filename
   61596     900   11264   73760   12020 /ssd/hosts/vax/usr/mdec/boot
 
 
 gzip'd kernel and crc32() returning 0 always:
 
   1452593154:151872000: > boot netbsd
   1452593527:861325000: Copyright(c)...
   = 373.709s
 
    text    data     bss     dec     hex filename
   61360     900   10236   72496   11b30 /ssd/hosts/vax/usr/mdec/boot
 
 
 DYNAMIC_CRC_TABLE variant:
 
 
   1452595355:235578000: > boot netbsd
   1452595738:124262000: Copyright(c)...
   = 382.889s
 
 which sounds too quick, repeat DYNAMIC_CRC_TABLE test:
 
   1452596046:563402000: > boot netbsd
   1452596447:310237000: Copyright(c)...
   = 400.747s
 
    text    data     bss     dec     hex filename
   62538     900   18428   81866   13fca /ssd/hosts/vax/usr/mdec/boot
 
 
 
 The patch I used is attached. It is not optimized for size, but for ease
 of testing all variants.
 
 Martin
 
 --k1lZvvs/B4yU6o8G
 Content-Type: text/plain; charset=us-ascii
 Content-Disposition: attachment; filename=patch
 
 Index: cread.c
 ===================================================================
 RCS file: /cvsroot/src/sys/lib/libsa/cread.c,v
 retrieving revision 1.27
 diff -u -r1.27 cread.c
 --- cread.c	25 Jul 2015 07:06:11 -0000	1.27
 +++ cread.c	12 Jan 2016 11:34:29 -0000
 @@ -93,8 +93,45 @@
   * a 4-bit table and at least 8K smaller than the libkern version.
   */
  #ifndef ETHER_CRC_POLY_LE
 -#define ETHER_CRC_POLY_LE	0xedb88320
 +#define ETHER_CRC_POLY_LE	0xedb88320U
  #endif
 +
 +#define	CRC_VARIANT	2	// 0 = none, 1 = joerg's, 2 = -current, 3 = DYNAMIC_CRC_TABLE
 +
 +#if CRC_VARIANT == 0
 +uint32_t
 +crc32(uint32_t crc, const uint8_t *buf, size_t len)
 +{
 +	return 0;
 +}
 +#elif CRC_VARIANT == 1
 +uint32_t
 +crc32(uint32_t crc, const uint8_t *buf, size_t len)
 +{
 +	static volatile int crc_tbl_inited = 0;
 +	static uint32_t crc_tbl[256];
 +
 +	if (!crc_tbl_inited) {
 +		uint32_t crc2, b, i;
 +		for (b = 0; b < 256; ++b) {
 +			crc2 = b;
 +			for (i = 8; i > 0; --i) {
 +				if (crc2 & 1)
 +					crc2 = (crc2 >> 1) ^ ETHER_CRC_POLY_LE;
 +				else    
 +					crc2 = (crc2 >> 1);
 +			}
 +			crc_tbl[b] = crc2;
 +		}
 +		crc_tbl_inited = 1;
 +	}
 +
 +	crc = crc ^ 0xffffffffU;
 +	while (len--)
 +		crc = crc_tbl[(crc ^ *buf++) & 0xff] ^ (crc >> 8);
 +	return (crc ^ 0xffffffffU);
 +}
 +#elif CRC_VARIANT == 2
  uint32_t
  crc32(uint32_t crc, const uint8_t *const buf, size_t len)
  {
 @@ -115,6 +152,186 @@
  	}
  	return (crc ^ 0xffffffffU);
  }
 +#elif CRC_VARIANT == 3
 +
 +static volatile bool crc_table_empty = true;
 +static uint32_t crc_table[8][256];
 +static void make_crc_table(void);
 +
 +typedef uint32_t u4;
 +
 +/* Definitions for doing the crc four data bytes at a time. */
 +#define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
 +               (((w)&0xff00)<<8)+(((w)&0xff)<<24))
 +
 +/*
 +  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
 +  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
 +
 +  Polynomials over GF(2) are represented in binary, one bit per coefficient,
 +  with the lowest powers in the most significant bit.  Then adding polynomials
 +  is just exclusive-or, and multiplying a polynomial by x is a right shift by
 +  one.  If we call the above polynomial p, and represent a byte as the
 +  polynomial q, also with the lowest power in the most significant bit (so the
 +  byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
 +  where a mod b means the remainder after dividing a by b.
 +
 +  This calculation is done using the shift-register method of multiplying and
 +  taking the remainder.  The register is initialized to zero, and for each
 +  incoming bit, x^32 is added mod p to the register if the bit is a one (where
 +  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
 +  x (which is shifting right by one and adding x^32 mod p if the bit shifted
 +  out is a one).  We start with the highest power (least significant bit) of
 +  q and repeat for all eight bits of q.
 +
 +  The first table is simply the CRC of all possible eight bit values.  This is
 +  all the information needed to generate CRCs on data a byte at a time for all
 +  combinations of CRC register values and incoming bytes.  The remaining tables
 +  allow for word-at-a-time CRC calculation for both big-endian and little-
 +  endian machines, where a word is four bytes.
 +*/
 +static void make_crc_table(void)
 +{
 +    uint32_t c;
 +    int n, k;
 +    uint32_t poly;                      /* polynomial exclusive-or pattern */
 +    /* terms of polynomial defining this crc (except x^32): */
 +    static volatile bool first = true;  /* flag to limit concurrent making */
 +    static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
 +
 +    /* See if another task is already doing this (not thread-safe, but better
 +       than nothing -- significantly reduces duration of vulnerability in
 +       case the advice about DYNAMIC_CRC_TABLE is ignored) */
 +    if (first) {
 +        first = false;
 +
 +        /* make exclusive-or pattern from polynomial (0xedb88320UL) */
 +        poly = 0;
 +        for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
 +            poly |= 1 << (31 - p[n]);
 +
 +        /* generate a crc for every 8-bit value */
 +        for (n = 0; n < 256; n++) {
 +            c = (uint32_t)n;
 +            for (k = 0; k < 8; k++)
 +                c = c & 1 ? poly ^ (c >> 1) : c >> 1;
 +            crc_table[0][n] = c;
 +        }
 +
 +        /* generate crc for each value followed by one, two, and three zeros,
 +           and then the byte reversal of those as well as the first table */
 +        for (n = 0; n < 256; n++) {
 +            c = crc_table[0][n];
 +            crc_table[4][n] = REV(c);
 +            for (k = 1; k < 4; k++) {
 +                c = crc_table[0][c & 0xff] ^ (c >> 8);
 +                crc_table[k][n] = c;
 +                crc_table[k + 4][n] = REV(c);
 +            }
 +        }
 +
 +        crc_table_empty = false;
 +    }
 +    else {      /* not first */
 +        /* wait for the other guy to finish (not efficient, but rare) */
 +        while (crc_table_empty)
 +            ;
 +    }
 +}
 +
 +#if BYTE_ORDER == LITTLE_ENDIAN
 +/* ========================================================================= */
 +#define DOLIT4 c ^= *buf4++; \
 +        c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
 +            crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
 +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
 +
 +/* ========================================================================= */
 +uint32_t crc32(uint32_t crc, const uint8_t *buf, size_t len)
 +{
 +    register u4 c;
 +    register const u4 *buf4;
 +
 +    if (buf == NULL) return 0UL;
 +
 +    if (crc_table_empty)
 +        make_crc_table();
 +
 +    c = (u4)crc;
 +    c = ~c;
 +    while (len && ((uintptr_t)buf & 3)) {
 +        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
 +        len--;
 +    }
 +
 +    buf4 = (const u4 *)(const void *)buf;
 +    while (len >= 32) {
 +        DOLIT32;
 +        len -= 32;
 +    }
 +    while (len >= 4) {
 +        DOLIT4;
 +        len -= 4;
 +    }
 +    buf = (const unsigned char *)buf4;
 +
 +    if (len) do {
 +        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
 +    } while (--len);
 +    c = ~c;
 +    return (uint32_t)c;
 +}
 +
 +#else /* BIG_ENDIAN */
 +
 +/* ========================================================================= */
 +#define DOBIG4 c ^= *++buf4; \
 +        c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
 +            crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
 +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
 +
 +/* ========================================================================= */
 +uint32_t crc32(uint32_t crc, const uint8_t *buf, size_t len)
 +{
 +    register u4 c;
 +    register const u4 *buf4;
 +
 +    if (buf == NULL) return 0UL;
 +
 +    if (crc_table_empty)
 +        make_crc_table();
 +
 +    c = REV((u4)crc);
 +    c = ~c;
 +    while (len && ((uintptr_t)buf & 3)) {
 +        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
 +        len--;
 +    }
 +
 +    buf4 = (const u4 *)(const void *)buf;
 +    buf4--;
 +    while (len >= 32) {
 +        DOBIG32;
 +        len -= 32;
 +    }
 +    while (len >= 4) {
 +        DOBIG4;
 +        len -= 4;
 +    }
 +    buf4++;
 +    buf = (const unsigned char *)buf4;
 +
 +    if (len) do {
 +        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
 +    } while (--len);
 +    c = ~c;
 +    return (uint32_t)(REV(c));
 +}
 +#endif
 +
 +#else
 +#error something is wrong
 +#endif
  
  /*
   * compression utilities
 
 --k1lZvvs/B4yU6o8G--
 


Home | Main Index | Thread Index | Old Index