Subject: Re: VAX code generation problem wrt IPSEC?
To: None <port-vax@netbsd.org, current-users@netbsd.org>
From: Olaf Seibert <rhialto@polderland.nl>
List: port-vax
Date: 11/12/2001 02:30:11
--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
I haven't tested yet if just using a -O1 or -O2 ip_cksump.c fixes the IP
checksum problem; first I looked at the assembly code generated by gcc.
ANd I found something curious in the code generation for a loop:
while ((mlen -= 32) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
w += 16;
}
is (optimised) turned into
L68:
#APP
addl2 (r3)+,r6 I presume this adds 4 bytes
adwc (r3)+,r6 at a time, otherwise I can't see
adwc (r3)+,r6 how 16 += are turned into
adwc (r3)+,r6 8 adwc (r3)+
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc $0,r6
#NO_APP
L80:
subl2 $32,r2
jgeq L68
but unoptimised into
L65:
subl3 $32,-12(fp),r0
movl r0,r1
movl r1,-12(fp)
tstl r1
jgeq L67
jbr L66
L67:
movl -8(fp),r5 r5 is sum, apparently
movl -4(fp),r4 r4 is w
#APP
addl2 (r4)+,r5 add a word, w[0]
#NO_APP
movl r5,-8(fp) store the sum
movl -8(fp),r5 and get it back
movl -4(fp),r4 next addition: now we should do
#APP w[1] but the increment has been lost!
adwc (r4)+,r5 so once again we add w[0]
#NO_APP
(repeated, also 8 times adwc, which is add word with carry, according
to a quick google search (boo, hiss, I should have proper vax assembly
docs - at least I know pdp-11), anyway using a 32-bits adwc for 16-bits
additions seems quite a liberty by the compiler in itself)
so that looks like it is a big problem.
Now first I'll go to sleep, then tomorrow I will check if I have not
been writing nonsense.
-Olaf.
--
___ Olaf 'Rhialto' Seibert - rhialto@ --Soep van de dag, wat zal dat zijn
\X/ xs4all.nl --wat kan dat wezen, beter maar het ergste vrezen -Boy Bensdorp
--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="in_cksum-O2.s"
#NO_APP
gcc2_compiled.:
___gnu_compiled_c:
.text
LC0:
.ascii "cksum: out of data\12\0"
.align 1
.globl _in_cksum
.type _in_cksum,@function
_in_cksum:
.word 0x3c0
movl 4(ap),r4
movl 8(ap),r5
clrl r6
clrl r7
tstl r4
jeql L55
movl $16,r8
L76:
tstl r5
jeql L77
movl 12(r4),r2
jeql L56
movl 8(r4),r3
cmpl r5,r2
jgeq L60
movl r5,r2
L60:
subl3 r2,r5,r5
cmpl r2,$15
jleq L62
bicl3 $-4,r3,r0
jeql L80
bicl3 $-65536,r6,r0
extzv r8,r8,r6,r1
addl3 r0,r1,r6
jlbc r3,L64
movzbl (r3)+,r0
addl2 r0,r6
ashl $8,r6,r6
xorl2 $1,r7
decl r2
L64:
jbc $1,r3,L80
movzwl (r3)+,r0
addl2 r0,r6
subl2 $2,r2
jbr L80
L68:
#APP
addl2 (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc $0,r6
#NO_APP
L80:
subl2 $32,r2
jgeq L68
addl2 $32,r2
cmpl r2,$15
jleq L62
#APP
addl2 (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc (r3)+,r6
adwc $0,r6
#NO_APP
subl2 $16,r2
L62:
cmpl r2,$7
jleq L71
#APP
addl2 (r3)+,r6
adwc (r3)+,r6
adwc $0,r6
#NO_APP
subl2 $8,r2
L71:
cmpl r2,$3
jleq L72
#APP
addl2 (r3)+,r6
adwc $0,r6
#NO_APP
subl2 $4,r2
L72:
tstl r2
jleq L56
bicl3 $-65536,r6,r1
extzv r8,r8,r6,r0
addl3 r1,r0,r6
cmpl r2,$1
jleq L74
movzwl (r3)+,r0
addl2 r0,r6
subl2 $2,r2
L74:
tstl r2
jleq L56
movzbl (r3),r0
addl2 r0,r6
ashl $8,r6,r6
xorl2 $1,r7
L56:
movl (r4),r4
jneq L76
L55:
tstl r5
jeql L77
pushab LC0
calls $1,_printf
L77:
tstl r7
jeql L78
#APP
rotl $8,r6,r6
#NO_APP
L78:
bicl3 $-65536,r6,r1
movl $16,r0
extzv r0,r0,r6,r0
addl3 r1,r0,r6
movzwl $65535,r0
cmpl r6,r0
jlequ L79
subl3 r0,r6,r6
L79:
xorl3 $65535,r6,r0
ret
Lfe1:
.size _in_cksum,Lfe1-_in_cksum
--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="in_cksum-O0.s"
#NO_APP
gcc2_compiled.:
___gnu_compiled_c:
.text
LC0:
.ascii "cksum: out of data\12\0"
.align 1
.globl _in_cksum
.type _in_cksum,@function
_in_cksum:
.word 0x0
subl2 $16,sp
clrl -8(fp)
clrl -12(fp)
clrl -16(fp)
L53:
tstl 4(ap)
jeql L57
tstl 8(ap)
jneq L56
jbr L57
L57:
jbr L54
L56:
movl 4(ap),r0
movl 12(r0),r1
movl r1,-12(fp)
tstl r1
jneq L58
jbr L55
L58:
movl 4(ap),r0
movl 8(r0),-4(fp)
cmpl 8(ap),-12(fp)
jgeq L59
movl 8(ap),-12(fp)
L59:
subl3 -12(fp),8(ap),8(ap)
cmpl -12(fp),$15
jgtr L60
jbr L61
L60:
bicl3 $-4,-4(fp),r0
tstl r0
jeql L62
bicl3 $-65536,-8(fp),r0
movl -8(fp),r1
movl $16,r3
subb3 r3,$32,r2
extzv r3,r2,r1,r1
addl3 r0,r1,-8(fp)
bicl3 $-2,-4(fp),r0
tstl r0
jeql L63
movzbl *-4(fp),r0
addl2 r0,-8(fp)
ashl $8,-8(fp),-8(fp)
xorl2 $1,-16(fp)
incl -4(fp)
decl -12(fp)
L63:
bicl3 $-3,-4(fp),r0
tstl r0
jeql L62
movzwl *-4(fp),r0
addl2 r0,-8(fp)
addl2 $2,-4(fp)
subl2 $2,-12(fp)
L64:
L62:
nop
L65:
subl3 $32,-12(fp),r0
movl r0,r1
movl r1,-12(fp)
tstl r1
jgeq L67
jbr L66
L67:
movl -8(fp),r5
movl -4(fp),r4
#APP
addl2 (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
#APP
adwc $0,r5
#NO_APP
movl r5,-8(fp)
jbr L65
L66:
addl2 $32,-12(fp)
cmpl -12(fp),$15
jleq L68
movl -8(fp),r4
movl -4(fp),r5
#APP
addl2 (r5)+,r4
#NO_APP
movl r4,-8(fp)
movl -8(fp),r4
movl -4(fp),r5
#APP
adwc (r5)+,r4
#NO_APP
movl r4,-8(fp)
movl -8(fp),r4
movl -4(fp),r5
#APP
adwc (r5)+,r4
#NO_APP
movl r4,-8(fp)
movl -8(fp),r4
movl -4(fp),r5
#APP
adwc (r5)+,r4
#NO_APP
movl r4,-8(fp)
movl -8(fp),r4
#APP
adwc $0,r4
#NO_APP
movl r4,-8(fp)
subl2 $16,-12(fp)
L68:
nop
L61:
cmpl -12(fp),$7
jleq L69
movl -8(fp),r5
movl -4(fp),r4
#APP
addl2 (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
movl -4(fp),r4
#APP
adwc (r4)+,r5
#NO_APP
movl r5,-8(fp)
movl -8(fp),r5
#APP
adwc $0,r5
#NO_APP
movl r5,-8(fp)
subl2 $8,-12(fp)
L69:
cmpl -12(fp),$3
jleq L70
movl -8(fp),r4
movl -4(fp),r5
#APP
addl2 (r5)+,r4
#NO_APP
movl r4,-8(fp)
movl -8(fp),r4
#APP
adwc $0,r4
#NO_APP
movl r4,-8(fp)
subl2 $4,-12(fp)
L70:
tstl -12(fp)
jleq L55
bicl3 $-65536,-8(fp),r0
movl -8(fp),r1
movl $16,r3
subb3 r3,$32,r2
extzv r3,r2,r1,r1
addl3 r0,r1,-8(fp)
cmpl -12(fp),$1
jleq L72
movzwl *-4(fp),r0
addl2 r0,-8(fp)
addl2 $2,-4(fp)
subl2 $2,-12(fp)
L72:
tstl -12(fp)
jleq L55
movzbl *-4(fp),r0
addl2 r0,-8(fp)
ashl $8,-8(fp),-8(fp)
xorl2 $1,-16(fp)
L73:
L71:
L55:
movl *4(ap),4(ap)
jbr L53
L54:
tstl 8(ap)
jeql L74
pushab LC0
calls $1,_printf
L74:
tstl -16(fp)
jeql L75
movl -8(fp),r4
#APP
rotl $8,r4,r4
#NO_APP
movl r4,-8(fp)
L75:
bicl3 $-65536,-8(fp),r0
movl -8(fp),r1
movl $16,r3
subb3 r3,$32,r2
extzv r3,r2,r1,r1
addl3 r0,r1,-8(fp)
cmpl -8(fp),$65535
jlequ L76
addl2 $-65535,-8(fp)
L76:
xorl3 $65535,-8(fp),r1
movl r1,r0
ret
ret
Lfe1:
.size _in_cksum,Lfe1-_in_cksum
--opJtzjQTFsWo+cga
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="in_cksum.c"
/* $NetBSD: in_cksum.c,v 1.14 2000/03/30 13:24:55 augustss Exp $ */
/*
* Copyright (c) 1988, 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <netinet/in.h>
/*
* Checksum routine for Internet Protocol family headers (Portable Version).
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
*/
#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
int
in_cksum(m, len)
struct mbuf *m;
int len;
{
u_int16_t *w;
int sum = 0;
int mlen = 0;
int byte_swapped = 0;
union {
u_int8_t c[2];
u_int16_t s;
} s_util;
union {
u_int16_t s[2];
u_int32_t l;
} l_util;
for (;m && len; m = m->m_next) {
if (m->m_len == 0)
continue;
w = mtod(m, u_int16_t *);
if (mlen == -1) {
/*
* The first byte of this mbuf is the continuation
* of a word spanning between this mbuf and the
* last mbuf.
*
* s_util.c[0] is already saved when scanning previous
* mbuf.
*/
s_util.c[1] = *(u_int8_t *)w;
sum += s_util.s;
w = (u_int16_t *)((u_int8_t *)w + 1);
mlen = m->m_len - 1;
len--;
} else
mlen = m->m_len;
if (len < mlen)
mlen = len;
len -= mlen;
/*
* Force to even boundary.
*/
if ((1 & (long) w) && (mlen > 0)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *(u_int8_t *)w;
w = (u_int16_t *)((int8_t *)w + 1);
mlen--;
byte_swapped = 1;
}
/*
* Unroll the loop to make overhead from
* branches &c small.
*/
while ((mlen -= 32) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
w += 16;
}
mlen += 32;
while ((mlen -= 8) >= 0) {
sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
w += 4;
}
mlen += 8;
if (mlen == 0 && byte_swapped == 0)
continue;
REDUCE;
while ((mlen -= 2) >= 0) {
sum += *w++;
}
if (byte_swapped) {
REDUCE;
sum <<= 8;
byte_swapped = 0;
if (mlen == -1) {
s_util.c[1] = *(u_int8_t *)w;
sum += s_util.s;
mlen = 0;
} else
mlen = -1;
} else if (mlen == -1)
s_util.c[0] = *(u_int8_t *)w;
}
if (len)
printf("cksum: out of data\n");
if (mlen == -1) {
/* The last mbuf has odd # of bytes. Follow the
standard (the odd byte may be shifted left by 8 bits
or not as determined by endian-ness of the machine) */
s_util.c[1] = 0;
sum += s_util.s;
}
REDUCE;
return (~sum & 0xffff);
}
--opJtzjQTFsWo+cga--