Subject: about powerpc version of in{,4}_cksum
To: None <port-powerpc@netbsd.org>
From: enami tsugutomo <enami@sm.sony.co.jp>
List: port-powerpc
Date: 07/30/2002 19:08:25
There is some bug and some possible improvement for current powerpc
version of in{,4}_cksum (which is found in
sys/arch/powerpc/powerpc/in_cksum.c).  Highlight is:

1) addze 7,7 or addze %1,%1 are used to clear carry bit, but they
   aren't correct.  The former fails to clear if junk register r7
   happen to contain 0xffffffff, and the latter may crobber non-junk
   (i.e., necessary) register.

2) When adjusting to 4 byte boundary, just adding 16bit value to the
   variable `sum' isn't enough, since the `sum' may have full 32bit
   value there, depending on how a packet is divided into mbufs.  So,
   we need to care carry bit.  This actually prevented my Mac
   (g4-500dp) from netbooting.  (we can REDUCE instead but it results
   longer instructions).

3) In asm statemnt, constraint letter "b" (base register) should be
   used instead of "r" for pointer operand.

4) Load with update is slower than ordinary load (+ add) depending on
   implementation (this is written in powerpc programming manual).
   With appended changes below, in4_cksum for a paticular packet
   (which is a nfs packet I've used to debug 2) above) runs about 15%
   faster on my Mac.  I'm not sure how this affects other powerpc
   variant though.

Comments?

Very thanks Tsubai for many helps.

enami.

Index: in_cksum.c
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/powerpc/powerpc/in_cksum.c,v
retrieving revision 1.4
diff -c -r1.4 in_cksum.c
*** in_cksum.c	2002/07/29 09:14:37	1.4
--- in_cksum.c	2002/07/30 09:47:04
***************
*** 62,67 ****
--- 62,68 ----
  	uint8_t *w;
  	int mlen = 0;
  	int byte_swapped = 0;
+ 	int n;
  
  	union {
  		uint8_t  c[2];
***************
*** 103,109 ****
  		 * Force to a word boundary.
  		 */
  		if ((3 & (long) w) && (mlen > 0)) {
! 			if ((1 & (long) w) && (mlen > 0)) {
  				REDUCE;
  				sum <<= 8;
  				s_util.c[0] = *w++;
--- 104,110 ----
  		 * Force to a word boundary.
  		 */
  		if ((3 & (long) w) && (mlen > 0)) {
! 			if ((1 & (long) w)) {
  				REDUCE;
  				sum <<= 8;
  				s_util.c[0] = *w++;
***************
*** 111,192 ****
  				byte_swapped = 1;
  			}
  			if ((2 & (long) w) && (mlen > 1)) {
! 				sum += *(uint16_t *)w;
  				w += 2;
  				mlen -= 2;
  			}
  		}
  
  		if (mlen >= 64) {
- 			register int n __asm("r0");
- 			uint8_t *tmpw;
- 
  			n = mlen >> 6;
! 			tmpw = w - 4;
! 			asm volatile(
! 				"addze 7,7;"		/* clear carry */
! 				"mtctr %1;"		/* load loop count */
! 				"1:"
! 				"lwzu 7,4(%2);"		/* load current data word */
! 				"lwzu 8,4(%2);"
! 				"lwzu 9,4(%2);"
! 				"lwzu 10,4(%2);"
! 				"adde %0,%0,7;"		/* add to sum */
! 				"adde %0,%0,8;"
! 				"adde %0,%0,9;"
! 				"adde %0,%0,10;"
! 				"lwzu 7,4(%2);"
! 				"lwzu 8,4(%2);"
! 				"lwzu 9,4(%2);"
! 				"lwzu 10,4(%2);"
! 				"adde %0,%0,7;"
! 				"adde %0,%0,8;"
! 				"adde %0,%0,9;"
! 				"adde %0,%0,10;"
! 				"lwzu 7,4(%2);"
! 				"lwzu 8,4(%2);"
! 				"lwzu 9,4(%2);"
! 				"lwzu 10,4(%2);"
! 				"adde %0,%0,7;"
! 				"adde %0,%0,8;"
! 				"adde %0,%0,9;"
! 				"adde %0,%0,10;"
! 				"lwzu 7,4(%2);"
! 				"lwzu 8,4(%2);"
! 				"lwzu 9,4(%2);"
! 				"lwzu 10,4(%2);"
! 				"adde %0,%0,7;"
! 				"adde %0,%0,8;"
! 				"adde %0,%0,9;"
! 				"adde %0,%0,10;"
! 				"bdnz 1b;"		/* loop */
! 				"addze %0,%0;"		/* add carry bit */
! 				: "+r"(sum)
! 				: "r"(n), "r"(tmpw)
! 				: "7", "8", "9", "10");	/* clobber r7, r8, r9, r10 */
  			w += n * 64;
  			mlen -= n * 64;
  		}
  
  		if (mlen >= 8) {
- 			register int n __asm("r0");
- 			uint8_t *tmpw;
- 
  			n = mlen >> 3;
! 			tmpw = w - 4;
! 			asm volatile(
! 				"addze %1,%1;"		/* clear carry */
! 				"mtctr %1;"		/* load loop count */
! 				"1:"
! 				"lwzu 7,4(%2);"		/* load current data word */
! 				"lwzu 8,4(%2);"
! 				"adde %0,%0,7;"		/* add to sum */
! 				"adde %0,%0,8;"
! 				"bdnz 1b;"		/* loop */
! 				"addze %0,%0;"		/* add carry bit */
! 				: "+r"(sum)
! 				: "r"(n), "r"(tmpw)
! 				: "7", "8");		/* clobber r7, r8 */
  			w += n * 8;
  			mlen -= n * 8;
  		}
--- 112,199 ----
  				byte_swapped = 1;
  			}
  			if ((2 & (long) w) && (mlen > 1)) {
! 				/*
! 				 * Since the `sum' may contain full 32 bit
! 				 * value, we can't simply add any value.
! 				 */
! 				__asm __volatile(
! 				    "lhz 7,0(%1);"	/* load current data
! 							   half word */
! 				    "addc %0,%0,7;"	/* add to sum */
! 				    "addze %0,%0;"	/* add carry bit */
! 				    : "+r"(sum)
! 				    : "b"(w)
! 				    : "7");		/* clobber r7 */
  				w += 2;
  				mlen -= 2;
  			}
  		}
  
  		if (mlen >= 64) {
  			n = mlen >> 6;
! 			__asm __volatile(
! 			    "addic 0,0,0;"		/* clear carry */
! 			    "mtctr %1;"			/* load loop count */
! 			    "1:"
! 			    "lwz 7,4(%2);"		/* load current data
! 							   word */
! 			    "lwz 8,8(%2);"
! 			    "lwz 9,12(%2);"
! 			    "lwz 10,16(%2);"
! 			    "adde %0,%0,7;"		/* add to sum */
! 			    "adde %0,%0,8;"
! 			    "adde %0,%0,9;"
! 			    "adde %0,%0,10;"
! 			    "lwz 7,20(%2);"
! 			    "lwz 8,24(%2);"
! 			    "lwz 9,28(%2);"
! 			    "lwz 10,32(%2);"
! 			    "adde %0,%0,7;"
! 			    "adde %0,%0,8;"
! 			    "adde %0,%0,9;"
! 			    "adde %0,%0,10;"
! 			    "lwz 7,36(%2);"
! 			    "lwz 8,40(%2);"
! 			    "lwz 9,44(%2);"
! 			    "lwz 10,48(%2);"
! 			    "adde %0,%0,7;"
! 			    "adde %0,%0,8;"
! 			    "adde %0,%0,9;"
! 			    "adde %0,%0,10;"
! 			    "lwz 7,52(%2);"
! 			    "lwz 8,56(%2);"
! 			    "lwz 9,60(%2);"
! 			    "lwzu 10,64(%2);"
! 			    "adde %0,%0,7;"
! 			    "adde %0,%0,8;"
! 			    "adde %0,%0,9;"
! 			    "adde %0,%0,10;"
! 			    "bdnz 1b;"			/* loop */
! 			    "addze %0,%0;"		/* add carry bit */
! 			    : "+r"(sum)
! 			    : "r"(n), "b"(w - 4)
! 			    : "7", "8", "9", "10");	/* clobber r7, r8, r9,
! 							   r10 */
  			w += n * 64;
  			mlen -= n * 64;
  		}
  
  		if (mlen >= 8) {
  			n = mlen >> 3;
! 			__asm __volatile(
! 			    "addic 0,0,0;"		/* clear carry */
! 			    "mtctr %1;"			/* load loop count */
! 			    "1:"
! 			    "lwz 7,4(%2);"		/* load current data
! 							   word */
! 			    "lwzu 8,8(%2);"
! 			    "adde %0,%0,7;"		/* add to sum */
! 			    "adde %0,%0,8;"
! 			    "bdnz 1b;"			/* loop */
! 			    "addze %0,%0;"		/* add carry bit */
! 			    : "+r"(sum)
! 			    : "r"(n), "b"(w - 4)
! 			    : "7", "8");		/* clobber r7, r8 */
  			w += n * 8;
  			mlen -= n * 8;
  		}
***************
*** 229,234 ****
--- 236,242 ----
  int
  in_cksum(struct mbuf *m, int len)
  {
+ 
  	return (in_cksum_internal(m, 0, len, 0));
  }