Port-powerpc archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
about powerpc version of in{,4}_cksum
There is some bug and some possible improvement for current powerpc
version of in{,4}_cksum (which is found in
sys/arch/powerpc/powerpc/in_cksum.c). Highlight is:
1) addze 7,7 or addze %1,%1 are used to clear carry bit, but they
aren't correct. The former fails to clear if junk register r7
happen to contain 0xffffffff, and the latter may crobber non-junk
(i.e., necessary) register.
2) When adjusting to 4 byte boundary, just adding 16bit value to the
variable `sum' isn't enough, since the `sum' may have full 32bit
value there, depending on how a packet is divided into mbufs. So,
we need to care carry bit. This actually prevented my Mac
(g4-500dp) from netbooting. (we can REDUCE instead but it results
longer instructions).
3) In asm statemnt, constraint letter "b" (base register) should be
used instead of "r" for pointer operand.
4) Load with update is slower than ordinary load (+ add) depending on
implementation (this is written in powerpc programming manual).
With appended changes below, in4_cksum for a paticular packet
(which is a nfs packet I've used to debug 2) above) runs about 15%
faster on my Mac. I'm not sure how this affects other powerpc
variant though.
Comments?
Very thanks Tsubai for many helps.
enami.
Index: in_cksum.c
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/powerpc/powerpc/in_cksum.c,v
retrieving revision 1.4
diff -c -r1.4 in_cksum.c
*** in_cksum.c 2002/07/29 09:14:37 1.4
--- in_cksum.c 2002/07/30 09:47:04
***************
*** 62,67 ****
--- 62,68 ----
uint8_t *w;
int mlen = 0;
int byte_swapped = 0;
+ int n;
union {
uint8_t c[2];
***************
*** 103,109 ****
* Force to a word boundary.
*/
if ((3 & (long) w) && (mlen > 0)) {
! if ((1 & (long) w) && (mlen > 0)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *w++;
--- 104,110 ----
* Force to a word boundary.
*/
if ((3 & (long) w) && (mlen > 0)) {
! if ((1 & (long) w)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *w++;
***************
*** 111,192 ****
byte_swapped = 1;
}
if ((2 & (long) w) && (mlen > 1)) {
! sum += *(uint16_t *)w;
w += 2;
mlen -= 2;
}
}
if (mlen >= 64) {
- register int n __asm("r0");
- uint8_t *tmpw;
-
n = mlen >> 6;
! tmpw = w - 4;
! asm volatile(
! "addze 7,7;" /* clear carry */
! "mtctr %1;" /* load loop count */
! "1:"
! "lwzu 7,4(%2);" /* load current data
word */
! "lwzu 8,4(%2);"
! "lwzu 9,4(%2);"
! "lwzu 10,4(%2);"
! "adde %0,%0,7;" /* add to sum */
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "lwzu 7,4(%2);"
! "lwzu 8,4(%2);"
! "lwzu 9,4(%2);"
! "lwzu 10,4(%2);"
! "adde %0,%0,7;"
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "lwzu 7,4(%2);"
! "lwzu 8,4(%2);"
! "lwzu 9,4(%2);"
! "lwzu 10,4(%2);"
! "adde %0,%0,7;"
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "lwzu 7,4(%2);"
! "lwzu 8,4(%2);"
! "lwzu 9,4(%2);"
! "lwzu 10,4(%2);"
! "adde %0,%0,7;"
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "bdnz 1b;" /* loop */
! "addze %0,%0;" /* add carry bit */
! : "+r"(sum)
! : "r"(n), "r"(tmpw)
! : "7", "8", "9", "10"); /* clobber r7, r8, r9,
r10 */
w += n * 64;
mlen -= n * 64;
}
if (mlen >= 8) {
- register int n __asm("r0");
- uint8_t *tmpw;
-
n = mlen >> 3;
! tmpw = w - 4;
! asm volatile(
! "addze %1,%1;" /* clear carry */
! "mtctr %1;" /* load loop count */
! "1:"
! "lwzu 7,4(%2);" /* load current data
word */
! "lwzu 8,4(%2);"
! "adde %0,%0,7;" /* add to sum */
! "adde %0,%0,8;"
! "bdnz 1b;" /* loop */
! "addze %0,%0;" /* add carry bit */
! : "+r"(sum)
! : "r"(n), "r"(tmpw)
! : "7", "8"); /* clobber r7, r8 */
w += n * 8;
mlen -= n * 8;
}
--- 112,199 ----
byte_swapped = 1;
}
if ((2 & (long) w) && (mlen > 1)) {
! /*
! * Since the `sum' may contain full 32 bit
! * value, we can't simply add any value.
! */
! __asm __volatile(
! "lhz 7,0(%1);" /* load current data
! half word */
! "addc %0,%0,7;" /* add to sum */
! "addze %0,%0;" /* add carry bit */
! : "+r"(sum)
! : "b"(w)
! : "7"); /* clobber r7 */
w += 2;
mlen -= 2;
}
}
if (mlen >= 64) {
n = mlen >> 6;
! __asm __volatile(
! "addic 0,0,0;" /* clear carry */
! "mtctr %1;" /* load loop count */
! "1:"
! "lwz 7,4(%2);" /* load current data
! word */
! "lwz 8,8(%2);"
! "lwz 9,12(%2);"
! "lwz 10,16(%2);"
! "adde %0,%0,7;" /* add to sum */
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "lwz 7,20(%2);"
! "lwz 8,24(%2);"
! "lwz 9,28(%2);"
! "lwz 10,32(%2);"
! "adde %0,%0,7;"
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "lwz 7,36(%2);"
! "lwz 8,40(%2);"
! "lwz 9,44(%2);"
! "lwz 10,48(%2);"
! "adde %0,%0,7;"
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "lwz 7,52(%2);"
! "lwz 8,56(%2);"
! "lwz 9,60(%2);"
! "lwzu 10,64(%2);"
! "adde %0,%0,7;"
! "adde %0,%0,8;"
! "adde %0,%0,9;"
! "adde %0,%0,10;"
! "bdnz 1b;" /* loop */
! "addze %0,%0;" /* add carry bit */
! : "+r"(sum)
! : "r"(n), "b"(w - 4)
! : "7", "8", "9", "10"); /* clobber r7, r8, r9,
! r10 */
w += n * 64;
mlen -= n * 64;
}
if (mlen >= 8) {
n = mlen >> 3;
! __asm __volatile(
! "addic 0,0,0;" /* clear carry */
! "mtctr %1;" /* load loop count */
! "1:"
! "lwz 7,4(%2);" /* load current data
! word */
! "lwzu 8,8(%2);"
! "adde %0,%0,7;" /* add to sum */
! "adde %0,%0,8;"
! "bdnz 1b;" /* loop */
! "addze %0,%0;" /* add carry bit */
! : "+r"(sum)
! : "r"(n), "b"(w - 4)
! : "7", "8"); /* clobber r7, r8 */
w += n * 8;
mlen -= n * 8;
}
***************
*** 229,234 ****
--- 236,242 ----
int
in_cksum(struct mbuf *m, int len)
{
+
return (in_cksum_internal(m, 0, len, 0));
}
Home |
Main Index |
Thread Index |
Old Index