Subject: copyin/out
To: None <port-arm@netbsd.org>
From: Allen Briggs <briggs@wasabisystems.com>
List: port-arm
Date: 08/08/2002 23:41:57
--IpbVkmxF4tDyP/Kb
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Hi,
I've been working on a new copyin/copyout/kcopy that's significantly
better in some caching modes on the XScale and slightly better in
others.
My three main concerns are:
1) how does it work on other ARM architectures
2) is the code too large for the more limited
of the arm32 archs?
3) Are there large, unaligned data copies going
through the copyin/copyout path?
Basically, I've ditched the pte scan and I'm using ldr[b]t and str[b]t
to access user data. I've also unrolled some loops and I've put in
some code to prefetch with the 'pld' instruction on XScale (if we can
define something like __ARM_v5EDSP or something, we could use that).
This does allow us to garbage-collect cowfault(), too.
(I've done some profiling with the new pmc(9) facilities)
Similar changes can be made to fusu.S, I believe--perhaps with more of
a gain there.
So, what do more experienced ARM-heads have to say about the attached
bcopyinout.S ?
With this, I'm seeing copyout run at about 63MB/s on a simple test
(dd if=/dev/zero of=/dev/null count=1024 bs=1024k).
-allen
--
Allen Briggs briggs@wasabisystems.com
http://www.wasabisystems.com/ Quality NetBSD CDs, Sales, Support, Service
NetBSD development for Alpha, ARM, M68K, MIPS, PowerPC, SuperH, XScale, etc...
--IpbVkmxF4tDyP/Kb
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="bcopyinout.S"
/* $NetBSD: bcopyinout.S,v 1.5 2002/03/23 02:22:57 thorpej Exp $ */
/*
* Copyright (c) 2002 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Allen Briggs for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "assym.h"
#include <machine/asm.h>
#include <sys/errno.h>
.text
.align 0
Lcurpcb:
.word _C_LABEL(curpcb)
#define SAVE_REGS stmfd sp!, {r4-r11}
#define RESTORE_REGS ldmfd sp!, {r4-r11}
#if defined(__XSCALE__)
#define HELLOCPP #
#define PREFETCH(rx,o) pld [ rx , HELLOCPP (o) ]
#else
#define PREFETCH(rx,o)
#endif
/*
* r0 = user space address
* r1 = kernel space address
* r2 = length
*
* Copies bytes from user space to kernel space
*
* We save/restore r4-r11:
* r4-r11 are scratch
*/
ENTRY(copyin)
/* Quick exit if length is zero */
teq r2, #0
moveq r0, #0
moveq pc, lr
SAVE_REGS
ldr r4, Lcurpcb
ldr r4, [r4]
ldr r5, [r4, #PCB_ONFAULT]
add r3, pc, #Lcopyfault - . - 8
str r3, [r4, #PCB_ONFAULT]
PREFETCH(r0, 0)
PREFETCH(r1, 0)
/*
* If not too many bytes, take the slow path.
*/
cmp r2, #0x08
blt Licleanup
/*
* Align destination to word boundary.
*/
and r6, r1, #0x3
ldr pc, [pc, r6, lsl #2]
b Lialend
.word Lialend
.word Lial1
.word Lial2
.word Lial3
Lial3: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lial2: ldrbt r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
Lial1: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lialend:
/*
* If few bytes left, finish slow.
*/
cmp r2, #0x08
blt Licleanup
/*
* If source is not aligned, finish slow.
*/
ands r3, r0, #0x03
bne Licleanup
cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */
blt Licleanup8
/*
* Align destination to cacheline boundary.
* If source and destination are nicely aligned, this can be a big
* win. If not, it's still cheaper to copy in groups of 32 even if
* we don't get the nice cacheline alignment.
*/
and r6, r1, #0x1f
ldr pc, [pc, r6]
b Licaligned
.word Licaligned
.word Lical4
.word Lical8
.word Lical12
.word Lical16
.word Lical20
.word Lical24
.word Lical28
Lical28:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
Lical24:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
Lical20:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
Lical16:ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
Lical12:ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
Lical8: ldrt r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
Lical4: ldrt r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
/*
* We start with > 0x40 bytes to copy (>= 0x60 got us into this
* part of the code, and we may have knocked that down by as much
* as 0x1c getting aligned).
*
* This loop basically works out to:
* do {
* prefetch-next-cacheline(s)
* bytes -= 0x20;
* copy cacheline
* } while (bytes >= 0x40);
* bytes -= 0x20;
* copy cacheline
*/
Licaligned:
PREFETCH(r0, 32)
PREFETCH(r1, 32)
sub r2, r2, #0x20
/* Copy a cacheline */
ldrt r6, [r0], #4
ldrt r7, [r0], #4
ldrt r8, [r0], #4
ldrt r9, [r0], #4
ldrt r10, [r0], #4
ldrt r11, [r0], #4
str r6, [r1], #4
str r7, [r1], #4
ldrt r6, [r0], #4
ldrt r7, [r0], #4
str r8, [r1], #4
str r9, [r1], #4
str r10, [r1], #4
str r11, [r1], #4
str r6, [r1], #4
str r7, [r1], #4
cmp r2, #0x40
bge Licaligned
sub r2, r2, #0x20
/* Copy a cacheline */
ldrt r6, [r0], #4
ldrt r7, [r0], #4
ldrt r8, [r0], #4
ldrt r9, [r0], #4
ldrt r10, [r0], #4
ldrt r11, [r0], #4
str r6, [r1], #4
str r7, [r1], #4
ldrt r6, [r0], #4
ldrt r7, [r0], #4
str r8, [r1], #4
str r9, [r1], #4
str r10, [r1], #4
str r11, [r1], #4
str r6, [r1], #4
str r7, [r1], #4
cmp r2, #0x08
blt Liprecleanup
Licleanup8:
ldrt r8, [r0], #4
ldrt r9, [r0], #4
sub r2, r2, #8
str r8, [r1], #4
str r9, [r1], #4
cmp r2, #8
bge Licleanup8
Liprecleanup:
/*
* If we're done, bail.
*/
cmp r2, #0
beq Lout
Licleanup:
and r6, r2, #0x3
ldr pc, [pc, r6, lsl #2]
b Licend
.word Lic4
.word Lic1
.word Lic2
.word Lic3
Lic4: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lic3: ldrbt r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
Lic2: ldrbt r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lic1: ldrbt r7, [r0], #1
subs r2, r2, #1
strb r7, [r1], #1
Licend:
bne Licleanup
Liout:
mov r0, #0
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
mov pc, lr
Lcopyfault:
mov r0, #EFAULT
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
mov pc, lr
/*
* r0 = kernel space address
* r1 = user space address
* r2 = length
*
* Copies bytes from kernel space to user space
*
* We save/restore r4-r11:
* r4-r11 are scratch
*/
ENTRY(copyout)
/* Quick exit if length is zero */
teq r2, #0
moveq r0, #0
moveq pc, lr
SAVE_REGS
ldr r4, Lcurpcb
ldr r4, [r4]
ldr r5, [r4, #PCB_ONFAULT]
add r3, pc, #Lcopyfault - . - 8
str r3, [r4, #PCB_ONFAULT]
PREFETCH(r0, 0)
PREFETCH(r1, 0)
/*
* If not too many bytes, take the slow path.
*/
cmp r2, #0x08
blt Lcleanup
/*
* Align destination to word boundary.
*/
and r6, r1, #0x3
ldr pc, [pc, r6, lsl #2]
b Lalend
.word Lalend
.word Lal1
.word Lal2
.word Lal3
Lal3: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
Lal2: ldrb r7, [r0], #1
sub r2, r2, #1
strbt r7, [r1], #1
Lal1: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
Lalend:
/*
* If few bytes left, finish slow.
*/
cmp r2, #0x08
blt Lcleanup
/*
* If source is not aligned, finish slow.
*/
ands r3, r0, #0x03
bne Lcleanup
cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */
blt Lcleanup8
/*
* Align source & destination to cacheline boundary.
*/
and r6, r1, #0x1f
ldr pc, [pc, r6]
b Lcaligned
.word Lcaligned
.word Lcal4
.word Lcal8
.word Lcal12
.word Lcal16
.word Lcal20
.word Lcal24
.word Lcal28
Lcal28: ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
Lcal24: ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
Lcal20: ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
Lcal16: ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
Lcal12: ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
Lcal8: ldr r7, [r0], #4
sub r2, r2, #4
strt r7, [r1], #4
Lcal4: ldr r6, [r0], #4
sub r2, r2, #4
strt r6, [r1], #4
/*
* We start with > 0x40 bytes to copy (>= 0x60 got us into this
* part of the code, and we may have knocked that down by as much
* as 0x1c getting aligned).
*
* This loop basically works out to:
* do {
* prefetch-next-cacheline(s)
* bytes -= 0x20;
* copy cacheline
* } while (bytes >= 0x40);
* bytes -= 0x20;
* copy cacheline
*/
Lcaligned:
PREFETCH(r0, 32)
PREFETCH(r1, 32)
sub r2, r2, #0x20
/* Copy a cacheline */
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
ldr r10, [r0], #4
ldr r11, [r0], #4
strt r6, [r1], #4
strt r7, [r1], #4
ldr r6, [r0], #4
ldr r7, [r0], #4
strt r8, [r1], #4
strt r9, [r1], #4
strt r10, [r1], #4
strt r11, [r1], #4
strt r6, [r1], #4
strt r7, [r1], #4
cmp r2, #0x40
bge Lcaligned
sub r2, r2, #0x20
/* Copy a cacheline */
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
ldr r10, [r0], #4
ldr r11, [r0], #4
strt r6, [r1], #4
strt r7, [r1], #4
ldr r6, [r0], #4
ldr r7, [r0], #4
strt r8, [r1], #4
strt r9, [r1], #4
strt r10, [r1], #4
strt r11, [r1], #4
strt r6, [r1], #4
strt r7, [r1], #4
cmp r2, #0x08
blt Lprecleanup
Lcleanup8:
ldr r8, [r0], #4
ldr r9, [r0], #4
sub r2, r2, #8
strt r8, [r1], #4
strt r9, [r1], #4
cmp r2, #8
bge Lcleanup8
Lprecleanup:
/*
* If we're done, bail.
*/
cmp r2, #0
beq Lout
Lcleanup:
and r6, r2, #0x3
ldr pc, [pc, r6, lsl #2]
b Lcend
.word Lc4
.word Lc1
.word Lc2
.word Lc3
Lc4: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
Lc3: ldrb r7, [r0], #1
sub r2, r2, #1
strbt r7, [r1], #1
Lc2: ldrb r6, [r0], #1
sub r2, r2, #1
strbt r6, [r1], #1
Lc1: ldrb r7, [r0], #1
subs r2, r2, #1
strbt r7, [r1], #1
Lcend:
bne Lcleanup
Lout:
mov r0, #0
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
mov pc, lr
/*
* r0 = kernel space source address
* r1 = kernel space destination address
* r2 = length
*
* Copies bytes from kernel space to kernel space, aborting on page fault
*
* Copy of copyout, but without the ldrt/strt instructions.
*/
ENTRY(kcopy)
/* Quick exit if length is zero */
teq r2, #0
moveq r0, #0
moveq pc, lr
SAVE_REGS
ldr r4, Lcurpcb
ldr r4, [r4]
ldr r5, [r4, #PCB_ONFAULT]
add r3, pc, #Lcopyfault - . - 8
str r3, [r4, #PCB_ONFAULT]
PREFETCH(r0, 0)
PREFETCH(r1, 0)
/*
* If not too many bytes, take the slow path.
*/
cmp r2, #0x08
blt Lkcleanup
/*
* Align destination to word boundary.
*/
and r6, r1, #0x3
ldr pc, [pc, r6, lsl #2]
b Lkalend
.word Lkalend
.word Lkal1
.word Lkal2
.word Lkal3
Lkal3: ldrb r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lkal2: ldrb r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
Lkal1: ldrb r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lkalend:
/*
* If few bytes left, finish slow.
*/
cmp r2, #0x08
blt Lkcleanup
/*
* If source is not aligned, finish slow.
*/
ands r3, r0, #0x03
bne Lkcleanup
cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */
blt Lkcleanup8
/*
* Align source & destination to cacheline boundary.
*/
and r6, r1, #0x1f
ldr pc, [pc, r6]
b Lkcaligned
.word Lkcaligned
.word Lkcal4
.word Lkcal8
.word Lkcal12
.word Lkcal16
.word Lkcal20
.word Lkcal24
.word Lkcal28
Lkcal28:ldr r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
Lkcal24:ldr r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
Lkcal20:ldr r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
Lkcal16:ldr r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
Lkcal12:ldr r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
Lkcal8: ldr r7, [r0], #4
sub r2, r2, #4
str r7, [r1], #4
Lkcal4: ldr r6, [r0], #4
sub r2, r2, #4
str r6, [r1], #4
/*
* We start with > 0x40 bytes to copy (>= 0x60 got us into this
* part of the code, and we may have knocked that down by as much
* as 0x1c getting aligned).
*
* This loop basically works out to:
* do {
* prefetch-next-cacheline(s)
* bytes -= 0x20;
* copy cacheline
* } while (bytes >= 0x40);
* bytes -= 0x20;
* copy cacheline
*/
Lkcaligned:
PREFETCH(r0, 32)
PREFETCH(r1, 32)
sub r2, r2, #0x20
/* Copy a cacheline */
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
ldr r10, [r0], #4
ldr r11, [r0], #4
str r6, [r1], #4
str r7, [r1], #4
ldr r6, [r0], #4
ldr r7, [r0], #4
str r8, [r1], #4
str r9, [r1], #4
str r10, [r1], #4
str r11, [r1], #4
str r6, [r1], #4
str r7, [r1], #4
cmp r2, #0x40
bge Lkcaligned
sub r2, r2, #0x20
/* Copy a cacheline */
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
ldr r10, [r0], #4
ldr r11, [r0], #4
str r6, [r1], #4
str r7, [r1], #4
ldr r6, [r0], #4
ldr r7, [r0], #4
str r8, [r1], #4
str r9, [r1], #4
str r10, [r1], #4
str r11, [r1], #4
str r6, [r1], #4
str r7, [r1], #4
cmp r2, #0x08
blt Lkprecleanup
Lkcleanup8:
ldr r8, [r0], #4
ldr r9, [r0], #4
sub r2, r2, #8
str r8, [r1], #4
str r9, [r1], #4
cmp r2, #8
bge Lkcleanup8
Lkprecleanup:
/*
* If we're done, bail.
*/
cmp r2, #0
beq Lkout
Lkcleanup:
and r6, r2, #0x3
ldr pc, [pc, r6, lsl #2]
b Lkcend
.word Lkc4
.word Lkc1
.word Lkc2
.word Lkc3
Lkc4: ldrb r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lkc3: ldrb r7, [r0], #1
sub r2, r2, #1
strb r7, [r1], #1
Lkc2: ldrb r6, [r0], #1
sub r2, r2, #1
strb r6, [r1], #1
Lkc1: ldrb r7, [r0], #1
subs r2, r2, #1
strb r7, [r1], #1
Lkcend:
bne Lkcleanup
Lkout:
mov r0, #0
str r5, [r4, #PCB_ONFAULT]
RESTORE_REGS
mov pc, lr
--IpbVkmxF4tDyP/Kb--