Subject: Fast TLB MISS handler
To: None <port-sh3@NetBSD.org>
From: Valeriy E. Ushakov <uwe@ptc.spbu.ru>
List: port-sh3
Date: 03/22/2007 06:32:14
--envbJBWh7q8WU6mo
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

I've finally had some time for sh3 hacking and decided to look into
implementing fast TLB MISS handling.

The results are mixed.  Attached patch works for me on sh3 for user
space misses only (kernel space misses are handled the old way, see
below).

Good news is that with that patch make build time in pkgtools/digest
on my 133MHz Jornada (nfs root) goes down from 11:00 to 9:30 user time
- some 13% improvement.

The bad news is that if I make kernel misses go through the fast path
too, kernel gets stuck trying to execute init.  By that time we
already handling kernel misses from kernel stack and malloced data in
P3.  What it gets stuck at is:

. we call sys_execve -> execve1 -> check_exec
. vn_rdwr(UIO_READ, ... UIO_SYSSPACE, ...) -> nfs_bioread
. win = ubc_alloc(...)
. uiomove(win, ...)
. copyout_vmspace
. kcopy(src, dst, len), src poiting to what we've got from ubc_alloc

That's where it gets stuck.

I've tweaked miss handler to blink a led, and I can see the led
flicker very fast (you can only tell b/c its brighness is reduced
vs. plain "on" brighness).  The machine is pingable, but I cannot
break into ddb.

I figure that fast path should just load 0 into tlb, and on
instruction re-exec we should trap again but this time to the
tlb_exception hanler with the same miss where pcb_onfault should take
care of things.

I bet I'm missing something very obvious here.  Ideas are welcome.

PS: I plan to tackle sh4 version later, when I figure out this sh3
issue, but if you have a sh4 and feel like giving it a try - you are
most welcome.

TIA

SY, Uwe
-- 
uwe@ptc.spbu.ru                         |       Zu Grunde kommen
http://snark.ptc.spbu.ru/~uwe/          |       Ist zu Grunde gehen

--envbJBWh7q8WU6mo
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="fast_tlb_miss.diff"

Index: exception_vector.S
===================================================================
RCS file: /cvsroot/src/sys/arch/sh3/sh3/exception_vector.S,v
retrieving revision 1.28
diff -u -r1.28 exception_vector.S
--- exception_vector.S	18 Mar 2007 20:18:36 -0000	1.28
+++ exception_vector.S	22 Mar 2007 03:28:53 -0000
@@ -146,11 +146,100 @@
 /*
  * LINTSTUB: Var: char sh3_vector_tlbmiss[1];
  *
- * void sh3_vector_tlbmiss(void);
- *	Copied to VBR+0x400.  This code should be position independent
- *	and maximum 512 bytes long (== 0x600 - 0x400).
+ * Fast TLB MISS vector.  We do everything with exceptions disabled,
+ * so no P3 addresses please (including no stack).  As we don't save
+ * processor state we can only use BANK1 registers, and of those
+ * r6 and r7 are already taken.
+ *
+ * Copied to VBR+0x400.  This code should be position independent
+ * and maximum 512 bytes long (== 0x600 - 0x400).
  */
 NENTRY(sh3_vector_tlbmiss)
+	mov	#(SH3_PTEH & 0xff), r4
+	mov.l	@r4, r5
+
+	cmp/pz	r5		! user space address?
+	bt/s	.L3_user_va
+	 mov	r5, r2		! copy of VPN to compute index into ptd
+
+	!! XXX:	for now redirect kernel faults to the old vector
+	bra	_C_LABEL(sh3_vector_tlbmiss_old)
+	 nop
+
+	!! kernel space address, use pmap_kernel(), adjust VPN for indexing
+.L3_kernel_va:
+	mov.l	.L3_VM_MIN_KERNEL_ADDRESS, r0
+	mov.l	.L3_kernptd,  r1
+	bra	.L3_fetch_pte
+	 sub	r0, r2		! va -= VM_MIN_KERNEL_ADDRESS
+
+	!! user space address, use curlwp's pmap
+.L3_user_va:
+	mov.l	.L3_curptd,  r1
+
+	!! r1: points to ptd
+	!! r2: va, prepared for indexing into ptd
+	!! r3: pt_entry_t **ptd;
+.L3_fetch_pte:
+	mov.l	@r1, r3		! fetch ptd
+
+	!! __PMAP_PTP_INDEX(va)
+	mov	#-22, r1	! __PMAP_PTP_SHIFT
+	mov	r2, r0
+	shld	r1, r0		! va >> __PMAP_PTP_SHIFT
+	mov.l	.L3_ptp_index_mask, r1
+	and	r1, r0		! & (__PMAP_PTP_N - 1)
+	shll2	r0		! array index -> array offset
+	mov.l	@(r0, r3), r0	! ptp = ptd[idx]
+	tst	r0, r0		! if (ptp == NULL)
+	bt/s	.L3_load	!     pte = 0
+	 mov	#-12, r1	! PGSHIFT
+
+	!! __PMAP_PTP_OFSET(va)
+	shld	r1, r2		! va >> PGSHIFT
+	mov.l	.L3_ptp_offset_mask, r1
+	and	r1, r2		! & (__PMAP_PTP_PG_N - 1)
+	shll2	r2		! array index -> array offset
+	mov.l	@(r0, r2), r0	! pte = ptp[idx]
+
+	!! r0: pte or 0 if no mapping
+	!! r4: SH3_PTEH (still)
+	!! r5: *SH3_PTEH (still)
+.L3_load:
+	cmp/pz	r5		! user space address?
+	bf/s	.L3_load_kernel
+	 mov.l	r0, @(4, r4)	! *SH3_PTEL = pte
+
+	!! load mapping for a user space page
+	!! SH3_PTEH contains correct { VPN, ASID }
+.L3_load_user:
+	ldtlb
+	nop
+	nop
+	rte
+	 nop
+
+	!! load mapping for a kernel space page
+	!! we need to temporary set ASID to 0
+.L3_load_kernel:
+	mov.l	.L3_not_SH3_PTEH_ASID_MASK, r1
+	and	r5, r1		! & ~SH3_PTEH_ASID_MASK
+	mov.l	r1, @r4		! *SH3_PTEH = { VPN, ASID = 0 }
+	ldtlb
+	mov.l	r5, @r4		! restore ASID
+	nop
+	rte
+	 nop
+
+	.align 2
+.L3_curptd:			.long	_C_LABEL(curptd)
+.L3_kernptd:			.long	_C_LABEL(kernptd)
+.L3_ptp_index_mask:		.long	0x1ff
+.L3_ptp_offset_mask:		.long	0x3ff
+.L3_VM_MIN_KERNEL_ADDRESS:	.long	VM_MIN_KERNEL_ADDRESS
+.L3_not_SH3_PTEH_ASID_MASK:	.long	~SH3_PTEH_ASID_MASK
+
+NENTRY(sh3_vector_tlbmiss_old)
 	__EXCEPTION_ENTRY
 	mov	#(SH3_TEA & 0xff), r0
 	mov.l	@r0, r6		! 3rd arg: va = TEA
Index: genassym.cf
===================================================================
RCS file: /cvsroot/src/sys/arch/sh3/sh3/genassym.cf,v
retrieving revision 1.10
diff -u -r1.10 genassym.cf
--- genassym.cf	11 Dec 2005 12:19:00 -0000	1.10
+++ genassym.cf	22 Mar 2007 03:28:53 -0000
@@ -101,6 +101,7 @@
 define	UVMEXP_INTRS		offsetof(struct uvmexp, intrs)
 
 define	VM_MAXUSER_ADDRESS	VM_MAXUSER_ADDRESS
+define	VM_MIN_KERNEL_ADDRESS	VM_MIN_KERNEL_ADDRESS
 
 define	EFAULT			EFAULT
 define	ENAMETOOLONG		ENAMETOOLONG
Index: pmap.c
===================================================================
RCS file: /cvsroot/src/sys/arch/sh3/sh3/pmap.c,v
retrieving revision 1.62
diff -u -r1.62 pmap.c
--- pmap.c	12 Mar 2007 18:18:26 -0000	1.62
+++ pmap.c	22 Mar 2007 03:28:54 -0000
@@ -68,6 +68,10 @@
 paddr_t avail_start;		/* PA of first available physical page */
 paddr_t avail_end;		/* PA of last available physical page */
 
+/* For the fast tlb miss handler */
+pt_entry_t **kernptd;		/* p1 va of pmap_kernel()->pm_ptp */
+pt_entry_t **curptd;		/* p1 va of curlwp->...->pm_ptp */
+
 /* pmap pool */
 STATIC struct pool __pmap_pmap_pool;
 
@@ -118,6 +122,9 @@
 	pmap_kernel()->pm_ptp = (pt_entry_t **)uvm_pageboot_alloc(PAGE_SIZE);
 	memset(pmap_kernel()->pm_ptp, 0, PAGE_SIZE);
 
+	/* for tlb miss handler */
+	kernptd = pmap_kernel()->pm_ptp;
+
 	/* Enable MMU */
 	sh_mmu_start();
 	/* Mask all interrupt */
@@ -314,6 +321,7 @@
 
 	KDASSERT(pmap->pm_asid >=0 && pmap->pm_asid < 256);
 	sh_tlb_set_asid(pmap->pm_asid);
+	curptd = pmap->pm_ptp;
 }
 
 void

--envbJBWh7q8WU6mo--