Re: Time accounting on statclock()

To: matthew green <mrg%eterna.com.au@localhost>
Subject: Re: Time accounting on statclock()
From: Cherry G.Mathew <cherry%zyx.in@localhost>
Date: Tue, 06 Nov 2018 21:53:27 +0530

matthew green <mrg%eterna.com.au@localhost> writes:

>> For eg: if a clock interrupt from userland got deferred as pending, even
>> if it came in from userland (is this possible ?), because the current
>> spl level was at, say, SPL_HIGH, it now seems to be the case that the
>> system accounts for the delayed execution by charging the *entire* time
>> (from the last hardclock() inclusive of whatever was executing at
>> SPL_HIGH, to the system and not the userland process, thus charging the
>> time interval between when the last hardclock() was called, and when it
>> was actually serviced, to the system instead of the user process that
>> was originally interrupted.
>> 
>> To emphasise my point, I've added a patch below that I think should
>> reflect the accounting correctly.
>> 
>> I'm not sure I've understood this correctly, so I'd appreciate it if
>> someone who has a good understanding of this would be able to comment.
>> 
>> Many Thanks,
>> 
>> Cherry
>> 
>> 
>> 
>> --- kern_clock.c.~1.138.~       2018-09-21 11:28:02.792675611 +0000
>> +++ kern_clock.c        2018-10-16 12:06:38.753987323 +0000
>> @@ -352,6 +352,10 @@
>>         }
>> 
>>         if (CLKF_USERMODE(frame)) {
>> +               if (p == NULL) { /* Account deferred clock ticks to
>> current user. */
>> +                       p = l->l_proc;
>> +                       mutex_spin_enter(&p->p_stmutex);
>> +               }
>>                 KASSERT(p != NULL);
>>                 if ((p->p_stflag & PST_PROFIL) && profsrc ==
>>                 PROFSRC_CLOCK)
>>                         addupc_intr(l, CLKF_PC(frame));
>
> i don't believe this can happen.  if it does, then something
> else is horribly wrong and should be fixed instead.
>
> eg, if i'm in usermode, then i can't be the idle thread, so
> p should always be valid.  (usermode can also never be at any
> IPL besides 0, with it being an outright bug.)
>
> what problem are you actually trying to solve?
>

I'm attaching a diff that reworks the Xen interrupt handling to work
within the native x86/intr.c framework. I finally got it working today
to not miss interrupts (it survives pretty harsh workloads - obviously
needs more testing). The goal here is to provision a unified entrypoint
for PV, PVHVM and PVH code, so we don't have to worry about bitrotting
assembler when things move around. When I'm done, we'll be the only OS
that supports all 3 modes.

To answer your question - I'm abusing the spl(9) framework to schedule
and queue pending events using the vector.S codepaths. This means that
any pending interrupts on the Xen side are processed within spl(9)'s
iteration of interrupt handlers for a given IPL. The problem here is
that the clock interrupt handler examines the stackframe to determine
where it was interrupted from - and uses this information for time
accounting.

On x86, the clock handler is the only one that uses the USERMODE(tf)
macro in this way, and I'm sure there's a better way to do this. How do
non-x86 ports do this ?

-- 
~~cherry

diff -r 0d3b02e26e2b sys/arch/amd64/amd64/locore.S
--- a/sys/arch/amd64/amd64/locore.S	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/amd64/amd64/locore.S	Tue Nov 06 16:14:38 2018 +0000
@@ -1567,9 +1567,9 @@
 
 	_ALIGN_TEXT
 LABEL(intrfastexit)
-	NOT_XEN(cli;)
-	SVS_LEAVE
-	IBRS_LEAVE
+	CLI(si)
+	/*SVS_LEAVE*/
+	/*IBRS_LEAVE*/
 	INTR_RESTORE_GPRS
 	addq	$(TF_REGSIZE+16),%rsp	/* iret frame */
 
diff -r 0d3b02e26e2b sys/arch/amd64/amd64/machdep.c
--- a/sys/arch/amd64/amd64/machdep.c	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/amd64/amd64/machdep.c	Tue Nov 06 16:14:38 2018 +0000
@@ -1911,11 +1911,7 @@
 
 	init_x86_64_ksyms();
 
-#ifndef XEN
 	intr_default_setup();
-#else
-	events_default_setup();
-#endif
 
 	splraise(IPL_HIGH);
 	x86_enable_intr();
diff -r 0d3b02e26e2b sys/arch/amd64/amd64/spl.S
--- a/sys/arch/amd64/amd64/spl.S	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/amd64/amd64/spl.S	Tue Nov 06 16:14:38 2018 +0000
@@ -140,7 +140,8 @@
 	addq	$(5 * 8),%rsp
 	jmp	*%r13			/* back to Xspllower/Xdoreti */
 IDTVEC_END(softintr)
-
+#endif /* !XEN */
+	
 /*
  * softintr_ret()
  *
@@ -156,6 +157,7 @@
 	jmp	*%r13			/* back to Xspllower/Xdoreti */
 END(softintr_ret)
 
+#ifndef XEN
 /*
  * void softint_trigger(uintptr_t machdep);
  *
@@ -165,7 +167,7 @@
 	orl	%edi,CPUVAR(IPENDING)	/* atomic on local cpu */
 	ret
 END(softint_trigger)
-
+#endif
 
 /*
  * Xrecurse_preempt()
@@ -174,10 +176,10 @@
  */
 IDTVEC(recurse_preempt)
 	movl	$IPL_PREEMPT,CPUVAR(ILEVEL)
-	sti
+	STI(si)
 	xorq	%rdi,%rdi
 	call	_C_LABEL(kpreempt)
-	cli
+	CLI(si)
 	jmp	*%r13			/* back to Xspllower */
 IDTVEC_END(recurse_preempt)
 
@@ -188,16 +190,16 @@
  */
 IDTVEC(resume_preempt)
 	movl	$IPL_PREEMPT,CPUVAR(ILEVEL)
-	sti
+	STI(di)
 	testq	$SEL_RPL,TF_CS(%rsp)
 	jnz	1f
 	movq	TF_RIP(%rsp),%rdi
 	call	_C_LABEL(kpreempt)	/* from kernel */
-	cli
+	CLI(si)
 	jmp	*%r13			/* back to Xdoreti */
 1:
 	call	_C_LABEL(preempt)	/* from user */
-	cli
+	CLI(si)
 	jmp	*%r13			/* back to Xdoreti */
 IDTVEC_END(resume_preempt)
 
@@ -223,24 +225,24 @@
 	cmpl	CPUVAR(ILEVEL),%edi
 	jae	1f
 	movl	CPUVAR(IUNMASK)(,%rdi,4),%edx
-	pushf
-	cli
+	PUSHF(si)
+	CLI(si)
 	testl	CPUVAR(IPENDING),%edx
 	jnz	2f
 	movl	%edi,CPUVAR(ILEVEL)
-	popf
+	POPF(si)
 1:
 	ret
 	ret
 2:
-	popf
+	POPF(si)
 	jmp	_C_LABEL(Xspllower)
 3:
 	.space 16
 	.align	16
 END(spllower)
 LABEL(spllower_end)
-#endif /* !XEN */
+
 
 /*
  * void	cx8_spllower(int s);
diff -r 0d3b02e26e2b sys/arch/amd64/amd64/vector.S
--- a/sys/arch/amd64/amd64/vector.S	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/amd64/amd64/vector.S	Tue Nov 06 16:14:38 2018 +0000
@@ -302,8 +302,6 @@
 
 #define voidop(num)
 
-#ifndef XEN
-
 /*
  * This macro defines the generic stub code. Its arguments modify it
  * for specific PICs.
@@ -339,7 +337,7 @@
 1:									\
 	pushq	%r13			/* save for Xdoreti */		;\
 	movl	%ebx,CPUVAR(ILEVEL)					;\
-	sti								;\
+	STI(si)								;\
 	incl	CPUVAR(IDEPTH)						;\
 	movq	IS_HANDLERS(%r14),%rbx					;\
 6:									\
@@ -354,20 +352,20 @@
 	testq	%rbx,%rbx						;\
 	jnz	6b							;\
 5:									\
-	cli								;\
+	CLI(si)								;\
 	unmask(num)			/* unmask it in hardware */	;\
 	late_ack(num)							;\
-	sti								;\
+	STI(si)								;\
 	jmp	_C_LABEL(Xdoreti)	/* lower spl and do ASTs */	;\
 7:									\
-	cli								;\
+	CLI(si)								;\
 	orl	$(1 << num),CPUVAR(IPENDING)				;\
 	level_mask(num)							;\
 	late_ack(num)							;\
-	sti								;\
+	STI(si)								;\
 	jmp	_C_LABEL(Xdoreti)	/* lower spl and do ASTs */	;\
 10:									\
-	cli								;\
+	CLI(si)								;\
 	orl	$(1 << num),CPUVAR(IPENDING)				;\
 	level_mask(num)							;\
 	late_ack(num)							;\
@@ -388,6 +386,7 @@
 
 #define ICUADDR IO_ICU1
 
+#if !defined(XEN) || defined(DOM0OPS)
 INTRSTUB(legacy,0,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
     voidop)
 INTRSTUB(legacy,1,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
@@ -423,7 +422,8 @@
     voidop)
 INTRSTUB(legacy,15,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
     voidop)
-
+#endif
+#if !defined(XEN)
 #if NIOAPIC > 0
 
 INTRSTUB(ioapic_edge,0,voidop,ioapic_asm_ack,voidop,voidop,voidop)
@@ -643,111 +643,124 @@
 INTRSTUB_ARRAY_32(x2apic_level)
 #endif
 
-#endif /* !defined(XEN) */
+#else /* defined(XEN) */
+
+// XXX: Move these to xen/include/xenev.h or some such */
+// XXX: Pic / evtchn Locking
+#define xenev_asm_unmask(num)				 \
+	xorq	%rdi, %rdi				;\
+	movl	IS_PIN(%r14), %edi     			;\
+	callq	_C_LABEL(hypervisor_unmask_event)
 
-#if defined(XEN)
-/* Resume/recurse procedures for spl() */
-#define	XENINTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
-IDTVEC(recurse_ ## name ## num)						;\
-	INTR_RECURSE_HWFRAME						;\
-	subq	$8,%rsp							;\
-	pushq	$T_ASTFLT		/* trap # for doing ASTs */	;\
-	INTR_RECURSE_ENTRY						;\
-IDTVEC(resume_ ## name ## num)						\
-	movq	$IREENT_MAGIC,TF_ERR(%rsp)				;\
-	movl	%ebx,%r13d						;\
-	movq	CPUVAR(ISOURCES) + (num) * 8,%r14			;\
-1:									\
-	pushq	%r13							;\
-	movl	$num,CPUVAR(ILEVEL)					;\
-	STI(si)								;\
-	incl	CPUVAR(IDEPTH)						;\
-	movq	IS_HANDLERS(%r14),%rbx					;\
-6:									\
-	movq	IH_ARG(%rbx),%rdi					;\
-	movq	%rsp,%rsi						;\
-	call	*IH_FUN(%rbx)		/* call it */			;\
-	movq	IH_NEXT(%rbx),%rbx	/* next handler in chain */	;\
-	testq	%rbx,%rbx						;\
-	jnz	6b							;\
-5:									\
-	CLI(si)								;\
-	unmask(num)			/* unmask it in hardware */	;\
-	late_ack(num)							;\
-	STI(si)								;\
-	jmp	_C_LABEL(Xdoreti)	/* lower spl and do ASTs */	;\
+#define xenev_asm_mask(num)				 \
+	xorq	%rdi, %rdi				;\
+	movl	IS_PIN(%r14), %edi     			;\
+	callq	_C_LABEL(hypervisor_mask_event)
+
+#define xenev_asm_ack(num)				 \
+	xorq	%rdi, %rdi				;\
+	movl	IS_PIN(%r14), %edi     			;\
+	callq	_C_LABEL(hypervisor_ack_event)
 
-/* The unmask func for Xen events */
-#define hypervisor_asm_unmask(num)			\
-	movq	$num,%rdi				;\
-	call	_C_LABEL(hypervisor_enable_ipl)
-
-XENINTRSTUB(xenev,0,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,1,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,2,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,3,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,4,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,5,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,6,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,7,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,8,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,9,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,10,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,11,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,12,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,13,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,14,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,15,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,16,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,17,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,18,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,19,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,20,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,21,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,22,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,23,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,24,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,25,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,26,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,27,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,28,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,29,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,30,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
-XENINTRSTUB(xenev,31,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+	
+INTRSTUB(xenev,0,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,1,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,2,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,3,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,4,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,5,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,6,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,7,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,8,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,9,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,10,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,11,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,12,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,13,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,14,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,15,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,16,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,17,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,18,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,19,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,20,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,21,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,22,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,23,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,24,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,25,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,26,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,27,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,28,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,29,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,30,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
+INTRSTUB(xenev,31,voidop,voidop,xenev_asm_mask,xenev_asm_unmask,voidop)
 
 LABEL(xenev_stubs)
-	.quad _C_LABEL(Xrecurse_xenev0), _C_LABEL(Xresume_xenev0)
-	.quad _C_LABEL(Xrecurse_xenev1) ,_C_LABEL(Xresume_xenev1)
-	.quad _C_LABEL(Xrecurse_xenev2) ,_C_LABEL(Xresume_xenev2)
-	.quad _C_LABEL(Xrecurse_xenev3) ,_C_LABEL(Xresume_xenev3)
-	.quad _C_LABEL(Xrecurse_xenev4) ,_C_LABEL(Xresume_xenev4)
-	.quad _C_LABEL(Xrecurse_xenev5) ,_C_LABEL(Xresume_xenev5)
-	.quad _C_LABEL(Xrecurse_xenev6) ,_C_LABEL(Xresume_xenev6)
-	.quad _C_LABEL(Xrecurse_xenev7) ,_C_LABEL(Xresume_xenev7)
-	.quad _C_LABEL(Xrecurse_xenev8) ,_C_LABEL(Xresume_xenev8)
-	.quad _C_LABEL(Xrecurse_xenev9) ,_C_LABEL(Xresume_xenev9)
-	.quad _C_LABEL(Xrecurse_xenev10), _C_LABEL(Xresume_xenev10)
-	.quad _C_LABEL(Xrecurse_xenev11), _C_LABEL(Xresume_xenev11)
-	.quad _C_LABEL(Xrecurse_xenev12), _C_LABEL(Xresume_xenev12)
-	.quad _C_LABEL(Xrecurse_xenev13), _C_LABEL(Xresume_xenev13)
-	.quad _C_LABEL(Xrecurse_xenev14), _C_LABEL(Xresume_xenev14)
-	.quad _C_LABEL(Xrecurse_xenev15), _C_LABEL(Xresume_xenev15)
-	.quad _C_LABEL(Xrecurse_xenev16), _C_LABEL(Xresume_xenev16)
-	.quad _C_LABEL(Xrecurse_xenev17), _C_LABEL(Xresume_xenev17)
-	.quad _C_LABEL(Xrecurse_xenev18), _C_LABEL(Xresume_xenev18)
-	.quad _C_LABEL(Xrecurse_xenev19), _C_LABEL(Xresume_xenev19)
-	.quad _C_LABEL(Xrecurse_xenev20), _C_LABEL(Xresume_xenev20)
-	.quad _C_LABEL(Xrecurse_xenev21), _C_LABEL(Xresume_xenev21)
-	.quad _C_LABEL(Xrecurse_xenev22), _C_LABEL(Xresume_xenev22)
-	.quad _C_LABEL(Xrecurse_xenev23), _C_LABEL(Xresume_xenev23)
-	.quad _C_LABEL(Xrecurse_xenev24), _C_LABEL(Xresume_xenev24)
-	.quad _C_LABEL(Xrecurse_xenev25), _C_LABEL(Xresume_xenev25)
-	.quad _C_LABEL(Xrecurse_xenev26), _C_LABEL(Xresume_xenev26)
-	.quad _C_LABEL(Xrecurse_xenev27), _C_LABEL(Xresume_xenev27)
-	.quad _C_LABEL(Xrecurse_xenev28), _C_LABEL(Xresume_xenev28)
-	.quad _C_LABEL(Xrecurse_xenev29), _C_LABEL(Xresume_xenev29)
-	.quad _C_LABEL(Xrecurse_xenev30), _C_LABEL(Xresume_xenev30)
-	.quad _C_LABEL(Xrecurse_xenev31), _C_LABEL(Xresume_xenev31)
+	.quad _C_LABEL(Xintr_xenev0), _C_LABEL(Xrecurse_xenev0)
+	.quad _C_LABEL(Xresume_xenev0)
+	.quad _C_LABEL(Xintr_xenev1), _C_LABEL(Xrecurse_xenev1)
+	.quad _C_LABEL(Xresume_xenev1)
+	.quad _C_LABEL(Xintr_xenev2), _C_LABEL(Xrecurse_xenev2)
+	.quad _C_LABEL(Xresume_xenev2)
+	.quad _C_LABEL(Xintr_xenev3), _C_LABEL(Xrecurse_xenev3)
+	.quad _C_LABEL(Xresume_xenev3)
+	.quad _C_LABEL(Xintr_xenev4), _C_LABEL(Xrecurse_xenev4)
+	.quad _C_LABEL(Xresume_xenev4)
+	.quad _C_LABEL(Xintr_xenev5), _C_LABEL(Xrecurse_xenev5)
+	.quad _C_LABEL(Xresume_xenev5)
+	.quad _C_LABEL(Xintr_xenev6), _C_LABEL(Xrecurse_xenev6)
+	.quad _C_LABEL(Xresume_xenev6)
+	.quad _C_LABEL(Xintr_xenev7), _C_LABEL(Xrecurse_xenev7)
+	.quad _C_LABEL(Xresume_xenev7)
+	.quad _C_LABEL(Xintr_xenev8), _C_LABEL(Xrecurse_xenev8)
+	.quad _C_LABEL(Xresume_xenev8)
+	.quad _C_LABEL(Xintr_xenev9), _C_LABEL(Xrecurse_xenev9)
+	.quad _C_LABEL(Xresume_xenev9)
+	.quad _C_LABEL(Xintr_xenev10), _C_LABEL(Xrecurse_xenev10)
+	.quad _C_LABEL(Xresume_xenev10)
+	.quad _C_LABEL(Xintr_xenev11), _C_LABEL(Xrecurse_xenev11)
+	.quad _C_LABEL(Xresume_xenev11)
+	.quad _C_LABEL(Xintr_xenev12), _C_LABEL(Xrecurse_xenev12)
+	.quad _C_LABEL(Xresume_xenev12)
+	.quad _C_LABEL(Xintr_xenev13), _C_LABEL(Xrecurse_xenev13)
+	.quad _C_LABEL(Xresume_xenev13)
+	.quad _C_LABEL(Xintr_xenev14), _C_LABEL(Xrecurse_xenev14)
+	.quad _C_LABEL(Xresume_xenev14)
+	.quad _C_LABEL(Xintr_xenev15), _C_LABEL(Xrecurse_xenev15)
+	.quad _C_LABEL(Xresume_xenev15)
+	.quad _C_LABEL(Xintr_xenev16), _C_LABEL(Xrecurse_xenev16)
+	.quad _C_LABEL(Xresume_xenev16)
+	.quad _C_LABEL(Xintr_xenev17), _C_LABEL(Xrecurse_xenev17)
+	.quad _C_LABEL(Xresume_xenev17)
+	.quad _C_LABEL(Xintr_xenev18), _C_LABEL(Xrecurse_xenev18)
+	.quad _C_LABEL(Xresume_xenev18)
+	.quad _C_LABEL(Xintr_xenev19), _C_LABEL(Xrecurse_xenev19)
+	.quad _C_LABEL(Xresume_xenev19)
+	.quad _C_LABEL(Xintr_xenev20), _C_LABEL(Xrecurse_xenev20)
+	.quad _C_LABEL(Xresume_xenev20)
+	.quad _C_LABEL(Xintr_xenev21), _C_LABEL(Xrecurse_xenev21)
+	.quad _C_LABEL(Xresume_xenev21)
+	.quad _C_LABEL(Xintr_xenev22), _C_LABEL(Xrecurse_xenev22)
+	.quad _C_LABEL(Xresume_xenev22)
+	.quad _C_LABEL(Xintr_xenev23), _C_LABEL(Xrecurse_xenev23)
+	.quad _C_LABEL(Xresume_xenev23)
+	.quad _C_LABEL(Xintr_xenev24), _C_LABEL(Xrecurse_xenev24)
+	.quad _C_LABEL(Xresume_xenev24)
+	.quad _C_LABEL(Xintr_xenev25), _C_LABEL(Xrecurse_xenev25)
+	.quad _C_LABEL(Xresume_xenev25)
+	.quad _C_LABEL(Xintr_xenev26), _C_LABEL(Xrecurse_xenev26)
+	.quad _C_LABEL(Xresume_xenev26)
+	.quad _C_LABEL(Xintr_xenev27), _C_LABEL(Xrecurse_xenev27)
+	.quad _C_LABEL(Xresume_xenev27)
+	.quad _C_LABEL(Xintr_xenev28), _C_LABEL(Xrecurse_xenev28)
+	.quad _C_LABEL(Xresume_xenev28)
+	.quad _C_LABEL(Xintr_xenev29), _C_LABEL(Xrecurse_xenev29)
+	.quad _C_LABEL(Xresume_xenev29)
+	.quad _C_LABEL(Xintr_xenev30), _C_LABEL(Xrecurse_xenev30)
+	.quad _C_LABEL(Xresume_xenev30)
+	.quad _C_LABEL(Xintr_xenev31), _C_LABEL(Xrecurse_xenev31)
+	.quad _C_LABEL(Xresume_xenev31)
 END(xenev_stubs)
 
 /*
@@ -762,13 +775,7 @@
 	pushq	$0		/* Dummy error code */
 	pushq	$T_ASTFLT
 	INTRENTRY
-	/* sti?? */
-	movq	%rsp,%rdi
-	subq	$8,%rdi;	/* don't forget if_ppl */
-	call	do_hypervisor_callback
-	testb	$SEL_RPL,TF_CS(%rsp)
-	jnz	doreti_checkast
-1:
+	callq	_C_LABEL(xen_do_event_2l)
 	INTRFASTEXIT
 END(hypervisor_callback)
 
@@ -787,4 +794,4 @@
 /*	jmp	HYPERVISOR_iret */
 END(failsafe_callback)
 
-#endif	/* !XEN */
+#endif	/* XEN */
diff -r 0d3b02e26e2b sys/arch/amd64/conf/XEN3_DOMU
--- a/sys/arch/amd64/conf/XEN3_DOMU	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/amd64/conf/XEN3_DOMU	Tue Nov 06 16:14:38 2018 +0000
@@ -53,7 +53,7 @@
 options 	DDB_HISTORY_SIZE=512	# enable history editing in DDB
 #options 	KGDB		# remote debugger
 #options 	KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600
-#makeoptions	DEBUG="-g"	# compile full symbol table
+makeoptions	DEBUG="-g"	# compile full symbol table
 makeoptions	COPTS="-O2 -fno-omit-frame-pointer"
 options DDB_COMMANDONENTER="trace;show registers"
 
diff -r 0d3b02e26e2b sys/arch/amd64/include/frameasm.h
--- a/sys/arch/amd64/include/frameasm.h	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/amd64/include/frameasm.h	Tue Nov 06 16:14:38 2018 +0000
@@ -18,6 +18,7 @@
 /* Xen do not need swapgs, done by hypervisor */
 #define swapgs
 #define iretq	pushq $0 ; jmp HYPERVISOR_iret
+#define XEN_ONLY(x) x
 #define	XEN_ONLY2(x,y)	x,y
 #define	NOT_XEN(x)
 
@@ -29,11 +30,22 @@
  	movq CPUVAR(VCPU),%r ## temp_reg ;			\
 	movb $0,EVTCHN_UPCALL_MASK(%r ## temp_reg);
 
+#define PUSHF(temp_reg)						\
+ 	movq CPUVAR(VCPU),%r ## temp_reg ;			\
+	pushq EVTCHN_UPCALL_MASK(%r ## temp_reg);
+
+#define POPF(temp_reg) \
+ 	movq CPUVAR(VCPU),%r ## temp_reg ;			\
+	popq EVTCHN_UPCALL_MASK(%r ## temp_reg);
+
 #else /* XEN */
+#define XEN_ONLY(x)
 #define	XEN_ONLY2(x,y)
 #define	NOT_XEN(x)	x
 #define CLI(temp_reg) cli
 #define STI(temp_reg) sti
+#define PUSHF(temp_reg) pushf
+#define POPF(temp_reg) popf
 #endif	/* XEN */
 
 #define HP_NAME_CLAC		1
diff -r 0d3b02e26e2b sys/arch/x86/include/intr.h
--- a/sys/arch/x86/include/intr.h	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/x86/include/intr.h	Tue Nov 06 16:14:38 2018 +0000
@@ -69,9 +69,7 @@
  */
 
 struct intrstub {
-#if !defined(XEN)
 	void *ist_entry;
-#endif
 	void *ist_recurse;
 	void *ist_resume;
 };
@@ -90,10 +88,6 @@
 	void *is_recurse;		/* entry for spllower */
 	void *is_resume;		/* entry for doreti */
 	lwp_t *is_lwp;			/* for soft interrupts */
-#if defined(XEN)
-	u_long ipl_evt_mask1;	/* pending events for this IPL */
-	u_long ipl_evt_mask2[NR_EVENT_CHANNELS];
-#endif
 	struct evcnt is_evcnt;		/* interrupt counter per cpu */
 	int is_flags;			/* see below */
 	int is_type;			/* level, edge */
@@ -117,18 +111,6 @@
  */
 
 struct intrhand {
-#if defined(XEN)
-	/*
-	 * Note: This is transitional and will go away.
-	 *
-	 * We ought to use a union here, but too much effort.
-	 * We use this field to tear down the cookie handed to us
-	 * via x86/intr.c:intr_disestablish();
-	 * Interestingly, the intr_establish_xname() function returns
-	 * a "void *" - so we abuse this for now.
-	 */
-	int	pic_type; /* Overloading wrt struct pintrhand */
-#endif
 	int	(*ih_fun)(void *);
 	void	*ih_arg;
 	int	ih_level;
@@ -138,9 +120,6 @@
 	struct	intrhand **ih_prevp;
 	int	ih_pin;
 	int	ih_slot;
-#if defined(XEN)
-	struct	intrhand *ih_evt_next;
-#endif
 	struct cpu_info *ih_cpu;
 };
 
diff -r 0d3b02e26e2b sys/arch/x86/include/intrdefs.h
--- a/sys/arch/x86/include/intrdefs.h	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/x86/include/intrdefs.h	Tue Nov 06 16:14:38 2018 +0000
@@ -13,7 +13,8 @@
 #define	IPL_VM		0x6	/* low I/O, memory allocation */
 #define IPL_SCHED	0x7	/* medium I/O, scheduler, clock */
 #define	IPL_HIGH	0x8	/* high I/O, statclock, IPIs */
-#define	NIPL		9
+#define IPL_HYPERVISOR  0x9	/* Exclusively used by hypervisor callback */
+#define	NIPL		10
 
 /* Interrupt sharing types. */
 #define	IST_NONE	0	/* none */
diff -r 0d3b02e26e2b sys/arch/x86/x86/intr.c
--- a/sys/arch/x86/x86/intr.c	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/x86/x86/intr.c	Tue Nov 06 16:14:38 2018 +0000
@@ -159,6 +159,8 @@
 
 #include <uvm/uvm_extern.h>
 
+#include <machine/cpuvar.h>
+
 #include <machine/i8259.h>
 #include <machine/pio.h>
 
@@ -227,7 +229,6 @@
 #endif
 #endif
 
-#if !defined(XEN)
 static int intr_allocate_slot_cpu(struct cpu_info *, struct pic *, int, int *,
 				  struct intrsource *);
 static int __noinline intr_allocate_slot(struct pic *, int, int,
@@ -238,14 +239,9 @@
 
 static void intr_establish_xcall(void *, void *);
 static void intr_disestablish_xcall(void *, void *);
-#endif
 
 static const char *legacy_intr_string(int, char *, size_t, struct pic *);
 
-#if defined(XEN) /* XXX: nuke conditional after integration */
-static const char *xen_intr_string(int, char *, size_t, struct pic *);
-#endif /* XXX: XEN */
-
 #if defined(INTRSTACKSIZE)
 static inline bool redzone_const_or_false(bool);
 static inline int redzone_const_or_zero(int);
@@ -258,7 +254,6 @@
 
 static struct intrsource *intr_get_io_intrsource(const char *);
 static void intr_free_io_intrsource_direct(struct intrsource *);
-#if !defined(XEN)
 static int intr_num_handlers(struct intrsource *);
 
 static int intr_find_unused_slot(struct cpu_info *, int *);
@@ -266,7 +261,10 @@
 static void intr_deactivate_xcall(void *, void *);
 static void intr_get_affinity(struct intrsource *, kcpuset_t *);
 static int intr_set_affinity(struct intrsource *, const kcpuset_t *);
-#endif /* XEN */
+
+#if defined(XEN)
+static const char *xen_intr_string(int, char *, size_t, struct pic *);
+#endif
 
 /*
  * Fill in default interrupt table (in case of spurious interrupt
@@ -275,7 +273,7 @@
 void
 intr_default_setup(void)
 {
-#if !defined(XEN)
+#if !defined(XEN) || defined(DOM0OPS)
 	int i;
 
 	/* icu vectors */
@@ -289,7 +287,8 @@
 	 */
 	i8259_default_setup();
 
-#else
+#endif
+#if defined(XEN)
 	events_default_setup();
 #endif /* !XEN */
 	mutex_init(&intr_distribute_lock, MUTEX_DEFAULT, IPL_NONE);
@@ -645,7 +644,6 @@
 	intr_free_io_intrsource_direct(isp);
 }
 
-#if !defined(XEN)
 static int
 intr_allocate_slot_cpu(struct cpu_info *ci, struct pic *pic, int pin,
 		       int *index, struct intrsource *chained)
@@ -655,7 +653,7 @@
 
 	KASSERT(mutex_owned(&cpu_lock));
 
-	if (pic == &i8259_pic) {
+	if (pic->pic_type == PIC_I8259) {
 		KASSERT(CPU_IS_PRIMARY(ci));
 		slot = pin;
 	} else {
@@ -686,7 +684,9 @@
 
 		isp = chained;
 		KASSERT(isp != NULL);
-		if (pic->pic_type == PIC_MSI || pic->pic_type == PIC_MSIX)
+		if (pic->pic_type == PIC_XEN)
+			via = "channel";
+		else if (pic->pic_type == PIC_MSI || pic->pic_type == PIC_MSIX)
 			via = "vec";
 		else
 			via = "pin";
@@ -742,13 +742,21 @@
 	 * PIC and APIC usage are essentially exclusive, so the reservation
 	 * of the ISA slots is ignored when assigning IOAPIC slots.
 	 */
-	if (pic == &i8259_pic) {
+	switch(pic->pic_type) {
+	case PIC_I8259:
 		/*
 		 * Must be directed to BP.
 		 */
 		ci = &cpu_info_primary;
-		error = intr_allocate_slot_cpu(ci, pic, pin, &slot, chained);
-	} else {
+		break;
+	case PIC_XEN:
+		/* 
+		 * Note: this will override any subsequent
+		 * xen_pic.pic_route() request.
+		 */
+		ci = evtsource[pin].ci = curcpu();
+		break;
+	default:
 		/*
 		 * Find least loaded AP/BP and try to allocate there.
 		 */
@@ -766,26 +774,29 @@
 			ci = &cpu_info_primary;
 #endif
 		}
-		KASSERT(ci != NULL);
-		error = intr_allocate_slot_cpu(ci, pic, pin, &slot, chained);
+		break;
+	}
+	KASSERT(ci != NULL);
+
+	error = intr_allocate_slot_cpu(ci, pic, pin, &slot, chained);
 
-		/*
-		 * If that did not work, allocate anywhere.
-		 */
-		if (error != 0) {
-			for (CPU_INFO_FOREACH(cii, ci)) {
-				if ((ci->ci_schedstate.spc_flags &
-				    SPCF_NOINTR) != 0) {
-					continue;
-				}
-				error = intr_allocate_slot_cpu(ci, pic,
-				    pin, &slot, chained);
-				if (error == 0) {
-					break;
-				}
+	/*
+	 * If that did not work, allocate anywhere.
+	 */
+	if (error != 0) {
+		for (CPU_INFO_FOREACH(cii, ci)) {
+			if ((ci->ci_schedstate.spc_flags &
+			    SPCF_NOINTR) != 0) {
+				continue;
+			}
+			error = intr_allocate_slot_cpu(ci, pic,
+			    pin, &slot, chained);
+			if (error == 0) {
+				break;
 			}
 		}
 	}
+
 	if (error != 0) {
 		return error;
 	}
@@ -795,10 +806,66 @@
 	 * Now allocate an IDT vector.
 	 * For the 8259 these are reserved up front.
 	 */
-	if (pic == &i8259_pic) {
+	switch (pic->pic_type) {
+#if !defined(XEN)
+	case PIC_I8259:
 		idtvec = ICU_OFFSET + pin;
-	} else {
+		break;
+	case PIC_IOAPIC:
+	case PIC_LAPIC:
+	case PIC_MSI:
+	case PIC_MSIX:
 		idtvec = idt_vec_alloc(APIC_LEVEL(level), IDT_INTR_HIGH);
+		break;
+#elif defined(DOM0OPS)
+		/*
+		 * Normally the pic on Xen PV is "xenev". However on
+		 * PV dom0, we have the situation where the real pic
+		 * needs to be programmed. Here 'idtvec' has nothing
+		 * to do with the cpu IDT. Instead it is a global
+		 * irq number which we query from the Hypervisor. This
+		 * number is then bound to an event whose handler acts
+		 * as the irq handler. We hide all this detail inside
+		 * the xen_vec_alloc() call, with the detail that
+		 * what's returned is not a cpu idtvec as usual, but a
+		 * number space called 'gsi' whose meaning only XEN
+		 * should know the semantics of.
+		 */
+
+	case PIC_I8259:
+	case PIC_IOAPIC:
+	case PIC_LAPIC:
+		idtvec = xen_vec_alloc(xen_pic_to_gsi(pic, pin));
+		break;
+
+	case PIC_MSI:  /* TODO: */
+	case PIC_MSIX: /* TODO: */
+		panic("TODO: MSI/MSIX on Xen\n");
+		/* NOTREACHED */
+#endif
+#if defined(XEN)
+	case PIC_XEN:
+		/*
+		 * for "xenev", the pin is the port number of the
+		 * event we're wiring in. This means that the
+		 * underlying binding needs to be done outside of this
+		 * call.
+		 *
+		 * If each pin source type had a separate pic
+		 * this problem would go away.
+		 *
+		 * We set idtvec = pin to not bail out with EBUSY
+		 * below. This is semantically equivalent to the
+		 * (ci, idt_vec) tuple, so there is no loss of
+		 * information.
+		 */
+		idtvec = pin;
+		break;
+#endif
+	default:
+		idtvec = 0; /* Unknown pic! */
+		panic("Slot requested for unknown pic!");
+		break;
 	}
 	if (idtvec == 0) {
 		evcnt_detach(&ci->ci_isources[slot]->is_evcnt);
@@ -822,8 +889,20 @@
 	if (isp->is_handlers != NULL)
 		return;
 	ci->ci_isources[slot] = NULL;
-	if (pic != &i8259_pic)
+	if (pic->pic_type != PIC_I8259 &&
+	    pic->pic_type != PIC_XEN)
 		idt_vec_free(idtvec);
+
+	switch(pic->pic_type) {
+	case PIC_I8259:
+	case PIC_XEN:
+		break;
+	default:
+#if !defined(DOM0OPS) /* Xen has no equivalent */
+		idt_vec_free(idtvec);
+#endif
+		break;
+	}
 }
 
 #ifdef MULTIPROCESSOR
@@ -848,9 +927,7 @@
 	return ret;
 }
 #endif /* MULTIPROCESSOR */
-#endif /* XEN */
 
-#if defined(DOM0OPS) || !defined(XEN)
 struct pic *
 intr_findpic(int num)
 {
@@ -861,14 +938,14 @@
 	if (pic != NULL)
 		return &pic->sc_pic;
 #endif
+#if !defined(XEN) || defined(DOM0OPS)
 	if (num < NUM_LEGACY_IRQS)
 		return &i8259_pic;
+#endif
 
 	return NULL;
 }
-#endif
 
-#if !defined(XEN)
 /*
  * Append device name to intrsource. If device A and device B share IRQ number,
  * the device name of the interrupt id is "device A, device B".
@@ -901,7 +978,7 @@
 
 	ih = arg1;
 
-	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
+	KASSERT(ih->ih_cpu == curcpu() || !x86_mp_online);
 
 	ci = ih->ih_cpu;
 	source = ci->ci_isources[ih->ih_slot];
@@ -918,7 +995,15 @@
 	/* Hook in new IDT vector and SPL state. */
 	if (source->is_resume == NULL || source->is_idtvec != idt_vec) {
 		if (source->is_idtvec != 0 && source->is_idtvec != idt_vec)
-			idt_vec_free(source->is_idtvec);
+			switch(source->is_pic->pic_type) {
+#if defined(XEN)				
+			case PIC_XEN:
+				xen_channel_free(source->is_idtvec);
+				break;
+#endif
+			default: /* Everything else is assumed to be x86 */
+				idt_vec_free(source->is_idtvec);
+			}
 		source->is_idtvec = idt_vec;
 		if (source->is_type == IST_LEVEL) {
 			stubp = &source->is_pic->pic_level_stubs[ih->ih_slot];
@@ -927,7 +1012,15 @@
 		}
 		source->is_resume = stubp->ist_resume;
 		source->is_recurse = stubp->ist_recurse;
-		idt_vec_set(idt_vec, stubp->ist_entry);
+		switch(source->is_pic->pic_type) {
+#if defined(XEN)
+		case PIC_XEN:
+			xen_channel_set(idt_vec, stubp->ist_entry, ih->ih_slot, ih->ih_level);
+			break;
+#endif
+		default: /* Everything else is assumed to be x86 */
+			idt_vec_set(idt_vec, stubp->ist_entry);
+		}
 	}
 
 	/* Re-enable interrupts locally. */
@@ -952,8 +1045,15 @@
 
 	KASSERTMSG((legacy_irq == -1 || (0 <= legacy_irq && legacy_irq < 16)),
 	    "bad legacy IRQ value: %d", legacy_irq);
-	KASSERTMSG((legacy_irq != -1 || pic != &i8259_pic),
-	    "non-legacy IRQ on i8259");
+	KASSERTMSG((legacy_irq != -1 || pic->pic_type == PIC_I8259
+#if defined(XEN)		    
+		    || pic->pic_type == PIC_XEN
+#if defined(DOM0OPS)
+		    || (pic->pic_type != PIC_XEN && xen_pic_to_gsi(pic, pin) < 255)
+#endif /* DOM0OPS */
+#endif /* XEN */
+		    ),
+	    "non-legacy IRQ on %s", pic->pic_name);
 
 	ih = kmem_alloc(sizeof(*ih), KM_SLEEP);
 	intrstr = intr_create_intrid(legacy_irq, pic, pin, intrstr_buf,
@@ -1088,7 +1188,7 @@
 	 * Call out to the remote CPU to update its interrupt state.
 	 * Only make RPCs if the APs are up and running.
 	 */
-	if (ci == curcpu() || !mp_online) {
+	if (ci == curcpu() || !x86_mp_online) {
 		intr_establish_xcall(ih, (void *)(intptr_t)idt_vec);
 	} else {
 		where = xc_unicast(0, intr_establish_xcall, ih,
@@ -1098,6 +1198,18 @@
 
 	/* All set up, so add a route for the interrupt and unmask it. */
 	(*pic->pic_addroute)(pic, ci, pin, idt_vec, type);
+#if defined(XEN) && (defined(DOM0OPS) || NPCI > 0)
+	/* pirq/gsi -> event cascade binding */
+	if (pic->pic_type != PIC_XEN) {
+		unsigned int ev;
+		ev = xen_vec_alloc_event(idt_vec);
+		xen_pic.pic_addroute(&xen_pic, ci, ev, ev, IST_LEVEL);
+		xen_pic.pic_hwunmask(&xen_pic, ev);
+		/* gsi<->ev glue */
+		hypervisor_prime_pirq_event(xen_vec_to_gsi(idt_vec), ev);
+		hypervisor_ack_pirq_event(ev);
+	}
+#endif
 	(*pic->pic_hwunmask)(pic, pin);
 	mutex_exit(&cpu_lock);
 
@@ -1140,7 +1252,7 @@
 	ih = arg1;
 	ci = ih->ih_cpu;
 
-	KASSERT(ci == curcpu() || !mp_online);
+	KASSERT(ci == curcpu() || !x86_mp_online);
 
 	/* Disable interrupts locally. */
 	psl = x86_read_psl();
@@ -1205,117 +1317,12 @@
 	return num;
 }
 
-#else /* XEN */
-void *
-intr_establish(int legacy_irq, struct pic *pic, int pin,
-    int type, int level, int (*handler)(void *), void *arg,
-    bool known_mpsafe)
-{
-
-	return intr_establish_xname(legacy_irq, pic, pin, type, level,
-	    handler, arg, known_mpsafe, "XEN");
-}
-
-void *
-intr_establish_xname(int legacy_irq, struct pic *pic, int pin,
-    int type, int level, int (*handler)(void *), void *arg,
-    bool known_mpsafe, const char *xname)
-{
-	const char *intrstr;
-	char intrstr_buf[INTRIDBUF];
-
-	if (pic->pic_type == PIC_XEN) {
-		struct intrhand *rih;
-
-		/*
-		 * event_set_handler interprets `level != IPL_VM' to
-		 * mean MP-safe, so we require the caller to match that
-		 * for the moment.
-		 */
-		KASSERT(known_mpsafe == (level != IPL_VM));
-
-		intrstr = intr_create_intrid(legacy_irq, pic, pin, intrstr_buf,
-		    sizeof(intrstr_buf));
-
-		event_set_handler(pin, handler, arg, level, intrstr, xname);
-
-		rih = kmem_zalloc(sizeof(*rih), cold ? KM_NOSLEEP : KM_SLEEP);
-		if (rih == NULL) {
-			printf("%s: can't allocate handler info\n", __func__);
-			return NULL;
-		}
-
-		/*
-		 * XXX:
-		 * This is just a copy for API conformance.
-		 * The real ih is lost in the innards of
-		 * event_set_handler(); where the details of
-		 * biglock_wrapper etc are taken care of.
-		 * All that goes away when we nuke event_set_handler()
-		 * et. al. and unify with x86/intr.c
-		 */
-		rih->ih_pin = pin; /* port */
-		rih->ih_fun = rih->ih_realfun = handler;
-		rih->ih_arg = rih->ih_realarg = arg;
-		rih->pic_type = pic->pic_type;
-		return rih;
-	} 	/* Else we assume pintr */
-
-#if NPCI > 0 || NISA > 0
-	struct pintrhand *pih;
-	int gsi;
-	int vector, evtchn;
-
-	KASSERTMSG(legacy_irq == -1 || (0 <= legacy_irq && legacy_irq < NUM_XEN_IRQS),
-	    "bad legacy IRQ value: %d", legacy_irq);
-	KASSERTMSG(!(legacy_irq == -1 && pic == &i8259_pic),
-	    "non-legacy IRQon i8259 ");
-
-	gsi = xen_pic_to_gsi(pic, pin);
-
-	intrstr = intr_create_intrid(gsi, pic, pin, intrstr_buf,
-	    sizeof(intrstr_buf));
-
-	vector = xen_vec_alloc(gsi);
-
-	if (irq2port[gsi] == 0) {
-		extern struct cpu_info phycpu_info_primary; /* XXX */
-		struct cpu_info *ci = &phycpu_info_primary;
-
-		pic->pic_addroute(pic, ci, pin, vector, type);
-
-		evtchn = bind_pirq_to_evtch(gsi);
-		KASSERT(evtchn > 0);
-		KASSERT(evtchn < NR_EVENT_CHANNELS);
-		irq2port[gsi] = evtchn + 1;
-		xen_atomic_set_bit(&ci->ci_evtmask[0], evtchn);
-	} else {
-		/*
-		 * Shared interrupt - we can't rebind.
-		 * The port is shared instead.
-		 */
-		evtchn = irq2port[gsi] - 1;
-	}
-
-	pih = pirq_establish(gsi, evtchn, handler, arg, level,
-			     intrstr, xname);
-	pih->pic_type = pic->pic_type;
-	return pih;
-#endif /* NPCI > 0 || NISA > 0 */
-
-	/* FALLTHROUGH */
-	return NULL;
-}
-
-#endif /* XEN */
-
 /*
  * Deregister an interrupt handler.
  */
 void
 intr_disestablish(struct intrhand *ih)
 {
-#if !defined(XEN)
 	struct cpu_info *ci;
 	struct intrsource *isp;
 	uint64_t where;
@@ -1330,7 +1337,7 @@
 	(ci->ci_nintrhand)--;
 	KASSERT(ci->ci_nintrhand >= 0);
 	isp = ci->ci_isources[ih->ih_slot];
-	if (ci == curcpu() || !mp_online) {
+	if (ci == curcpu() || !x86_mp_online) {
 		intr_disestablish_xcall(ih, NULL);
 	} else {
 		where = xc_unicast(0, intr_disestablish_xcall, ih, NULL, ci);
@@ -1341,42 +1348,7 @@
 	}
 	mutex_exit(&cpu_lock);
 	kmem_free(ih, sizeof(*ih));
-#else /* XEN */
-	if (ih->pic_type == PIC_XEN) {
-		event_remove_handler(ih->ih_pin, ih->ih_realfun,
-		    ih->ih_realarg);
-		kmem_free(ih, sizeof(*ih));
-		return;
-	}
-#if defined(DOM0OPS)
-	/* 
-	 * Cache state, to prevent a use after free situation with
-	 * ih.
-	 */
-
-	struct pintrhand *pih = (struct pintrhand *)ih;
-
-	int pirq = pih->pirq;
-	int port = pih->evtch;
-	KASSERT(irq2port[pirq] != 0);
-
-	pirq_disestablish(pih);
-
-	if (evtsource[port] == NULL) {
-			/*
-			 * Last handler was removed by
-			 * event_remove_handler().
-			 *
-			 * We can safely unbind the pirq now.
-			 */
-
-			port = unbind_pirq_from_evtch(pirq);
-			KASSERT(port == pih->evtch);
-			irq2port[pirq] = 0;
-	}
-#endif
 	return;
-#endif /* XEN */
 }
 
 #if defined(XEN) /* nuke conditional post integration */
@@ -1490,8 +1462,9 @@
 void
 cpu_intr_init(struct cpu_info *ci)
 {
-#if !defined(XEN)
+#if NLAPIC > 0 || defined(__HAVE_PREEMPTION)
 	struct intrsource *isp;
+#endif
 #if NLAPIC > 0
 	static int first = 1;
 #if defined(MULTIPROCESSOR)
@@ -1539,13 +1512,6 @@
 #endif
 	intr_calculatemasks(ci);
 
-#else /* XEN */
-	int i; /* XXX: duplicate */
-	ci->ci_iunmask[0] = 0xfffffffe;
-	for (i = 1; i < NIPL; i++)
-		ci->ci_iunmask[i] = ci->ci_iunmask[i - 1] & ~(1 << i);
-#endif /* XEN */
-
 #if defined(INTRSTACKSIZE)
 	vaddr_t istack;
 
@@ -1901,11 +1867,8 @@
 	struct cpu_info *ci;
 
 	KASSERT(mutex_owned(&cpu_lock));
-	KASSERT(mp_online);
+	KASSERT(x86_mp_online);
 
-#if defined(XEN) /* XXX: remove */
-	return;
-#endif
 	/* Direct interrupts away from shielded CPUs. */
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) == 0) {
@@ -1928,7 +1891,6 @@
 	return ci->ci_nintrhand;
 }
 
-#if !defined(XEN)
 static int
 intr_find_unused_slot(struct cpu_info *ci, int *index)
 {
@@ -1970,7 +1932,7 @@
 
 	kpreempt_disable();
 
-	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
+	KASSERT(ih->ih_cpu == curcpu() || !x86_mp_online);
 
 	ci = ih->ih_cpu;
 	slot = ih->ih_slot;
@@ -1989,7 +1951,15 @@
 	}
 	source->is_resume = stubp->ist_resume;
 	source->is_recurse = stubp->ist_recurse;
-	idt_vec_set(idt_vec, stubp->ist_entry);
+	switch(source->is_pic->pic_type) {
+#if defined(XEN)
+	case PIC_XEN:
+		xen_channel_set(idt_vec, stubp->ist_entry, slot, ih->ih_level);
+		break;
+#endif
+	default: /* Everything else is assumed to be x86 */
+		idt_vec_set(idt_vec, stubp->ist_entry);
+	}
 
 	x86_write_psl(psl);
 
@@ -2011,7 +1981,7 @@
 
 	kpreempt_disable();
 
-	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
+	KASSERT(ih->ih_cpu == curcpu() || !x86_mp_online);
 
 	ci = ih->ih_cpu;
 	slot = ih->ih_slot;
@@ -2098,9 +2068,16 @@
 
 	/* i8259_pic supports only primary cpu, see i8259.c. */
 	pic = isp->is_pic;
-	if (pic == &i8259_pic) {
-		DPRINTF(("i8259 pic does not support set_affinity\n"));
+	switch(pic->pic_type) {
+#if !defined(XEN) || defined(DOM0OPS)
+	case PIC_I8259:
+#endif
+	case PIC_XEN:
+		DPRINTF(("%s pic does not support set_affinity\n",
+			    pic->pic_name));
 		return ENOTSUP;
+	default:
+		break;
 	}
 
 	ih = isp->is_handlers;
@@ -2129,7 +2106,7 @@
 	kpreempt_disable();
 
 	/* deactivate old interrupt setting */
-	if (oldci == curcpu() || !mp_online) {
+	if (oldci == curcpu() || !x86_mp_online) {
 		intr_deactivate_xcall(ih, NULL);
 	} else {
 		uint64_t where;
@@ -2147,7 +2124,7 @@
 		lih->ih_cpu = newci;
 		lih->ih_slot = newslot;
 	}
-	if (newci == curcpu() || !mp_online) {
+	if (newci == curcpu() || !x86_mp_online) {
 		intr_activate_xcall(ih, NULL);
 	} else {
 		uint64_t where;
@@ -2269,8 +2246,6 @@
 	mutex_exit(&cpu_lock);
 }
 
-#endif /* XEN */
-
 /*
  * MI interface for subr_interrupt.c
  */
@@ -2291,8 +2266,6 @@
 	mutex_exit(&cpu_lock);
 }
 
-#if !defined(XEN)
-
 /*
  * MI interface for subr_interrupt.c
  */
@@ -2438,7 +2411,6 @@
 
 	return ii_handler;
 }
-#endif /* !XEN */
 
 /*
  * MI interface for subr_interrupt.c
diff -r 0d3b02e26e2b sys/arch/xen/include/evtchn.h
--- a/sys/arch/xen/include/evtchn.h	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/include/evtchn.h	Tue Nov 06 16:14:38 2018 +0000
@@ -31,7 +31,7 @@
 
 #define NR_PIRQS	256
 
-extern struct evtsource *evtsource[];
+extern struct evtsource *evtsource;
 
 void events_default_setup(void);
 void events_init(void);
@@ -41,9 +41,8 @@
 unsigned int evtchn_do_event(int, struct intrframe *);
 void call_evtchn_do_event(int, struct intrframe *);
 void call_xenevt_event(int);
-int event_set_handler(int, int (*func)(void *), void *, int, const char *,
-    const char *);
-int event_remove_handler(int, int (*func)(void *), void *);
+void xen_channel_free(evtchn_port_t);
+void xen_channel_set(evtchn_port_t, void *, int, int);
 
 struct cpu_info;
 struct intrhand;
diff -r 0d3b02e26e2b sys/arch/xen/include/hypervisor.h
--- a/sys/arch/xen/include/hypervisor.h	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/include/hypervisor.h	Tue Nov 06 16:14:38 2018 +0000
@@ -150,12 +150,10 @@
 	  XEN_MINOR(xen_version) >= (minor)))
 
 /* hypervisor_machdep.c */
-void hypervisor_send_event(struct cpu_info *, unsigned int);
 void hypervisor_unmask_event(unsigned int);
 void hypervisor_mask_event(unsigned int);
+void hypervisor_ack_event(unsigned int);
 void hypervisor_clear_event(unsigned int);
-void hypervisor_enable_ipl(unsigned int);
-void hypervisor_set_ipending(uint32_t, int, int);
 void hypervisor_machdep_attach(void);
 void hypervisor_machdep_resume(void);
 
diff -r 0d3b02e26e2b sys/arch/xen/include/intr.h
--- a/sys/arch/xen/include/intr.h	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/include/intr.h	Tue Nov 06 16:14:38 2018 +0000
@@ -52,17 +52,13 @@
  */
 
 struct evtsource {
-	int ev_maxlevel;		/* max. IPL for this source */
-	uint32_t ev_imask;		/* interrupt mask */
-	struct intrhand *ev_handlers;	/* handler chain */
-	struct evcnt ev_evcnt;		/* interrupt counter */
-	struct cpu_info *ev_cpu;        /* cpu on which this event is bound */
-	char ev_intrname[32];		/* interrupt string */
-	char ev_xname[64];		/* handler device list */
+	void (*ist_entry);		/* stub handler for this channel */
+	struct cpu_info *ci;		/* XXX: kcpuset_t for multicast support */
+	int slot;			/* Slot corresponding to entry */
+	int level;			/* IPL_XXX interrupt priority  */
 };
 
 extern struct intrstub xenev_stubs[];
-extern int irq2port[NR_EVENT_CHANNELS]; /* actually port + 1, so that 0 is invaid */
 
 #ifdef MULTIPROCESSOR
 int xen_intr_biglock_wrapper(void *);
@@ -71,6 +67,9 @@
 #if defined(DOM0OPS) || NPCI > 0
 int xen_vec_alloc(int);
 int xen_pic_to_gsi(struct pic *, int);
+unsigned int xen_vec_alloc_event(int);
+int xen_gsi_to_vec(int);
+int xen_vec_to_gsi(int);
 #endif /* defined(DOM0OPS) || NPCI > 0 */
 
 #ifdef MULTIPROCESSOR
diff -r 0d3b02e26e2b sys/arch/xen/include/pic.h
--- a/sys/arch/xen/include/pic.h	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/include/pic.h	Tue Nov 06 16:14:38 2018 +0000
@@ -5,4 +5,6 @@
 
 #include <x86/pic.h>
 
+extern struct pic xen_pic;
+
 #endif /* _XEN_PIC_H_ */
diff -r 0d3b02e26e2b sys/arch/xen/x86/hypervisor_machdep.c
--- a/sys/arch/xen/x86/hypervisor_machdep.c	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/x86/hypervisor_machdep.c	Tue Nov 06 16:14:38 2018 +0000
@@ -1,5 +1,7 @@
 /*	$NetBSD: hypervisor_machdep.c,v 1.29 2018/10/26 05:33:21 cherry Exp $	*/
 
+/* Copyright 2018, The NetBSD Foundation */
+
 /*
  *
  * Copyright (c) 2004 Christian Limpach.
@@ -62,6 +64,7 @@
 
 #include <uvm/uvm_extern.h>
 
+#include <machine/cpuvar.h>
 #include <machine/vmparam.h>
 #include <machine/pmap.h>
 
@@ -85,284 +88,257 @@
 static void build_p2m_frame_list_list(void);
 static void update_p2m_frame_list_list(void);
 
-// #define PORT_DEBUG 4
-// #define EARLY_DEBUG_EVENT
+static volatile unsigned long pending[MAXCPUS] = {0}; /* local L1 cache. */
+static volatile unsigned long evtchn_pending[sizeof(unsigned long) * 8]; /* L2 */
 
-/* callback function type */
-typedef void (*iterate_func_t)(unsigned int, unsigned int,
-			       unsigned int, void *);
+/* 
+ * The following two functions abuse the spl(9) mechanism to queue
+ * pending events. There's no real reason to do this other than
+ * expedience. We can use any generic queuing mechanism, however, note
+ * that any such mechanism will need to take a few things into account:
+ *
+ *  i) The details of handler registration management in x86/intr.c
+ * ii) Interrupt spl(9) priority management.
+ *iii) Interrupt Stack entry and exit layout.
+ *
+ * All of these are available for free by abusing the spl code.
+ *
+ * However, a cleaner way to do this would be to route the code
+ * properly via the appropriate slot entry points, so that we fully
+ * simulate the code path that executes on native.
+ */
 
-static inline void
-evt_iterate_bits(volatile unsigned long *pendingl1,
-		 volatile unsigned long *pendingl2, 
-		 volatile unsigned long *mask,
-		 iterate_func_t iterate_pending, void *iterate_args)
+static void
+xen_queue_event(struct cpu_info *ci, unsigned long port)
 {
+	int slot = MAX_INTR_SOURCES;
+	slot = evtsource[port].slot;
+
+	/* Set pending bit for spl code */
+	atomic_or_32(&ci->ci_ipending, (1U << slot));
+
+	/* Update interrupt count on this VCPU */
+	ci->ci_isources[slot]->is_evcnt.ev_count++;
+}
+
+
+/* This is essentially a despatch function for queued interrupts */
+static void
+xen_despatch_events(struct cpu_info *ci)
+{
+	int i, spl;
+
+	spl = splraise(IPL_HYPERVISOR);
 
-	KASSERT(pendingl1 != NULL);
-	KASSERT(pendingl2 != NULL);
-	
-	unsigned long l1, l2;
-	unsigned int l1i, l2i, port;
+	/*
+	 * This bit uses spl(9) magic to brute force calling every
+	 * pending handler at every SPL level down to the interuptee.
+	 */
+
+	for (i = IPL_HYPERVISOR;i >= spl;i--) {
+		spllower(i);
+	}
+}
 
-	l1 = xen_atomic_xchg(pendingl1, 0);
-	while ((l1i = xen_ffs(l1)) != 0) {
+/* 
+ * Scan the Xen interrupt bitmap, and set the corresponding pending
+ * bits on the NetBSD side, for a given VCPU.
+ * If the port found was routed to a remote vcpu, then stop
+ * immediately and return the remote port value so that the callee can
+ * arrange to notify the remote VCPU.
+ *
+ * Returns NR_EVENT_CHANNELS when all pending events on the given vCPU
+ * are queued for despatch.
+ */
+
+static unsigned int
+xen_scan_queue_pending(struct cpu_info *ci)
+{
+	unsigned int l1i, l2i, port = NR_EVENT_CHANNELS;
+
+	volatile unsigned long *mask;
+	volatile unsigned long *pendingl2;
+	volatile shared_info_t *s = HYPERVISOR_shared_info;
+
+	KASSERT(ci != NULL);
+
+	mask = s->evtchn_mask;
+	pendingl2 = evtchn_pending;
+
+	/* 
+	 * Note that this is per-cpu.
+	 */
+
+	while ((l1i = xen_ffs(pending[ci->ci_cpuid])) != 0) {
 		l1i--;
-		l1 &= ~(1UL << l1i);
-
-		l2 = pendingl2[l1i] & (mask != NULL ? ~mask[l1i] : -1UL);
-		l2 &= curcpu()->ci_evtmask[l1i];
+ 		pending[ci->ci_cpuid] &= ~(1UL << l1i);
 
-		if (mask != NULL) xen_atomic_setbits_l(&mask[l1i], l2);
-		xen_atomic_clearbits_l(&pendingl2[l1i], l2);
-
-		while ((l2i = xen_ffs(l2)) != 0) {
+		/*
+		 * Note that we do not queue events on VCPUs without
+		 * ci->ci_evtmask[] set.
+		 */
+		while ((l2i = xen_ffs(pendingl2[l1i])) != 0) {
 			l2i--;
-			l2 &= ~(1UL << l2i);
 
 			port = (l1i << LONG_SHIFT) + l2i;
 
-			iterate_pending(port, l1i, l2i, iterate_args);
+			/*
+			 * Only unmasked events on this VCPU are
+			 * cleared and scheduled.
+			 */
+			if ((1U << l2i) & ~mask[l1i] & ci->ci_evtmask[l1i]) {
+				/* Queue this port for despatch */
+				KASSERT(evtsource[port].ci == ci);
+				xen_queue_event(ci, port);
+				pendingl2[l1i] &= ~(1UL << l2i);
+			} else {
+				/* Return remote port number for processing */
+				return port;
+			}
 		}
 	}
+
+	return NR_EVENT_CHANNELS; /* No remotes */
 }
 
+// #define PORT_DEBUG 4
+// #define EARLY_DEBUG_EVENT
+
 /*
- * Set per-cpu "pending" information for outstanding events that
- * cannot be processed now.
+ * This function is responsible for saving and clearing the Xen
+ * pending events bitmap we are given. This constitues both a pending
+ * worklist and an ack to the hypervisor. The pending bitmap is local
+ * to NetBSD, and we use this saved bitmap to later queue up work
+ * using xen_scan_queue_pending().
  */
-   
-static inline void
-evt_set_pending(unsigned int port, unsigned int l1i,
-		unsigned int l2i, void *args)
+
+static void
+xen_ack_cache_pending(struct cpu_info *ci)
 {
 
-	KASSERT(args != NULL);
+	unsigned int l1i;
+	// volatile unsigned long *mask;
+	volatile unsigned long *pendingl1, *pendingl2;
+	volatile shared_info_t *s = HYPERVISOR_shared_info;
+	
+	volatile struct vcpu_info *vci;
 
-	int *ret = args;
+	KASSERT(ci != NULL);
+	vci = ci->ci_vcpu;
+
+	//	mask = s->evtchn_mask;
+	pendingl1 = &vci->evtchn_pending_sel;
+	pendingl2 = evtchn_pending;
+	
+	/* 
+	 * Prevent re-entry due to currently pending events. After
+	 * the cache-and-clear, any re-entry is due to new events.
+	 */
 
-	if (evtsource[port]) {
-		hypervisor_set_ipending(evtsource[port]->ev_imask, l1i, l2i);
-		evtsource[port]->ev_evcnt.ev_count++;
-		if (*ret == 0 && curcpu()->ci_ilevel <
-		    evtsource[port]->ev_maxlevel)
-			*ret = 1;
+	/* Queue up all pending events for despatch */
+	while(xen_atomic_test_and_clear_bit(&vci->evtchn_upcall_pending, 0)) {
+		/* Cache and clear pending */
+		xen_atomic_setbits_l(&pending[ci->ci_cpuid],
+		    xen_atomic_xchg(pendingl1, 0));
+
+		unsigned long lpending = pending[ci->ci_cpuid];
+		while ((l1i = xen_ffs(lpending)) != 0) {
+			l1i--;
+
+			lpending &= ~(1UL << l1i);
+
+			xen_atomic_setbits_l(&pendingl2[l1i],
+				xen_atomic_xchg(&s->evtchn_pending[l1i], 0));
+		}
 	}
-#ifdef DOM0OPS
-	else  {
-		/* set pending event */
-		xenevt_setipending(l1i, l2i);
-	}
-#endif
+
+	return;
 }
 
-int stipending(void);
-int
-stipending(void)
+void xen_do_event_2l(void); /* Only used in vector.S asm */
+
+/*
+ * This function is reentrant, because Xen gives us no mechanism to
+ * blanket mask the event callback.
+ */
+void
+xen_do_event_2l(void)
 {
-	volatile shared_info_t *s = HYPERVISOR_shared_info;
+
 	struct cpu_info *ci;
 	volatile struct vcpu_info *vci;
-	int ret;
-
-	ret = 0;
+	
 	ci = curcpu();
 	vci = ci->ci_vcpu;
 
-#if 0
-	if (HYPERVISOR_shared_info->events)
-		printf("stipending events %08lx mask %08lx ilevel %d\n",
-		    HYPERVISOR_shared_info->events,
-		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
-#endif
+	/* Despatch: Crank pending through the despatch function */
+	xen_despatch_events(ci);
 
-#ifdef EARLY_DEBUG_EVENT
-	if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
-		xen_debug_handler(NULL);
-		xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
-	}
-#endif
-
-	/*
-	 * we're only called after STIC, so we know that we'll have to
-	 * STI at the end
-	 */
+	do {
+		/* Save cache pending state on Guest and ack them */
+		xen_ack_cache_pending(ci);
 
-	while (vci->evtchn_upcall_pending) {
-		cli();
-
-		vci->evtchn_upcall_pending = 0;
-
-		evt_iterate_bits(&vci->evtchn_pending_sel,
-		    s->evtchn_pending, s->evtchn_mask,
-		    evt_set_pending, &ret);
+		/*
+		 * Capture and queue pending events on this vcpu for
+		 * later despatch
+		 */
+		unsigned remoteport;
+		while ((remoteport = xen_scan_queue_pending(ci)) != NR_EVENT_CHANNELS) {
+			struct cpu_info *rci = evtsource[remoteport].ci;
+			if (rci == curcpu()) /* No more pending */
+				break;
+			/* 
+			 * Mark pending for remote invocation of 
+			 * xen_do_event_2l()->xen_scan_queue_pending()
+			 */
+			xen_atomic_set_bit(&pending[rci->ci_cpuid],
+				remoteport >> LONG_SHIFT);
+			if (__predict_false(
+				    xen_send_ipi(evtsource[remoteport].ci, XEN_IPI_HVCB))) {
+				panic("xen_send_ipi(cpu%d, "
+				      "XEN_IPI_HVCB) failed\n",
+				      (int) ci->ci_cpuid);
+			}
+		}
+		xen_despatch_events(ci);
 
-		sti();
-	}
+	} while(vci->evtchn_upcall_pending);
 
-#if 0
-	if (ci->ci_ipending & 0x1)
-		printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
-		    HYPERVISOR_shared_info->events,
-		    HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
-		    ci->ci_ipending);
-#endif
-
-	return (ret);
+	xen_despatch_events(ci);
 }
 
-/* Iterate through pending events and call the event handler */
-
-static inline void
-evt_do_hypervisor_callback(unsigned int port, unsigned int l1i,
-			   unsigned int l2i, void *args)
-{
-	KASSERT(args != NULL);
-
-	struct cpu_info *ci = curcpu();
-	struct intrframe *regs = args;
-
-#ifdef PORT_DEBUG
-	if (port == PORT_DEBUG)
-		printf("do_hypervisor_callback event %d\n", port);
-#endif
-	if (evtsource[port]) {
-		ci->ci_idepth++;
-		evtchn_do_event(port, regs);
-		ci->ci_idepth--;
-	}
-#ifdef DOM0OPS
-	else  {
-		if (ci->ci_ilevel < IPL_HIGH) {
-			/* fast path */
-			int oipl = ci->ci_ilevel;
-			ci->ci_ilevel = IPL_HIGH;
-			ci->ci_idepth++;			
-			xenevt_event(port);
-			ci->ci_idepth--;
-			ci->ci_ilevel = oipl;
-		} else {
-			/* set pending event */
-			xenevt_setipending(l1i, l2i);
-		}
-	}
-#endif
-}
-
-void
-do_hypervisor_callback(struct intrframe *regs)
-{
-	volatile shared_info_t *s = HYPERVISOR_shared_info;
-	struct cpu_info *ci;
-	volatile struct vcpu_info *vci;
-	int level __diagused;
-
-	ci = curcpu();
-	vci = ci->ci_vcpu;
-	level = ci->ci_ilevel;
-
-	// DDD printf("do_hypervisor_callback\n");
-
-#ifdef EARLY_DEBUG_EVENT
-	if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
-		xen_debug_handler(NULL);
-		xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
-	}
-#endif
-
-	while (vci->evtchn_upcall_pending) {
-		vci->evtchn_upcall_pending = 0;
-
-		evt_iterate_bits(&vci->evtchn_pending_sel,
-		    s->evtchn_pending, s->evtchn_mask,
-		    evt_do_hypervisor_callback, regs);
-	}
-
-#ifdef DIAGNOSTIC
-	if (level != ci->ci_ilevel)
-		printf("hypervisor done %08x level %d/%d ipending %08x\n",
-		    (uint)vci->evtchn_pending_sel,
-		    level, ci->ci_ilevel, ci->ci_ipending);
-#endif
-}
-
-void
-hypervisor_send_event(struct cpu_info *ci, unsigned int ev)
-{
-	KASSERT(ci != NULL);
-
-	volatile shared_info_t *s = HYPERVISOR_shared_info;
-	volatile struct vcpu_info *vci = ci->ci_vcpu;
-
-#ifdef PORT_DEBUG
-	if (ev == PORT_DEBUG)
-		printf("hypervisor_send_event %d\n", ev);
-#endif
-
-	xen_atomic_set_bit(&s->evtchn_pending[0], ev);
-
-	if (__predict_false(ci == curcpu())) {
-		xen_atomic_set_bit(&vci->evtchn_pending_sel,
-		    ev >> LONG_SHIFT);
-		xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0);
-	}
-
-	xen_atomic_clear_bit(&s->evtchn_mask[0], ev);
-
-	if (__predict_true(ci == curcpu())) {
-		hypervisor_force_callback();
-	} else {
-		if (__predict_false(xen_send_ipi(ci, XEN_IPI_HVCB))) {
-			panic("xen_send_ipi(cpu%d, XEN_IPI_HVCB) failed\n",
-			    (int) ci->ci_cpuid);
-		}
-	}
-}
-
+/*
+ * On a real PIC, when a pin is unmasked, if there is a pending
+ * interrupt, it gets asserted on the target CPU immediately after
+ * EOI if unmasked. The CPU can chose to ignore this assertion.
+ *
+ * Note: We don't care what VCPU we're on *now*. If the event is
+ * routed non-locally, an IPI is triggered by XEN.
+ */
 void
 hypervisor_unmask_event(unsigned int ev)
 {
-	volatile shared_info_t *s = HYPERVISOR_shared_info;
-	CPU_INFO_ITERATOR cii;
 	struct cpu_info *ci;
-	volatile struct vcpu_info *vci;
+
+
+	KASSERT(ev > 0 && ev < NR_EVENT_CHANNELS);
+	ci = x86_mp_online ? evtsource[ev].ci : &cpu_info_primary;
+
+	KASSERT(ci != NULL); /* Has the event been routed ? */
 
 #ifdef PORT_DEBUG
 	if (ev == PORT_DEBUG)
 		printf("hypervisor_unmask_event %d\n", ev);
 #endif
 
-	xen_atomic_clear_bit(&s->evtchn_mask[0], ev);
-	/*
-	 * The following is basically the equivalent of
-	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
-	 * interrupt edge' if the channel is masked.
-	 */
-	if (!xen_atomic_test_bit(&s->evtchn_pending[0], ev))
-		return;
+	/* Xen unmasks the evtchn_mask[0]:ev bit for us. */
+	evtchn_op_t op;
+	op.cmd = EVTCHNOP_unmask;
+	op.u.unmask.port = ev;
+	if (HYPERVISOR_event_channel_op(&op) != 0)
+		panic("Failed to unmask event %d\n", ev);
 
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		if (!xen_atomic_test_bit(&ci->ci_evtmask[0], ev))
-			continue;
-		vci = ci->ci_vcpu;
-		if (__predict_true(ci == curcpu())) {
-			if (!xen_atomic_test_and_set_bit(&vci->evtchn_pending_sel,
-				ev>>LONG_SHIFT))
-				xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0);
-		}
-		if (!vci->evtchn_upcall_mask) {
-			if (__predict_true(ci == curcpu())) {
-				hypervisor_force_callback();
-			} else {
-				if (__predict_false(
-				    xen_send_ipi(ci, XEN_IPI_HVCB))) {
-					panic("xen_send_ipi(cpu%d, "
-					    "XEN_IPI_HVCB) failed\n",
-					    (int) ci->ci_cpuid);
-				}
-			}
-		}
-	}
+	return;
 }
 
 void
@@ -373,7 +349,7 @@
 	if (ev == PORT_DEBUG)
 		printf("hypervisor_mask_event %d\n", ev);
 #endif
-
+	KASSERT(ev < NR_EVENT_CHANNELS);
 	xen_atomic_set_bit(&s->evtchn_mask[0], ev);
 }
 
@@ -385,70 +361,87 @@
 	if (ev == PORT_DEBUG)
 		printf("hypervisor_clear_event %d\n", ev);
 #endif
-
+	KASSERT(ev > 0 && ev < NR_EVENT_CHANNELS);
 	xen_atomic_clear_bit(&s->evtchn_pending[0], ev);
 }
 
-static inline void
-evt_enable_event(unsigned int port, unsigned int l1i,
-		 unsigned int l2i, void *args)
+/*
+ * Scan for any other pending events on given vcpu.
+ * Note that this will be a recursive entry into the
+ * callback handler on the given vcpu.
+ */
+
+static void hypervisor_ack_cpu(struct cpu_info *ci)
 {
-	KASSERT(args == NULL);
-	hypervisor_unmask_event(port);
-#if NPCI > 0 || NISA > 0
-	hypervisor_ack_pirq_event(port);
-#endif /* NPCI > 0 || NISA > 0 */
-}
+	KASSERT(ci != NULL);
+	volatile struct vcpu_info *vci;
+
+	vci = ci->ci_vcpu;
 
-void
-hypervisor_enable_ipl(unsigned int ipl)
-{
-	struct cpu_info *ci = curcpu();
+	/* Save cache pending state on Guest and ack them */
+	xen_ack_cache_pending(ci);
+	
+	if (vci->evtchn_upcall_mask)	/* is CLI() in effect ? */
+		/* Cannot recurse. Job done here. */
+		return;
 
-	/*
-	 * enable all events for ipl. As we only set an event in ipl_evt_mask
-	 * for its lowest IPL, and pending IPLs are processed high to low,
-	 * we know that all callback for this event have been processed.
-	 */
-
-	evt_iterate_bits(&ci->ci_isources[ipl]->ipl_evt_mask1,
-	    ci->ci_isources[ipl]->ipl_evt_mask2, NULL, 
-	    evt_enable_event, NULL);
-
+	/* Arrange to invoke pending, unmasked events */
+	if (__predict_true(ci == curcpu())) {
+		hypervisor_force_callback();
+	} else {
+		if (__predict_false(
+			    xen_send_ipi(ci, XEN_IPI_HVCB))) {
+			panic("xen_send_ipi(cpu%d, "
+			      "XEN_IPI_HVCB) failed\n",
+			      (int) ci->ci_cpuid);
+		}
+	}
 }
 
-void
-hypervisor_set_ipending(uint32_t iplmask, int l1, int l2)
+/*
+ * The semantic here is essentially an "EOI".
+ * On a real PIC, EOI indicates that the current pending interrupt
+ * has been serviced. This effectively means that it, or lower
+ * priority interrupts may now be serviced. If the corresponding pins
+ * are unmasked, pending interrupts will signal the CPU. If the CPU
+ * accepts incoming interrupts (ie; rflags:PSL_I is set), then it will
+ * invoke the corresponding handler.
+ *
+ * We simulate this scenario by resetting the pending bit and manually
+ * scanning for existing pending bits.
+ *
+ * If there are any pending and unmasked events on *ANY* VCPU, we
+ * simulate its firing by inducing the XEN callback on the
+ * corresponding VCPU
+ */
+ 
+
+void hypervisor_ack_event(unsigned int ev)
 {
 
-	/* This function is not re-entrant */
-	KASSERT(x86_read_psl() != 0);
-
-	int ipl;
-	struct cpu_info *ci = curcpu();
-
-	/* set pending bit for the appropriate IPLs */	
-	ci->ci_ipending |= iplmask;
-
+	KASSERT(ev > 0 && ev < NR_EVENT_CHANNELS);
+	CPU_INFO_ITERATOR cii;
+ 	struct cpu_info *ci;
+	
+#ifdef PORT_DEBUG
+	if (ev == PORT_DEBUG)
+		printf("hypervisor_ack_event %d\n", ev);
+#endif
 	/*
-	 * And set event pending bit for the lowest IPL. As IPL are handled
-	 * from high to low, this ensure that all callbacks will have been
-	 * called when we ack the event
-	 */
-	ipl = ffs(iplmask);
-	KASSERT(ipl > 0);
-	ipl--;
-	KASSERT(ipl < NIPL);
-	KASSERT(ci->ci_isources[ipl] != NULL);
-	ci->ci_isources[ipl]->ipl_evt_mask1 |= 1UL << l1;
-	ci->ci_isources[ipl]->ipl_evt_mask2[l1] |= 1UL << l2;
-	if (__predict_false(ci != curcpu())) {
-		if (xen_send_ipi(ci, XEN_IPI_HVCB)) {
-			panic("hypervisor_set_ipending: "
-			    "xen_send_ipi(cpu%d, XEN_IPI_HVCB) failed\n",
-			    (int) ci->ci_cpuid);
-		}
+	 * Scan for any other pending events on any vcpu
+	 * Force the corresponding vcpu to enter its interrupt
+	 * callback. Note that this will be a recursive entry on the
+	 * current vcpu.
+ 	 */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		hypervisor_ack_cpu(ci);
 	}
+
+#if NPCI > 0 || NISA > 0
+	hypervisor_ack_pirq_event(ev);
+#endif /* NPCI > 0 || NISA > 0 */
+
+	return;
 }
 
 void
diff -r 0d3b02e26e2b sys/arch/xen/x86/pintr.c
--- a/sys/arch/xen/x86/pintr.c	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/x86/pintr.c	Tue Nov 06 16:14:38 2018 +0000
@@ -135,16 +135,17 @@
 /* XXX: todo - compat with lapic.c and XEN for x2apic */
 bool x2apic_mode __read_mostly = false;
 /* for x86/i8259.c */
-struct intrstub legacy_stubs[NUM_LEGACY_IRQS] = {{0,0}};
+struct intrstub legacy_stubs[NUM_LEGACY_IRQS] = {{0,0,0}};
+
 /* for x86/ioapic.c */
-struct intrstub ioapic_edge_stubs[MAX_INTR_SOURCES] = {{0,0}};
-struct intrstub ioapic_level_stubs[MAX_INTR_SOURCES] = {{0,0}};
-struct intrstub x2apic_edge_stubs[MAX_INTR_SOURCES] = {{0,0}};
-struct intrstub x2apic_level_stubs[MAX_INTR_SOURCES] = {{0,0}};
+struct intrstub ioapic_edge_stubs[MAX_INTR_SOURCES] = {{0,0,0}};
+struct intrstub ioapic_level_stubs[MAX_INTR_SOURCES] = {{0,0,0}};
+struct intrstub x2apic_edge_stubs[MAX_INTR_SOURCES] = {{0,0,0}};
+struct intrstub x2apic_level_stubs[MAX_INTR_SOURCES] = {{0,0,0}};
 #include <machine/i82093var.h>
-int irq2port[NR_EVENT_CHANNELS] = {0}; /* actually port + 1, so that 0 is invaid */
 static int irq2vect[256] = {0};
 static int vect2irq[256] = {0};
+static int irq2port[NR_EVENT_CHANNELS]; /* actually port + 1, so that 0 is invaid */
 #endif /* NIOAPIC */
 #if NACPICA > 0
 #include <machine/mpconfig.h>
@@ -196,6 +197,7 @@
 	int gsi;
 
 	KASSERT(pic != NULL);
+	KASSERT(pic->pic_type != PIC_XEN); /* GSI is a h/w thing */
 
 	/*
 	 * We assume that mpbios/mpacpi have done the right thing.
@@ -205,7 +207,7 @@
 
 	switch (pic->pic_type) {
 	case PIC_I8259:
-		KASSERT(gsi < 16);
+		KASSERT(gsi < NUM_LEGACY_IRQS);
 		break;
 	case PIC_IOAPIC:
 		break;
@@ -218,5 +220,45 @@
 	return gsi;
 }
 
+/* We expect that the gsi has been routed already */
+unsigned int
+xen_vec_alloc_event(int vec)
+{
+	unsigned int evtchn;
+	int gsi;
 
+	KASSERT(vec != 0 && vec < 256);
+
+	gsi = vect2irq[vec];
+	if (irq2port[gsi] == 0) {
+		evtchn = bind_pirq_to_evtch(gsi);
+		KASSERT(evtchn > 0);
+		KASSERT(evtchn < NR_EVENT_CHANNELS);
+		irq2port[gsi] = evtchn + 1;
+	} else {
+		/*
+		 * Shared interrupt - we can't rebind.
+		 * The port is shared instead.
+		 */
+		evtchn = irq2port[gsi] - 1;
+	}
+
+	return evtchn;
+}
+
+/* Wrappers */
+
+int
+xen_gsi_to_vec(int gsi)
+{
+	KASSERT(gsi > 0 && gsi < 255);
+	return irq2vect[gsi];
+}
+
+int
+xen_vec_to_gsi(int vec)
+{
+	KASSERT(vec > 0 && vec < 255);
+	return vect2irq[vec];
+}
 #endif /* defined(DOM0OPS) || NPCI > 0 */
diff -r 0d3b02e26e2b sys/arch/xen/x86/xen_intr.c
--- a/sys/arch/xen/x86/xen_intr.c	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/x86/xen_intr.c	Tue Nov 06 16:14:38 2018 +0000
@@ -37,51 +37,6 @@
 #include <machine/cpu.h>
 #include <machine/intr.h>
 
-/*
- * Add a mask to cpl, and return the old value of cpl.
- */
-int
-splraise(int nlevel)
-{
-	int olevel;
-	struct cpu_info *ci = curcpu();
-
-	olevel = ci->ci_ilevel;
-	if (nlevel > olevel)
-		ci->ci_ilevel = nlevel;
-	__insn_barrier();
-	return (olevel);
-}
-
-/*
- * Restore a value to cpl (unmasking interrupts).  If any unmasked
- * interrupts are pending, call Xspllower() to process them.
- */
-void
-spllower(int nlevel)
-{
-	struct cpu_info *ci = curcpu();
-	uint32_t imask;
-	u_long psl;
-
-	if (ci->ci_ilevel <= nlevel)
-		return;
-
-	__insn_barrier();
-
-	imask = IUNMASK(ci, nlevel);
-	psl = x86_read_psl();
-	x86_disable_intr();
-	if (ci->ci_ipending & imask) {
-		KASSERT(psl == 0);
-		Xspllower(nlevel);
-		/* Xspllower does enable_intr() */
-	} else {
-		ci->ci_ilevel = nlevel;
-		x86_write_psl(psl);
-	}
-}
-
 void
 x86_disable_intr(void)
 {
diff -r 0d3b02e26e2b sys/arch/xen/xen/clock.c
--- a/sys/arch/xen/xen/clock.c	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/xen/clock.c	Tue Nov 06 16:14:38 2018 +0000
@@ -162,7 +162,7 @@
 idle_block(void)
 {
 
-	KASSERT(curcpu()->ci_ipending == 0);
+  //KASSERT(curcpu()->ci_ipending == 0);
 	HYPERVISOR_block();
 }
 
diff -r 0d3b02e26e2b sys/arch/xen/xen/evtchn.c
--- a/sys/arch/xen/xen/evtchn.c	Fri Oct 26 05:35:00 2018 +0000
+++ b/sys/arch/xen/xen/evtchn.c	Tue Nov 06 16:14:38 2018 +0000
@@ -1,6 +1,9 @@
 /*	$NetBSD: evtchn.c,v 1.82 2018/10/26 05:33:21 cherry Exp $	*/
 
 /*
+ * Copyright (c) 2018 The NetBSD Foundation
+ * Conversion to pic interface by Cherry G. Mathew <cherry%NetBSD.org@localhost>
+ *
  * Copyright (c) 2006 Manuel Bouyer.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -87,10 +90,11 @@
 static kmutex_t evtchn_lock;
 
 /* event handlers */
-struct evtsource *evtsource[NR_EVENT_CHANNELS];
+static struct evtsource evtsources[NR_EVENT_CHANNELS];
+struct evtsource *evtsource = evtsources; /* Export */
 
 /* channel locks */
-static kmutex_t evtlock[NR_EVENT_CHANNELS];
+//XXX: static kmutex_t evtlock[NR_EVENT_CHANNELS];
 
 /* Reference counts for bindings to event channels XXX: redo for SMP */
 static uint8_t evtch_bindcount[NR_EVENT_CHANNELS];
@@ -139,8 +143,10 @@
 };
 	
 /*
+ * Design Notes:
+ *
  * We try to stick to the traditional x86 PIC semantics wrt Xen
- * events.
+ * events. The traditional native semantics are as follows:
  *
  * PIC pins exist in a global namespace which may be hierarchical, and
  * are mapped to a cpu bus concept called 'IRQ' numbers, which are
@@ -179,16 +185,15 @@
  * PIC_XEN ,  'pin' , 'irq' and 'idt_vec' are all identical to the
  * port number of the event.
  *
- * In the case of dom0 physical irq bound events, we currently
- * event binding by exporting evtchn.h functions. From the POV of
+ * In the case of dom0 physical irq bound events, we currently do
+ * explicit event binding in x86/intr.c. From the POV of
  * PIC_LAPIC/PIC_IOAPIC, the 'pin' is the hardware pin, the 'irq' is
  * the x86 global irq number  - the port number is extracted out of a
  * global array (this is currently kludgy and breaks API abstraction)
- * and the binding happens during pic_addroute() of the ioapic.
+ * and the binding happens after pic_addroute() of the ioapic.
  *
- * Later when we integrate more tightly with x86/intr.c, we will be
- * able to conform better to (PIC_LAPIC/PIC_IOAPIC)->PIC_XEN
- * cascading model.
+ * Since our current pic.h doesn't provide mechanisms for clean
+ * cascading, this is the best abstraction we can do now.
  */
 
 int debug_port = -1;
@@ -245,7 +250,10 @@
 
 	/* No event-channel are 'live' right now. */
 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		evtsource[i] = NULL;
+		evtsource[i].ist_entry = NULL;
+		evtsource[i].ci = NULL;
+		evtsource[i].slot = -1;
+		evtsource[i].level = IPL_NONE;
 		evtch_bindcount[i] = 0;
 		hypervisor_mask_event(i);
 	}
@@ -267,7 +275,8 @@
 	 * evtsource[] to a non-NULL value so that evtchn_do_event will
 	 * be called.
 	 */
-	evtsource[debug_port] = (void *)-1;
+
+	evtsource[debug_port].ist_entry = (void *)-1;	/* XXX: Review */
 	xen_atomic_set_bit(&curcpu()->ci_evtmask[0], debug_port);
 	hypervisor_unmask_event(debug_port);
 #if NPCI > 0 || NISA > 0
@@ -290,7 +299,7 @@
 
 	hypervisor_mask_event(evtch);
 	/* Remove the non-NULL value set in events_init() */
-	evtsource[evtch] = NULL;
+	evtsource[evtch].ist_entry = NULL;
 	aprint_verbose("VIRQ_DEBUG interrupt disabled, "
 	    "event channel %d removed\n", evtch);
 
@@ -305,137 +314,31 @@
 	return true;
 }
 
-
-unsigned int
-evtchn_do_event(int evtch, struct intrframe *regs)
-{
-	struct cpu_info *ci;
-	int ilevel;
-	struct intrhand *ih;
-	int	(*ih_fun)(void *, void *);
-	uint32_t iplmask;
-	int i;
-	uint32_t iplbit;
-
-	KASSERTMSG(evtch >= 0, "negative evtch: %d", evtch);
-	KASSERTMSG(evtch < NR_EVENT_CHANNELS,
-	    "evtch number %d > NR_EVENT_CHANNELS", evtch);
-
-#ifdef IRQ_DEBUG
-	if (evtch == IRQ_DEBUG)
-		printf("evtchn_do_event: evtch %d\n", evtch);
-#endif
-	ci = curcpu();
-
-	/*
-	 * Shortcut for the debug handler, we want it to always run,
-	 * regardless of the IPL level.
-	 */
-	if (__predict_false(evtch == debug_port)) {
-		xen_debug_handler(NULL);
-		hypervisor_unmask_event(debug_port);
-#if NPCI > 0 || NISA > 0
-		hypervisor_ack_pirq_event(debug_port);
-#endif /* NPCI > 0 || NISA > 0 */		
-		return 0;
-	}
-
-	KASSERTMSG(evtsource[evtch] != NULL, "unknown event %d", evtch);
-	ci->ci_data.cpu_nintr++;
-	evtsource[evtch]->ev_evcnt.ev_count++;
-	ilevel = ci->ci_ilevel;
-
-	if (evtsource[evtch]->ev_cpu != ci /* XXX: get stats */) {
-		hypervisor_send_event(evtsource[evtch]->ev_cpu, evtch);
-		return 0;
-	}
-
-	if (evtsource[evtch]->ev_maxlevel <= ilevel) {
-#ifdef IRQ_DEBUG
-		if (evtch == IRQ_DEBUG)
-		    printf("evtsource[%d]->ev_maxlevel %d <= ilevel %d\n",
-		    evtch, evtsource[evtch]->ev_maxlevel, ilevel);
-#endif
-		hypervisor_set_ipending(evtsource[evtch]->ev_imask,
-					evtch >> LONG_SHIFT,
-					evtch & LONG_MASK);
-
-		/* leave masked */
+#define PRIuCPUID	"lu" /* XXX: move this somewhere more appropriate */
 
-		return 0;
-	}
-	ci->ci_ilevel = evtsource[evtch]->ev_maxlevel;
-	iplmask = evtsource[evtch]->ev_imask;
-	sti();
-	mutex_spin_enter(&evtlock[evtch]);
-	ih = evtsource[evtch]->ev_handlers;
-	while (ih != NULL) {
-		if (ih->ih_cpu != ci) {
-			hypervisor_send_event(ih->ih_cpu, evtch);
-			iplmask &= ~IUNMASK(ci, ih->ih_level);
-			ih = ih->ih_evt_next;
-			continue;
-		}
-		if (ih->ih_level <= ilevel) {
-#ifdef IRQ_DEBUG
-		if (evtch == IRQ_DEBUG)
-		    printf("ih->ih_level %d <= ilevel %d\n", ih->ih_level, ilevel);
-#endif
-			cli();
-			hypervisor_set_ipending(iplmask,
-			    evtch >> LONG_SHIFT, evtch & LONG_MASK);
-			/* leave masked */
-			mutex_spin_exit(&evtlock[evtch]);
-			goto splx;
-		}
-		iplmask &= ~IUNMASK(ci, ih->ih_level);
-		ci->ci_ilevel = ih->ih_level;
-		ih_fun = (void *)ih->ih_fun;
-		ih_fun(ih->ih_arg, regs);
-		ih = ih->ih_evt_next;
-	}
-	mutex_spin_exit(&evtlock[evtch]);
-	cli();
-	hypervisor_unmask_event(evtch);
-#if NPCI > 0 || NISA > 0
-	hypervisor_ack_pirq_event(evtch);
-#endif /* NPCI > 0 || NISA > 0 */		
-
-splx:
-	/*
-	 * C version of spllower(). ASTs will be checked when
-	 * hypevisor_callback() exits, so no need to check here.
-	 */
-	iplmask = (IUNMASK(ci, ilevel) & ci->ci_ipending);
-	while (iplmask != 0) {
-		iplbit = 1 << (NIPL - 1);
-		i = (NIPL - 1);
-		while (iplmask != 0 && i > ilevel) {
-			while (iplmask & iplbit) {
-				ci->ci_ipending &= ~iplbit;
-				ci->ci_ilevel = i;
-				for (ih = ci->ci_isources[i]->is_handlers;
-				    ih != NULL; ih = ih->ih_next) {
-					KASSERT(ih->ih_cpu == ci);
-					sti();
-					ih_fun = (void *)ih->ih_fun;
-					ih_fun(ih->ih_arg, regs);
-					cli();
-				}
-				hypervisor_enable_ipl(i);
-				/* more pending IPLs may have been registered */
-				iplmask =
-				    (IUNMASK(ci, ilevel) & ci->ci_ipending);
-			}
-			i--;
-			iplbit >>= 1;
-		}
-	}
-	ci->ci_ilevel = ilevel;
-	return 0;
+void
+xen_channel_free(evtchn_port_t evtchn)
+{
+	KASSERT((evtchn > 0) && (evtchn < NR_EVENT_CHANNELS));
+	
+	mutex_spin_enter(&evtchn_lock);
+	evtsource[evtchn].ist_entry = NULL; /* XXX: confirm invalid */
+	evtsource[evtchn].slot = -1;
+	mutex_spin_exit(&evtchn_lock);
 }
 
-#define PRIuCPUID	"lu" /* XXX: move this somewhere more appropriate */
+void
+xen_channel_set(evtchn_port_t evtchn, void *stub, int slot, int level)
+{
+	KASSERT((evtchn > 0) && (evtchn < NR_EVENT_CHANNELS));
+	KASSERT(stub != NULL);
+	
+	mutex_spin_enter(&evtchn_lock);
+	evtsource[evtchn].ist_entry = stub; /* XXX: confirm valid */
+	evtsource[evtchn].slot = slot;
+	evtsource[evtchn].level = level;
+	mutex_spin_exit(&evtchn_lock);
+}
 
 /* PIC callbacks */
 /* pic "pin"s are conceptually mapped to event port numbers */
@@ -448,7 +351,6 @@
 	KASSERT(evtchn < NR_EVENT_CHANNELS);
 
 	hypervisor_mask_event(evtchn);
-	
 }
 
 static void
@@ -463,49 +365,75 @@
 	
 }
 
-
 static void
 xen_evtchn_addroute(struct pic *pic, struct cpu_info *ci, int pin, int idt_vec, int type)
 {
-
 	evtchn_port_t evtchn = pin;
 
 	/* Events are simulated as level triggered interrupts */
 	KASSERT(type == IST_LEVEL); 
 
 	KASSERT(evtchn < NR_EVENT_CHANNELS);
-#if notyet
-	evtchn_port_t boundport = idt_vec;
-#endif
 	
 	KASSERT(pic->pic_type == PIC_XEN);
+	KASSERT(idt_vec == pin); /* passthrough */
+	
+	/* 
+	 * This is a bit tricky. On MP, events are typically
+	 * bound to the VCPU that instantiated them. This makes things
+	 * slightly non-deterministic for us. Some of the Xen APIs
+	 * related to events are non decomposable, and therefore can't
+	 * be cleanly mapped to the pic->pic_route() abstraction. This
+	 * means that by the time we get here, the routing decision
+	 * has already been made for us. Thus we can only sanity check
+	 * the routing, unless rebinding is an option (but see comment
+	 * about caveats below). Note that we don't have the ability
+	 * to notify callers that any routing has failed or is
+	 * inconsistent due to the lack of a return value mechanism in
+	 * the pic_route() API call.
+	 */
+
+	if (evtsource[evtchn].ci == NULL) {
+		evtsource[evtchn].ci = ci;
+	}
+	else {
+		KASSERT(evtsource[evtchn].ci == ci);
+	}
+
+#if 0 /* 
+       * Note: if we figure out rebinding, then we should override the
+       * above. The route() code should be ideally able to rebind to
+       * the appropriate VCPU. There are some caveats, so disabling
+       * this for now.
+       */
+	op.cmd = EVTCHNOP_bind_vcpu;
+	op.u.bind_vcpu.vcpu = (uint32_t) ci->ci_cpuid;
+	op.u.bind_vcpu.port = evtchn;
+	if (HYPERVISOR_event_channel_op(&op) != 0)
+	  panic("Failed to bind channel %u to VCPU %"PRIuCPUID"\n", evtchn, ci->ci_cpuid);
+	evtchn = op.u.bind_vcpu.port;
+#endif
 
 	xen_atomic_set_bit(&ci->ci_evtmask[0], evtchn);
-
+	printf("%s: evtmask set on CPU #%d for event %u\n", __func__, (int) ci->ci_cpuid, evtchn);
 }
 
 static void
 xen_evtchn_delroute(struct pic *pic, struct cpu_info *ci, int pin, int idt_vec, int type)
 {
-	/*
-	 * XXX: In the future, this is a great place to
-	 * 'unbind' events to underlying events and cpus.
-	 * For now, just disable interrupt servicing on this cpu for
-	 * this pin aka cpu.
-	 */
 	evtchn_port_t evtchn = pin;
 
 	/* Events are simulated as level triggered interrupts */
 	KASSERT(type == IST_LEVEL); 
 
 	KASSERT(evtchn < NR_EVENT_CHANNELS);
-#if notyet
-	evtchn_port_t boundport = idt_vec;
-#endif
 	
 	KASSERT(pic->pic_type == PIC_XEN);
-
+	KASSERT(idt_vec == pin); /* passthrough */
+	
+	KASSERT(evtsource[evtchn].ci == ci);
 	xen_atomic_clear_bit(&ci->ci_evtmask[0], evtchn);
+	evtsource[evtchn].ci = NULL;
 }
 
 /*
@@ -576,6 +504,8 @@
 	evtchn_op_t op;
 	int evtchn;
 
+	KASSERT(virq < NR_VIRQS);
+	
 	mutex_spin_enter(&evtchn_lock);
 
 	/*
@@ -741,269 +671,8 @@
 	return evtchn;
 }
 
-struct pintrhand *
-pirq_establish(int pirq, int evtch, int (*func)(void *), void *arg, int level,
-    const char *intrname, const char *xname)
-{
-	struct pintrhand *ih;
-
-	ih = kmem_zalloc(sizeof(struct pintrhand),
-	    cold ? KM_NOSLEEP : KM_SLEEP);
-	if (ih == NULL) {
-		printf("pirq_establish: can't allocate handler info\n");
-		return NULL;
-	}
-
-	KASSERT(evtch > 0);
-
-	ih->pirq = pirq;
-	ih->evtch = evtch;
-	ih->func = func;
-	ih->arg = arg;
-
-	if (event_set_handler(evtch, pirq_interrupt, ih, level, intrname,
-	    xname) != 0) {
-		kmem_free(ih, sizeof(struct pintrhand));
-		return NULL;
-	}
-
-	hypervisor_prime_pirq_event(pirq, evtch);
-	hypervisor_unmask_event(evtch);
-	hypervisor_ack_pirq_event(evtch);
-	return ih;
-}
-
-void
-pirq_disestablish(struct pintrhand *ih)
-{
-	int error = event_remove_handler(ih->evtch, pirq_interrupt, ih);
-	if (error) {
-		printf("pirq_disestablish(%p): %d\n", ih, error);
-		return;
-	}
-	kmem_free(ih, sizeof(struct pintrhand));
-}
-
-int
-pirq_interrupt(void *arg)
-{
-	struct pintrhand *ih = arg;
-	int ret;
-
-	ret = ih->func(ih->arg);
-#ifdef IRQ_DEBUG
-	if (ih->evtch == IRQ_DEBUG)
-	    printf("pirq_interrupt irq %d ret %d\n", ih->pirq, ret);
-#endif
-	return ret;
-}
-
 #endif /* NPCI > 0 || NISA > 0 */
 
-
-/*
- * Recalculate the interrupt from scratch for an event source.
- */
-static void
-intr_calculatemasks(struct evtsource *evts, int evtch, struct cpu_info *ci)
-{
-	struct intrhand *ih;
-	int cpu_receive = 0;
-
-#ifdef MULTIPROCESSOR
-	KASSERT(!mutex_owned(&evtlock[evtch]));
-#endif
-	mutex_spin_enter(&evtlock[evtch]);
-	evts->ev_maxlevel = IPL_NONE;
-	evts->ev_imask = 0;
-	for (ih = evts->ev_handlers; ih != NULL; ih = ih->ih_evt_next) {
-		if (ih->ih_level > evts->ev_maxlevel)
-			evts->ev_maxlevel = ih->ih_level;
-		evts->ev_imask |= (1 << ih->ih_level);
-		if (ih->ih_cpu == ci)
-			cpu_receive = 1;
-	}
-	if (cpu_receive)
-		xen_atomic_set_bit(&curcpu()->ci_evtmask[0], evtch);
-	else
-		xen_atomic_clear_bit(&curcpu()->ci_evtmask[0], evtch);
-	mutex_spin_exit(&evtlock[evtch]);
-}
-
-int
-event_set_handler(int evtch, int (*func)(void *), void *arg, int level,
-    const char *intrname, const char *xname)
-{
-	struct cpu_info *ci = curcpu(); /* XXX: pass in ci ? */
-	struct evtsource *evts;
-	struct intrhand *ih, **ihp;
-	int s;
-#ifdef MULTIPROCESSOR
-	bool mpsafe = (level != IPL_VM);
-#endif /* MULTIPROCESSOR */
-
-#ifdef IRQ_DEBUG
-	printf("event_set_handler IRQ %d handler %p\n", evtch, func);
-#endif
-
-	KASSERTMSG(evtch >= 0, "negative evtch: %d", evtch);
-	KASSERTMSG(evtch < NR_EVENT_CHANNELS,
-	    "evtch number %d > NR_EVENT_CHANNELS", evtch);
-	KASSERT(intrname != NULL && xname != NULL);
-
-#if 0
-	printf("event_set_handler evtch %d handler %p level %d\n", evtch,
-	       handler, level);
-#endif
-	ih = kmem_zalloc(sizeof (struct intrhand), KM_NOSLEEP);
-	if (ih == NULL)
-		panic("can't allocate fixed interrupt source");
-
-
-	ih->ih_level = level;
-	ih->ih_fun = ih->ih_realfun = func;
-	ih->ih_arg = ih->ih_realarg = arg;
-	ih->ih_evt_next = NULL;
-	ih->ih_next = NULL;
-	ih->ih_cpu = ci;
-#ifdef MULTIPROCESSOR
-	if (!mpsafe) {
-		ih->ih_fun = xen_intr_biglock_wrapper;
-		ih->ih_arg = ih;
-	}
-#endif /* MULTIPROCESSOR */
-
-	s = splhigh();
-
-	/* register per-cpu handler for spllower() */
-	event_set_iplhandler(ci, ih, level);
-
-	/* register handler for event channel */
-	if (evtsource[evtch] == NULL) {
-		evts = kmem_zalloc(sizeof (struct evtsource),
-		    KM_NOSLEEP);
-		if (evts == NULL)
-			panic("can't allocate fixed interrupt source");
-
-		evts->ev_handlers = ih;
-		/*
-		 * XXX: We're assuming here that ci is the same cpu as
-		 * the one on which this event/port is bound on. The
-		 * api needs to be reshuffled so that this assumption
-		 * is more explicitly implemented.
-		 */
-		evts->ev_cpu = ci;
-		mutex_init(&evtlock[evtch], MUTEX_DEFAULT, IPL_HIGH);
-		evtsource[evtch] = evts;
-		strlcpy(evts->ev_intrname, intrname, sizeof(evts->ev_intrname));
-
-		evcnt_attach_dynamic(&evts->ev_evcnt, EVCNT_TYPE_INTR, NULL,
-		    device_xname(ci->ci_dev), evts->ev_intrname);
-	} else {
-		evts = evtsource[evtch];
-		/* sort by IPL order, higher first */
-		mutex_spin_enter(&evtlock[evtch]);
-		for (ihp = &evts->ev_handlers; ; ihp = &((*ihp)->ih_evt_next)) {
-			if ((*ihp)->ih_level < ih->ih_level) {
-				/* insert before *ihp */
-				ih->ih_evt_next = *ihp;
-				*ihp = ih;
-				break;
-			}
-			if ((*ihp)->ih_evt_next == NULL) {
-				(*ihp)->ih_evt_next = ih;
-				break;
-			}
-		}
-		mutex_spin_exit(&evtlock[evtch]);
-	}
-
-
-	// append device name
-	if (evts->ev_xname[0] != '\0')
-		strlcat(evts->ev_xname, ", ", sizeof(evts->ev_xname));
-	strlcat(evts->ev_xname, xname, sizeof(evts->ev_xname));
-
-	intr_calculatemasks(evts, evtch, ci);
-	splx(s);
-
-	return 0;
-}
-
-void
-event_set_iplhandler(struct cpu_info *ci,
-		     struct intrhand *ih,
-		     int level)
-{
-	struct intrsource *ipls;
-
-	KASSERT(ci == ih->ih_cpu);
-	if (ci->ci_isources[level] == NULL) {
-		ipls = kmem_zalloc(sizeof (struct intrsource),
-		    KM_NOSLEEP);
-		if (ipls == NULL)
-			panic("can't allocate fixed interrupt source");
-		ipls->is_recurse = xenev_stubs[level].ist_recurse;
-		ipls->is_resume = xenev_stubs[level].ist_resume;
-		ipls->is_handlers = ih;
-		ci->ci_isources[level] = ipls;
-	} else {
-		ipls = ci->ci_isources[level];
-		ih->ih_next = ipls->is_handlers;
-		ipls->is_handlers = ih;
-	}
-}
-
-int
-event_remove_handler(int evtch, int (*func)(void *), void *arg)
-{
-	struct intrsource *ipls;
-	struct evtsource *evts;
-	struct intrhand *ih;
-	struct intrhand **ihp;
-	struct cpu_info *ci;
-
-	evts = evtsource[evtch];
-	if (evts == NULL)
-		return ENOENT;
-
-	mutex_spin_enter(&evtlock[evtch]);
-	for (ihp = &evts->ev_handlers, ih = evts->ev_handlers;
-	    ih != NULL;
-	    ihp = &ih->ih_evt_next, ih = ih->ih_evt_next) {
-		if (ih->ih_realfun == func && ih->ih_realarg == arg)
-			break;
-	}
-	if (ih == NULL) {
-		mutex_spin_exit(&evtlock[evtch]);
-		return ENOENT;
-	}
-	ci = ih->ih_cpu;
-	*ihp = ih->ih_evt_next;
-
-	ipls = ci->ci_isources[ih->ih_level];
-	for (ihp = &ipls->is_handlers, ih = ipls->is_handlers;
-	    ih != NULL;
-	    ihp = &ih->ih_next, ih = ih->ih_next) {
-		if (ih->ih_realfun == func && ih->ih_realarg == arg)
-			break;
-	}
-	if (ih == NULL)
-		panic("event_remove_handler");
-	*ihp = ih->ih_next;
-	mutex_spin_exit(&evtlock[evtch]);
-	kmem_free(ih, sizeof (struct intrhand));
-	if (evts->ev_handlers == NULL) {
-		xen_atomic_clear_bit(&ci->ci_evtmask[0], evtch);
-		evcnt_detach(&evts->ev_evcnt);
-		kmem_free(evts, sizeof (struct evtsource));
-		evtsource[evtch] = NULL;
-	} else {
-		intr_calculatemasks(evts, evtch, ci);
-	}
-	return 0;
-}
-
 #if NPCI > 0 || NISA > 0
 void
 hypervisor_prime_pirq_event(int pirq, unsigned int evtch)
@@ -1079,130 +748,3 @@
 	return 0;
 }
 
-static struct evtsource *
-event_get_handler(const char *intrid)
-{
-	for (int i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (evtsource[i] == NULL || i == debug_port)
-			continue;
-
-		struct evtsource *evp = evtsource[i];
-
-		if (strcmp(evp->ev_intrname, intrid) == 0)
-			return evp;
-	}
-
-	return NULL;
-}
-
-/*
- * MI interface for subr_interrupt.c
- */
-uint64_t
-interrupt_get_count(const char *intrid, u_int cpu_idx)
-{
-	int count = 0;
-	struct evtsource *evp;
-
-	mutex_spin_enter(&evtchn_lock);
-
-	evp = event_get_handler(intrid);
-	if (evp != NULL && cpu_idx == cpu_index(evp->ev_cpu))
-		count = evp->ev_evcnt.ev_count;
-
-	mutex_spin_exit(&evtchn_lock);
-
-	return count;
-}
-
-/*
- * MI interface for subr_interrupt.c
- */
-void
-interrupt_get_assigned(const char *intrid, kcpuset_t *cpuset)
-{
-	struct evtsource *evp;
-
-	kcpuset_zero(cpuset);
-
-	mutex_spin_enter(&evtchn_lock);
-
-	evp = event_get_handler(intrid);
-	if (evp != NULL)
-		kcpuset_set(cpuset, cpu_index(evp->ev_cpu));
-
-	mutex_spin_exit(&evtchn_lock);
-}
-
-/*
- * MI interface for subr_interrupt.c
- */
-void
-interrupt_get_devname(const char *intrid, char *buf, size_t len)
-{
-	struct evtsource *evp;
-
-	mutex_spin_enter(&evtchn_lock);
-
-	evp = event_get_handler(intrid);
-	strlcpy(buf, evp ? evp->ev_xname : "unknown", len);
-
-	mutex_spin_exit(&evtchn_lock);
-}
-
-/*
- * MI interface for subr_interrupt.
- */
-struct intrids_handler *
-interrupt_construct_intrids(const kcpuset_t *cpuset)
-{
-	struct intrids_handler *ii_handler;
-	intrid_t *ids;
-	int i, count, off;
-	struct evtsource *evp;
-
-	if (kcpuset_iszero(cpuset))
-		return 0;
-
-	/*
-	 * Count the number of interrupts which affinity to any cpu of "cpuset".
-	 */
-	count = 0;
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		evp = evtsource[i];
-
-		if (evp == NULL || i == debug_port)
-			continue;
-
-		if (!kcpuset_isset(cpuset, cpu_index(evp->ev_cpu)))
-			continue;
-
-		count++;
-	}
-
-	ii_handler = kmem_zalloc(sizeof(int) + sizeof(intrid_t) * count,
-	    KM_SLEEP);
-	if (ii_handler == NULL)
-		return NULL;
-	ii_handler->iih_nids = count;
-	if (count == 0)
-		return ii_handler;
-
-	ids = ii_handler->iih_intrids;
-	mutex_spin_enter(&evtchn_lock);
-	for (i = 0, off = 0; i < NR_EVENT_CHANNELS && off < count; i++) {
-		evp = evtsource[i];
-
-		if (evp == NULL || i == debug_port)
-			continue;
-
-		if (!kcpuset_isset(cpuset, cpu_index(evp->ev_cpu)))
-			continue;
-
-		snprintf(ids[off], sizeof(intrid_t), "%s", evp->ev_intrname);
-		off++;
-	}
-	mutex_spin_exit(&evtchn_lock);
-
-	return ii_handler;
-}

Follow-Ups:
- Re: Time accounting on statclock()
  - From: Mathew, Cherry G.

References:
- re: Time accounting on statclock()
  - From: matthew green

Prev by Date: Re: Help with a bug in mmap
Next by Date: Re: [PATCH] Re: Reboot resistant USB bug
Previous by Thread: re: Time accounting on statclock()
Next by Thread: Re: Time accounting on statclock()
Indexes:

Home | Main Index | Thread Index | Old Index