tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Time accounting on statclock()



"Cherry G. Mathew" <cherry%NetBSD.org@localhost> writes:

> Hello,
>
> So recently I've been looking at the interrupt subsystem for NetBSD/Xen,
> which is a little unusual in that several interrupts can batch
> together before a virtual cpu that is scheduled can be woken up to
> handle them. 
>
> When this happens, a single callback handler is invoked, which
> demultiplexes all the pending interrupts, and passes on to everyone of
> them the same trapframe that was passed to the callback. This is
> particularly interesting for the clock handler, because what would have
> been multiple ticks during any dormant period for the guest OS would be
> only called once, and so the handler invokes hardclock() a number of
> times to make up for lost time. Once again, it accounts for it using the
> exact same trapframe that it got from the demultiplexing callback
> handler - which if it were from userland, would account every tick to
> that particular userland process (which was presumably dormant for the
> duration lost).
>
> This got me thinking about deferred interrupts using spl(9), and how
> time is accounted for those. (I will confess that I have an insiduous
> interest, but I won't divulge what that is right now).
>
> For eg: if a clock interrupt from userland got deferred as pending, even
> if it came in from userland (is this possible ?), because the current
> spl level was at, say, SPL_HIGH, it now seems to be the case that the
> system accounts for the delayed execution by charging the *entire* time
> (from the last hardclock() inclusive of whatever was executing at
> SPL_HIGH, to the system and not the userland process, thus charging the
> time interval between when the last hardclock() was called, and when it
> was actually serviced, to the system instead of the user process that
> was originally interrupted.
>
> To emphasise my point, I've added a patch below that I think should
> reflect the accounting correctly.
>
> I'm not sure I've understood this correctly, so I'd appreciate it if
> someone who has a good understanding of this would be able to comment.
>
> Many Thanks,
>
> Cherry
>
>
>
> --- kern_clock.c.~1.138.~       2018-09-21 11:28:02.792675611 +0000
> +++ kern_clock.c        2018-10-16 12:06:38.753987323 +0000
> @@ -352,6 +352,10 @@
>         }
>
>         if (CLKF_USERMODE(frame)) {
> +               if (p == NULL) { /* Account deferred clock ticks to
> current user. */
> +                       p = l->l_proc;
> +                       mutex_spin_enter(&p->p_stmutex);
> +               }
>                 KASSERT(p != NULL);
>                 if ((p->p_stflag & PST_PROFIL) && profsrc ==
>                 PROFSRC_CLOCK)
>                         addupc_intr(l, CLKF_PC(frame));


Here's a full patch implementing what I have in mind for native and
port-xen on arch/amd64


-- 
~~cherry

diff -r 5a1052452d40 sys/arch/amd64/amd64/amd64_trap.S
--- a/sys/arch/amd64/amd64/amd64_trap.S	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/amd64/amd64/amd64_trap.S	Tue Oct 16 14:35:48 2018 +0000
@@ -628,6 +628,7 @@
  */
 ENTRY(alltraps)
 	INTRENTRY
+	movq	%rsp,CPUVAR(TF)
 .Lalltraps_noentry:
 	STI(si)
 
diff -r 5a1052452d40 sys/arch/amd64/amd64/genassym.cf
--- a/sys/arch/amd64/amd64/genassym.cf	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/amd64/amd64/genassym.cf	Tue Oct 16 14:35:48 2018 +0000
@@ -259,6 +259,7 @@
 define  CPU_INFO_CPUID		offsetof(struct cpu_info, ci_cpuid)
 define	CPU_INFO_ISTATE		offsetof(struct cpu_info, ci_istate)
 define	CPU_INFO_CC_SKEW	offsetof(struct cpu_info, ci_data.cpu_cc_skew)
+define  CPU_INFO_TF		offsetof(struct cpu_info, ci_tf)
 
 define	ACPI_SUSPEND_GDT	offsetof(struct cpu_info, ci_suspend_gdt)
 define	ACPI_SUSPEND_IDT	offsetof(struct cpu_info, ci_suspend_idt)
diff -r 5a1052452d40 sys/arch/amd64/amd64/vector.S
--- a/sys/arch/amd64/amd64/vector.S	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/amd64/amd64/vector.S	Tue Oct 16 14:35:48 2018 +0000
@@ -238,7 +238,6 @@
 	movl	$IPL_CLOCK,CPUVAR(ILEVEL)
 	sti
 	pushq	%rbx
-	movq	%rsp,%rsi
 	xorq	%rdi,%rdi
 	call	_C_LABEL(lapic_clockintr)
 	jmp	_C_LABEL(Xdoreti)
@@ -252,12 +251,14 @@
 	pushq	$0
 	pushq	$T_ASTFLT
 	INTRENTRY
+	movq	%rsp,CPUVAR(TF)
 	jmp	_C_LABEL(Xhandle_x2apic_ltimer)
 IDTVEC_END(intr_x2apic_ltimer)
 IDTVEC(intr_lapic_ltimer)
 	pushq	$0
 	pushq	$T_ASTFLT
 	INTRENTRY
+	movq	%rsp,CPUVAR(TF)
 	jmp	_C_LABEL(Xhandle_lapic_ltimer)
 IDTVEC_END(intr_lapic_ltimer)
 	TEXT_USER_END
@@ -346,7 +347,6 @@
 	movl	IH_LEVEL(%rbx),%r12d					;\
 	cmpl	%r13d,%r12d						;\
 	jle	7f							;\
-	movq	%rsp,%rsi						;\
 	movq	IH_ARG(%rbx),%rdi					;\
 	movl	%r12d,CPUVAR(ILEVEL)					;\
 	call	*IH_FUN(%rbx)		/* call it */			;\
@@ -382,6 +382,7 @@
 	pushq	$0			/* dummy error code */		;\
 	pushq	$T_ASTFLT		/* trap # for doing ASTs */	;\
 	INTRENTRY							;\
+	movq	%rsp,CPUVAR(TF)						;\
 	jmp	_C_LABEL(Xhandle_ ## name ## num)			;\
 IDTVEC_END(intr_ ## name ## num)					;\
 	TEXT_USER_END
@@ -665,7 +666,6 @@
 	movq	IS_HANDLERS(%r14),%rbx					;\
 6:									\
 	movq	IH_ARG(%rbx),%rdi					;\
-	movq	%rsp,%rsi						;\
 	call	*IH_FUN(%rbx)		/* call it */			;\
 	movq	IH_NEXT(%rbx),%rbx	/* next handler in chain */	;\
 	testq	%rbx,%rbx						;\
@@ -765,6 +765,8 @@
 	/* sti?? */
 	movq	%rsp,%rdi
 	subq	$8,%rdi;	/* don't forget if_ppl */
+	movq	%rdi,CPUVAR(TF)
+	xorq	%rdi, %rdi	/* We don't want to pass on as arg */
 	call	do_hypervisor_callback
 	testb	$SEL_RPL,TF_CS(%rsp)
 	jnz	doreti_checkast
@@ -782,6 +784,8 @@
 	INTRENTRY
 	movq	%rsp,%rdi
 	subq	$8,%rdi;	/* don't forget if_ppl */
+	movq	%rdi,CPUVAR(TF)
+	xorq	%rdi, %rdi	/* We don't want to pass on as arg */
 	call	xen_failsafe_handler
 	INTRFASTEXIT
 /*	jmp	HYPERVISOR_iret */
diff -r 5a1052452d40 sys/arch/i386/i386/genassym.cf
--- a/sys/arch/i386/i386/genassym.cf	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/i386/i386/genassym.cf	Tue Oct 16 14:35:48 2018 +0000
@@ -262,6 +262,7 @@
 define	CPU_INFO_NINTR		offsetof(struct cpu_info, ci_data.cpu_nintr)
 define	CPU_INFO_CURPRIORITY	offsetof(struct cpu_info, ci_schedstate.spc_curpriority)
 define	CPU_INFO_CC_SKEW	offsetof(struct cpu_info, ci_data.cpu_cc_skew)
+define  CPU_INFO_TF		offsetof(struct cpu_info, ci_tf)
 
 
 define	CPU_INFO_VENDOR		offsetof(struct cpu_info, ci_vendor[0])
diff -r 5a1052452d40 sys/arch/i386/i386/vector.S
--- a/sys/arch/i386/i386/vector.S	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/i386/i386/vector.S	Tue Oct 16 14:35:48 2018 +0000
@@ -1027,9 +1027,9 @@
 	cmpl	$ecrit,%eax
 	jb	critical_region_fixup
 11:	pushl	CPUVAR(ILEVEL)
-	push	%esp
+	movq	%esp, CPUVAR(TF)
 	call	do_hypervisor_callback
-	add	$8,%esp
+	add	$4,%esp
 	xorl	%eax,%eax
 	movb	TF_CS(%esp),%cl
 	test	$CHK_UPL,%cl		/* slow return to ring 2 or 3 */
diff -r 5a1052452d40 sys/arch/x86/include/cpu.h
--- a/sys/arch/x86/include/cpu.h	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/x86/include/cpu.h	Tue Oct 16 14:35:48 2018 +0000
@@ -167,7 +167,7 @@
 	
 	const struct cpu_functions *ci_func;  /* start/stop functions */
 	struct trapframe *ci_ddb_regs;
-
+	struct trapframe *ci_tf; /* Saved for clock handlers */
 	u_int ci_cflush_lsize;	/* CLFLUSH insn line size */
 	struct x86_cache_info ci_cinfo[CAI_COUNT];
 
diff -r 5a1052452d40 sys/arch/x86/isa/clock.c
--- a/sys/arch/x86/isa/clock.c	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/x86/isa/clock.c	Tue Oct 16 14:35:48 2018 +0000
@@ -188,7 +188,7 @@
 void		sysbeep(int, int);
 static void     tickle_tc(void);
 
-static int	clockintr(void *, struct intrframe *);
+static int	clockintr(void *);
 
 int 		sysbeepdetach(device_t, int);
 
@@ -371,11 +371,11 @@
 }
 
 static int
-clockintr(void *arg, struct intrframe *frame)
+clockintr(void *arg)
 {
 	tickle_tc();
 
-	hardclock((struct clockframe *)frame);
+	hardclock((struct clockframe *)curcpu()->ci_tf);
 
 #if NMCA > 0
 	if (MCA_system) {
diff -r 5a1052452d40 sys/arch/x86/x86/lapic.c
--- a/sys/arch/x86/x86/lapic.c	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/x86/x86/lapic.c	Tue Oct 16 14:35:48 2018 +0000
@@ -90,7 +90,7 @@
 #include <x86/x86/vmtvar.h>	/* for vmt_hvcall() */
 
 /* Referenced from vector.S */
-void		lapic_clockintr(void *, struct intrframe *);
+void		lapic_clockintr(void *);
 
 static void	lapic_delay(unsigned int);
 static uint32_t lapic_gettick(void);
@@ -561,13 +561,13 @@
 extern u_int i8254_get_timecount(struct timecounter *);
 
 void
-lapic_clockintr(void *arg, struct intrframe *frame)
+lapic_clockintr(void *arg)
 {
 	struct cpu_info *ci = curcpu();
 
 	ci->ci_lapic_counter += lapic_tval;
 	ci->ci_isources[LIR_TIMER]->is_evcnt.ev_count++;
-	hardclock((struct clockframe *)frame);
+	hardclock((struct clockframe *)ci->ci_tf);
 }
 
 void
diff -r 5a1052452d40 sys/arch/xen/include/evtchn.h
--- a/sys/arch/xen/include/evtchn.h	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/xen/include/evtchn.h	Tue Oct 16 14:35:48 2018 +0000
@@ -38,9 +38,7 @@
 bool events_suspend(void);
 bool events_resume(void);
 
-unsigned int evtchn_do_event(int, struct intrframe *);
-void call_evtchn_do_event(int, struct intrframe *);
-void call_xenevt_event(int);
+unsigned int evtchn_do_event(int);
 int event_set_handler(int, int (*func)(void *), void *, int, const char *,
     const char *);
 int event_remove_handler(int, int (*func)(void *), void *);
diff -r 5a1052452d40 sys/arch/xen/include/hypervisor.h
--- a/sys/arch/xen/include/hypervisor.h	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/xen/include/hypervisor.h	Tue Oct 16 14:35:48 2018 +0000
@@ -129,7 +129,7 @@
 /* hypervisor.c */
 struct intrframe;
 struct cpu_info;
-void do_hypervisor_callback(struct intrframe *regs);
+void do_hypervisor_callback(void);
 void hypervisor_prime_pirq_event(int, unsigned int);
 void hypervisor_enable_event(unsigned int);
 
diff -r 5a1052452d40 sys/arch/xen/x86/hypervisor_machdep.c
--- a/sys/arch/xen/x86/hypervisor_machdep.c	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/xen/x86/hypervisor_machdep.c	Tue Oct 16 14:35:48 2018 +0000
@@ -215,10 +215,9 @@
 evt_do_hypervisor_callback(unsigned int port, unsigned int l1i,
 			   unsigned int l2i, void *args)
 {
-	KASSERT(args != NULL);
+	KASSERT(args == NULL);
 
 	struct cpu_info *ci = curcpu();
-	struct intrframe *regs = args;
 
 #ifdef PORT_DEBUG
 	if (port == PORT_DEBUG)
@@ -226,7 +225,7 @@
 #endif
 	if (evtsource[port]) {
 		ci->ci_idepth++;
-		evtchn_do_event(port, regs);
+		evtchn_do_event(port);
 		ci->ci_idepth--;
 	}
 #ifdef DOM0OPS
@@ -248,7 +247,7 @@
 }
 
 void
-do_hypervisor_callback(struct intrframe *regs)
+do_hypervisor_callback(void)
 {
 	volatile shared_info_t *s = HYPERVISOR_shared_info;
 	struct cpu_info *ci;
@@ -273,7 +272,7 @@
 
 		evt_iterate_bits(&vci->evtchn_pending_sel,
 		    s->evtchn_pending, s->evtchn_mask,
-		    evt_do_hypervisor_callback, regs);
+		    evt_do_hypervisor_callback, NULL);
 	}
 
 #ifdef DIAGNOSTIC
diff -r 5a1052452d40 sys/arch/xen/xen/clock.c
--- a/sys/arch/xen/xen/clock.c	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/xen/xen/clock.c	Tue Oct 16 14:35:48 2018 +0000
@@ -76,7 +76,7 @@
 static unsigned	xen_get_timecount(struct timecounter *);
 static int	xen_rtc_get(struct todr_chip_handle *, struct timeval *);
 static int	xen_rtc_set(struct todr_chip_handle *, struct timeval *);
-static int	xen_timer_handler(void *, struct clockframe *);
+static int	xen_timer_handler(void *);
 
 /*
  * xen timecounter:
@@ -802,7 +802,7 @@
  *	The cookie is the pointer to struct cpu_info.
  */
 static int
-xen_timer_handler(void *cookie, struct clockframe *frame)
+xen_timer_handler(void *cookie)
 {
 	struct cpu_info *ci = curcpu();
 	uint64_t last, now, delta, next;
@@ -836,7 +836,7 @@
 	while (delta >= NS_PER_TICK) {
 		ci->ci_xen_hardclock_systime_ns += NS_PER_TICK;
 		delta -= NS_PER_TICK;
-		hardclock(frame);
+		hardclock((struct clockframe *)ci->ci_tf);
 		if (__predict_false(delta >= NS_PER_TICK))
 			ci->ci_xen_missed_hardclock_evcnt.ev_count++;
 	}
diff -r 5a1052452d40 sys/arch/xen/xen/evtchn.c
--- a/sys/arch/xen/xen/evtchn.c	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/arch/xen/xen/evtchn.c	Tue Oct 16 14:35:48 2018 +0000
@@ -305,12 +305,12 @@
 
 
 unsigned int
-evtchn_do_event(int evtch, struct intrframe *regs)
+evtchn_do_event(int evtch)
 {
 	struct cpu_info *ci;
 	int ilevel;
 	struct intrhand *ih;
-	int	(*ih_fun)(void *, void *);
+	int	(*ih_fun)(void *);
 	uint32_t iplmask;
 	int i;
 	uint32_t iplbit;
@@ -386,7 +386,7 @@
 		iplmask &= ~IUNMASK(ci, ih->ih_level);
 		ci->ci_ilevel = ih->ih_level;
 		ih_fun = (void *)ih->ih_fun;
-		ih_fun(ih->ih_arg, regs);
+		ih_fun(ih->ih_arg);
 		ih = ih->ih_evt_next;
 	}
 	mutex_spin_exit(&evtlock[evtch]);
@@ -410,7 +410,7 @@
 					KASSERT(ih->ih_cpu == ci);
 					sti();
 					ih_fun = (void *)ih->ih_fun;
-					ih_fun(ih->ih_arg, regs);
+					ih_fun(ih->ih_arg);
 					cli();
 				}
 				hypervisor_enable_ipl(i);
diff -r 5a1052452d40 sys/kern/kern_clock.c
--- a/sys/kern/kern_clock.c	Tue Oct 16 09:33:04 2018 +0000
+++ b/sys/kern/kern_clock.c	Tue Oct 16 14:35:48 2018 +0000
@@ -352,6 +352,10 @@
 	}
 
 	if (CLKF_USERMODE(frame)) {
+		if (p == NULL) { /* Account deferred clock ticks to current user. */
+			p = l->l_proc;
+			mutex_spin_enter(&p->p_stmutex);
+		}
 		KASSERT(p != NULL);
 		if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
 			addupc_intr(l, CLKF_PC(frame));


Home | Main Index | Thread Index | Old Index