NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

port-amd64/56943: nvmm - Fix TSC synchronization issues



>Number:         56943
>Category:       port-amd64
>Synopsis:       nvmm - Fix TSC synchronization issues
>Confidential:   no
>Severity:       critical
>Priority:       medium
>Responsible:    port-amd64-maintainer
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Wed Jul 27 05:45:00 +0000 2022
>Originator:     William J. Coldwell
>Release:        NetBSD 9.99.99
>Organization:
	The NetBSD Foundation
>Environment:
	
	
System: NetBSD mortalcoil.local 9.99.99 NetBSD 9.99.99 (GENERIC) #0: Fri Jul 22 19:53:09 UTC 2022 mkrepro%mkrepro.NetBSD.org@localhost:/usr/src/sys/arch/amd64/compile/GENERIC amd64
Architecture: x86_64
Machine: amd64
>Description:

* Save the guest TSC offset in cpudata as 'gtsc_offset', replacing the
  origin absolute TSC value stored as 'gtsc'.

* QEMU and other emulators probably have no intention of actually
  forcing the TSC state in the SETSTATE call, so don't act on it
  if it matches the value we previously returned.

  This allows the guest to inherit a completely synchronized TSC from
  the host.  Without it, the TSC's for the VCPUs wind up being badly
  out of sync.

* Updating MSR_TSC completely blows up TSC mp synchronization.  We
  assume QEMU did not intend to update the TSC if it tries to write
  0 or tries to write the value returned in the previous getstate.

* This allows kernels to use the TSC as a clock, which costs nothing,
  verses the ACPI or HPET which have horrible overhead and a global
  mutex in QEMU.

 Credit: Matthew Dillon <dillon%apollo.backplane.com@localhost>
 dragonfly.git commit 	c9096cab4ce6c0febd2dfe41f841c7e844fea2a9
>How-To-Repeat:
	run a vm under nvmm and watch the clock drift in TSC kernel messages
>Fix:
--- sys/dev/nvmm/x86/nvmm_x86_svm.c     2022-07-26 23:23:14.534930037 +0000
+++ /root/nvmm_x86_svm.c        2022-07-26 16:56:09.839160986 +0000
@@ -572,7 +572,8 @@
        uint64_t gxcr0;
        uint64_t gprs[NVMM_X64_NGPR];
        uint64_t drs[NVMM_X64_NDR];
-       uint64_t gtsc;
+       uint64_t gtsc_offset;
+       uint64_t gtsc_match;;
        struct xsave_header gfpu __aligned(64);
 
        /* VCPU configuration. */
@@ -1202,8 +1203,10 @@
                        svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_CR);
                        goto handled;
                }
+
+               /* All bets are off if MSR_TSC is actually written to. */
                if (exit->u.wrmsr.msr == MSR_TSC) {
-                       cpudata->gtsc = exit->u.wrmsr.val;
+                       cpudata->gtsc_offset = exit->u.wrmsr.val - rdtsc();
                        cpudata->gtsc_want_update = true;
                        goto handled;
                }
@@ -1524,7 +1527,7 @@
                }
 
                if (__predict_false(cpudata->gtsc_want_update)) {
-                       vmcb->ctrl.tsc_offset = cpudata->gtsc - rdtsc();
+                       vmcb->ctrl.tsc_offset = cpudata->gtsc_offset;
                        svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_I);
                }
 
@@ -1620,8 +1623,6 @@
                }
        }
 
-       cpudata->gtsc = rdtsc() + vmcb->ctrl.tsc_offset;
-
        svm_vcpu_guest_misc_leave(vcpu);
        svm_vcpu_guest_dbregs_leave(vcpu);
 
@@ -1892,8 +1893,25 @@
                    state->msrs[NVMM_X64_MSR_SYSENTER_EIP];
                vmcb->state.g_pat = state->msrs[NVMM_X64_MSR_PAT];
 
-               cpudata->gtsc = state->msrs[NVMM_X64_MSR_TSC];
-               cpudata->gtsc_want_update = true;
+               /*
+                * QEMU or whatever... probably did NOT want to set the TSC,
+                * because doing so would destroy tsc mp-synchronization
+                * across logical cpus.  Try to figure out what qemu meant
+                * to do.
+                *
+                * If writing the last TSC value we reported via getstate,
+                * assume that the hypervisor does not want to write to the
+                * TSC.
+                *
+                * QEMU appears to issue a setstate with the value 0 after
+                * a 'reboot', so for now also ignore this case.
+                */
+               if (state->msrs[NVMM_X64_MSR_TSC] != cpudata->gtsc_match &&
+                   state->msrs[NVMM_X64_MSR_TSC] != 0) {
+                       cpudata->gtsc_offset =
+                           state->msrs[NVMM_X64_MSR_TSC] - rdtsc();
+                       cpudata->gtsc_want_update = true;
+               }
        }
 
        if (flags & NVMM_X64_STATE_INTR) {
@@ -2016,10 +2034,13 @@
                state->msrs[NVMM_X64_MSR_SYSENTER_EIP] =
                    vmcb->state.sysenter_eip;
                state->msrs[NVMM_X64_MSR_PAT] = vmcb->state.g_pat;
-               state->msrs[NVMM_X64_MSR_TSC] = cpudata->gtsc;
+               state->msrs[NVMM_X64_MSR_TSC] = rdtsc() + cpudata->gtsc_offset;
 
                /* Hide SVME. */
                state->msrs[NVMM_X64_MSR_EFER] &= ~EFER_SVME;
+
+               /* Save reported TSC value for later setstate hack. */
+               cpudata->gtsc_match = state->msrs[NVMM_X64_MSR_TSC];
        }
 
        if (flags & NVMM_X64_STATE_INTR) {
--- sys/dev/nvmm/x86/nvmm_x86_vmx.c     2022-07-26 23:23:14.553341729 +0000
+++ /root/nvmm_x86_vmx.c        2022-07-26 16:56:17.198963938 +0000
@@ -817,7 +817,8 @@
        uint64_t gxcr0;
        uint64_t gprs[NVMM_X64_NGPR];
        uint64_t drs[NVMM_X64_NDR];
-       uint64_t gtsc;
+       uint64_t gtsc_offset;
+       uint64_t gtsc_match;
        struct xsave_header gfpu __aligned(64);
 
        /* VCPU configuration. */
@@ -1863,8 +1864,9 @@
                        goto handled;
                }
        } else {
+               /* All bets are off if MSR_TSC is actually written to. */
                if (exit->u.wrmsr.msr == MSR_TSC) {
-                       cpudata->gtsc = exit->u.wrmsr.val;
+                       cpudata->gtsc_offset = exit->u.wrmsr.val - rdtsc();
                        cpudata->gtsc_want_update = true;
                        goto handled;
                }
@@ -2235,7 +2237,7 @@
                }
 
                if (__predict_false(cpudata->gtsc_want_update)) {
-                       vmx_vmwrite(VMCS_TSC_OFFSET, cpudata->gtsc - rdtsc());
+                       vmx_vmwrite(VMCS_TSC_OFFSET, cpudata->gtsc_offset);
                        cpudata->gtsc_want_update = false;
                }
 
@@ -2345,8 +2347,6 @@
 
        cpudata->vmcs_launched = launched;
 
-       cpudata->gtsc = vmx_vmread(VMCS_TSC_OFFSET) + rdtsc();
-
        vmx_vcpu_guest_misc_leave(vcpu);
        vmx_vcpu_guest_dbregs_leave(vcpu);
 
@@ -2641,8 +2641,25 @@
                vmx_vmwrite(VMCS_GUEST_IA32_SYSENTER_EIP,
                    state->msrs[NVMM_X64_MSR_SYSENTER_EIP]);
 
-               cpudata->gtsc = state->msrs[NVMM_X64_MSR_TSC];
-               cpudata->gtsc_want_update = true;
+               /*
+                * QEMU or whatever... probably did NOT want to set the TSC,
+                * because doing so would destroy tsc mp-synchronization
+                * across logical cpus.  Try to figure out what qemu meant
+                * to do.
+                *
+                * If writing the last TSC value we reported via getstate,
+                * assume that the hypervisor does not want to write to the
+                * TSC.
+                *
+                * QEMU appears to issue a setstate with the value 0 after
+                * a 'reboot', so for now also ignore this case.
+                */
+               if (state->msrs[NVMM_X64_MSR_TSC] != cpudata->gtsc_match &&
+                   state->msrs[NVMM_X64_MSR_TSC] != 0) {
+                       cpudata->gtsc_offset =
+                           state->msrs[NVMM_X64_MSR_TSC] - rdtsc();
+                       cpudata->gtsc_want_update = true;
+               }
 
                /* ENTRY_CTLS_LONG_MODE must match EFER_LMA. */
                ctls1 = vmx_vmread(VMCS_ENTRY_CTLS);
@@ -2772,7 +2789,10 @@
                    vmx_vmread(VMCS_GUEST_IA32_SYSENTER_ESP);
                state->msrs[NVMM_X64_MSR_SYSENTER_EIP] =
                    vmx_vmread(VMCS_GUEST_IA32_SYSENTER_EIP);
-               state->msrs[NVMM_X64_MSR_TSC] = cpudata->gtsc;
+               state->msrs[NVMM_X64_MSR_TSC] = rdtsc() + cpudata->gtsc_offset;
+
+               /* Save reported TSC value for later setstate hack. */
+               cpudata->gtsc_match = state->msrs[NVMM_X64_MSR_TSC];
        }
 
        if (flags & NVMM_X64_STATE_INTR) {

>Unformatted:
 	
 	


Home | Main Index | Thread Index | Old Index