pkgsrc-Changes archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

CVS commit: pkgsrc/sysutils



Module Name:    pkgsrc
Committed By:   bouyer
Date:           Fri Dec 13 13:44:21 UTC 2019

Modified Files:
        pkgsrc/sysutils/xenkernel411: Makefile distinfo
        pkgsrc/sysutils/xentools411: Makefile distinfo
Added Files:
        pkgsrc/sysutils/xenkernel411/patches: patch-XSA307 patch-XSA308
            patch-XSA309 patch-XSA310 patch-XSA311
Removed Files:
        pkgsrc/sysutils/xenkernel411/patches: patch-XSA298 patch-XSA299
            patch-XSA302 patch-XSA304 patch-XSA305 patch-XSA306

Log Message:
Update xenkernel411 to 4.11.3nb1, and xentools411 to 4.11.3
(PKGREVISION not reset on xenkernel411 on purpose, to enphasis that it's
not a stock Xen 4.11.3 kernel).
Changes since 4.11.2:
- includes all security patches up to XSA306
- other minor bug fixes, hardware support and performances improvements

In addition, xenkernel411 includes all security patches released since 4.11.3,
up to XSA311


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 pkgsrc/sysutils/xenkernel411/Makefile
cvs rdiff -u -r1.8 -r1.9 pkgsrc/sysutils/xenkernel411/distinfo
cvs rdiff -u -r1.2 -r0 pkgsrc/sysutils/xenkernel411/patches/patch-XSA298 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA302 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA304 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA305
cvs rdiff -u -r1.1 -r0 pkgsrc/sysutils/xenkernel411/patches/patch-XSA299 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA306
cvs rdiff -u -r0 -r1.1 pkgsrc/sysutils/xenkernel411/patches/patch-XSA307 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA308 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA309 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA310 \
    pkgsrc/sysutils/xenkernel411/patches/patch-XSA311
cvs rdiff -u -r1.11 -r1.12 pkgsrc/sysutils/xentools411/Makefile
cvs rdiff -u -r1.7 -r1.8 pkgsrc/sysutils/xentools411/distinfo

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: pkgsrc/sysutils/xenkernel411/Makefile
diff -u pkgsrc/sysutils/xenkernel411/Makefile:1.11 pkgsrc/sysutils/xenkernel411/Makefile:1.12
--- pkgsrc/sysutils/xenkernel411/Makefile:1.11  Fri Dec  6 17:30:28 2019
+++ pkgsrc/sysutils/xenkernel411/Makefile       Fri Dec 13 13:44:21 2019
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.11 2019/12/06 17:30:28 bouyer Exp $
+# $NetBSD: Makefile,v 1.12 2019/12/13 13:44:21 bouyer Exp $
 
-VERSION=       4.11.2
-PKGREVISION=   3
+VERSION=       4.11.3
+PKGREVISION=   1
 DISTNAME=      xen-${VERSION}
 PKGNAME=       xenkernel411-${VERSION}
 CATEGORIES=    sysutils

Index: pkgsrc/sysutils/xenkernel411/distinfo
diff -u pkgsrc/sysutils/xenkernel411/distinfo:1.8 pkgsrc/sysutils/xenkernel411/distinfo:1.9
--- pkgsrc/sysutils/xenkernel411/distinfo:1.8   Fri Dec  6 17:30:28 2019
+++ pkgsrc/sysutils/xenkernel411/distinfo       Fri Dec 13 13:44:21 2019
@@ -1,16 +1,15 @@
-$NetBSD: distinfo,v 1.8 2019/12/06 17:30:28 bouyer Exp $
+$NetBSD: distinfo,v 1.9 2019/12/13 13:44:21 bouyer Exp $
 
-SHA1 (xen411/xen-4.11.2.tar.gz) = 82766db0eca7ce65962732af8a31bb5cce1eb7ce
-RMD160 (xen411/xen-4.11.2.tar.gz) = 6dcb1ac3e72381474912607b30b59fa55d87d38b
-SHA512 (xen411/xen-4.11.2.tar.gz) = 48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e
-Size (xen411/xen-4.11.2.tar.gz) = 25164925 bytes
+SHA1 (xen411/xen-4.11.3.tar.gz) = 2d77152168d6f9dcea50db9cb8e3e6a0720a4a1b
+RMD160 (xen411/xen-4.11.3.tar.gz) = cfb2e699842867b60d25a01963c564a6c5e580da
+SHA512 (xen411/xen-4.11.3.tar.gz) = 2204e490e9fc357a05983a9bf4e7345e1d364fe00400ce473988dcb9ca7d4e2b921fe10f095cbbc64248130a92d22c6f0d154dcae250a57a7f915df32e3dc436
+Size (xen411/xen-4.11.3.tar.gz) = 25180826 bytes
 SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
-SHA1 (patch-XSA298) = 63e0f96ce3b945b16b98b51b423bafec14cf2be6
-SHA1 (patch-XSA299) = beb7ba1a8f9e0adda161c0da725ff053e674067e
-SHA1 (patch-XSA302) = 12fbb7dfea27f53c70c8115487a2e30595549c2b
-SHA1 (patch-XSA304) = f2c22732227e11a3e77c630f0264a689eed53399
-SHA1 (patch-XSA305) = eb5e0096cbf501fcbd7a5c5f9d1f932b557636b6
-SHA1 (patch-XSA306) = f57201b2ae5f6435ce6ba3c6aac3e9e10cdba3fb
+SHA1 (patch-XSA307) = afd88b8294b0dbbc32e1d1aa74eb887d2da6695a
+SHA1 (patch-XSA308) = bda9ef732e0b6578ce8f7f0f7aa0a4189da41e86
+SHA1 (patch-XSA309) = 78cf7306e9d1efcbf2ebf425025d46948ae83019
+SHA1 (patch-XSA310) = 77b711f4b75de1d473a6988eb6f2b48e37cc353a
+SHA1 (patch-XSA311) = 4d3e6cc39c2b95cb3339961271df2bc885667927
 SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
 SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
 SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b

Index: pkgsrc/sysutils/xentools411/Makefile
diff -u pkgsrc/sysutils/xentools411/Makefile:1.11 pkgsrc/sysutils/xentools411/Makefile:1.12
--- pkgsrc/sysutils/xentools411/Makefile:1.11   Mon Nov  4 21:28:58 2019
+++ pkgsrc/sysutils/xentools411/Makefile        Fri Dec 13 13:44:21 2019
@@ -1,6 +1,6 @@
-# $NetBSD: Makefile,v 1.11 2019/11/04 21:28:58 rillig Exp $
+# $NetBSD: Makefile,v 1.12 2019/12/13 13:44:21 bouyer Exp $
 #
-VERSION=       4.11.2
+VERSION=       4.11.3
 VERSION_IPXE=  356f6c1b64d7a97746d1816cef8ca22bdd8d0b5d
 DIST_IPXE=     ipxe-git-${VERSION_IPXE}.tar.gz
 

Index: pkgsrc/sysutils/xentools411/distinfo
diff -u pkgsrc/sysutils/xentools411/distinfo:1.7 pkgsrc/sysutils/xentools411/distinfo:1.8
--- pkgsrc/sysutils/xentools411/distinfo:1.7    Sun Nov  3 10:07:16 2019
+++ pkgsrc/sysutils/xentools411/distinfo        Fri Dec 13 13:44:21 2019
@@ -1,13 +1,13 @@
-$NetBSD: distinfo,v 1.7 2019/11/03 10:07:16 maya Exp $
+$NetBSD: distinfo,v 1.8 2019/12/13 13:44:21 bouyer Exp $
 
 SHA1 (xen411/ipxe-git-356f6c1b64d7a97746d1816cef8ca22bdd8d0b5d.tar.gz) = 272b8c904dc0127690eca2c5c20c67479e40da34
 RMD160 (xen411/ipxe-git-356f6c1b64d7a97746d1816cef8ca22bdd8d0b5d.tar.gz) = cfcb4a314c15da19b36132b27126f3bd9699d0e5
 SHA512 (xen411/ipxe-git-356f6c1b64d7a97746d1816cef8ca22bdd8d0b5d.tar.gz) = 
bbcce5e55040e7e29adebd4a5253a046016a6e2e7ff34cf801a42d147e1ec1af57e0297318249bfa9c5bbeac969fe4b37c18cbf845a80b2136d65387a4fc31da
 Size (xen411/ipxe-git-356f6c1b64d7a97746d1816cef8ca22bdd8d0b5d.tar.gz) = 3732065 bytes
-SHA1 (xen411/xen-4.11.2.tar.gz) = 82766db0eca7ce65962732af8a31bb5cce1eb7ce
-RMD160 (xen411/xen-4.11.2.tar.gz) = 6dcb1ac3e72381474912607b30b59fa55d87d38b
-SHA512 (xen411/xen-4.11.2.tar.gz) = 48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e
-Size (xen411/xen-4.11.2.tar.gz) = 25164925 bytes
+SHA1 (xen411/xen-4.11.3.tar.gz) = 2d77152168d6f9dcea50db9cb8e3e6a0720a4a1b
+RMD160 (xen411/xen-4.11.3.tar.gz) = cfb2e699842867b60d25a01963c564a6c5e580da
+SHA512 (xen411/xen-4.11.3.tar.gz) = 2204e490e9fc357a05983a9bf4e7345e1d364fe00400ce473988dcb9ca7d4e2b921fe10f095cbbc64248130a92d22c6f0d154dcae250a57a7f915df32e3dc436
+Size (xen411/xen-4.11.3.tar.gz) = 25180826 bytes
 SHA1 (patch-.._ipxe_src_core_settings.c) = 1eab2fbd8b22dde2b8aa830ae7701603486f74e4
 SHA1 (patch-.._ipxe_src_net_fcels.c) = eda41b25c3d5f5bef33caa9a6af28c40cb91e66b
 SHA1 (patch-Config.mk) = c41005a60de2f94a72b0206030eb021c137653d3

Added files:

Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA307
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA307:1.1
--- /dev/null   Fri Dec 13 13:44:21 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA307   Fri Dec 13 13:44:21 2019
@@ -0,0 +1,101 @@
+$NetBSD: patch-XSA307,v 1.1 2019/12/13 13:44:21 bouyer Exp $
+
+From: Jan Beulich <jbeulich%suse.com@localhost>
+Subject: x86+Arm32: make find_next_{,zero_}bit() have well defined behavior
+
+These functions getting used with the 2nd and 3rd arguments being equal
+wasn't well defined: Arm64 reliably returns the value of the 2nd
+argument in this case, while on x86 for bitmaps up to 64 bits wide the
+return value was undefined (due to the undefined behavior of a shift of
+a value by the number of bits it's wide) when the incoming value was 64.
+On Arm32 an actual out of bounds access would happen when the
+size/offset value is a multiple of 32; if this access doesn't fault, the
+return value would have been sufficiently correct afaict.
+
+Make the functions consistently tolerate the last two arguments being
+equal (and in fact the 3rd argument being greater or equal to the 2nd),
+in favor of finding and fixing all the use sites that violate the
+original more strict assumption.
+
+This is XSA-307.
+
+Signed-off-by: Jan Beulich <jbeulich%suse.com@localhost>
+Acked-by: Julien Grall <julien%xen.org@localhost>
+---
+The most obvious (albeit still indirect) exposure to guests is
+evtchn_check_pollers(), which imo makes this a security issue at least
+for Arm32.
+
+This was originally already discussed between (at least) Andrew and me,
+and I don't really recall who brought up the issue first.
+
+Note that Arm's Linux origin of the code may call for syncing
+publication with them. Then again I don't want to tell them just to see
+them go public ahead of us.
+
+--- xen/arch/arm/arm32/lib/findbit.S.orig
++++ xen/arch/arm/arm32/lib/findbit.S
+@@ -42,8 +42,8 @@ ENDPROC(_find_first_zero_bit_le)
+  * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+  */
+ ENTRY(_find_next_zero_bit_le)
+-              teq     r1, #0
+-              beq     3b
++              cmp     r1, r2
++              bls     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+  ARM(         ldrb    r3, [r0, r2, lsr #3]    )
+@@ -83,8 +83,8 @@ ENDPROC(_find_first_bit_le)
+  * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+  */
+ ENTRY(_find_next_bit_le)
+-              teq     r1, #0
+-              beq     3b
++              cmp     r1, r2
++              bls     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+  ARM(         ldrb    r3, [r0, r2, lsr #3]    )
+@@ -117,8 +117,8 @@ ENTRY(_find_first_zero_bit_be)
+ ENDPROC(_find_first_zero_bit_be)
+ 
+ ENTRY(_find_next_zero_bit_be)
+-              teq     r1, #0
+-              beq     3b
++              cmp     r1, r2
++              bls     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+               eor     r3, r2, #0x18           @ big endian byte ordering
+@@ -151,8 +151,8 @@ ENTRY(_find_first_bit_be)
+ ENDPROC(_find_first_bit_be)
+ 
+ ENTRY(_find_next_bit_be)
+-              teq     r1, #0
+-              beq     3b
++              cmp     r1, r2
++              bls     3b
+               ands    ip, r2, #7
+               beq     1b                      @ If new byte, goto old routine
+               eor     r3, r2, #0x18           @ big endian byte ordering
+--- xen/include/asm-x86/bitops.h.orig
++++ xen/include/asm-x86/bitops.h
+@@ -358,7 +358,7 @@ static always_inline unsigned int __scan
+     const unsigned long *a__ = (addr);                                      \
+     unsigned int s__ = (size);                                              \
+     unsigned int o__ = (off);                                               \
+-    if ( __builtin_constant_p(size) && !s__ )                               \
++    if ( o__ >= s__ )                                                       \
+         r__ = s__;                                                          \
+     else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG )          \
+         r__ = o__ + __scanbit(*(const unsigned long *)(a__) >> o__, s__);   \
+@@ -390,7 +390,7 @@ static always_inline unsigned int __scan
+     const unsigned long *a__ = (addr);                                      \
+     unsigned int s__ = (size);                                              \
+     unsigned int o__ = (off);                                               \
+-    if ( __builtin_constant_p(size) && !s__ )                               \
++    if ( o__ >= s__ )                                                       \
+         r__ = s__;                                                          \
+     else if ( __builtin_constant_p(size) && s__ <= BITS_PER_LONG )          \
+         r__ = o__ + __scanbit(~*(const unsigned long *)(a__) >> o__, s__);  \
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA308
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA308:1.1
--- /dev/null   Fri Dec 13 13:44:21 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA308   Fri Dec 13 13:44:21 2019
@@ -0,0 +1,76 @@
+$NetBSD: patch-XSA308,v 1.1 2019/12/13 13:44:21 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: x86/vtx: Work around SingleStep + STI/MovSS VMEntry failures
+
+See patch comment for technical details.
+
+Concerning the timeline, this was first discovered in the aftermath of
+XSA-156 which caused #DB to be intercepted unconditionally, but only in
+its SingleStep + STI form which is restricted to privileged software.
+
+After working with Intel and identifying the problematic vmentry check,
+this workaround was suggested, and the patch was posted in an RFC
+series.  Outstanding work for that series (not breaking Introspection)
+is still pending, and this fix from it (which wouldn't have been good
+enough in its original form) wasn't committed.
+
+A vmentry failure was reported to xen-devel, and debugging identified
+this bug in its SingleStep + MovSS form by way of INT1, which does not
+involve the use of any privileged instructions, and proving this to be a
+security issue.
+
+This is XSA-308
+
+Reported-by: Håkon Alstadheim <hakon%alstadheim.priv.no@localhost>
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+Acked-by: Kevin Tian <kevin.tian%intel.com@localhost>
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 6a5eeb5c13..59b836f43f 100644
+--- xen/arch/x86/hvm/vmx/vmx.c.orig
++++ xen/arch/x86/hvm/vmx/vmx.c
+@@ -3816,6 +3816,42 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
+             HVMTRACE_1D(TRAP_DEBUG, exit_qualification);
+             __restore_debug_registers(v);
+             write_debugreg(6, exit_qualification | DR_STATUS_RESERVED_ONE);
++
++            /*
++             * Work around SingleStep + STI/MovSS VMEntry failures.
++             *
++             * We intercept #DB unconditionally to work around CVE-2015-8104 /
++             * XSA-156 (guest-kernel induced host DoS).
++             *
++             * STI/MovSS shadows block/defer interrupts/exceptions (exact
++             * details are complicated and poorly documented).  Debug
++             * exceptions delayed for any reason are stored in the
++             * PENDING_DBG_EXCEPTIONS field.
++             *
++             * The falling edge of PENDING_DBG causes #DB to be delivered,
++             * resulting in a VMExit, as #DB is intercepted.  The VMCS still
++             * reports blocked-by-STI/MovSS.
++             *
++             * The VMEntry checks when EFLAGS.TF is set don't like a VMCS in
++             * this state.  Despite a #DB queued in VMENTRY_INTR_INFO, the
++             * state is rejected as DR6.BS isn't pending.  Fix this up.
++             */
++            if ( unlikely(regs->eflags & X86_EFLAGS_TF) )
++            {
++                unsigned long int_info;
++
++                __vmread(GUEST_INTERRUPTIBILITY_INFO, &int_info);
++
++                if ( int_info & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) )
++                {
++                    unsigned long pending_dbg;
++
++                    __vmread(GUEST_PENDING_DBG_EXCEPTIONS, &pending_dbg);
++                    __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS,
++                              pending_dbg | DR_STEP);
++                }
++            }
++
+             if ( !v->domain->debugger_attached )
+             {
+                 unsigned long insn_len = 0;
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA309
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA309:1.1
--- /dev/null   Fri Dec 13 13:44:21 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA309   Fri Dec 13 13:44:21 2019
@@ -0,0 +1,60 @@
+$NetBSD: patch-XSA309,v 1.1 2019/12/13 13:44:21 bouyer Exp $
+
+From 523e3974ed2213719a19218f5b246e382ceef18a Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap%citrix.com@localhost>
+Date: Wed, 30 Oct 2019 17:05:28 +0000
+Subject: [PATCH] x86/mm: Don't reset linear_pt_count on partial validation
+
+"Linear pagetables" is a technique which involves either pointing a
+pagetable at itself, or to another pagetable the same or higher level.
+Xen has limited support for linear pagetables: A page may either point
+to itself, or point to another page of the same level (i.e., L2 to L2,
+L3 to L3, and so on).
+
+XSA-240 introduced an additional restriction that limited the "depth"
+of such chains by allowing pages to either *point to* other pages of
+the same level, or *be pointed to* by other pages of the same level,
+but not both.  To implement this, we keep track of the number of
+outstanding times a page points to or is pointed to another page
+table, to prevent both from happening at the same time.
+
+Unfortunately, the original commit introducing this reset this count
+when resuming validation of a partially-validated pagetable, dropping
+some "linear_pt_entry" counts.
+
+On debug builds on systems where guests used this feature, this might
+lead to crashes that look like this:
+
+    Assertion 'oc > 0' failed at mm.c:874
+
+Worse, if an attacker could engineer such a situation to occur, they
+might be able to make loops or other abitrary chains of linear
+pagetables, leading to the denial-of-service situation outlined in
+XSA-240.
+
+This is XSA-309.
+
+Reported-by: Manuel Bouyer <bouyer%antioche.eu.org@localhost>
+Signed-off-by: George Dunlap <george.dunlap%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ xen/arch/x86/mm.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 7d4dd80a85..01393fb0da 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -3059,8 +3059,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+         {
+             page->nr_validated_ptes = 0;
+             page->partial_flags = 0;
++            page->linear_pt_count = 0;
+         }
+-        page->linear_pt_count = 0;
+         rc = alloc_page_type(page, type, preemptible);
+     }
+ 
+-- 
+2.24.0
+
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA310
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA310:1.1
--- /dev/null   Fri Dec 13 13:44:21 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA310   Fri Dec 13 13:44:21 2019
@@ -0,0 +1,348 @@
+$NetBSD: patch-XSA310,v 1.1 2019/12/13 13:44:21 bouyer Exp $
+
+From 7c537dc8d28a03064a14171ed5c6fc329531816a Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap%citrix.com@localhost>
+Date: Tue, 19 Nov 2019 11:40:34 +0000
+Subject: [PATCH 1/3] x86/mm: Set old_guest_table when destroying vcpu
+ pagetables
+
+Changeset 6c4efc1eba ("x86/mm: Don't drop a type ref unless you held a
+ref to begin with"), part of XSA-299, changed the calling discipline
+of put_page_type() such that if put_page_type() returned -ERESTART
+(indicating a partially de-validated page), subsequent calls to
+put_page_type() must be called with PTF_partial_set.  If called on a
+partially de-validated page but without PTF_partial_set, Xen will
+BUG(), because to do otherwise would risk opening up the kind of
+privilege escalation bug described in XSA-299.
+
+One place this was missed was in vcpu_destroy_pagetables().
+put_page_and_type_preemptible() is called, but on -ERESTART, the
+entire operation is simply restarted, causing put_page_type() to be
+called on a partially de-validated page without PTF_partial_set.  The
+result was that if such an operation were interrupted, Xen would hit a
+BUG().
+
+Fix this by having vcpu_destroy_pagetables() consistently pass off
+interrupted de-validations to put_old_page_type():
+- Unconditionally clear references to the page, even if
+  put_page_and_type failed
+- Set old_guest_table and old_guest_table_partial appropriately
+
+While here, do some refactoring:
+
+ - Move clearing of arch.cr3 to the top of the function
+
+ - Now that clearing is unconditional, move the unmap to the same
+   conditional as the l4tab mapping.  This also allows us to reduce
+   the scope of the l4tab variable.
+
+ - Avoid code duplication by looping to drop references on
+   guest_table_user
+
+This is part of XSA-310.
+
+Reported-by: Sarah Newman <srn%prgmr.com@localhost>
+Signed-off-by: George Dunlap <george.dunlap%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+Added in v2.
+
+Changes in v3:
+- Minor comment / whitespace fixes
+---
+ xen/arch/x86/mm.c | 75 +++++++++++++++++++++++++++++------------------
+ 1 file changed, 47 insertions(+), 28 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 01393fb0da..a759afc9e3 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -3142,40 +3142,36 @@ int put_old_guest_table(struct vcpu *v)
+ int vcpu_destroy_pagetables(struct vcpu *v)
+ {
+     unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
+-    struct page_info *page;
+-    l4_pgentry_t *l4tab = NULL;
++    struct page_info *page = NULL;
+     int rc = put_old_guest_table(v);
++    bool put_guest_table_user = false;
+ 
+     if ( rc )
+         return rc;
+ 
++    v->arch.cr3 = 0;
++
++    /*
++     * Get the top-level guest page; either the guest_table itself, for
++     * 64-bit, or the top-level l4 entry for 32-bit.  Either way, remove
++     * the reference to that page.
++     */
+     if ( is_pv_32bit_vcpu(v) )
+     {
+-        l4tab = map_domain_page(_mfn(mfn));
+-        mfn = l4e_get_pfn(*l4tab);
+-    }
++        l4_pgentry_t *l4tab = map_domain_page(_mfn(mfn));
+ 
+-    if ( mfn )
+-    {
+-        page = mfn_to_page(_mfn(mfn));
+-        if ( paging_mode_refcounts(v->domain) )
+-            put_page(page);
+-        else
+-            rc = put_page_and_type_preemptible(page);
+-    }
+-
+-    if ( l4tab )
+-    {
+-        if ( !rc )
+-            l4e_write(l4tab, l4e_empty());
++        mfn = l4e_get_pfn(*l4tab);
++        l4e_write(l4tab, l4e_empty());
+         unmap_domain_page(l4tab);
+     }
+-    else if ( !rc )
++    else
+     {
+         v->arch.guest_table = pagetable_null();
++        put_guest_table_user = true;
++    }
+ 
+-        /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+-        mfn = pagetable_get_pfn(v->arch.guest_table_user);
++    /* Free that page if non-zero */
++    do {
+         if ( mfn )
+         {
+             page = mfn_to_page(_mfn(mfn));
+@@ -3183,18 +3179,41 @@ int vcpu_destroy_pagetables(struct vcpu *v)
+                 put_page(page);
+             else
+                 rc = put_page_and_type_preemptible(page);
++            mfn = 0;
+         }
+-        if ( !rc )
+-            v->arch.guest_table_user = pagetable_null();
+-    }
+ 
+-    v->arch.cr3 = 0;
++        if ( !rc && put_guest_table_user )
++        {
++            /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
++            mfn = pagetable_get_pfn(v->arch.guest_table_user);
++            v->arch.guest_table_user = pagetable_null();
++            put_guest_table_user = false;
++        }
++    } while ( mfn );
+ 
+     /*
+-     * put_page_and_type_preemptible() is liable to return -EINTR. The
+-     * callers of us expect -ERESTART so convert it over.
++     * If a "put" operation was interrupted, finish things off in
++     * put_old_guest_table() when the operation is restarted.
+      */
+-    return rc != -EINTR ? rc : -ERESTART;
++    switch ( rc )
++    {
++    case -EINTR:
++    case -ERESTART:
++        v->arch.old_guest_ptpg = NULL;
++        v->arch.old_guest_table = page;
++        v->arch.old_guest_table_partial = (rc == -ERESTART);
++        rc = -ERESTART;
++        break;
++    default:
++        /*
++         * Failure to 'put' a page may cause it to leak, but that's
++         * less bad than a crash.
++         */
++        ASSERT(rc == 0);
++        break;
++    }
++
++    return rc;
+ }
+ 
+ int new_guest_cr3(mfn_t mfn)
+-- 
+2.24.0
+
+From 128cb126aee9b4a2855ab898fdfbfe7009fbf1f5 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap%citrix.com@localhost>
+Date: Thu, 31 Oct 2019 11:17:38 +0000
+Subject: [PATCH 2/3] x86/mm: alloc/free_lN_table: Retain partial_flags on
+ -EINTR
+
+When validating or de-validating pages (in alloc_lN_table and
+free_lN_table respectively), the `partial_flags` local variable is
+used to keep track of whether the "current" PTE started the entire
+operation in a "may be partial" state.
+
+One of the patches in XSA-299 addressed the fact that it is possible
+for a previously-partially-validated entry to subsequently be found to
+have invalid entries (indicated by returning -EINVAL); in which case
+page->partial_flags needs to be set to indicate that the current PTE
+may have the partial bit set (and thus _put_page_type() should be
+called with PTF_partial_set).
+
+Unfortunately, the patches in XSA-299 assumed that once
+put_page_from_lNe() returned -ERESTART on a page, it was not possible
+for it to return -EINTR.  This turns out to be true for
+alloc_lN_table() and free_lN_table, but not for _get_page_type() and
+_put_page_type(): both can return -EINTR when called on pages with
+PGT_partial set.  In these cases, the pages PGT_partial will still be
+set; failing to set partial_flags appropriately may allow an attacker
+to do a privilege escalation similar to those described in XSA-299.
+
+Fix this by always copying the local partial_flags variable into
+page->partial_flags when exiting early.
+
+NB that on the "get" side, no adjustment to nr_validated_entries is
+needed: whether pte[i] is partially validated or entirely
+un-validated, we want nr_validated_entries = i.  On the "put" side,
+however, we need to adjust nr_validated_entries appropriately: if
+pte[i] is entirely validated, we want nr_validated_entries = i + 1; if
+pte[i] is partially validated, we want nr_validated_entries = i.
+
+This is part of XSA-310.
+
+Reported-by: Sarah Newman <srn%prgmr.com@localhost>
+Signed-off-by: George Dunlap <george.dunlap%citrix.com@localhost>
+Reviewed-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+ xen/arch/x86/mm.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index a759afc9e3..97c8d73b7b 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1557,7 +1557,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+         if ( rc == -EINTR && i )
+         {
+             page->nr_validated_ptes = i;
+-            page->partial_flags = 0;
++            page->partial_flags = partial_flags;;
+             rc = -ERESTART;
+         }
+         else if ( rc < 0 && rc != -EINTR )
+@@ -1660,7 +1660,7 @@ static int alloc_l3_table(struct page_info *page)
+         else if ( rc == -EINTR && i )
+         {
+             page->nr_validated_ptes = i;
+-            page->partial_flags = 0;
++            page->partial_flags = partial_flags;
+             rc = -ERESTART;
+         }
+         if ( rc < 0 )
+@@ -1982,8 +1982,8 @@ static int free_l2_table(struct page_info *page)
+     }
+     else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
+     {
+-        page->nr_validated_ptes = i + 1;
+-        page->partial_flags = 0;
++        page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set);
++        page->partial_flags = partial_flags;
+         rc = -ERESTART;
+     }
+ 
+@@ -2030,8 +2030,8 @@ static int free_l3_table(struct page_info *page)
+     }
+     else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+     {
+-        page->nr_validated_ptes = i + 1;
+-        page->partial_flags = 0;
++        page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set);
++        page->partial_flags = partial_flags;
+         rc = -ERESTART;
+     }
+     return rc > 0 ? 0 : rc;
+@@ -2061,8 +2061,8 @@ static int free_l4_table(struct page_info *page)
+     }
+     else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+     {
+-        page->nr_validated_ptes = i + 1;
+-        page->partial_flags = 0;
++        page->nr_validated_ptes = i + !(partial_flags & PTF_partial_set);
++        page->partial_flags = partial_flags;
+         rc = -ERESTART;
+     }
+ 
+-- 
+2.24.0
+
+From e9f835982a726ae16997c566b5eafab74f8b4cb7 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap%citrix.com@localhost>
+Date: Mon, 28 Oct 2019 14:33:51 +0000
+Subject: [PATCH 3/3] x86/mm: relinquish_memory: Grab an extra type ref when
+ setting PGT_partial
+
+The PGT_partial bit in page->type_info holds both a type count and a
+general ref count.  During domain tear-down, when free_page_type()
+returns -ERESTART, relinquish_memory() correctly handles the general
+ref count, but fails to grab an extra type count when setting
+PGT_partial.  When this bit is eventually cleared, type_count underflows
+and triggers the following BUG in page_alloc.c:free_domheap_pages():
+
+    BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
+
+As far as we can tell, this page underflow cannot be exploited any any
+other way: The page can't be used as a pagetable by the dying domain
+because it's dying; it can't be used as a pagetable by any other
+domain since it belongs to the dying domain; and ownership can't
+transfer to any other domain without hitting the BUG_ON() in
+free_domheap_pages().
+
+(steal_page() won't work on a page in this state, since it requires
+PGC_allocated to be set, and PGC_allocated will already have been
+cleared.)
+
+Fix this by grabbing an extra type ref if setting PGT_partial in
+relinquish_memory.
+
+This is part of XSA-310.
+
+Reported-by: Sarah Newman <srn%prgmr.com@localhost>
+Signed-off-by: George Dunlap <george.dunlap%citrix.com@localhost>
+Acked-by: Jan Beulich <jbeulich%suse.com@localhost>
+---
+v2:
+- Move discussion of potential exploits into the commit message
+- Keep PGT_partial and put_page() ordering
+---
+ xen/arch/x86/domain.c | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index f1dd86e12e..51880fc50d 100644
+--- xen/arch/x86/domain.c.orig
++++ xen/arch/x86/domain.c
+@@ -2049,6 +2049,25 @@ static int relinquish_memory(
+                     goto out;
+                 case -ERESTART:
+                     page_list_add(page, list);
++                    /*
++                     * PGT_partial holds a type ref and a general ref.
++                     * If we came in with PGT_partial set, then we 1)
++                     * don't need to grab an extra type count, and 2)
++                     * do need to drop the extra page ref we grabbed
++                     * at the top of the loop.  If we didn't come in
++                     * with PGT_partial set, we 1) do need to drab an
++                     * extra type count, but 2) can transfer the page
++                     * ref we grabbed above to it.
++                     *
++                     * Note that we must increment type_info before
++                     * setting PGT_partial.  Theoretically it should
++                     * be safe to drop the page ref before setting
++                     * PGT_partial, but do it afterwards just to be
++                     * extra safe.
++                     */
++                    if ( !(x & PGT_partial) )
++                        page->u.inuse.type_info++;
++                    smp_wmb();
+                     page->u.inuse.type_info |= PGT_partial;
+                     if ( x & PGT_partial )
+                         put_page(page);
+-- 
+2.24.0
+
Index: pkgsrc/sysutils/xenkernel411/patches/patch-XSA311
diff -u /dev/null pkgsrc/sysutils/xenkernel411/patches/patch-XSA311:1.1
--- /dev/null   Fri Dec 13 13:44:21 2019
+++ pkgsrc/sysutils/xenkernel411/patches/patch-XSA311   Fri Dec 13 13:44:21 2019
@@ -0,0 +1,189 @@
+$NetBSD: patch-XSA311,v 1.1 2019/12/13 13:44:21 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Subject: AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables
+
+update_paging_mode() has multiple bugs:
+
+ 1) Booting with iommu=debug will cause it to inform you that that it called
+    without the pdev_list lock held.
+ 2) When growing by more than a single level, it leaks the newly allocated
+    table(s) in the case of a further error.
+
+Furthermore, the choice of default level for a domain has issues:
+
+ 1) All HVM guests grow from 2 to 3 levels during construction because of the
+    position of the VRAM just below the 4G boundary, so defaulting to 2 is a
+    waste of effort.
+ 2) The limit for PV guests doesn't take memory hotplug into account, and
+    isn't dynamic at runtime like HVM guests.  This means that a PV guest may
+    get RAM which it can't map in the IOMMU.
+
+The dynamic height is a property unique to AMD, and adds a substantial
+quantity of complexity for what is a marginal performance improvement.  Remove
+the complexity by removing the dynamic height.
+
+PV guests now get 3 or 4 levels based on any hotplug regions in the host.
+This only makes a difference for hardware which previously had all RAM below
+the 512G boundary, and a hotplug region above.
+
+HVM guests now get 4 levels (which will be sufficient until 256TB guests
+become a thing), because we don't currently have the information to know when
+3 would be safe to use.
+
+The overhead of this extra level is not expected to be noticeable.  It costs
+one page (4k) per domain, and one extra IO-TLB paging structure cache entry
+which is very hot and less likely to be evicted.
+
+This is XSA-311.
+
+Reported-by: XXX PERSON <XXX EMAIL>3
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Signed-off-by: Andrew Cooper <andrew.cooper3%citrix.com@localhost>
+Acked-by: Jan Beulich <jbeulich%suse.com@localhost>
+
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -569,97 +569,6 @@ static int iommu_pde_from_gfn(struct dom
+     return 0;
+ }
+ 
+-static int update_paging_mode(struct domain *d, unsigned long gfn)
+-{
+-    u16 bdf;
+-    void *device_entry;
+-    unsigned int req_id, level, offset;
+-    unsigned long flags;
+-    struct pci_dev *pdev;
+-    struct amd_iommu *iommu = NULL;
+-    struct page_info *new_root = NULL;
+-    struct page_info *old_root = NULL;
+-    void *new_root_vaddr;
+-    unsigned long old_root_mfn;
+-    struct domain_iommu *hd = dom_iommu(d);
+-
+-    if ( gfn == gfn_x(INVALID_GFN) )
+-        return -EADDRNOTAVAIL;
+-    ASSERT(!(gfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
+-
+-    level = hd->arch.paging_mode;
+-    old_root = hd->arch.root_table;
+-    offset = gfn >> (PTE_PER_TABLE_SHIFT * (level - 1));
+-
+-    ASSERT(spin_is_locked(&hd->arch.mapping_lock) && is_hvm_domain(d));
+-
+-    while ( offset >= PTE_PER_TABLE_SIZE )
+-    {
+-        /* Allocate and install a new root table.
+-         * Only upper I/O page table grows, no need to fix next level bits */
+-        new_root = alloc_amd_iommu_pgtable();
+-        if ( new_root == NULL )
+-        {
+-            AMD_IOMMU_DEBUG("%s Cannot allocate I/O page table\n",
+-                            __func__);
+-            return -ENOMEM;
+-        }
+-
+-        new_root_vaddr = __map_domain_page(new_root);
+-        old_root_mfn = mfn_x(page_to_mfn(old_root));
+-        set_iommu_pde_present(new_root_vaddr, old_root_mfn, level,
+-                              !!IOMMUF_writable, !!IOMMUF_readable);
+-        level++;
+-        old_root = new_root;
+-        offset >>= PTE_PER_TABLE_SHIFT;
+-        unmap_domain_page(new_root_vaddr);
+-    }
+-
+-    if ( new_root != NULL )
+-    {
+-        hd->arch.paging_mode = level;
+-        hd->arch.root_table = new_root;
+-
+-        if ( !pcidevs_locked() )
+-            AMD_IOMMU_DEBUG("%s Try to access pdev_list "
+-                            "without aquiring pcidevs_lock.\n", __func__);
+-
+-        /* Update device table entries using new root table and paging mode */
+-        for_each_pdev( d, pdev )
+-        {
+-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
+-            iommu = find_iommu_for_device(pdev->seg, bdf);
+-            if ( !iommu )
+-            {
+-                AMD_IOMMU_DEBUG("%s Fail to find iommu.\n", __func__);
+-                return -ENODEV;
+-            }
+-
+-            spin_lock_irqsave(&iommu->lock, flags);
+-            do {
+-                req_id = get_dma_requestor_id(pdev->seg, bdf);
+-                device_entry = iommu->dev_table.buffer +
+-                               (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+-
+-                /* valid = 0 only works for dom0 passthrough mode */
+-                amd_iommu_set_root_page_table((u32 *)device_entry,
+-                                              page_to_maddr(hd->arch.root_table),
+-                                              d->domain_id,
+-                                              hd->arch.paging_mode, 1);
+-
+-                amd_iommu_flush_device(iommu, req_id);
+-                bdf += pdev->phantom_stride;
+-            } while ( PCI_DEVFN2(bdf) != pdev->devfn &&
+-                      PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
+-            spin_unlock_irqrestore(&iommu->lock, flags);
+-        }
+-
+-        /* For safety, invalidate all entries */
+-        amd_iommu_flush_all_pages(d);
+-    }
+-    return 0;
+-}
+-
+ int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
+                        unsigned int flags)
+ {
+@@ -685,19 +594,6 @@ int amd_iommu_map_page(struct domain *d,
+         return rc;
+     }
+ 
+-    /* Since HVM domain is initialized with 2 level IO page table,
+-     * we might need a deeper page table for lager gfn now */
+-    if ( is_hvm_domain(d) )
+-    {
+-        if ( update_paging_mode(d, gfn) )
+-        {
+-            spin_unlock(&hd->arch.mapping_lock);
+-            AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn);
+-            domain_crash(d);
+-            return -EFAULT;
+-        }
+-    }
+-
+     if ( iommu_pde_from_gfn(d, gfn, pt_mfn, true) || (pt_mfn[1] == 0) )
+     {
+         spin_unlock(&hd->arch.mapping_lock);
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -242,11 +242,17 @@ static int amd_iommu_domain_init(struct
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+ 
+-    /* For pv and dom0, stick with get_paging_mode(max_page)
+-     * For HVM dom0, use 2 level page table at first */
+-    hd->arch.paging_mode = is_hvm_domain(d) ?
+-                      IOMMU_PAGING_MODE_LEVEL_2 :
+-                      get_paging_mode(max_page);
++    /*
++     * Choose the number of levels for the IOMMU page tables.
++     * - PV needs 3 or 4, depending on whether there is RAM (including hotplug
++     *   RAM) above the 512G boundary.
++     * - HVM could in principle use 3 or 4 depending on how much guest
++     *   physical address space we give it, but this isn't known yet so use 4
++     *   unilaterally.
++     */
++    hd->arch.paging_mode = is_hvm_domain(d)
++        ? IOMMU_PAGING_MODE_LEVEL_4 : get_paging_mode(get_upper_mfn_bound());
++
+     return 0;
+ }
+ 



Home | Main Index | Thread Index | Old Index