NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

kern/57188: cow bug in uvm on multiprocessor systems



>Number:         57188
>Category:       kern
>Synopsis:       cow bug in uvm on multiprocessor systems
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    kern-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Mon Jan 16 21:55:00 +0000 2023
>Originator:     Taylor R Campbell
>Release:        current
>Organization:
The NetBSD Cowndation
>Environment:
washing away in atmospheric rivers of climate change
>Description:
(This is the same bug as FreeBSD had: https://reviews.freebsd.org/D14347)

Consider the following sketch (given in detail further below):

struct { long x; char pad[512]; int y; } *f = mmap(...);

/* thread A */
for (;;)
        f->y = 1;  /* trigger cow; send tlb ipi */

/* thread B */
for (long x = 0;; x++) {
        lock();
        f->x = x; /* write to new page after cow tlb flush */
        unlock();
}

/* thread C */
for (;;) {
        lock();
        long x0 = f->x; /* read from old page before tlb flush */
        /* tlb ipi may be handled here */
        long x1 = f->x; /* read from new page after tlb flush */
        unlock();
        assert(x0 == x1);
}

/* main thread */
for (;;) {
        pid_t pid = fork();
        if (pid == 0) _exit(0);
        waitpid(pid, NULL, 0);
}

The following order of events can happen if each thread is running on a separate CPU.

1. Main thread forks.  Address space marked nonwritable for copy-on-write (COW).
   => Old (current) _physical_ page has f->x = 123.
2. Thread A triggers COW by writing to f->y.
   (a) uvm allocates new physical page.
   (b) uvm copies old physical page to new physical page.
   (c) uvm enters new page in pmap with read/write permission.
   (d) pmap sends TLB IPI to all CPUs so they witness it.
3. Thread B executes TLB IPI and sees new physical page.
4. Thread B does lock(); f->x = 124; unlock().
   => New _physical_ page has f->x = 124.
5. Thread C does lock(); x0 = f->x.
   => No TLB IPI yet, so x0 = 123 from old physical page.
6. Thread C executes TLB IPI and sees new physical page.
7. Thread C does x1 = f->x; unlock().
   => TLB IPI ran, so x1 = 124 from new physical page.
8. Assertion fires because x0 = 123 != 124 = x1.

Note the program itself is structured so that all access to f->x is serialized by a lock, which internally executes all appropriate memory barriers as usual.

The problem is that the pmap update is interleaved with program execution in different orders on different CPUs/threads: in this scenario, thread B sees it before lock(); f->x = 124; unlock(), but thread C sees it after lock(); x0 = f->x, even though there is a happens-before relation ordering thread B's f->x = 124 store and thread C's x0 = f->x load in the C program.
>How-To-Repeat:
#include <sys/mman.h>
#include <sys/wait.h>

#include <assert.h>
#include <err.h>
#include <pthread.h>
#include <stddef.h>
#include <stdio.h>
#include <unistd.h>

struct foo {
	long x;
	char pad[512];
	int y;
};

static pthread_mutex_t mutex;

static void
lock(void)
{
	int error;

	error = pthread_mutex_lock(&mutex);
	if (error)
		errc(1, error, "lock@%p", __builtin_return_address(0));
}

static void
unlock(void)
{
	int error;

	error = pthread_mutex_unlock(&mutex);
	if (error)
		errc(1, error, "lock@%p", __builtin_return_address(0));
}

static void * __dead
a(void *cookie)
{
	volatile struct foo *f = cookie;

	for (;; pthread_testcancel())
		f->y = 1;
}

static void * __dead
b(void *cookie)
{
	volatile struct foo *f = cookie;
	long x;

	for (x = 0;; x++, pthread_testcancel()) {
		lock();
		f->x = x;
		unlock();
	}
}

static void
fence_it_up(void)
{
	int eax, ebx, ecx, edx;

	asm volatile("xorl %%eax,%%eax; cpuid; pause"
	    : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx));
}

static void * __dead
c(void *cookie)
{
	volatile struct foo *f = cookie;
	long x0, x1;

	for (;; pthread_testcancel()) {
		lock();
		x0 = f->x;
		fence_it_up();
		x1 = f->x;
		unlock();

		if (x0 != x1)
			errx(1, "%ld != %ld", x0, x1);
	}
}

static void
cow_it_up(void)
{
        long rax = 2;		/* SYS_FORK */

	asm volatile(
		"	syscall\n"
		"	cmpq $0,%%rax\n"
		"	jne 1f\n"
		"	movq $0,%%rdi\n"	/* child */
		"	movq $1,%%rax\n" /* SYS_EXIT */
		"	syscall\n"
		"	hlt\n"
		"1:"
		: /*output*/ "+a"(rax)
		: /*input*/
		: /*clobber*/ "memory");
	if (rax == -1)
		err(1, "fork");
	if (waitpid(rax, NULL, 0) == -1)
		err(1, "waitpid");
}

int
main(void)
{
	static void *(*f[])(void *) = {a, b, c};
	pthread_t t[__arraycount(f)];
	size_t pagesize = sysconf(_SC_PAGESIZE);
	void *p;
	unsigned i;
	int error;

	assert(pagesize >= sizeof(struct foo));

	p = mmap(/*addr*/NULL, pagesize, PROT_READ|PROT_WRITE,
	    MAP_ANON|MAP_PRIVATE, /*fd*/-1, /*offset*/0);
	if (p == MAP_FAILED)
		err(1, "mmap");

	error = pthread_mutex_init(&mutex, NULL);
	if (error)
		errc(1, error, "pthread_mutex_init");

	for (i = 0; i < __arraycount(f); i++) {
		error = pthread_create(&t[i], NULL, f[i], p);
		if (error)
			errc(1, error, "pthread_create %u", i);
	}
	for (i = 0; i < 100000; i++) {
		if ((i % 10000) == 0)
			fprintf(stderr, "%u\n", i);
		cow_it_up();
	}
	for (i = 0; i < __arraycount(f); i++) {
		fprintf(stderr, "cancel %u\n", i);
		error = pthread_cancel(t[i]);
		if (error)
			errc(1, error, "pthread_cancel %u", i);
	}
	for (i = 0; i < __arraycount(f); i++) {
		fprintf(stderr, "join %u\n", i);
		error = pthread_join(t[i], NULL);
		if (error)
			errc(1, error, "pthread_join %u", i);
	}

	error = pthread_mutex_destroy(&mutex);
	if (error)
		errc(1, error, "pthread_mutex_destroy");

	if (munmap(p, pagesize) == -1)
		err(1, "munmap");

	return 0;
}

>Fix:
Various fixes are possible.  A cheap fix is this:

diff --git a/sys/uvm/uvm_fault.c b/sys/uvm/uvm_fault.c
index 7e5133f124e9..97a1986afb87 100644
--- a/sys/uvm/uvm_fault.c
+++ b/sys/uvm/uvm_fault.c
@@ -635,6 +635,7 @@ uvmfault_promote(struct uvm_faultinfo *ufi,
 
 	/* copy page [pg now dirty] */
 	if (opg) {
+		pmap_page_protect(opg, VM_PROT_NONE);
 		uvm_pagecopy(opg, pg);
 	}
 	KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY);

This inserts a step between 2(a) and 2(b) to block all access to the old physical page until the COW has completed.  The scenario above can't happen with this change because by the time thread B can write to the new physical page in step (4), it is guaranteed that thread C _either_ sees the new physical page too, _or_ sees the old physical page mapped PROT_NONE so that the attempt to read in step (5) will block until COW has completed and it seeds the new physical page instead.

This solution is suboptimal, but it is quick to implement, it's already been tested, and it's easy to be confident in its correctness, so it's fit for pullup to release branches.

This change _also_ has the effect of suppressing the bug described in https://github.com/golang/go/issues/34988 but I have no theory for how it could have that effect other than by changing timing or something.  This bug here depends on having the TLB IPI handler run between the two loads, which will separate the loads by at least hundreds of nanoseconds if not microseconds or more, whereas in that bug there, we have observed different values loaded over the course of about 10ns in which it is not physically possible for the TLB IPI handler to have run.



Home | Main Index | Thread Index | Old Index