tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: kernel memory allocation failures



On Fri, Dec 11, 2015 at 12:16:19PM -0500, Christos Zoulas wrote:
> On Dec 11,  8:54am, chuq%chuq.com@localhost (Chuck Silvers) wrote:
> -- Subject: Re: kernel memory allocation failures
> 
> | I haven't looked see exactly what the internal conditions are that lead to
> | kmem_alloc(KM_SLEEP) returning NULL, so it's not clear whether having
> | kmem_alloc() retry or panic in that situation would be better.
> | but either of those would be better than adding code to all the callers.
> 
>     http://nxr.netbsd.org/xref/src/sys/kern/subr_vmem.c#256
> 
> Where we are calling the pool code with PR_NOWAIT and the flags are not paid
> attention to at all (except in the recursive calls).

the attached patch makes bt_alloc() retry the bt_refill() when called with VM_SLEEP,
which I think removes all the ways that vmem_alloc() with VM_SLEEP
(and thus kmem_alloc() with KM_SLEEP) can fail.


> And there is also:
> 
>     http://nxr.netbsd.org/xref/src/sys/kern/subr_vmem.c#272
> 
> where the 3 bt_refill() calls are not even being checked (but at least
> they use flags :-)

the error is not checked because these are just a precaution,
the function will not do anything different if these fail.
I'll cast them to void.


I'll commit this patch in a few days if there are no complaints.

-Chuck
Index: src/sys/kern/subr_vmem.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/kern/subr_vmem.c,v
retrieving revision 1.93
diff -u -p -r1.93 subr_vmem.c
--- src/sys/kern/subr_vmem.c	24 Aug 2015 22:50:32 -0000	1.93
+++ src/sys/kern/subr_vmem.c	15 Feb 2016 20:05:36 -0000
@@ -194,6 +194,16 @@ static LIST_HEAD(, vmem_btag) vmem_btag_
 static size_t vmem_btag_freelist_count = 0;
 static struct pool vmem_btag_pool;
 
+static void
+vmem_kick_pdaemon(void)
+{
+#if defined(_KERNEL)
+	mutex_spin_enter(&uvm_fpageqlock);
+	uvm_kick_pdaemon();
+	mutex_spin_exit(&uvm_fpageqlock);
+#endif
+}
+
 /* ---- boundary tag */
 
 static int bt_refill(vmem_t *vm, vm_flag_t flags);
@@ -270,11 +280,11 @@ bt_refill(vmem_t *vm, vm_flag_t flags)
 	VMEM_UNLOCK(vm);
 
 	if (kmem_meta_arena != NULL) {
-		bt_refill(kmem_arena, (flags & ~VM_FITMASK)
+		(void)bt_refill(kmem_arena, (flags & ~VM_FITMASK)
 		    | VM_INSTANTFIT | VM_POPULATING);
-		bt_refill(kmem_va_meta_arena, (flags & ~VM_FITMASK)
+		(void)bt_refill(kmem_va_meta_arena, (flags & ~VM_FITMASK)
 		    | VM_INSTANTFIT | VM_POPULATING);
-		bt_refill(kmem_meta_arena, (flags & ~VM_FITMASK)
+		(void)bt_refill(kmem_meta_arena, (flags & ~VM_FITMASK)
 		    | VM_INSTANTFIT | VM_POPULATING);
 	}
 
@@ -289,7 +299,21 @@ bt_alloc(vmem_t *vm, vm_flag_t flags)
 	while (vm->vm_nfreetags <= BT_MINRESERVE && (flags & VM_POPULATING) == 0) {
 		VMEM_UNLOCK(vm);
 		if (bt_refill(vm, VM_NOSLEEP | VM_INSTANTFIT)) {
-			return NULL;
+			if ((flags & VM_NOSLEEP) != 0) {
+				return NULL;
+			}
+
+			/*
+			 * It would be nice to wait for something specific here
+			 * but there are multiple ways that a retry could
+			 * succeed and we can't wait for multiple things
+			 * simultaneously.  So we'll just sleep for an arbitrary
+			 * short period of time and retry regardless.
+			 * This should be a very rare case.
+			 */
+
+			vmem_kick_pdaemon();
+			kpause("btalloc", false, 1, NULL);
 		}
 		VMEM_LOCK(vm);
 	}
@@ -1177,11 +1201,7 @@ retry:
 	/* XXX */
 
 	if ((flags & VM_SLEEP) != 0) {
-#if defined(_KERNEL)
-		mutex_spin_enter(&uvm_fpageqlock);
-		uvm_kick_pdaemon();
-		mutex_spin_exit(&uvm_fpageqlock);
-#endif
+		vmem_kick_pdaemon();
 		VMEM_LOCK(vm);
 		VMEM_CONDVAR_WAIT(vm);
 		VMEM_UNLOCK(vm);
Index: src/sys/kern/subr_kmem.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/kern/subr_kmem.c,v
retrieving revision 1.61
diff -u -p -r1.61 subr_kmem.c
--- src/sys/kern/subr_kmem.c	27 Jul 2015 09:24:28 -0000	1.61
+++ src/sys/kern/subr_kmem.c	14 Feb 2016 18:13:48 -0000
@@ -383,9 +383,13 @@ kmem_intr_free(void *p, size_t requested
 void *
 kmem_alloc(size_t size, km_flag_t kmflags)
 {
+	void *v;
+
 	KASSERTMSG((!cpu_intr_p() && !cpu_softintr_p()),
 	    "kmem(9) should not be used from the interrupt context");
-	return kmem_intr_alloc(size, kmflags);
+	v = kmem_intr_alloc(size, kmflags);
+	KASSERT(v || (kmflags & KM_NOSLEEP) != 0);
+	return v;
 }
 
 /*
@@ -396,9 +400,13 @@ kmem_alloc(size_t size, km_flag_t kmflag
 void *
 kmem_zalloc(size_t size, km_flag_t kmflags)
 {
+	void *v;
+
 	KASSERTMSG((!cpu_intr_p() && !cpu_softintr_p()),
 	    "kmem(9) should not be used from the interrupt context");
-	return kmem_intr_zalloc(size, kmflags);
+	v = kmem_intr_zalloc(size, kmflags);
+	KASSERT(v || (kmflags & KM_NOSLEEP) != 0);
+	return v;
 }
 
 /*
Index: src/share/man/man9/vmem.9
===================================================================
RCS file: /home/chs/netbsd/cvs/src/share/man/man9/vmem.9,v
retrieving revision 1.15
diff -u -p -r1.15 vmem.9
--- src/share/man/man9/vmem.9	29 Jan 2013 22:02:17 -0000	1.15
+++ src/share/man/man9/vmem.9	15 Feb 2016 19:39:43 -0000
@@ -83,7 +83,7 @@ other than virtual memory.
 .Fn vmem_create
 creates a new vmem arena.
 .Pp
-.Bl -tag -width qcache_max
+.Bl -tag -offset indent -width qcache_max
 .It Fa name
 The string to describe the vmem.
 .It Fa base
@@ -118,7 +118,7 @@ calls
 to import a span of size at least
 .Fa size .
 .Fa allocfn
-should accept the same
+must accept the same
 .Fa flags
 as
 .Fn vmem_alloc .
@@ -169,7 +169,8 @@ It is merely a hint and can be ignored.
 Either of:
 .Bl -tag -width VM_NOSLEEP
 .It Dv VM_SLEEP
-Can sleep until enough resources are available.
+If the allocation cannot be satisfied immediately, sleep until enough
+resources are available.
 .It Dv VM_NOSLEEP
 Don't sleep.
 Immediately return
@@ -184,7 +185,7 @@ Interrupt level to be blocked for alloca
 .Fn vmem_xcreate
 creates a new vmem arena.
 .Pp
-.Bl -tag -width qcache_max
+.Bl -tag -offset indent -width qcache_max
 .It Fa name
 The string to describe the vmem.
 .It Fa base
@@ -220,7 +221,7 @@ calls
 to import a span of size at least
 .Fa size .
 .Fa allocfn
-should accept the same
+must accept the same
 .Fa flags
 as
 .Fn vmem_alloc .
@@ -274,7 +275,8 @@ It is merely a hint and can be ignored.
 Either of:
 .Bl -tag -width VM_NOSLEEP
 .It Dv VM_SLEEP
-Can sleep until enough resources are available.
+If the allocation cannot be satisfied immediately, sleep until enough
+resources are available.
 .It Dv VM_NOSLEEP
 Don't sleep.
 Immediately return
@@ -297,23 +299,26 @@ Returns
 on success,
 .Dv ENOMEM
 on failure.
-.Fa flags
-should be one of:
+.Bl -tag -offset indent -width flags
+.It Fa flags
+Either of:
 .Bl -tag -width VM_NOSLEEP
 .It Dv VM_SLEEP
-Can sleep until enough resources are available.
+If the allocation cannot be satisfied immediately, sleep until enough
+resources are available.
 .It Dv VM_NOSLEEP
 Don't sleep.
 Immediately return
 .Dv ENOMEM
 if there are not enough resources available.
 .El
+.El
 .Pp
 .\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 .Fn vmem_xalloc
 allocates a resource from the arena.
 .Pp
-.Bl -tag -width nocross
+.Bl -tag -offset indent -width nocross
 .It Fa vm
 The arena which we allocate from.
 .It Fa size
@@ -333,10 +338,10 @@ If
 .Fa align
 is zero,
 .Fa phase
-should be zero.
+must be zero.
 Otherwise,
 .Fa phase
-should be smaller than
+must be smaller than
 .Fa align .
 .It Fa nocross
 Request a resource which doesn't cross
@@ -353,7 +358,7 @@ if the caller does not care.
 .It Fa flags
 A bitwise OR of an allocation strategy and a sleep flag.
 .Pp
-The allocation strategy is one of:
+The allocation strategy must be one of:
 .Bl -tag -width VM_INSTANTFIT
 .It Dv VM_BESTFIT
 Prefer space efficiency.
@@ -361,10 +366,11 @@ Prefer space efficiency.
 Prefer performance.
 .El
 .Pp
-The sleep flag should be one of:
+The sleep flag must be one of:
 .Bl -tag -width VM_NOSLEEP
 .It Dv VM_SLEEP
-Can sleep until enough resources are available.
+If the allocation cannot be satisfied immediately, sleep until enough
+resources are available.
 .It Dv VM_NOSLEEP
 Don't sleep.
 Immediately return
@@ -386,14 +392,14 @@ frees resource allocated by
 .Fn vmem_xalloc
 to the arena.
 .Pp
-.Bl -tag -width addr
+.Bl -tag -offset indent -width addr
 .It Fa vm
 The arena which we free to.
 .It Fa addr
 The resource being freed.
-It must be the one returned by
+It must have been allocated via
 .Fn vmem_xalloc .
-Notably, it must not be the one from
+Notably, it must not have been allocated via
 .Fn vmem_alloc .
 Otherwise, the behaviour is undefined.
 .It Fa size
@@ -408,7 +414,7 @@ argument used for
 .Fn vmem_alloc
 allocates a resource from the arena.
 .Pp
-.Bl -tag -width flags
+.Bl -tag -offset indent -width flags
 .It Fa vm
 The arena which we allocate from.
 .It Fa size
@@ -416,7 +422,7 @@ Specify the size of the allocation.
 .It Fa flags
 A bitwise OR of an allocation strategy and a sleep flag.
 .Pp
-The allocation strategy is one of:
+The allocation strategy must be one of:
 .Bl -tag -width VM_INSTANTFIT
 .It Dv VM_BESTFIT
 Prefer space efficiency.
@@ -424,10 +430,11 @@ Prefer space efficiency.
 Prefer performance.
 .El
 .Pp
-The sleep flag should be one of:
+The sleep flag must be one of:
 .Bl -tag -width VM_NOSLEEP
 .It Dv VM_SLEEP
-Can sleep until enough resources are available.
+If the allocation cannot be satisfied immediately, sleep until enough
+resources are available.
 .It Dv VM_NOSLEEP
 Don't sleep.
 Immediately return
@@ -449,14 +456,14 @@ frees resource allocated by
 .Fn vmem_alloc
 to the arena.
 .Pp
-.Bl -tag -width addr
+.Bl -tag -offset indent -width addr
 .It Fa vm
 The arena which we free to.
 .It Fa addr
 The resource being freed.
-It must be the one returned by
+It must have been allocated via
 .Fn vmem_alloc .
-Notably, it must not be the one from
+Notably, it must not have been allocated via
 .Fn vmem_xalloc .
 Otherwise, the behaviour is undefined.
 .It Fa size
@@ -471,10 +478,10 @@ argument used for
 .Fn vmem_destroy
 destroys a vmem arena.
 .Pp
-.Bl -tag -width vm
+.Bl -tag -offset indent -width vm
 .It Fa vm
 The vmem arena being destroyed.
-The caller should ensure that no one will use it anymore.
+The caller must ensure that no one will use it anymore.
 .El
 .\" ------------------------------------------------------------
 .Sh RETURN VALUES
Index: src/share/man/man9/kmem.9
===================================================================
RCS file: /home/chs/netbsd/cvs/src/share/man/man9/kmem.9,v
retrieving revision 1.19
diff -u -p -r1.19 kmem.9
--- src/share/man/man9/kmem.9	11 Dec 2015 10:05:17 -0000	1.19
+++ src/share/man/man9/kmem.9	14 Feb 2016 18:22:31 -0000
@@ -77,15 +77,9 @@ Either of the following:
 .It Dv KM_SLEEP
 If the allocation cannot be satisfied immediately, sleep until enough
 memory is available.
-Note that this does not mean that if
+If
 .Dv KM_SLEEP
 is specified, then the allocation cannot fail.
-Under resource stress conditions, the allocation can fail and the
-function will return
-.Dv NULL .
-One such scenario is when the allocation size is larger than it can ever
-be allocated; another is when the system memory resources are exhausted
-to even allocate pools of pages.
 .It Dv KM_NOSLEEP
 Don't sleep.
 Immediately return
@@ -143,9 +137,6 @@ It must be the one returned by
 .Fn kmem_alloc
 or
 .Fn kmem_zalloc .
-One such scenario is when the allocation size is larger than it can ever
-be allocated; another is when the system memory resources are exhausted
-to even allocate pools of pages.
 .It Fa size
 The size of the memory being freed, in bytes.
 It must be the same as the
@@ -185,10 +176,6 @@ It should be noted that
 .Fn kmem_free
 may also block.
 .Pp
-Always check the return value of the allocators, even when
-.Dv KM_SLEEP
-is specified to avoid kernel crashes during resource stress conditions.
-.Pp
 For some locks this is permissible or even unavoidable.
 For others, particularly locks that may be taken from soft interrupt context,
 it is a serious problem.


Home | Main Index | Thread Index | Old Index