Subject: more locks..
To: None <tech-smp@netbsd.org>
From: Paul Kranenburg <pk@cs.few.eur.nl>
List: tech-smp
Date: 02/02/2003 23:44:28
--ELM74055137-22799-0_
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=US-ASCII
I've looked into making several kernel modules MP-safer. Attached is
a set of diffs for the buffer cache (mostly vfs_bio.c) for your
inspection. With this patch, my 2-cpu machine can run the buffer cache
code concurrently, although I didn't push it very hard (just ffs & lfs
on a scsi disk).
Note that the patch omits bunch of files that need trivial changes, such
as initing a buffer's simple lock.
-pk
--ELM74055137-22799-0_
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset=US-ASCII
Content-Disposition: attachment; filename=DIFF
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.56
diff -u -r1.56 buf.h
--- sys/buf.h 2003/02/01 21:07:02 1.56
+++ sys/buf.h 2003/02/02 22:27:07
@@ -82,6 +82,7 @@
#include <sys/pool.h>
#include <sys/queue.h>
+#include <sys/lock.h>
struct buf;
struct mount;
@@ -158,6 +159,7 @@
TAILQ_ENTRY(buf) b_actq; /* Device driver queue when active. */
struct proc *b_proc; /* Associated proc if B_PHYS set. */
volatile long b_flags; /* B_* flags. */
+ struct simplelock b_interlock; /* Lock for b_flags changes */
int b_error; /* Errno value. */
long b_bufsize; /* Allocated buffer size. */
long b_bcount; /* Valid bytes in buffer. */
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v
retrieving revision 1.86
diff -u -r1.86 vfs_bio.c
--- kern/vfs_bio.c 2003/01/18 10:06:37 1.86
+++ kern/vfs_bio.c 2003/02/02 22:27:17
@@ -103,26 +103,35 @@
int needbuffer;
/*
+ * Buffer queue lock.
+ * Take this lock first if also taking some buffer's b_interlock.
+ */
+struct simplelock bqueue_slock = SIMPLELOCK_INITIALIZER;
+extern struct simplelock global_v_numoutput_slock;/*XXX*/
+
+/*
* Buffer pool for I/O buffers.
*/
struct pool bufpool;
/*
+ * bread()/breadn() helper.
+ */
+static __inline struct buf *bio_doread(struct vnode *, daddr_t, int,
+ struct ucred *, int);
+int count_lock_queue(void);
+
+/*
* Insq/Remq for the buffer free lists.
+ * Call with buffer queue locked.
*/
#define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist)
#define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist)
-static __inline struct buf *bio_doread __P((struct vnode *, daddr_t, int,
- struct ucred *, int));
-int count_lock_queue __P((void));
-
void
bremfree(bp)
struct buf *bp;
{
- int s = splbio();
-
struct bqueues *dp = NULL;
/*
@@ -140,7 +149,6 @@
panic("bremfree: lost tail");
}
TAILQ_REMOVE(dp, bp, b_freelist);
- splx(s);
}
/*
@@ -168,6 +176,7 @@
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
memset((char *)bp, 0, sizeof(*bp));
+ simple_lock_init(&bp->b_interlock);
bp->b_dev = NODEV;
bp->b_vnbufs.le_next = NOLIST;
LIST_INIT(&bp->b_dep);
@@ -206,7 +215,7 @@
/*
* If buffer does not have data valid, start a read.
* Note that if buffer is B_INVAL, getblk() won't return it.
- * Therefore, it's valid if it's I/O has completed or been delayed.
+ * Therefore, it's valid if its I/O has completed or been delayed.
*/
if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
/* Start I/O for the buffer. */
@@ -307,6 +316,8 @@
struct vnode *vp;
struct mount *mp;
+ KASSERT(ISSET(bp->b_flags, B_BUSY));
+
vp = bp->b_vp;
if (vp != NULL) {
if (vp->v_type == VBLK)
@@ -345,6 +356,7 @@
wasdelayed = ISSET(bp->b_flags, B_DELWRI);
s = splbio();
+ simple_lock(&bp->b_interlock);
CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
@@ -358,7 +370,10 @@
p->p_stats->p_ru.ru_oublock++;
/* Initiate disk write. Make sure the appropriate party is charged. */
+ simple_lock(&global_v_numoutput_slock);
bp->b_vp->v_numoutput++;
+ simple_unlock(&global_v_numoutput_slock);
+ simple_unlock(&bp->b_interlock);
splx(s);
VOP_STRATEGY(bp);
@@ -407,6 +422,8 @@
const struct bdevsw *bdev;
int s;
+ KASSERT(ISSET(bp->b_flags, B_BUSY));
+
/* If this is a tape block, write the block now. */
/* XXX NOTE: the memory filesystem usurpes major device */
/* XXX number 4095, which is a bad idea. */
@@ -425,6 +442,7 @@
* (3) Make sure it's on its vnode's correct block list.
*/
s = splbio();
+ simple_lock(&bp->b_interlock);
if (!ISSET(bp->b_flags, B_DELWRI)) {
SET(bp->b_flags, B_DELWRI);
@@ -434,6 +452,7 @@
/* Otherwise, the "write" is done, so mark and release the buffer. */
CLR(bp->b_flags, B_NEEDCOMMIT|B_DONE);
+ simple_unlock(&bp->b_interlock);
splx(s);
brelse(bp);
@@ -446,8 +465,15 @@
bawrite(bp)
struct buf *bp;
{
+ int s;
+ KASSERT(ISSET(bp->b_flags, B_BUSY));
+
+ s = splbio();
+ simple_lock(&bp->b_interlock);
SET(bp->b_flags, B_ASYNC);
+ simple_unlock(&bp->b_interlock);
+ splx(s);
VOP_BWRITE(bp);
}
@@ -462,7 +488,10 @@
struct proc *p = l->l_proc;
int s;
+ KASSERT(ISSET(bp->b_flags, B_BUSY));
+
s = splbio();
+ simple_lock(&bp->b_interlock);
CLR(bp->b_flags, B_AGE);
@@ -472,6 +501,7 @@
reassignbuf(bp, bp->b_vp);
}
+ simple_unlock(&bp->b_interlock);
splx(s);
}
@@ -488,15 +518,17 @@
KASSERT(ISSET(bp->b_flags, B_BUSY));
+ /* Block disk interrupts. */
+ s = splbio();
+ simple_lock(&bqueue_slock);
+ simple_lock(&bp->b_interlock);
+
/* Wake up any processes waiting for any buffer to become free. */
if (needbuffer) {
needbuffer = 0;
wakeup(&needbuffer);
}
- /* Block disk interrupts. */
- s = splbio();
-
/* Wake up any proceeses waiting for _this_ buffer to become free. */
if (ISSET(bp->b_flags, B_WANTED)) {
CLR(bp->b_flags, B_WANTED|B_AGE);
@@ -584,6 +616,8 @@
SET(bp->b_flags, B_CACHE);
/* Allow disk interrupts. */
+ simple_unlock(&bp->b_interlock);
+ simple_unlock(&bqueue_slock);
splx(s);
}
@@ -629,17 +663,21 @@
int s, err;
start:
+ s = splbio();
+ simple_lock(&bqueue_slock);
bp = incore(vp, blkno);
if (bp != NULL) {
- s = splbio();
+ simple_lock(&bp->b_interlock);
if (ISSET(bp->b_flags, B_BUSY)) {
+ simple_unlock(&bqueue_slock);
if (curproc == uvm.pagedaemon_proc) {
+ simple_unlock(&bp->b_interlock);
splx(s);
return NULL;
}
SET(bp->b_flags, B_WANTED);
- err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
- slptimeo);
+ err = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
+ "getblk", slptimeo, &bp->b_interlock);
splx(s);
if (err)
return (NULL);
@@ -652,17 +690,20 @@
#endif
SET(bp->b_flags, B_BUSY);
bremfree(bp);
- splx(s);
} else {
- if ((bp = getnewbuf(slpflag, slptimeo)) == NULL)
+ if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) {
+ simple_unlock(&bqueue_slock);
+ splx(s);
goto start;
+ }
binshash(bp, BUFHASH(vp, blkno));
bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
- s = splbio();
bgetvp(vp, bp);
- splx(s);
}
+ simple_unlock(&bp->b_interlock);
+ simple_unlock(&bqueue_slock);
+ splx(s);
allocbuf(bp, size);
return (bp);
}
@@ -675,11 +716,18 @@
int size;
{
struct buf *bp;
+ int s;
+ s = splbio();
+ simple_lock(&bqueue_slock);
while ((bp = getnewbuf(0, 0)) == 0)
;
+
SET(bp->b_flags, B_INVAL);
binshash(bp, &invalhash);
+ simple_unlock(&bqueue_slock);
+ simple_unlock(&bp->b_interlock);
+ splx(s);
allocbuf(bp, size);
return (bp);
}
@@ -717,12 +765,18 @@
int amt;
/* find a buffer */
+ s = splbio();
+ simple_lock(&bqueue_slock);
while ((nbp = getnewbuf(0, 0)) == NULL)
;
SET(nbp->b_flags, B_INVAL);
binshash(nbp, &invalhash);
+ simple_unlock(&nbp->b_interlock);
+ simple_unlock(&bqueue_slock);
+ splx(s);
+
/* and steal its pages, up to the amount we need */
amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize));
pagemove((nbp->b_data + nbp->b_bufsize - amt),
@@ -738,7 +792,6 @@
if (nbp->b_bufsize < 0)
panic("allocbuf: negative bufsize");
#endif
-
brelse(nbp);
}
@@ -750,13 +803,17 @@
*/
if (bp->b_bufsize > desired_size) {
s = splbio();
+ simple_lock(&bqueue_slock);
if ((nbp = TAILQ_FIRST(&bufqueues[BQ_EMPTY])) == NULL) {
/* No free buffer head */
+ simple_unlock(&bqueue_slock);
splx(s);
goto out;
}
+ /* No need to lock nbp since it came from the empty queue */
bremfree(nbp);
- SET(nbp->b_flags, B_BUSY);
+ SET(nbp->b_flags, B_BUSY | B_INVAL);
+ simple_unlock(&bqueue_slock);
splx(s);
/* move the page to it and note this change */
@@ -765,7 +822,6 @@
nbp->b_bufsize = bp->b_bufsize - desired_size;
bp->b_bufsize = desired_size;
nbp->b_bcount = 0;
- SET(nbp->b_flags, B_INVAL);
/* release the newly-filled buffer and leave */
brelse(nbp);
@@ -779,24 +835,28 @@
* Find a buffer which is available for use.
* Select something from a free list.
* Preference is to AGE list, then LRU list.
+ *
+ * Called with buffer queues locked.
+ * Return buffer locked.
*/
struct buf *
getnewbuf(slpflag, slptimeo)
int slpflag, slptimeo;
{
struct buf *bp;
- int s;
start:
- s = splbio();
+ LOCK_ASSERT(simple_lock_held(&bqueue_slock));
+
if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL ||
(bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) {
+ simple_lock(&bp->b_interlock);
bremfree(bp);
} else {
/* wait for a free buffer of any kind */
needbuffer = 1;
- tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
- splx(s);
+ ltsleep(&needbuffer, slpflag|(PRIBIO+1),
+ "getnewbuf", slptimeo, &bqueue_slock);
return (NULL);
}
@@ -808,7 +868,7 @@
*/
CLR(bp->b_flags, B_VFLUSH);
SET(bp->b_flags, B_AGE);
- splx(s);
+ simple_unlock(&bp->b_interlock);
goto start;
}
@@ -820,12 +880,12 @@
* (since we might sleep while starting the write).
*/
if (ISSET(bp->b_flags, B_DELWRI)) {
- splx(s);
/*
* This buffer has gone through the LRU, so make sure it gets
* reused ASAP.
*/
SET(bp->b_flags, B_AGE);
+ simple_unlock(&bp->b_interlock);
bawrite(bp);
return (NULL);
}
@@ -833,7 +893,6 @@
/* disassociate us from our vnode, if we had one... */
if (bp->b_vp)
brelvp(bp);
- splx(s);
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
@@ -859,21 +918,25 @@
biowait(bp)
struct buf *bp;
{
- int s;
+ int s, error;
s = splbio();
+ simple_lock(&bp->b_interlock);
while (!ISSET(bp->b_flags, B_DONE | B_DELWRI))
- tsleep(bp, PRIBIO + 1, "biowait", 0);
- splx(s);
+ ltsleep(bp, PRIBIO + 1, "biowait", 0, &bp->b_interlock);
/* check for interruption of I/O (e.g. via NFS), then errors. */
if (ISSET(bp->b_flags, B_EINTR)) {
CLR(bp->b_flags, B_EINTR);
- return (EINTR);
+ error = EINTR;
} else if (ISSET(bp->b_flags, B_ERROR))
- return (bp->b_error ? bp->b_error : EIO);
+ error = bp->b_error ? bp->b_error : EIO;
else
- return (0);
+ error = 0;
+
+ simple_unlock(&bp->b_interlock);
+ splx(s);
+ return (error);
}
/*
@@ -898,6 +961,7 @@
{
int s = splbio();
+ simple_lock(&bp->b_interlock);
if (ISSET(bp->b_flags, B_DONE))
panic("biodone already");
SET(bp->b_flags, B_DONE); /* note that it's done */
@@ -910,12 +974,15 @@
if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
CLR(bp->b_flags, B_CALL); /* but note callout done */
+ simple_unlock(&bp->b_interlock);
(*bp->b_iodone)(bp);
} else {
- if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release */
+ if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release */
+ simple_unlock(&bp->b_interlock);
brelse(bp);
- else { /* or just wakeup the buffer */
+ } else { /* or just wakeup the buffer */
CLR(bp->b_flags, B_WANTED);
+ simple_unlock(&bp->b_interlock);
wakeup(bp);
}
}
@@ -932,8 +999,10 @@
struct buf *bp;
int n = 0;
+ simple_lock(&bqueue_slock);
TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED], b_freelist)
n++;
+ simple_unlock(&bqueue_slock);
return (n);
}
Index: kern/vfs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v
retrieving revision 1.186
diff -u -r1.186 vfs_subr.c
--- kern/vfs_subr.c 2003/02/01 06:23:45 1.186
+++ kern/vfs_subr.c 2003/02/02 22:27:36
@@ -158,6 +158,9 @@
struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER;
struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER;
+/* XXX - gross; single global lock to protect v_numoutput */
+struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER;
+
/*
* These define the root filesystem and device.
*/
@@ -640,12 +643,18 @@
struct vnode *vp;
if ((vp = bp->b_vp) != NULL) {
+ /* XXX global lock hack
+ * can't use v_interlock here since this is called
+ * in interrupt context from biodone().
+ */
+ simple_lock(&global_v_numoutput_slock);
if (--vp->v_numoutput < 0)
panic("vwakeup: neg numoutput, vp %p", vp);
if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
vp->v_flag &= ~VBWAIT;
wakeup((caddr_t)&vp->v_numoutput);
}
+ simple_unlock(&global_v_numoutput_slock);
}
}
@@ -691,10 +700,12 @@
restart:
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
nbp = LIST_NEXT(bp, b_vnbufs);
+ simple_lock(&bp->b_interlock);
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
- error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
- "vinvalbuf", slptimeo);
+ error = ltsleep((caddr_t)bp,
+ slpflag | (PRIBIO + 1) | PNORELOCK,
+ "vinvalbuf", slptimeo, &bp->b_interlock);
if (error) {
splx(s);
return (error);
@@ -702,15 +713,18 @@
goto restart;
}
bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
+ simple_unlock(&bp->b_interlock);
brelse(bp);
}
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = LIST_NEXT(bp, b_vnbufs);
+ simple_lock(&bp->b_interlock);
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
- error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
- "vinvalbuf", slptimeo);
+ error = ltsleep((caddr_t)bp,
+ slpflag | (PRIBIO + 1) | PNORELOCK,
+ "vinvalbuf", slptimeo, &bp->b_interlock);
if (error) {
splx(s);
return (error);
@@ -727,10 +741,12 @@
printf("buffer still DELWRI\n");
#endif
bp->b_flags |= B_BUSY | B_VFLUSH;
+ simple_unlock(&bp->b_interlock);
VOP_BWRITE(bp);
goto restart;
}
bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
+ simple_unlock(&bp->b_interlock);
brelse(bp);
}
@@ -773,10 +789,11 @@
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
+ simple_lock(&bp->b_interlock);
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
- error = tsleep(bp, slpflag | (PRIBIO + 1),
- "vtruncbuf", slptimeo);
+ error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
+ "vtruncbuf", slptimeo, &bp->b_interlock);
if (error) {
splx(s);
return (error);
@@ -784,6 +801,7 @@
goto restart;
}
bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
+ simple_unlock(&bp->b_interlock);
brelse(bp);
}
@@ -791,10 +809,11 @@
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
+ simple_lock(&bp->b_interlock);
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
- error = tsleep(bp, slpflag | (PRIBIO + 1),
- "vtruncbuf", slptimeo);
+ error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
+ "vtruncbuf", slptimeo, &bp->b_interlock);
if (error) {
splx(s);
return (error);
@@ -802,6 +821,7 @@
goto restart;
}
bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
+ simple_unlock(&bp->b_interlock);
brelse(bp);
}
@@ -826,11 +846,15 @@
s = splbio();
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = LIST_NEXT(bp, b_vnbufs);
- if ((bp->b_flags & B_BUSY))
+ simple_lock(&bp->b_interlock);
+ if ((bp->b_flags & B_BUSY)) {
+ simple_unlock(&bp->b_interlock);
continue;
+ }
if ((bp->b_flags & B_DELWRI) == 0)
panic("vflushbuf: not dirty, bp %p", bp);
bp->b_flags |= B_BUSY | B_VFLUSH;
+ simple_unlock(&bp->b_interlock);
splx(s);
/*
* Wait for I/O associated with indirect blocks to complete,
@@ -846,10 +870,13 @@
splx(s);
return;
}
+ simple_lock(&global_v_numoutput_slock);
while (vp->v_numoutput) {
vp->v_flag |= VBWAIT;
- tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0);
+ ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0,
+ &global_v_numoutput_slock);
}
+ simple_unlock(&global_v_numoutput_slock);
splx(s);
if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
vprint("vflushbuf: dirty", vp);
Index: ufs/lfs/lfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/lfs/lfs_bio.c,v
retrieving revision 1.56
diff -u -r1.56 lfs_bio.c
--- ufs/lfs/lfs_bio.c 2003/01/24 21:55:26 1.56
+++ ufs/lfs/lfs_bio.c 2003/02/02 22:27:41
@@ -641,10 +641,11 @@
if (bp == NULL)
panic("bp is NULL after malloc in lfs_newbuf");
#endif
+ simple_lock_init(&bp->b_interlock);
s = splbio();
bgetvp(vp, bp);
splx(s);
-
+
bp->b_saveaddr = (caddr_t)fs;
bp->b_bufsize = size;
bp->b_bcount = size;
Index: ufs/lfs/lfs_segment.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/lfs/lfs_segment.c,v
retrieving revision 1.99
diff -u -r1.99 lfs_segment.c
--- ufs/lfs/lfs_segment.c 2003/02/01 06:23:53 1.99
+++ ufs/lfs/lfs_segment.c 2003/02/02 22:27:59
@@ -1492,6 +1492,7 @@
#define BQUEUES 4 /* XXX */
#define BQ_EMPTY 3 /* XXX */
extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
+extern struct simplelock bqueue_slock;
#define BUFHASH(dvp, lbn) \
(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
@@ -1527,7 +1528,9 @@
/* Get an empty buffer header, or maybe one with something on it */
s = splbio();
+ simple_lock(&bqueue_slock);
if((bp = bufqueues[BQ_EMPTY].tqh_first) != NULL) {
+ simple_lock(&bp->b_interlock);
bremfree(bp);
/* clear out various other fields */
bp->b_flags = B_BUSY;
@@ -1546,12 +1549,12 @@
if (bp->b_vp)
brelvp(bp);
}
- splx(s);
while (!bp)
bp = getnewbuf(0, 0);
- s = splbio();
bgetvp(vp, bp);
binshash(bp,&invalhash);
+ simple_unlock(&bp->b_interlock);
+ simple_unlock(&bqueue_slock);
splx(s);
bp->b_bcount = 0;
bp->b_blkno = bp->b_lblkno = addr;
--ELM74055137-22799-0_--