Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/kern improve performance of journal writes by paralleliz...



details:   https://anonhg.NetBSD.org/src/rev/f003f2445661
branches:  trunk
changeset: 822866:f003f2445661
user:      jdolecek <jdolecek%NetBSD.org@localhost>
date:      Mon Apr 10 21:34:37 2017 +0000

description:
improve performance of journal writes by parallelizing the I/O - use 4 bufs
by default, add sysctl vfs.wapbl.journal_iobufs to control it

this also removes need to allocate iobuf during commit, so it
might help to avoid deadlock during memory shortages like PR kern/47030

diffstat:

 sys/kern/vfs_wapbl.c |  250 ++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 198 insertions(+), 52 deletions(-)

diffs (truncated from 415 to 300 lines):

diff -r d8b7bb0eb798 -r f003f2445661 sys/kern/vfs_wapbl.c
--- a/sys/kern/vfs_wapbl.c      Mon Apr 10 19:52:38 2017 +0000
+++ b/sys/kern/vfs_wapbl.c      Mon Apr 10 21:34:37 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $  */
+/*     $NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $  */
 
 /*-
  * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
@@ -36,7 +36,7 @@
 #define WAPBL_INTERNAL
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $");
 
 #include <sys/param.h>
 #include <sys/bitops.h>
@@ -72,6 +72,7 @@
 static int wapbl_flush_disk_cache = 1;
 static int wapbl_verbose_commit = 0;
 static int wapbl_allow_fuadpo = 0;     /* switched off by default for now */
+static int wapbl_journal_iobufs = 4;
 
 static inline size_t wapbl_space_free(size_t, off_t, off_t);
 
@@ -191,6 +192,8 @@
        char wl_ev_group[EVCNT_STRING_MAX];     /* r    */
        struct evcnt wl_ev_commit;              /* l    */
        struct evcnt wl_ev_journalwrite;        /* l    */
+       struct evcnt wl_ev_jbufs_bio_nowait;    /* l    */
+       struct evcnt wl_ev_jbufs_bio_wait;      /* l    */
        struct evcnt wl_ev_metawrite;           /* lm   */
        struct evcnt wl_ev_cacheflush;          /* l    */
 #endif
@@ -228,9 +231,9 @@
        SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
                                                   accounting */
 
-       u_char *wl_buffer;      /* l:   buffer for wapbl_buffered_write() */
-       daddr_t wl_buffer_dblk; /* l:   buffer disk block address */
-       size_t wl_buffer_used;  /* l:   buffer current use */
+       /* buffers for wapbl_buffered_write() */
+       TAILQ_HEAD(, buf) wl_iobufs;            /* l: Free or filling bufs */
+       TAILQ_HEAD(, buf) wl_iobufs_busy;       /* l: In-transit bufs */
 
        int wl_dkcache;         /* r:   disk cache flags */
 #define WAPBL_USE_FUA(wl)      \
@@ -360,6 +363,15 @@
        if (rv)
                return rv;
 
+       rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "journal_iobufs",
+                      SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
+                      NULL, 0, &wapbl_journal_iobufs, 0,
+                      CTL_CREATE, CTL_EOL);
+       if (rv)
+               return rv;
+
        return rv;
 }
 
@@ -401,6 +413,10 @@
            NULL, wl->wl_ev_group, "commit");
        evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
            NULL, wl->wl_ev_group, "journal sync block write");
+       evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
+           NULL, wl->wl_ev_group, "journal I/O bufs no wait");
+       evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_wait, EVCNT_TYPE_MISC,
+           NULL, wl->wl_ev_group, "journal I/O bufs biowait");
        evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
            NULL, wl->wl_ev_group, "metadata finished block write");
        evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
@@ -412,6 +428,8 @@
 {
        evcnt_detach(&wl->wl_ev_commit);
        evcnt_detach(&wl->wl_ev_journalwrite);
+       evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
+       evcnt_detach(&wl->wl_ev_jbufs_bio_wait);
        evcnt_detach(&wl->wl_ev_metawrite);
        evcnt_detach(&wl->wl_ev_cacheflush);
 }
@@ -605,9 +623,6 @@
        wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
        TAILQ_INIT(&wl->wl_dealloclist);
 
-       wl->wl_buffer = wapbl_alloc(MAXPHYS);
-       wl->wl_buffer_used = 0;
-
        wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
 
        wapbl_evcnt_init(wl);
@@ -630,6 +645,25 @@
                wl->wl_wc_scratch = wapbl_alloc(len);
        }
 
+       TAILQ_INIT(&wl->wl_iobufs);
+       TAILQ_INIT(&wl->wl_iobufs_busy);
+       for (int i = 0; i < wapbl_journal_iobufs; i++) {
+               struct buf *bp;
+
+               if ((bp = geteblk(MAXPHYS)) == NULL)
+                       goto errout;
+
+               mutex_enter(&bufcache_lock);
+               mutex_enter(devvp->v_interlock);
+               bgetvp(devvp, bp);
+               mutex_exit(devvp->v_interlock);
+               mutex_exit(&bufcache_lock);
+
+               bp->b_dev = devvp->v_rdev;
+
+               TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+       }
+
        /*
         * if there was an existing set of unlinked but
         * allocated inodes, preserve it in the new
@@ -656,7 +690,13 @@
        wapbl_discard(wl);
        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
-       wapbl_free(wl->wl_buffer, MAXPHYS);
+       while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+               struct buf *bp;
+
+               bp = TAILQ_FIRST(&wl->wl_iobufs);
+               TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+               brelse(bp, BC_INVAL);
+       }
        wapbl_inodetrk_free(wl);
        wapbl_free(wl, sizeof(*wl));
 
@@ -832,10 +872,17 @@
        KASSERT(wl->wl_inohashcnt == 0);
        KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
        KASSERT(wl->wl_dealloccnt == 0);
+       KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
 
        wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
        wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
-       wapbl_free(wl->wl_buffer, MAXPHYS);
+       while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+               struct buf *bp;
+
+               bp = TAILQ_FIRST(&wl->wl_iobufs);
+               TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+               brelse(bp, BC_INVAL);
+       }
        wapbl_inodetrk_free(wl);
 
        wapbl_evcnt_free(wl);
@@ -853,14 +900,10 @@
  * Unbuffered disk I/O
  */
 
-static int
-wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+static void
+wapbl_doio_accounting(struct vnode *devvp, int flags)
 {
        struct pstats *pstats = curlwp->l_proc->p_stats;
-       struct buf *bp;
-       int error;
-
-       KASSERT(devvp->v_type == VBLK);
 
        if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
                mutex_enter(devvp->v_interlock);
@@ -871,6 +914,18 @@
                pstats->p_ru.ru_inblock++;
        }
 
+}
+
+static int
+wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+{
+       struct buf *bp;
+       int error;
+
+       KASSERT(devvp->v_type == VBLK);
+
+       wapbl_doio_accounting(devvp, flags);
+
        bp = getiobuf(devvp, true);
        bp->b_flags = flags;
        bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
@@ -935,24 +990,77 @@
  */
 
 /*
+ * wapbl_buffered_write_async(wl, bp)
+ *
+ *     Send buffer for asynchronous write.
+ */
+static void
+wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
+{
+       wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
+
+       KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
+       TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+
+       bp->b_flags = B_WRITE | WAPBL_JFLAGS(wl);
+       bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
+       bp->b_oflags = 0;
+       bp->b_bcount = bp->b_resid;
+       BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+
+       VOP_STRATEGY(wl->wl_devvp, bp);
+
+       wl->wl_ev_journalwrite.ev_count++;
+
+       TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
+}
+
+/*
  * wapbl_buffered_flush(wl)
  *
  *     Flush any buffered writes from wapbl_buffered_write.
  */
 static int
-wapbl_buffered_flush(struct wapbl *wl)
+wapbl_buffered_flush(struct wapbl *wl, bool full)
 {
-       int error;
-
-       if (wl->wl_buffer_used == 0)
-               return 0;
-
-       error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-           wl->wl_devvp, wl->wl_buffer_dblk,
-           B_WRITE | WAPBL_JFLAGS(wl));
-       wl->wl_buffer_used = 0;
-
-       wl->wl_ev_journalwrite.ev_count++;
+       int error = 0;
+       struct buf *bp, *bnext;
+       bool only_done = true, found = false;
+
+       /* if there is outstanding buffered write, send it now */
+       if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
+               wapbl_buffered_write_async(wl, bp);
+
+       /* wait for I/O to complete */
+again:
+       TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
+               if (!full && only_done) {
+                       /* skip unfinished */
+                       if (!ISSET(bp->b_oflags, BO_DONE))
+                               continue;
+               }
+                       
+               if (ISSET(bp->b_oflags, BO_DONE))
+                       wl->wl_ev_jbufs_bio_nowait.ev_count++;
+               else
+                       wl->wl_ev_jbufs_bio_wait.ev_count++;
+
+               TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
+               error = biowait(bp);
+
+               /* reset for reuse */
+               bp->b_blkno = bp->b_resid = 0;
+               TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+               found = true;
+
+               if (!full)
+                       break;
+       }
+
+       if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
+               only_done = false;
+               goto again;
+       }
 
        return error;
 }
@@ -967,49 +1075,63 @@
 static int
 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
 {
-       int error;
        size_t resid;
+       struct buf *bp;
+
+again:
+       bp = TAILQ_FIRST(&wl->wl_iobufs);
+
+       if (bp == NULL) {
+               /* No more buffers, wait for any previous I/O to finish. */
+               wapbl_buffered_flush(wl, false);
+
+               bp = TAILQ_FIRST(&wl->wl_iobufs);
+               KASSERT(bp != NULL);
+       }
 
        /*



Home | Main Index | Thread Index | Old Index