tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: wapbl_flush() speedup



On Dec 5, 2012, at 4:48 PM, Edgar Fuß <ef%math.uni-bonn.de@localhost> wrote:

>> The attached diff tries to coalesce writes to the journal in MAXPHYS
>> sized and aligned blocks.
> First, many thanks to hannken@ for ending my monthlong performance nightmare!
> 
> Btw., this also explains my tstile lockups: writing the journal could take
> tens of seconds; because the journal was locked, most actions on the fs would
> stall for that time.
> 
>> Comments or objections anyone?
> I have a few questions. But I am by far no file system expert. So identifiers
> confusing me may be evident for an expert or it may be just my ignorance that
> some invariants are not obvious to me.
> 
>> +    daddr_t wl_buffer_addr; /* l:   buffer base address */
> With "base address" I was thinking of a memory address, not a disc address.
> Is it common practice to use xxx_addr for daddrs?
> 
>> +    size_t wl_buffer_len;   /* l:   buffer current use */
> "len" made me think of allocated length, not used length.
> Again, that use may be common practice.
> 
>> +     * If not adjacent to buffered dat flush first.
> s/dat/data/
> 
>> +        pbn != wl->wl_buffer_addr + btodb(wl->wl_buffer_len)) {
> Is it obvious to anyone but me why wl_buffer_addr is initialized at this 
> point?
> I'm now sure it is, it just took me some time and a private mail to learn.
> 
>> +    KASSERT(resid > 0);
> It took me even more time to learn why resid is non-negative here.
> Would the explanation be worth a comment?
> However, I'm still not sure why it can't be zero.
> 
>> +            error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_len,
>> +                wl->wl_devvp, wl->wl_buffer_addr, B_WRITE);
> I suppose it's on purpose not to use wapbl_write() here.

Yes, wapbl_buffered_write is a helper like wapbl_write and there is
no need to go through wapbl_write.

An updated diff (names and comments) is attached.

--
J. Hannken-Illjes - hannken%eis.cs.tu-bs.de@localhost - TU Braunschweig 
(Germany)

Index: vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.53
diff -p -u -2 -r1.53 vfs_wapbl.c
--- vfs_wapbl.c 17 Nov 2012 10:10:17 -0000      1.53
+++ vfs_wapbl.c 5 Dec 2012 17:08:33 -0000
@@ -185,4 +185,8 @@ struct wapbl {
        SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
                                                   accounting */
+
+       u_char *wl_buffer;      /* l:   buffer for wapbl_buffered_write() */
+       daddr_t wl_buffer_dblk; /* l:   buffer disk block address */
+       size_t wl_buffer_used;  /* l:   buffer current use */
 };
 
@@ -490,4 +494,7 @@ wapbl_start(struct wapbl ** wlp, struct 
            wl->wl_dealloclim);
 
+       wl->wl_buffer = wapbl_alloc(MAXPHYS);
+       wl->wl_buffer_used = 0;
+
        wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
 
@@ -538,4 +545,5 @@ wapbl_start(struct wapbl ** wlp, struct 
        wapbl_free(wl->wl_dealloclens,
            sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
+       wapbl_free(wl->wl_buffer, MAXPHYS);
        wapbl_inodetrk_free(wl);
        wapbl_free(wl, sizeof(*wl));
@@ -717,4 +725,5 @@ wapbl_stop(struct wapbl *wl, int force)
        wapbl_free(wl->wl_dealloclens,
            sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
+       wapbl_free(wl->wl_buffer, MAXPHYS);
        wapbl_inodetrk_free(wl);
 
@@ -792,4 +801,78 @@ wapbl_read(void *data, size_t len, struc
 
 /*
+ * Flush buffered data if any.
+ */
+static int
+wapbl_buffered_flush(struct wapbl *wl)
+{
+       int error;
+
+       if (wl->wl_buffer_used == 0)
+               return 0;
+
+       error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
+           wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+       wl->wl_buffer_used = 0;
+
+       return error;
+}
+
+/*
+ * Write data to the log.
+ * Try to coalesce writes and emit MAXPHYS aligned blocks.
+ */
+static int
+wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
+{
+       int error;
+       size_t resid;
+
+       /*
+        * If not adjacent to buffered data flush first.  Disk block
+        * address is always valid for non-empty buffer.
+        */
+       if (wl->wl_buffer_used > 0 &&
+           pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
+               error = wapbl_buffered_flush(wl);
+               if (error)
+                       return error;
+       }
+       /*
+        * If this write goes to an empty buffer we have to
+        * save the disk block address first.
+        */
+       if (wl->wl_buffer_used == 0)
+               wl->wl_buffer_dblk = pbn;
+       /*
+        * Remaining space so this buffer ends on a MAXPHYS boundary.
+        *
+        * Cannot become less or equal zero as the buffer would have been
+        * flushed on the last call then.
+        */
+       resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
+           wl->wl_buffer_used;
+       KASSERT(resid > 0);
+       KASSERT(dbtob(btodb(resid)) == resid);
+       if (len >= resid) {
+               memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
+               wl->wl_buffer_used += resid;
+               error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
+                   wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+               data = (uint8_t *)data + resid;
+               len -= resid;
+               wl->wl_buffer_dblk = pbn + btodb(resid);
+               wl->wl_buffer_used = 0;
+               if (error)
+                       return error;
+       }
+       if (len > 0) {
+               memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
+               wl->wl_buffer_used += len;
+       }
+
+       return 0;
+}
+
+/*
  * Off is byte offset returns new offset for next write
  * handles log wraparound
@@ -814,5 +897,5 @@ wapbl_circ_write(struct wapbl *wl, void 
                pbn = btodb(pbn << wl->wl_log_dev_bshift);
 #endif
-               error = wapbl_write(data, slen, wl->wl_devvp, pbn);
+               error = wapbl_buffered_write(data, slen, wl, pbn);
                if (error)
                        return error;
@@ -825,5 +908,5 @@ wapbl_circ_write(struct wapbl *wl, void 
        pbn = btodb(pbn << wl->wl_log_dev_bshift);
 #endif
-       error = wapbl_write(data, len, wl->wl_devvp, pbn);
+       error = wapbl_buffered_write(data, len, wl, pbn);
        if (error)
                return error;
@@ -1968,4 +2051,7 @@ wapbl_write_commit(struct wapbl *wl, off
        daddr_t pbn;
 
+       error = wapbl_buffered_flush(wl);
+       if (error)
+               return error;
        /*
         * flush disk cache to ensure that blocks we've written are actually
@@ -1999,5 +2085,8 @@ wapbl_write_commit(struct wapbl *wl, off
        pbn = btodb(pbn << wc->wc_log_dev_bshift);
 #endif
-       error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, pbn);
+       error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
+       if (error)
+               return error;
+       error = wapbl_buffered_flush(wl);
        if (error)
                return error;


Home | Main Index | Thread Index | Old Index