tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[PATCH] bufq_priocscan enhancement



HI all,
sorry for not being alive for so long. I am resending this e-mail to this list 
since I have previously sent it to developers@.

I attach a patch for 5.1 with an enhancement to bufq_priocscan. It 
reimplements cscan with rb-trees instead of queues and thus eliminates O(n^2) 
in favor of O(nlogn).

Below is a description of a performance problem which I've been hit by and 
later a short comment to the patch and some more considerations.

Problem description
----------------------------
I have noticed that running
dd if=/dev/zero of=/dev/wd0f bs=1048576
has very poor performance (around 300KB/s) on my Core 2 Duo with a regular 
SATA disk and one of the cores is used entirely for interrupts and the other 
entirely for system time, which makes the system completely unusable (several 
minutes needed to ssh to it and kill dd).

A look at kprof shows that:
  %   cumulative   self              self     total
 time   seconds   seconds    calls  ms/call  ms/call  name
 39.69     74.71    74.71   134145     0.56     0.56  bufq_priocscan_put
 23.33    118.63    43.92                             x86_pause
 19.53    155.39    36.76    49863     0.74     0.74  x86_mwait
 15.47    184.52    29.13   607203     0.05     0.05  _kernel_lock
so apart from poor bufq_priocscan_put performance, there is lock contention in 
an interrupt caused by it.

After changing the implementation of cscan in bufq_priocscan, I got results 
from dd around 45MB/s with 50% interrupt time and around 7% system time. The 
system is usable during that operation and seems to be more interactive 
during regular use too (observable difference in firefox).
I have not made any more tests like dbench yet.

Implementation
----------------------
There used to be 2 queues and requests were inserted in right order to them by 
searching them from the start. I have simply removed the queues in favor of 
rb trees, however the code is not very pretty because I needed to support 
both cylinder and raw block number sort orders (a lot of code duplication), 
maybe I can hack it around with some macros. Or maybe someone has a better 
idea how to do it in an elegant fasion.

The other ugly thing which I am aware of, is that it will work incorrectly (or 
crash) if someday it becomes possible to change the sort order at runtime 
(fortunately grepping the code shows it does not happen). Maybe I can add 
some more assertions to catch it if it becomes true.

I'd be grateful for reviewing it and some opinions.

Some more considerations
--------------------------------------
I also noticed that still there is a serious starvation issue and it is not 
only theoretical. If you run "find /" and then start the above mentioned dd 
on the same drive, find stops until dd finishes. It doesn't move forward even 
by one file.

Maybe I can implement some other strategy which would be aware of processes 
(as CFQ in Linux) and assign processes sort of timeslices for disk I/O to 
avoid such starvation. It would require a change in bufq interface AFAIK 
though. If we want to avoid the interface change, maybe at least something 
like deadline strategy under Linux (assigning requests a deadline and 
executing starving ones before any other).

Moreover I think (but I have not checked that in any way) that we can reduce 
the time spent in interrupts by merging disk requests in the scheduler or 
increasing buffer cache page size (which may be beneficial on 4KB sector 
drives).

But for these enhancements I'd probably need some months to finish them.

Regards

-- 
Marek Dopiera
marek%dopiera.pl@localhost

Patch
-------
Index: kern/bufq_priocscan.c
===================================================================
RCS file: /cvsroot/src/sys/kern/bufq_priocscan.c,v
retrieving revision 1.12
diff -u -r1.12 bufq_priocscan.c
--- kern/bufq_priocscan.c       3 May 2008 05:18:36 -0000       1.12
+++ kern/bufq_priocscan.c       7 Jun 2011 19:57:20 -0000
@@ -35,96 +35,266 @@
 #include <sys/bufq.h>
 #include <sys/bufq_impl.h>
 #include <sys/malloc.h>
+#include <sys/tree.h>
+
+struct cscan_key
+{
+       int ck_lastcylinder;            /* b_cylinder of the last request */
+       daddr_t ck_lastrawblkno;        /* b_rawblkno of the last request */
+};
 
 /*
  * Cyclical scan (CSCAN)
  */
-TAILQ_HEAD(bqhead, buf);
+RB_HEAD(pstree_cyl, buf);
+RB_HEAD(pstree_blkno, buf);
 struct cscan_queue {
-       struct bqhead cq_head[2];       /* actual lists of buffers */
-       int cq_idx;                     /* current list index */
-       int cq_lastcylinder;            /* b_cylinder of the last request */
-       daddr_t cq_lastrawblkno;        /* b_rawblkno of the last request */
+       union {
+               struct pstree_cyl u_tree_cyl;
+               struct pstree_blkno u_tree_blkno;
+       } cq_u;
+#define cq_tree_cyl cq_u.u_tree_cyl
+#define cq_tree_blkno cq_u.u_tree_blkno
+       struct buf * cq_next;
+       struct cscan_key cq_last;
 };
 
-static inline int cscan_empty(const struct cscan_queue *);
+static __inline void
+cscan_tree_insert(struct cscan_queue * cq, struct buf * buf, int sortby);
+static __inline void
+cscan_tree_remove(struct cscan_queue * cq, struct buf * buf, int sortby);
+static __inline struct buf *
+cscan_tree_next(struct cscan_queue * cq, struct buf * buf, int sortby);
+static __inline struct buf *
+cscan_tree_min(struct cscan_queue * cq, int sortby);
+static __inline struct buf *
+cscan_tree_find(struct cscan_queue * cq, struct buf * bp, int sortby);
+static __inline void
+cscan_tree_init(struct cscan_queue * cq, int sortby);
+static inline int cscan_tree_empty(const struct cscan_queue *, int sortby);
 static void cscan_put(struct cscan_queue *, struct buf *, int);
-static struct buf *cscan_get(struct cscan_queue *, int);
-static void cscan_init(struct cscan_queue *);
+static struct buf *cscan_get(struct cscan_queue *, int remove, int sortby);
+static void cscan_init(struct cscan_queue *, int sortby);
+static __inline off_t
+buf_sort_order_cyl(const struct buf *bp, const struct buf *bq);
+static __inline off_t
+buf_sort_order_blkno(const struct buf *bp, const struct buf *bq);
+static __inline off_t
+buf_sort_order_wrapper(
+       const struct buf *bp,
+       const struct buf *bq,
+       int sortby
+       );
+RB_PROTOTYPE(pstree_cyl, buf, b_actt, buf_sort_order_cyl);
+RB_GENERATE(pstree_cyl, buf, b_actt, buf_sort_order_cyl);
+RB_PROTOTYPE(pstree_blkno, buf, b_actt, buf_sort_order_blkno);
+RB_GENERATE(pstree_blkno, buf, b_actt, buf_sort_order_blkno);
 
-static inline int
-cscan_empty(const struct cscan_queue *q)
+static __inline off_t
+buf_sort_order_cyl(const struct buf *bp, const struct buf *bq)
 {
-
-       return TAILQ_EMPTY(&q->cq_head[0]) && TAILQ_EMPTY(&q->cq_head[1]);
+       if (bp->b_cylinder != bq->b_cylinder)
+               return bp->b_cylinder - bq->b_cylinder;
+       else if (bp->b_rawblkno != bq->b_rawblkno)
+               return bp->b_rawblkno - bq->b_rawblkno;
+       else
+               return bp - bq; //ensure that buffers with the same blkno can 
coexist
 }
 
-static void
-cscan_put(struct cscan_queue *q, struct buf *bp, int sortby)
+static __inline off_t
+buf_sort_order_blkno(const struct buf *bp, const struct buf *bq)
 {
-       struct buf tmp;
-       struct buf *it;
-       struct bqhead *bqh;
-       int idx;
-
-       tmp.b_cylinder = q->cq_lastcylinder;
-       tmp.b_rawblkno = q->cq_lastrawblkno;
-
-       if (buf_inorder(bp, &tmp, sortby))
-               idx = 1 - q->cq_idx;
+       if (bp->b_rawblkno != bq->b_rawblkno)
+               return bp->b_rawblkno - bq->b_rawblkno;
        else
-               idx = q->cq_idx;
+               return bp - bq; //ensure that buffers with the same blkno can 
coexist
+}
 
-       bqh = &q->cq_head[idx];
+off_t buf_sort_order_wrapper(
+       const struct buf *bp,
+       const struct buf *bq,
+       int sortby
+       )
+{
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER: return buf_sort_order_cyl(bq, bq);
+               case BUFQ_SORT_RAWBLOCK: return buf_sort_order_blkno(bp, bq);
+               default: panic("%s: unknown sort order %d", __func__, sortby);
+       }
+}
 
-       TAILQ_FOREACH(it, bqh, b_actq)
-               if (buf_inorder(bp, it, sortby))
+static __inline void
+cscan_tree_insert(struct cscan_queue * cq, struct buf * buf, int sortby)
+{
+       struct buf * collision;
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER:
+                       collision = RB_INSERT(pstree_cyl, &cq->cq_tree_cyl, 
buf);
                        break;
+               case BUFQ_SORT_RAWBLOCK:
+                       collision = RB_INSERT(pstree_blkno, &cq->cq_tree_blkno, 
buf);
+                       break;
+               default: panic("%s: unknown sort order % d", __func__, sortby);
+       }
+       KDASSERT(!collision);
+}
 
-       if (it != NULL)
-               TAILQ_INSERT_BEFORE(it, bp, b_actq);
-       else
-               TAILQ_INSERT_TAIL(bqh, bp, b_actq);
+static __inline void
+cscan_tree_remove(struct cscan_queue * cq, struct buf * buf, int sortby)
+{
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER:
+                       RB_REMOVE(pstree_cyl, &cq->cq_tree_cyl, buf);
+                       break;
+               case BUFQ_SORT_RAWBLOCK:
+                       RB_REMOVE(pstree_blkno, &cq->cq_tree_blkno, buf);
+                       break;
+               default: panic("%s: unknown sort order %d", __func__, sortby);
+       }
 }
 
-static struct buf *
-cscan_get(struct cscan_queue *q, int remove)
+static __inline struct buf *
+cscan_tree_next(struct cscan_queue * cq, struct buf * buf, int sortby)
 {
-       int idx = q->cq_idx;
-       struct bqhead *bqh;
-       struct buf *bp;
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER:
+                       return RB_NEXT(pstree_cyl, &cq->cq_tree_cyl, buf);
+               case BUFQ_SORT_RAWBLOCK:
+                       return RB_NEXT(pstree_blkno, &cq->cq_tree_blkno, buf);
+               default: panic("%s: unknown sort order %d", __func__, sortby);
+       }
+}
+
+static __inline struct buf *
+cscan_tree_min(struct cscan_queue * cq, int sortby)
+{
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER:
+                       return RB_MIN(pstree_cyl, &cq->cq_tree_cyl);
+               case BUFQ_SORT_RAWBLOCK:
+                       return RB_MIN(pstree_blkno, &cq->cq_tree_blkno);
+               default: panic("%s: unknown sort order %d", __func__, sortby);
+       }
+}
 
-       bqh = &q->cq_head[idx];
-       bp = TAILQ_FIRST(bqh);
+static __inline struct buf *
+cscan_tree_find(struct cscan_queue * cq, struct buf * bp, int sortby)
+{
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER:
+                       return RB_FIND(pstree_cyl, &cq->cq_tree_cyl, bp);
+               case BUFQ_SORT_RAWBLOCK:
+                       return RB_FIND(pstree_blkno, &cq->cq_tree_blkno, bp);
+               default: panic("%s: unknown sort order %d", __func__, sortby);
+       }
+}
 
-       if (bp == NULL) {
-               /* switch queue */
-               idx = 1 - idx;
-               bqh = &q->cq_head[idx];
-               bp = TAILQ_FIRST(bqh);
+static __inline void
+cscan_tree_init(struct cscan_queue * cq, int sortby)
+{
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER:
+                       RB_INIT(&cq->cq_tree_cyl);
+                       break;
+               case BUFQ_SORT_RAWBLOCK:
+                       RB_INIT(&cq->cq_tree_blkno);
+                       break;
+               default: panic("%s: unknown sort order %d", __func__, sortby);
        }
+}
 
-       KDASSERT((bp != NULL && !cscan_empty(q)) ||
-                (bp == NULL && cscan_empty(q)));
+static inline int
+cscan_tree_empty(const struct cscan_queue *q, int sortby)
+{
+       switch (sortby)
+       {
+               case BUFQ_SORT_CYLINDER:
+                       return RB_EMPTY(&q->cq_tree_cyl);
+               case BUFQ_SORT_RAWBLOCK:
+                       return RB_EMPTY(&q->cq_tree_blkno);
+               default: panic("%s: unknown sort order %d", __func__, sortby);
+       }
+}
 
-       if (bp != NULL && remove) {
-               q->cq_idx = idx;
-               TAILQ_REMOVE(bqh, bp, b_actq);
+static void
+cscan_put(struct cscan_queue *cq, struct buf *bp, int sortby)
+{
+       int const was_empty __attribute__((unused)) = cscan_tree_empty(cq, 
sortby);
+       cscan_tree_insert(cq, bp, sortby);
+       if (cq->cq_next == NULL)
+       {
+               KDASSERT(was_empty);
+               cq->cq_next = bp;
+       }
+       else {
+               struct buf tmp;
+               tmp.b_cylinder = cq->cq_last.ck_lastcylinder;
+               tmp.b_rawblkno = cq->cq_last.ck_lastrawblkno;
+               if (
+                       buf_sort_order_wrapper(bp, cq->cq_next, sortby) < 0 &&
+                       buf_sort_order_wrapper(&tmp, bp, sortby) < 0
+                       )
+                       cq->cq_next = bp;
+       }
+}
 
-               q->cq_lastcylinder = bp->b_cylinder;
-               q->cq_lastrawblkno =
-                   bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
+static struct buf *
+cscan_get(struct cscan_queue *cq, int remove, int sortby)
+{
+       struct buf * const res = cq->cq_next;
+       if (!res)
+       {
+               KDASSERT(cscan_tree_empty(cq));
+               return NULL;
        }
+       if (remove)
+       {
+               cq->cq_next = cscan_tree_next(cq, res, sortby);
+               cq->cq_last.ck_lastcylinder = res->b_cylinder;
+               cq->cq_last.ck_lastrawblkno= res->b_rawblkno;
+               cscan_tree_remove(cq, res, sortby);
+               if (cq->cq_next == NULL)
+               {
+                       cq->cq_next = cscan_tree_min(cq, sortby);
+                       cq->cq_last = (struct cscan_key){ 0, 0 };
+               }
+       }
+       return res;
+}
+
+static struct buf *
+cscan_remove(struct cscan_queue *cq, struct buf * bp, int sortby)
+{
+       struct buf * bq = cscan_tree_find(cq, bp, sortby);
+       
+       if (!bq)
+               return NULL;
+
+       KDASSERT(bq == bp);
 
-       return (bp);
+       cq->cq_next = cscan_tree_next(cq, bp, sortby);
+       cscan_tree_remove(cq, bp, sortby);
+       if (cq->cq_next == NULL)
+       {
+               cq->cq_next = cscan_tree_min(cq, sortby);
+               cq->cq_last = (struct cscan_key){ 0, 0 };
+       }
+       return bp;
 }
 
 static void
-cscan_init(struct cscan_queue *q)
+cscan_init(struct cscan_queue *cq, int sortby)
 {
-
-       TAILQ_INIT(&q->cq_head[0]);
-       TAILQ_INIT(&q->cq_head[1]);
+       cscan_tree_init(cq, sortby);
+       cq->cq_next = NULL;
+       cq->cq_last = (struct cscan_key) {0, 0};
 }
 
 
@@ -209,12 +379,13 @@
        const struct cscan_queue *cq;
        struct buf *bp;
        bool single; /* true if there's only one non-empty queue */
+       const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
 
        pq = &q->bq_queue[0];
        epq = pq + PRIOCSCAN_NQUEUE;
        for (; pq < epq; pq++) {
                cq = &pq->q_queue;
-               if (!cscan_empty(cq))
+               if (!cscan_tree_empty(cq, sortby))
                        break;
        }
        if (pq == epq) {
@@ -226,7 +397,7 @@
        single = true;
        for (npq = first + 1; npq < epq; npq++) {
                cq = &npq->q_queue;
-               if (!cscan_empty(cq)) {
+               if (!cscan_tree_empty(cq, sortby)) {
                        single = false;
                        if (pq->q_burst > 0)
                                break;
@@ -254,7 +425,7 @@
                for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
                        pq = &q->bq_queue[i];
                        cq = &pq->q_queue;
-                       if (!cscan_empty(cq) && pq->q_burst)
+                       if (!cscan_tree_empty(cq, sortby) && pq->q_burst)
                                panic("%s: inconsist", __func__);
                }
 #endif /* DEBUG */
@@ -275,8 +446,8 @@
                pq = first;
        }
 
-       KDASSERT(!cscan_empty(&pq->q_queue));
-       bp = cscan_get(&pq->q_queue, remove);
+       KDASSERT(!cscan_tree_empty(&pq->q_queue, sortby));
+       bp = cscan_get(&pq->q_queue, remove, sortby);
        KDASSERT(bp != NULL);
        KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
 
@@ -287,20 +458,12 @@
 bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *buf)
 {
        struct cscan_queue *q = bufq->bq_private;
-       struct bqhead *bqh;
-       struct buf *bq;
        int idx;
+       const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
 
        for (idx = 0; idx < 2; idx++) {
-               bqh = &q->cq_head[idx];
-               bq = TAILQ_FIRST(bqh);
-               while (bq) {
-                       if (bq == buf) {
-                               TAILQ_REMOVE(bqh, bq, b_actq);
-                               return buf;
-                       }
-                       bq = TAILQ_NEXT(bq, b_actq);
-               }
+               if (cscan_remove(q, buf, sortby))
+                       return buf;
        }
        return NULL;
 }
@@ -310,6 +473,7 @@
 {
        struct bufq_priocscan *q;
        int i;
+       const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
 
        bufq->bq_get = bufq_priocscan_get;
        bufq->bq_put = bufq_priocscan_put;
@@ -321,6 +485,6 @@
        for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
                struct cscan_queue *cq = &q->bq_queue[i].q_queue;
 
-               cscan_init(cq);
+               cscan_init(cq, sortby);
        }
 }
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.110
diff -u -r1.110 buf.h
--- sys/buf.h   31 Jul 2008 05:38:05 -0000      1.110
+++ sys/buf.h   7 Jun 2011 19:57:23 -0000
@@ -69,6 +69,7 @@
 #ifndef _SYS_BUF_H_
 #define        _SYS_BUF_H_
 
+#include <sys/tree.h>
 #include <sys/pool.h>
 #include <sys/queue.h>
 #include <sys/mutex.h>
@@ -125,11 +126,13 @@
 struct buf {
        union {
                TAILQ_ENTRY(buf) u_actq;
+               RB_ENTRY(buf) u_actt;
 #if defined(_KERNEL) /* u_work is smaller than u_actq. XXX */
                struct work u_work;
 #endif /* defined(_KERNEL) */
        } b_u;                                  /* b: device driver queue */
 #define        b_actq  b_u.u_actq
+#define b_actt b_u.u_actt
 #define        b_work  b_u.u_work
        void                    (*b_iodone)(struct buf *);/* b: call when done 
*/
        int                     b_error;        /* b: errno value. */
-- 
Marek Dopiera
marek%dopiera.pl@localhost


Home | Main Index | Thread Index | Old Index