tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

WAPBL truncate resource starvation stopgap fix suggestion



Hi,

I've noticed it's possible to panic system in
wapbl_register_deallocation() on small (200MB) filesystem just by
creating directory with many entries and then removing it, since
system runs out of dealloc array space - wl_dealloclim ended up being
63.

I've also noticed kern/49175, which also is related to the
deallocation path, patch there is trying to force flush on
wapbl_end(), even though there is similar flush being actually done by
wapbl_start().

ufs_truncate() takes a lot of care to not create big transactions
(actually too much), truncating by indirect block boundaries, and
doing the wapbl_end()/wapbl_start() to allow flush. However, the call
to wapbl_flush() in wapbl_start() is not effective, because it doesn't
do anything if !waitfor and wl_bufcount == 0, as is the case during
the truncate. Thus, for any bigger truncate run it would just continue
filling wl_deallocblks past 50% and eventually run out.

I suggest change as outlined in attached diff, does two things:
1. commit transaction in wapbl_flush() even in !waitfor case when
wapbl_transaction_len() > 0
2. allow many more revocation records, using up to 25% of journal size

I'm leaving for three weeks on vac, so sending this for review now to
gather feedback. I'm very new to WAPBL and I don't understand all
consequences of such changes.

When I return, I'll also look on the actual main problem kern/49175,
i.e. the excessive amount of VOP_TRUNCATE() calls triggered by
ufs_wapbl_truncate(). Joerg wrote me there was some floating patch to
refactor ffs_truncate() to be allow partial success, I'll check the
archives for that when I get back.

Jaromir
Index: vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.78
diff -u -p -r1.78 vfs_wapbl.c
--- vfs_wapbl.c	19 May 2016 18:32:29 -0000	1.78
+++ vfs_wapbl.c	22 Aug 2016 21:27:28 -0000
@@ -202,6 +202,10 @@ struct wapbl {
 	size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
 #endif
 
+#if _KERNEL
+	size_t wl_brperblock;	/* r Block records per journal block */
+#endif
+
 	daddr_t *wl_deallocblks;/* lm:	address of block */
 	int *wl_dealloclens;	/* lm:	size of block */
 	int wl_dealloccnt;	/* lm:	total count */
@@ -498,9 +502,12 @@ wapbl_start(struct wapbl ** wlp, struct 
 	/* XXX fix actual number of buffers reserved per filesystem. */
 	wl->wl_bufcount_max = (nbuf / 2) * 1024;
 
-	/* XXX tie this into resource estimation */
-	wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
-	
+	/* allow up to 25% of journal to be taken by revocations */
+	wl->wl_brperblock = ((1<<wl->wl_log_dev_bshift)
+		- offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
+	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
+
+	wl->wl_dealloclim  = wl->wl_bufbytes_max / 4 / wl->wl_brperblock;
 	wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
 	    wl->wl_dealloclim);
 	wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
@@ -1560,13 +1567,9 @@ wapbl_flush(struct wapbl *wl, int waitfo
 	 * issued any deferred block writes for this transaction, check
 	 * whether there are any blocks to write to the log.  If not,
 	 * skip waiting for space or writing any log entries.
-	 *
-	 * XXX Shouldn't this also check wl_dealloccnt and
-	 * wl_inohashcnt?  Perhaps wl_dealloccnt doesn't matter if the
-	 * file system didn't produce any blocks as a consequence of
-	 * it, but the same does not seem to be so of wl_inohashcnt.
 	 */
-	if (wl->wl_bufcount == 0) {
+	flushsize = wapbl_transaction_len(wl);
+	if (flushsize == 0) {
 		goto wait_out;
 	}
 
@@ -1578,8 +1581,6 @@ wapbl_flush(struct wapbl *wl, int waitfo
 		      wl->wl_bufbytes));
 #endif
 
-	/* Calculate amount of space needed to flush */
-	flushsize = wapbl_transaction_len(wl);
 	if (wapbl_verbose_commit) {
 		struct timespec ts;
 		getnanotime(&ts);
@@ -1940,6 +1941,7 @@ wapbl_register_deallocation(struct wapbl
 	wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
 	wl->wl_dealloclens[wl->wl_dealloccnt] = len;
 	wl->wl_dealloccnt++;
+
 	WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
 	    ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
 	mutex_exit(&wl->wl_mtx);
@@ -2228,7 +2230,6 @@ wapbl_write_blocks(struct wapbl *wl, off
 	struct wapbl_wc_blocklist *wc =
 	    (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
 	int blocklen = 1<<wl->wl_log_dev_bshift;
-	int bph;
 	struct buf *bp;
 	off_t off = *offp;
 	int error;
@@ -2236,9 +2237,6 @@ wapbl_write_blocks(struct wapbl *wl, off
 
 	KASSERT(rw_write_held(&wl->wl_rwlock));
 
-	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
-	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
-
 	bp = LIST_FIRST(&wl->wl_bufs);
 
 	while (bp) {
@@ -2250,7 +2248,7 @@ wapbl_write_blocks(struct wapbl *wl, off
 		wc->wc_type = WAPBL_WC_BLOCKS;
 		wc->wc_len = blocklen;
 		wc->wc_blkcount = 0;
-		while (bp && (wc->wc_blkcount < bph)) {
+		while (bp && (wc->wc_blkcount < wl->wl_brperblock)) {
 			/*
 			 * Make sure all the physical block numbers are up to
 			 * date.  If this is not always true on a given
@@ -2289,7 +2287,7 @@ wapbl_write_blocks(struct wapbl *wl, off
 			return error;
 		bp = obp;
 		cnt = 0;
-		while (bp && (cnt++ < bph)) {
+		while (bp && (cnt++ < wl->wl_brperblock)) {
 			error = wapbl_circ_write(wl, bp->b_data,
 			    bp->b_bcount, &off);
 			if (error)
@@ -2326,22 +2324,18 @@ wapbl_write_revocations(struct wapbl *wl
 	    (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
 	int i;
 	int blocklen = 1<<wl->wl_log_dev_bshift;
-	int bph;
 	off_t off = *offp;
 	int error;
 
 	if (wl->wl_dealloccnt == 0)
 		return 0;
 
-	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
-	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
-
 	i = 0;
 	while (i < wl->wl_dealloccnt) {
 		wc->wc_type = WAPBL_WC_REVOCATIONS;
 		wc->wc_len = blocklen;
 		wc->wc_blkcount = 0;
-		while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
+		while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < wl->wl_brperblock)) {
 			wc->wc_blocks[wc->wc_blkcount].wc_daddr =
 			    wl->wl_deallocblks[i];
 			wc->wc_blocks[wc->wc_blkcount].wc_dlen =


Home | Main Index | Thread Index | Old Index