NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: kern/49175



The following reply was made to PR kern/49175; it has been noted by GNATS.

From: "Sergio L. Pascual" <slp%sinrega.org@localhost>
To: gnats-bugs%netbsd.org@localhost
Cc: 
Subject: Re: kern/49175
Date: Fri, 09 Jan 2015 09:38:01 +0100

 This is a multi-part message in MIME format.
 
 --nextPart23712401.KJzYPYpgBM
 Content-Transfer-Encoding: 7Bit
 Content-Type: text/plain; charset="us-ascii"
 
 The problem comes from ufs_inode.c:284, where indirect blocks are being freed 
 one by one, apparently to avoid generating a large transaction and/or 
 exhausting WABPL resources. This is extremely expensive, for both sparse and 
 regular files. I think, at the very least, we should be able to free those 
 blocks in reasonably sized chunks.
 
 I've attached here a reimplementation of my original patch over the changes 
 applied by Christos over the related code. It addresses this issue by checking 
 the maximum the size for a WAPBL transaction, blocking others from using WAPBL 
 resources by taking a RW_WRITER lock on wl_rwlock, and then issuing such 
 transaction.
 
 Additionally, it yield()s a little every 10 batches (an arbitrary number I've 
 just made up, but sounds reasonable to me), to allow other tasks to use some 
 CPU time.
 
 Sergio.
 
 --nextPart23712401.KJzYPYpgBM
 Content-Disposition: attachment; filename="wapbl_batch_dealloc_v3.patch"
 Content-Transfer-Encoding: 7Bit
 Content-Type: text/x-patch; charset="UTF-8"; name="wapbl_batch_dealloc_v3.patch"
 
 diff --git a/sys/kern/vfs_wapbl.c b/sys/kern/vfs_wapbl.c
 index 2448053..a005db9 100644
 --- a/sys/kern/vfs_wapbl.c
 +++ b/sys/kern/vfs_wapbl.c
 @@ -683,7 +683,7 @@ wapbl_stop(struct wapbl *wl, int force)
  	int error;
  
  	WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
 -	error = wapbl_flush(wl, 1);
 +	error = wapbl_flush(wl, WAPBL_WAITFOR);
  	if (error) {
  		if (force)
  			wapbl_discard(wl);
 @@ -911,7 +911,7 @@ wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
  /****************************************************************/
  
  int
 -wapbl_begin(struct wapbl *wl, const char *file, int line)
 +wapbl_begin(struct wapbl *wl, int *deallocmax, const char *file, int line)
  {
  	int doflush;
  	unsigned lockcount;
 @@ -950,9 +950,15 @@ wapbl_begin(struct wapbl *wl, const char *file, int line)
  			return error;
  	}
  
 -	rw_enter(&wl->wl_rwlock, RW_READER);
 +	if (deallocmax)
 +		rw_enter(&wl->wl_rwlock, RW_WRITER);
 +	else
 +		rw_enter(&wl->wl_rwlock, RW_READER);
 +
  	mutex_enter(&wl->wl_mtx);
  	wl->wl_lock_count++;
 +	if (deallocmax)
 +		*deallocmax = wl->wl_dealloclim - wl->wl_dealloccnt;
  	mutex_exit(&wl->wl_mtx);
  
  #if defined(WAPBL_DEBUG_PRINT)
 @@ -967,8 +973,10 @@ wapbl_begin(struct wapbl *wl, const char *file, int line)
  }
  
  void
 -wapbl_end(struct wapbl *wl)
 +wapbl_end(struct wapbl *wl, int dealloc)
  {
 +	int doflush = 0;
 +	unsigned lockcount;
  
  #if defined(WAPBL_DEBUG_PRINT)
  	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
 @@ -990,12 +998,28 @@ wapbl_end(struct wapbl *wl)
  	}
  #endif
  
 +	if (dealloc) {
 +		mutex_enter(&wl->wl_mtx);
 +		lockcount = wl->wl_lock_count;
 +		doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
 +			  wl->wl_bufbytes_max / 2) ||
 +			  ((wl->wl_bufcount + (lockcount * 10)) >
 +			  wl->wl_bufcount_max / 2) ||
 +			  (wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
 +			  (wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
 +		mutex_exit(&wl->wl_mtx);
 +
 +		if (doflush)
 +			wapbl_flush(wl, WAPBL_WAITFOR | WAPBL_LOCKTAKEN);
 +	}
 +
  	mutex_enter(&wl->wl_mtx);
  	KASSERT(wl->wl_lock_count > 0);
  	wl->wl_lock_count--;
  	mutex_exit(&wl->wl_mtx);
  
 -	rw_exit(&wl->wl_rwlock);
 +	if (!doflush)
 +		rw_exit(&wl->wl_rwlock);
  }
  
  void
 @@ -1435,7 +1459,7 @@ wapbl_biodone(struct buf *bp)
   * Write transactions to disk + start I/O for contents
   */
  int
 -wapbl_flush(struct wapbl *wl, int waitfor)
 +wapbl_flush(struct wapbl *wl, int flags)
  {
  	struct buf *bp;
  	struct wapbl_entry *we;
 @@ -1452,7 +1476,7 @@ wapbl_flush(struct wapbl *wl, int waitfor)
  	 * This assumes that the flush callback does not need to be called
  	 * unless there are other outstanding bufs.
  	 */
 -	if (!waitfor) {
 +	if (!(flags & WAPBL_WAITFOR)) {
  		size_t nbufs;
  		mutex_enter(&wl->wl_mtx);	/* XXX need mutex here to
  						   protect the KASSERTS */
 @@ -1468,7 +1492,8 @@ wapbl_flush(struct wapbl *wl, int waitfor)
  	 * XXX we may consider using LK_UPGRADE here
  	 * if we want to call flush from inside a transaction
  	 */
 -	rw_enter(&wl->wl_rwlock, RW_WRITER);
 +	if (!(flags & WAPBL_LOCKTAKEN))
 +		rw_enter(&wl->wl_rwlock, RW_WRITER);
  	wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
  	    wl->wl_dealloccnt);
  
 @@ -1639,7 +1664,7 @@ wapbl_flush(struct wapbl *wl, int waitfor)
  	 * If the waitfor flag is set, don't return until everything is
  	 * fully flushed and the on disk log is empty.
  	 */
 -	if (waitfor) {
 +	if (flags & WAPBL_WAITFOR) {
  		error = wapbl_truncate(wl, wl->wl_circ_size - 
  			wl->wl_reserved_bytes, wapbl_lazy_truncate);
  	}
 diff --git a/sys/sys/mount.h b/sys/sys/mount.h
 index 048fcdd..13c4993 100644
 --- a/sys/sys/mount.h
 +++ b/sys/sys/mount.h
 @@ -312,8 +312,8 @@ struct wapbl_ops {
  	void (*wo_wapbl_add_buf)(struct wapbl *, struct buf *);
  	void (*wo_wapbl_remove_buf)(struct wapbl *, struct buf *);
  	void (*wo_wapbl_resize_buf)(struct wapbl *, struct buf *, long, long);
 -	int (*wo_wapbl_begin)(struct wapbl *, const char *, int);
 -	void (*wo_wapbl_end)(struct wapbl *);
 +	int (*wo_wapbl_begin)(struct wapbl *, int *, const char *, int);
 +	void (*wo_wapbl_end)(struct wapbl *, int);
  	void (*wo_wapbl_junlock_assert)(struct wapbl *);
  	void (*wo_wapbl_biodone)(struct buf *);
  };
 @@ -336,9 +336,9 @@ struct wapbl_ops {
      (OLDSZ), (OLDCNT))
  #define WAPBL_BEGIN(MP)							\
      (*(MP)->mnt_wapbl_op->wo_wapbl_begin)((MP)->mnt_wapbl,		\
 -    __FILE__, __LINE__)
 +    NULL, __FILE__, __LINE__)
  #define WAPBL_END(MP)							\
 -    (*(MP)->mnt_wapbl_op->wo_wapbl_end)((MP)->mnt_wapbl)
 +    (*(MP)->mnt_wapbl_op->wo_wapbl_end)((MP)->mnt_wapbl, 0)
  #define WAPBL_JUNLOCK_ASSERT(MP)					\
      (*(MP)->mnt_wapbl_op->wo_wapbl_junlock_assert)((MP)->mnt_wapbl)
  
 diff --git a/sys/sys/wapbl.h b/sys/sys/wapbl.h
 index 735ec26..f64a1d3 100644
 --- a/sys/sys/wapbl.h
 +++ b/sys/sys/wapbl.h
 @@ -125,11 +125,11 @@ int	wapbl_stop(struct wapbl *, int);
   * level if called while a transaction is already in progress
   * by the current process.
   */
 -int	wapbl_begin(struct wapbl *, const char *, int);
 +int	wapbl_begin(struct wapbl *, int *, const char *, int);
  
  
  /* End a transaction or decrement the transaction recursion level */
 -void	wapbl_end(struct wapbl *);
 +void	wapbl_end(struct wapbl *, int);
  
  /*
   * Add a new buffer to the current transaction.  The buffers
 @@ -144,6 +144,10 @@ void	wapbl_remove_buf(struct wapbl *, struct buf *);
  
  void	wapbl_resize_buf(struct wapbl *, struct buf *, long, long);
  
 +/* Flags for wapbl_flush */
 +#define WAPBL_WAITFOR   0x1 /* don't return until everything is flushed */
 +#define WAPBL_LOCKTAKEN 0x2 /* a writer lock for wl_rwlock is already taken */
 +
  /*
   * This will flush all completed transactions to disk and
   * start asynchronous writes on the associated buffers
 diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
 index 1d1e32a..c2f45a6 100644
 --- a/sys/ufs/ffs/ffs_vfsops.c
 +++ b/sys/ufs/ffs/ffs_vfsops.c
 @@ -1634,7 +1634,7 @@ ffs_flushfiles(struct mount *mp, int flags, struct lwp *l)
  	if (error)
  		return error;
  	if (mp->mnt_wapbl) {
 -		error = wapbl_flush(mp->mnt_wapbl, 1);
 +		error = wapbl_flush(mp->mnt_wapbl, WAPBL_WAITFOR);
  		if (flags & FORCECLOSE)
  			error = 0;
  	}
 @@ -2087,7 +2087,7 @@ ffs_suspendctl(struct mount *mp, int cmd)
  			error = fstrans_setstate(mp, FSTRANS_SUSPENDED);
  #ifdef WAPBL
  		if (error == 0 && mp->mnt_wapbl)
 -			error = wapbl_flush(mp->mnt_wapbl, 1);
 +			error = wapbl_flush(mp->mnt_wapbl, WAPBL_WAITFOR);
  #endif
  		if (error != 0) {
  			(void) fstrans_setstate(mp, FSTRANS_NORMAL);
 diff --git a/sys/ufs/ffs/ffs_wapbl.c b/sys/ufs/ffs/ffs_wapbl.c
 index 37deeec..a42fc31 100644
 --- a/sys/ufs/ffs/ffs_wapbl.c
 +++ b/sys/ufs/ffs/ffs_wapbl.c
 @@ -360,7 +360,7 @@ ffs_wapbl_start(struct mount *mp)
  					goto out;
  				}
  				UFS_WAPBL_END(mp);
 -				error = wapbl_flush(mp->mnt_wapbl, 1);
 +				error = wapbl_flush(mp->mnt_wapbl, WAPBL_WAITFOR);
  				if (error)
  					goto out;
  			}
 @@ -409,7 +409,7 @@ ffs_wapbl_stop(struct mount *mp, int force)
  		 * as the only change in the final flush since otherwise
  		 * a transaction may reorder writes.
  		 */
 -		error = wapbl_flush(mp->mnt_wapbl, 1);
 +		error = wapbl_flush(mp->mnt_wapbl, WAPBL_WAITFOR);
  		if (error && !force)
  			return error;
  		if (error && force)
 diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c
 index db38d6e..4a8e7a2 100644
 --- a/sys/ufs/ufs/ufs_inode.c
 +++ b/sys/ufs/ufs/ufs_inode.c
 @@ -277,25 +277,64 @@ ufs_wapbl_truncate(struct vnode *vp, uint64_t newsize, kauth_cred_t cred)
  {
  	struct inode *ip = VTOI(vp);
  	int error = 0;
 -	uint64_t base, incr;
 +	int count = 0;
 +	int maxblks;
 +	int blksize = 1 << vp->v_mount->mnt_fs_bshift;
  
 -	base = UFS_NDADDR << vp->v_mount->mnt_fs_bshift;
 -	incr = MNINDIR(ip->i_ump) << vp->v_mount->mnt_fs_bshift;/* Power of 2 */
 -	while (ip->i_size > base + incr &&
 -	    (newsize == 0 || ip->i_size > newsize + incr)) {
 +	error = UFS_WAPBL_BEGIN_DEALLOC(vp->v_mount, &maxblks);
 +	if (error)
 +		return error;
 +
 +	while (ip->i_size > newsize) {
  		/*
  		 * round down to next full indirect
  		 * block boundary.
  		 */
 -		uint64_t nsize = base + ((ip->i_size - base - 1) & ~(incr - 1));
 +		uint64_t nsize;
 +		uint64_t allocsize;
 +		uint64_t maxbytes = maxblks << vp->v_mount->mnt_fs_bshift;
 +
 +		/*
 +		 * Non-allocated blocks do not require a transaction, so
 +		 * limit based on the number of actually allocated blocks.
 +		 */
 +		if (ip->i_ump->um_fstype == UFS1) {
 +			allocsize = dbtob((u_quad_t)ip->i_ffs1_blocks);
 +		} else {
 +			allocsize = dbtob(ip->i_ffs2_blocks);
 +		}
 +
 +		if (allocsize > maxbytes &&
 +		    ip->i_size - newsize > maxbytes) {
 +			/*
 +			 * Truncate to the minimum block aligned offset
 +			 * maxbytes allows us to target.
 +			 */
 +			nsize = ip->i_size - maxbytes;
 +			nsize -= nsize & (blksize - 1);
 +		} else {
 +			nsize = newsize;
 +		}
 +
  		error = UFS_TRUNCATE(vp, nsize, 0, cred);
  		if (error)
  			break;
 -		UFS_WAPBL_END(vp->v_mount);
 -		error = UFS_WAPBL_BEGIN(vp->v_mount);
 +		UFS_WAPBL_END_DEALLOC(vp->v_mount);
 +		if (count > 10) {
 +			/*
 +			 * If this is an unusually large operation,
 +			 * be a good neighbor and rest a little.
 +			 */
 +			yield();
 +			count = 0;
 +		} else {
 +			count++;
 +		}
 +		error = UFS_WAPBL_BEGIN_DEALLOC(vp->v_mount, &maxblks);
  		if (error)
  			return error;
  	}
 +	UFS_WAPBL_END_DEALLOC(vp->v_mount);
  	return error;
  }
  
 @@ -304,16 +343,10 @@ ufs_truncate(struct vnode *vp, uint64_t newsize, kauth_cred_t cred)
  {
  	int error;
  
 -	error = UFS_WAPBL_BEGIN(vp->v_mount);
 -	if (error)
 -		return error;
 -
  	if (vp->v_mount->mnt_wapbl)
  		error = ufs_wapbl_truncate(vp, newsize, cred);
 -
 -	if (error == 0)
 +	else
  		error = UFS_TRUNCATE(vp, newsize, 0, cred);
 -	UFS_WAPBL_END(vp->v_mount);
  
  	return error;
  }
 diff --git a/sys/ufs/ufs/ufs_wapbl.h b/sys/ufs/ufs/ufs_wapbl.h
 index 1eab045..47a379f 100644
 --- a/sys/ufs/ufs/ufs_wapbl.h
 +++ b/sys/ufs/ufs/ufs_wapbl.h
 @@ -95,7 +95,8 @@ void	ufs_wapbl_verify_inodes(struct mount *, const char *);
  #endif
  
  static __inline int
 -ufs_wapbl_begin2(struct mount *mp, struct vnode *vp1, struct vnode *vp2,
 +ufs_wapbl_begin2(struct mount *mp, int *dealloc,
 +		 struct vnode *vp1, struct vnode *vp2,
  		 const char *file, int line)
  {
  	if (mp->mnt_wapbl) {
 @@ -105,7 +106,7 @@ ufs_wapbl_begin2(struct mount *mp, struct vnode *vp1, struct vnode *vp2,
  			vref(vp1);
  		if (vp2)
  			vref(vp2);
 -		error = wapbl_begin(mp->mnt_wapbl, file, line);
 +		error = wapbl_begin(mp->mnt_wapbl, dealloc, file, line);
  		if (error)
  			return error;
  #ifdef WAPBL_DEBUG_INODES
 @@ -117,14 +118,14 @@ ufs_wapbl_begin2(struct mount *mp, struct vnode *vp1, struct vnode *vp2,
  }
  
  static __inline void
 -ufs_wapbl_end2(struct mount *mp, struct vnode *vp1, struct vnode *vp2)
 +ufs_wapbl_end2(struct mount *mp, int dealloc, struct vnode *vp1, struct vnode *vp2)
  {
  	if (mp->mnt_wapbl) {
  #ifdef WAPBL_DEBUG_INODES
  		if (mp->mnt_wapbl->wl_lock.lk_exclusivecount == 1)
  			ufs_wapbl_verify_inodes(mp, "wapbl_end");
  #endif
 -		wapbl_end(mp->mnt_wapbl);
 +		wapbl_end(mp->mnt_wapbl, dealloc);
  		if (vp2)
  			vrele(vp2);
  		if (vp1)
 @@ -133,11 +134,14 @@ ufs_wapbl_end2(struct mount *mp, struct vnode *vp1, struct vnode *vp2)
  }
  
  #define	UFS_WAPBL_BEGIN(mp)						\
 -	ufs_wapbl_begin2(mp, NULL, NULL, __FUNCTION__, __LINE__)
 +	ufs_wapbl_begin2(mp, NULL, NULL, NULL, __FUNCTION__, __LINE__)
  #define	UFS_WAPBL_BEGIN1(mp, v1)					\
 -	ufs_wapbl_begin2(mp, v1, NULL, __FUNCTION__, __LINE__)
 -#define	UFS_WAPBL_END(mp)	ufs_wapbl_end2(mp, NULL, NULL)
 -#define	UFS_WAPBL_END1(mp, v1)	ufs_wapbl_end2(mp, v1, NULL)
 +	ufs_wapbl_begin2(mp, NULL, v1, NULL, __FUNCTION__, __LINE__)
 +#define	UFS_WAPBL_BEGIN_DEALLOC(mp, dealloc)				\
 +	ufs_wapbl_begin2(mp, dealloc, NULL, NULL, __FUNCTION__, __LINE__)
 +#define	UFS_WAPBL_END(mp)	ufs_wapbl_end2(mp, 0, NULL, NULL)
 +#define	UFS_WAPBL_END1(mp, v1)	ufs_wapbl_end2(mp, 0, v1, NULL)
 +#define	UFS_WAPBL_END_DEALLOC(mp)	ufs_wapbl_end2(mp, 1, NULL, NULL)
  
  #define	UFS_WAPBL_UPDATE(vp, access, modify, flags)			\
  	if ((vp)->v_mount->mnt_wapbl) {					\
 @@ -165,8 +169,10 @@ ufs_wapbl_end2(struct mount *mp, struct vnode *vp1, struct vnode *vp2)
  #else /* ! WAPBL */
  #define	UFS_WAPBL_BEGIN(mp) (__USE(mp), 0)
  #define	UFS_WAPBL_BEGIN1(mp, v1) 0
 +#define	UFS_WAPBL_BEGIN_DEALLOC(mp, dealloc) (__USE(mp), __USE(dealloc), 0)
  #define	UFS_WAPBL_END(mp)	do { } while (0)
  #define	UFS_WAPBL_END1(mp, v1)
 +#define	UFS_WAPBL_END_DEALLOC(mp, dealloc)
  #define	UFS_WAPBL_UPDATE(vp, access, modify, flags)	do { } while (0)
  #define	UFS_WAPBL_JLOCK_ASSERT(mp)
  #define	UFS_WAPBL_JUNLOCK_ASSERT(mp)
 
 --nextPart23712401.KJzYPYpgBM--
 


Home | Main Index | Thread Index | Old Index