tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Exposing FUA as alternative to DIOCCACHESYNC for WAPBL



Attached is final version of the patch. It uses MEDIA prefix for the
flags, but keeps the FUA/DPO - i.e names are B_MEDIA_FUA, B_MEDIA_DPO.
For wapbl it introduces a sysctl to use the feature, default is off
for now.

I plan to commit this later in the week or early next week, unless
there are some serious objections.

Jaromir

2017-03-05 23:22 GMT+01:00 Jaromír Doleček <jaromir.dolecek%gmail.com@localhost>:
> Here is an updated patch. It was updated to check for the FUA support
> for SCSI, using the MODE SENSE device-specific flag. Code was tested
> with QEMU emulated bha(4) and nvme. WAPBL code was updated to use the
> flag. It keeps the flag naming for now.
>
> In the patch, WAPBL sets the flag for journal writes, and also for the
> metadata buffer for bawrite() call after journal commit.
>
> There is possible layer violation for metadata write - b_flags are
> supposed to be set by owner of the buffer. Not sure how strict we
> want/need to be there - perhaps introduce another flag field? Also the
> flag
> probably needs to be unset in biodone hook, so that the code
> guarantees the buffer in buffer cache doesn't accidentaly keep it over
> to another I/O.
>
> Jaromir
? dev/ic/TODO.nvme
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.126
diff -u -p -r1.126 buf.h
--- sys/buf.h	26 Dec 2016 23:12:33 -0000	1.126
+++ sys/buf.h	27 Mar 2017 22:31:22 -0000
@@ -198,16 +198,21 @@ struct buf {
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_DEVPRIVATE	0x02000000	/* Device driver private flag. */
+#define	B_MEDIA_FUA	0x08000000	/* Set Force Unit Access for media. */
+#define	B_MEDIA_DPO	0x10000000	/* Set Disable Page Out for media. */
 
 #define BUF_FLAGBITS \
     "\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \
     "\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
-    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH"
+    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34MEDIAFUA\35MEDIADPO"
 
 /* Avoid weird code due to B_WRITE being a "pseudo flag" */
 #define BUF_ISREAD(bp)	(((bp)->b_flags & B_READ) == B_READ)
 #define BUF_ISWRITE(bp)	(((bp)->b_flags & B_READ) == B_WRITE)
 
+/* Media flags, to be passed for nested I/O */
+#define B_MEDIA_FLAGS	(B_MEDIA_FUA|B_MEDIA_DPO)
+
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
  * field of the buffer on which I/O is done.  At I/O completion, cluster
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.22
diff -u -p -r1.22 dkio.h
--- sys/dkio.h	8 Dec 2015 20:36:15 -0000	1.22
+++ sys/dkio.h	27 Mar 2017 22:31:22 -0000
@@ -85,6 +85,8 @@
 #define	DKCACHE_RCHANGE	0x000100 /* read enable is changeable */
 #define	DKCACHE_WCHANGE	0x000200 /* write enable is changeable */
 #define	DKCACHE_SAVE	0x010000 /* cache parameters are savable/save them */
+#define	DKCACHE_FUA	0x020000 /* Force Unit Access supported */
+#define	DKCACHE_DPO	0x040000 /* Disable Page Out supported */
 
 		/* sync disk cache */
 #define	DIOCCACHESYNC	_IOW('d', 118, int)	/* sync cache (force?) */
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v
retrieving revision 1.271
diff -u -p -r1.271 vfs_bio.c
--- kern/vfs_bio.c	21 Mar 2017 10:46:49 -0000	1.271
+++ kern/vfs_bio.c	27 Mar 2017 22:31:22 -0000
@@ -2027,7 +2027,7 @@ nestiobuf_iodone(buf_t *bp)
 void
 nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
 {
-	const int b_read = mbp->b_flags & B_READ;
+	const int b_pass = mbp->b_flags & (B_READ|B_MEDIA_FLAGS);
 	struct vnode *vp = mbp->b_vp;
 
 	KASSERT(mbp->b_bcount >= offset + size);
@@ -2035,14 +2035,14 @@ nestiobuf_setup(buf_t *mbp, buf_t *bp, i
 	bp->b_dev = mbp->b_dev;
 	bp->b_objlock = mbp->b_objlock;
 	bp->b_cflags = BC_BUSY;
-	bp->b_flags = B_ASYNC | b_read;
+	bp->b_flags = B_ASYNC | b_pass;
 	bp->b_iodone = nestiobuf_iodone;
 	bp->b_data = (char *)mbp->b_data + offset;
 	bp->b_resid = bp->b_bcount = size;
 	bp->b_bufsize = bp->b_bcount;
 	bp->b_private = mbp;
 	BIO_COPYPRIO(bp, mbp);
-	if (!b_read && vp != NULL) {
+	if (BUF_ISWRITE(bp) && vp != NULL) {
 		mutex_enter(vp->v_interlock);
 		vp->v_numoutput++;
 		mutex_exit(vp->v_interlock);
Index: kern/vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.92
diff -u -p -r1.92 vfs_wapbl.c
--- kern/vfs_wapbl.c	17 Mar 2017 03:19:46 -0000	1.92
+++ kern/vfs_wapbl.c	27 Mar 2017 22:31:23 -0000
@@ -71,6 +71,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,
 static struct sysctllog *wapbl_sysctl;
 static int wapbl_flush_disk_cache = 1;
 static int wapbl_verbose_commit = 0;
+static int wapbl_use_fua = 0; 	/* switched off by default for now */
 
 static inline size_t wapbl_space_free(size_t, off_t, off_t);
 
@@ -230,6 +231,16 @@ struct wapbl {
 	u_char *wl_buffer;	/* l:   buffer for wapbl_buffered_write() */
 	daddr_t wl_buffer_dblk;	/* l:   buffer disk block address */
 	size_t wl_buffer_used;	/* l:   buffer current use */
+
+	int wl_dkcache;		/* r: 	disk cache flags */
+#define WAPBL_USE_FUA(wl)	\
+		(wapbl_use_fua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
+#define WAPBL_JFLAGS(wl)	\
+		(WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
+#define WAPBL_MFLAGS(wl)	\
+		(WAPBL_USE_FUA(wl) ? (wl)->wl_mwrite_flags : 0)
+	int wl_jwrite_flags;	/* r: 	journal write flags */
+	int wl_mwrite_flags;	/* r:	metadata write flags */
 };
 
 #ifdef WAPBL_DEBUG_PRINT
@@ -281,6 +292,8 @@ static void wapbl_deallocation_free(stru
 static void wapbl_evcnt_init(struct wapbl *);
 static void wapbl_evcnt_free(struct wapbl *);
 
+static void wapbl_dkcache_init(struct wapbl *);
+
 #if 0
 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
 #endif
@@ -335,6 +348,18 @@ wapbl_sysctl_init(void)
 		       SYSCTL_DESCR("show time and size of wapbl log commits"),
 		       NULL, 0, &wapbl_verbose_commit, 0,
 		       CTL_CREATE, CTL_EOL);
+	if (rv)
+		return rv;
+
+	rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "use_fua",
+		       SYSCTL_DESCR("use FUA/DPO instead of cash flush if available"),
+		       NULL, 0, &wapbl_use_fua, 0,
+		       CTL_CREATE, CTL_EOL);
+	if (rv)
+		return rv;
+
 	return rv;
 }
 
@@ -391,6 +416,30 @@ wapbl_evcnt_free(struct wapbl *wl)
 	evcnt_detach(&wl->wl_ev_cacheflush);
 }
 
+static void
+wapbl_dkcache_init(struct wapbl *wl)
+{
+	int error;
+
+	/* Get disk cache flags */
+	error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
+	    FWRITE, FSCRED);
+	if (error) {
+		/* behave as if there was a write cache */
+		wl->wl_dkcache = DKCACHE_WRITE;
+	}
+
+	/* Use FUA instead of cache flush if available */
+	if (ISSET(wl->wl_dkcache, DKCACHE_FUA)) {
+		wl->wl_jwrite_flags |= B_MEDIA_FUA;
+		wl->wl_mwrite_flags |= B_MEDIA_FUA;
+	}
+
+	/* Use DPO for journal writes if available */
+	if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
+		wl->wl_jwrite_flags |= B_MEDIA_DPO;
+}
+
 static int
 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
 {
@@ -563,6 +612,8 @@ wapbl_start(struct wapbl ** wlp, struct 
 
 	wapbl_evcnt_init(wl);
 
+	wapbl_dkcache_init(wl);
+
 	/* Initialize the commit header */
 	{
 		struct wapbl_wc_header *wc;
@@ -809,7 +860,6 @@ wapbl_doio(void *data, size_t len, struc
 	struct buf *bp;
 	int error;
 
-	KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
 	KASSERT(devvp->v_type == VBLK);
 
 	if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
@@ -823,7 +873,7 @@ wapbl_doio(void *data, size_t len, struc
 
 	bp = getiobuf(devvp, true);
 	bp->b_flags = flags;
-	bp->b_cflags = BC_BUSY; /* silly & dubious */
+	bp->b_cflags = BC_BUSY;	/* mandatory, asserted by biowait() */
 	bp->b_dev = devvp->v_rdev;
 	bp->b_data = data;
 	bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
@@ -898,7 +948,8 @@ wapbl_buffered_flush(struct wapbl *wl)
 		return 0;
 
 	error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-	    wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+	    wl->wl_devvp, wl->wl_buffer_dblk,
+	    B_WRITE | WAPBL_JFLAGS(wl));
 	wl->wl_buffer_used = 0;
 
 	wl->wl_ev_journalwrite.ev_count++;
@@ -948,12 +999,10 @@ wapbl_buffered_write(void *data, size_t 
 	if (len >= resid) {
 		memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
 		wl->wl_buffer_used += resid;
-		error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-		    wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+		error = wapbl_buffered_flush(wl);
 		data = (uint8_t *)data + resid;
 		len -= resid;
 		wl->wl_buffer_dblk = pbn + btodb(resid);
-		wl->wl_buffer_used = 0;
 		if (error)
 			return error;
 	}
@@ -1500,6 +1549,13 @@ wapbl_biodone(struct buf *bp)
 	}
 
 	/*
+	 * Make sure that the buf doesn't retain the media flags, so that
+	 * e.g. wapbl_use_fua has immediate effect on any following I/O.
+	 * The flags will be set again if needed by another I/O.
+	 */
+	bp->b_flags &= ~B_MEDIA_FLAGS;
+
+	/*
 	 * Release the buffer here. wapbl_flush() may wait for the
 	 * log to become empty and we better unbusy the buffer before
 	 * wapbl_flush() returns.
@@ -1754,6 +1810,10 @@ wapbl_flush(struct wapbl *wl, int waitfo
 		}
 		bp->b_iodone = wapbl_biodone;
 		bp->b_private = we;
+
+		/* make sure the block is saved sync when FUA in use */
+		bp->b_flags |= WAPBL_MFLAGS(wl);
+
 		bremfree(bp);
 		wapbl_remove_buf_locked(wl, bp);
 		mutex_exit(&wl->wl_mtx);
@@ -2201,7 +2261,8 @@ wapbl_cache_sync(struct wapbl *wl, const
 	int force = 1;
 	int error;
 
-	if (!wapbl_flush_disk_cache) {
+	/* Skip full cache sync if disabled, or when using FUA */
+	if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
 		return 0;
 	}
 	if (verbose) {
Index: dev/ic/ld_nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v
retrieving revision 1.14
diff -u -p -r1.14 ld_nvme.c
--- dev/ic/ld_nvme.c	28 Feb 2017 20:55:09 -0000	1.14
+++ dev/ic/ld_nvme.c	27 Mar 2017 22:31:23 -0000
@@ -152,11 +152,15 @@ static int
 ld_nvme_start(struct ld_softc *ld, struct buf *bp)
 {
 	struct ld_nvme_softc *sc = device_private(ld->sc_dv);
+	int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ;
+
+	if (bp->b_flags & B_MEDIA_FUA)
+		flags |= NVME_NS_CTX_F_FUA;
 
 	return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc,
 	    bp, bp->b_data, bp->b_bcount,
 	    sc->sc_ld.sc_secsize, bp->b_rawblkno,
-	    BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ,
+	    flags,
 	    ld_nvme_biodone);
 }
 
@@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in
 	int error;
 	struct ld_nvme_softc *sc = device_private(ld->sc_dv);
 
-	*addr = 0;
+	/*
+	 * DPO not supported, Dataset Management (DSM) field doesn't specify
+	 * the same semantics.
+	 */ 
+	*addr = DKCACHE_FUA;
 
 	if (!nvme_has_volatile_write_cache(sc->sc_nvme)) {
 		/* cache simply not present */
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v
retrieving revision 1.25
diff -u -p -r1.25 nvme.c
--- dev/ic/nvme.c	28 Feb 2017 20:53:50 -0000	1.25
+++ dev/ic/nvme.c	27 Mar 2017 22:31:23 -0000
@@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st
 
 	htolem64(&sqe->slba, ccb->nnc_blkno);
 
+	if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
+		htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
+
 	/* guaranteed by upper layers, but check just in case */
 	KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
 	htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.12
diff -u -p -r1.12 nvmevar.h
--- dev/ic/nvmevar.h	28 Feb 2017 20:53:50 -0000	1.12
+++ dev/ic/nvmevar.h	27 Mar 2017 22:31:23 -0000
@@ -64,6 +64,7 @@ struct nvme_ccb {
 	uint16_t	nnc_flags;
 #define	NVME_NS_CTX_F_READ	__BIT(0)
 #define	NVME_NS_CTX_F_POLL	__BIT(1)
+#define	NVME_NS_CTX_F_FUA	__BIT(2)
 
 	struct buf	*nnc_buf;
 	daddr_t		nnc_blkno;
Index: dev/scsipi/scsipi_disk.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v
retrieving revision 1.21
diff -u -p -r1.21 scsipi_disk.h
--- dev/scsipi/scsipi_disk.h	25 Dec 2007 18:33:42 -0000	1.21
+++ dev/scsipi/scsipi_disk.h	27 Mar 2017 22:31:23 -0000
@@ -62,9 +62,10 @@ struct scsipi_rw_10 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SRWB_RELADDR	0x01	/* obsolete */
-#define	SRWB_FUA_NV	0x02	/* force unit access non-volatile cache */
-#define	SRWB_FUA	0x08	/* force unit access */
-#define	SRWB_DPO	0x10	/* disable page out */
+#define	SRWB_FUA_NV	0x02	/* force unit access non-volatile cache (SCSI-3) */
+#define	SRWB_RESV2	0x04	/* reserved (SCSI-2) */
+#define	SRWB_FUA	0x08	/* force unit access volatile cache (SCSI-2) */
+#define	SRWB_DPO	0x10	/* disable page out (SCSI-2) */
 #define	SRWB_PROTECT(x) ((x) << 5)
 	u_int8_t addr[4];
 	u_int8_t reserved;
@@ -159,4 +160,7 @@ struct scsipi_capacity_descriptor {
 #define	SCSIPI_CAP_DESC_CODE_FORMATTED		0x2
 #define	SCSIPI_CAP_DESC_CODE_NONE		0x3
 
+/* defines for the device specific byte in the mode select/sense header */
+#define	SMH_DSP_DPOFUA		0x10
+
 #endif /* _DEV_SCSIPI_SCSIPI_DISK_H_ */
Index: dev/scsipi/sd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v
retrieving revision 1.322
diff -u -p -r1.322 sd.c
--- dev/scsipi/sd.c	21 Dec 2016 21:28:30 -0000	1.322
+++ dev/scsipi/sd.c	27 Mar 2017 22:31:23 -0000
@@ -654,6 +654,7 @@ sd_diskstart(device_t dev, struct buf *b
 	struct scsipi_generic *cmdp;
 	struct scsipi_xfer *xs;
 	int error, flags, nblks, cmdlen;
+	int cdb_flags;
 
 	mutex_enter(chan_mtx(chan));
 
@@ -698,12 +699,27 @@ sd_diskstart(device_t dev, struct buf *b
 		nblks = howmany(bp->b_bcount, sd->params.blksize);
 
 	/*
+	 * Pass FUA and/or DPO if requested. Must be done before CDB
+	 * selection, as 6-byte CDB doesn't support the flags.
+	 */
+	cdb_flags = 0;
+
+	if (bp->b_flags & B_MEDIA_FUA)
+		cdb_flags |= SRWB_FUA;
+
+	if (bp->b_flags & B_MEDIA_DPO)
+		cdb_flags |= SRWB_DPO;
+
+	/*
 	 * Fill out the scsi command.  Use the smallest CDB possible
-	 * (6-byte, 10-byte, or 16-byte).
+	 * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO,
+	 * need to use 10-byte or bigger, as the 6-byte doesn't support
+	 * the flags.
 	 */
 	if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) &&
 	    ((nblks & 0xff) == nblks) &&
-	    !(periph->periph_quirks & PQUIRK_ONLYBIG)) {
+	    !(periph->periph_quirks & PQUIRK_ONLYBIG) &&
+	    !cdb_flags) {
 		/* 6-byte CDB */
 		memset(&cmd_small, 0, sizeof(cmd_small));
 		cmd_small.opcode = (bp->b_flags & B_READ) ?
@@ -732,6 +748,9 @@ sd_diskstart(device_t dev, struct buf *b
 		cmdp = (struct scsipi_generic *)&cmd16;
 	}
 
+	if (cdb_flags)
+		cmdp->bytes[0] = cdb_flags;
+
 	/*
 	 * Figure out what flags to use.
 	 */
@@ -1796,7 +1815,9 @@ sd_getcache(struct sd_softc *sd, int *bi
 	int error, bits = 0;
 	int big;
 	union scsi_disk_pages *pages;
+	uint8_t dev_spec;
 
+	/* only SCSI-2 and later supported */
 	if (periph->periph_version < 2)
 		return (EOPNOTSUPP);
 
@@ -1806,10 +1827,13 @@ sd_getcache(struct sd_softc *sd, int *bi
 	if (error)
 		return (error);
 
-	if (big)
+	if (big) {
 		pages = (void *)(&scsipi_sense.header.big + 1);
-	else
+		dev_spec = scsipi_sense.header.big.dev_spec;
+	} else {
 		pages = (void *)(&scsipi_sense.header.small + 1);
+		dev_spec = scsipi_sense.header.small.dev_spec;
+	}
 
 	if ((pages->caching_params.flags & CACHING_RCD) == 0)
 		bits |= DKCACHE_READ;
@@ -1818,6 +1842,13 @@ sd_getcache(struct sd_softc *sd, int *bi
 	if (pages->caching_params.pg_code & PGCODE_PS)
 		bits |= DKCACHE_SAVE;
 
+	/*
+	 * Support for FUA/DPO, defined starting with SCSI-2. Use only
+	 * if device claims to support it, according to the MODE SENSE.
+	 */
+	if (ISSET(dev_spec, SMH_DSP_DPOFUA))
+		bits |= DKCACHE_FUA | DKCACHE_DPO;
+
 	memset(&scsipi_sense, 0, sizeof(scsipi_sense));
 	error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
 	    sizeof(scsipi_sense.pages.caching_params),


Home | Main Index | Thread Index | Old Index