tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Exposing FUA as alternative to DIOCCACHESYNC for WAPBL



Here is an updated patch. It was updated to check for the FUA support
for SCSI, using the MODE SENSE device-specific flag. Code was tested
with QEMU emulated bha(4) and nvme. WAPBL code was updated to use the
flag. It keeps the flag naming for now.

In the patch, WAPBL sets the flag for journal writes, and also for the
metadata buffer for bawrite() call after journal commit.

There is possible layer violation for metadata write - b_flags are
supposed to be set by owner of the buffer. Not sure how strict we
want/need to be there - perhaps introduce another flag field? Also the
flag
probably needs to be unset in biodone hook, so that the code
guarantees the buffer in buffer cache doesn't accidentaly keep it over
to another I/O.

Jaromir
? dev/ic/TODO.nvme
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.126
diff -u -p -r1.126 buf.h
--- sys/buf.h	26 Dec 2016 23:12:33 -0000	1.126
+++ sys/buf.h	5 Mar 2017 22:08:35 -0000
@@ -198,11 +198,13 @@ struct buf {
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_DEVPRIVATE	0x02000000	/* Device driver private flag. */
+#define	B_FUA		0x08000000	/* Force Unit Access flag (mandatory). */
+#define	B_DPO		0x10000000	/* Disable Page Out flag (advisory). */
 
 #define BUF_FLAGBITS \
     "\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \
     "\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
-    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH"
+    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34FUA\35DPO"
 
 /* Avoid weird code due to B_WRITE being a "pseudo flag" */
 #define BUF_ISREAD(bp)	(((bp)->b_flags & B_READ) == B_READ)
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.22
diff -u -p -r1.22 dkio.h
--- sys/dkio.h	8 Dec 2015 20:36:15 -0000	1.22
+++ sys/dkio.h	5 Mar 2017 22:08:35 -0000
@@ -85,6 +85,8 @@
 #define	DKCACHE_RCHANGE	0x000100 /* read enable is changeable */
 #define	DKCACHE_WCHANGE	0x000200 /* write enable is changeable */
 #define	DKCACHE_SAVE	0x010000 /* cache parameters are savable/save them */
+#define	DKCACHE_FUA	0x020000 /* Force Unit Access supported */
+#define	DKCACHE_DPO	0x040000 /* Disable Page Out supported */
 
 		/* sync disk cache */
 #define	DIOCCACHESYNC	_IOW('d', 118, int)	/* sync cache (force?) */
Index: kern/vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.87
diff -u -p -r1.87 vfs_wapbl.c
--- kern/vfs_wapbl.c	5 Mar 2017 13:57:29 -0000	1.87
+++ kern/vfs_wapbl.c	5 Mar 2017 22:08:35 -0000
@@ -70,6 +70,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,
 static struct sysctllog *wapbl_sysctl;
 static int wapbl_flush_disk_cache = 1;
 static int wapbl_verbose_commit = 0;
+static int wapbl_use_fua = 1;
 
 static inline size_t wapbl_space_free(size_t, off_t, off_t);
 
@@ -229,6 +230,12 @@ struct wapbl {
 	u_char *wl_buffer;	/* l:   buffer for wapbl_buffered_write() */
 	daddr_t wl_buffer_dblk;	/* l:   buffer disk block address */
 	size_t wl_buffer_used;	/* l:   buffer current use */
+
+	int wl_dkcache;		/* r: 	disk cache flags */
+#define WAPBL_USE_FUA(wl)	\
+		(wapbl_use_fua && ISSET(wl->wl_dkcache, DKCACHE_FUA))
+	int wl_jwrite_flags;	/* r: 	journal write flags */
+	int wl_mwrite_flags;	/* r:	metadata write flags */
 };
 
 #ifdef WAPBL_DEBUG_PRINT
@@ -280,6 +287,8 @@ static void wapbl_deallocation_free(stru
 static void wapbl_evcnt_init(struct wapbl *);
 static void wapbl_evcnt_free(struct wapbl *);
 
+static void wapbl_dkcache_init(struct wapbl *);
+
 #if 0
 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
 #endif
@@ -390,6 +399,30 @@ wapbl_evcnt_free(struct wapbl *wl)
 	evcnt_detach(&wl->wl_ev_cacheflush);
 }
 
+static void
+wapbl_dkcache_init(struct wapbl *wl)
+{
+	int error;
+
+	/* Get disk cache flags */
+	error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
+	    FWRITE, FSCRED);
+	if (error) {
+		/* behave as if there is a write cache */
+		wl->wl_dkcache = DKCACHE_WRITE;
+	}
+
+	/* Use FUA instead of cache flush if available */
+	if (WAPBL_USE_FUA(wl)) {
+		wl->wl_jwrite_flags |= B_FUA;
+		wl->wl_mwrite_flags |= B_FUA;
+	}
+
+	/* Use DPO for journal writes if available */
+	if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
+		wl->wl_jwrite_flags |= B_DPO;
+}
+
 static int
 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
 {
@@ -562,6 +595,8 @@ wapbl_start(struct wapbl ** wlp, struct 
 
 	wapbl_evcnt_init(wl);
 
+	wapbl_dkcache_init(wl);
+
 	/* Initialize the commit header */
 	{
 		struct wapbl_wc_header *wc;
@@ -808,7 +843,6 @@ wapbl_doio(void *data, size_t len, struc
 	struct buf *bp;
 	int error;
 
-	KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
 	KASSERT(devvp->v_type == VBLK);
 
 	if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
@@ -822,7 +856,7 @@ wapbl_doio(void *data, size_t len, struc
 
 	bp = getiobuf(devvp, true);
 	bp->b_flags = flags;
-	bp->b_cflags = BC_BUSY; /* silly & dubious */
+	bp->b_cflags = BC_BUSY;	/* mandatory, asserted by biowait() */
 	bp->b_dev = devvp->v_rdev;
 	bp->b_data = data;
 	bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
@@ -897,7 +931,8 @@ wapbl_buffered_flush(struct wapbl *wl)
 		return 0;
 
 	error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-	    wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+	    wl->wl_devvp, wl->wl_buffer_dblk,
+	    B_WRITE | wl->wl_jwrite_flags);
 	wl->wl_buffer_used = 0;
 
 	wl->wl_ev_journalwrite.ev_count++;
@@ -947,12 +982,10 @@ wapbl_buffered_write(void *data, size_t 
 	if (len >= resid) {
 		memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
 		wl->wl_buffer_used += resid;
-		error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-		    wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+		error = wapbl_buffered_flush(wl);
 		data = (uint8_t *)data + resid;
 		len -= resid;
 		wl->wl_buffer_dblk = pbn + btodb(resid);
-		wl->wl_buffer_used = 0;
 		if (error)
 			return error;
 	}
@@ -1498,6 +1531,9 @@ wapbl_biodone(struct buf *bp)
 		mutex_exit(&wl->wl_mtx);
 	}
 
+	/* XXX unset FUA again here? */
+	/* bp->b_flags &= ~wl->wl_mwrite_flags; */
+
 	/*
 	 * Release the buffer here. wapbl_flush() may wait for the
 	 * log to become empty and we better unbusy the buffer before
@@ -1753,6 +1789,10 @@ wapbl_flush(struct wapbl *wl, int waitfo
 		}
 		bp->b_iodone = wapbl_biodone;
 		bp->b_private = we;
+
+		/* make sure the block is saved sync when FUA in use */
+		bp->b_flags |= wl->wl_mwrite_flags;
+
 		bremfree(bp);
 		wapbl_remove_buf_locked(wl, bp);
 		mutex_exit(&wl->wl_mtx);
@@ -2200,7 +2240,7 @@ wapbl_cache_sync(struct wapbl *wl, const
 	int force = 1;
 	int error;
 
-	if (!wapbl_flush_disk_cache) {
+	if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
 		return 0;
 	}
 	if (verbose) {
Index: dev/ic/ld_nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v
retrieving revision 1.14
diff -u -p -r1.14 ld_nvme.c
--- dev/ic/ld_nvme.c	28 Feb 2017 20:55:09 -0000	1.14
+++ dev/ic/ld_nvme.c	5 Mar 2017 22:08:35 -0000
@@ -152,11 +152,15 @@ static int
 ld_nvme_start(struct ld_softc *ld, struct buf *bp)
 {
 	struct ld_nvme_softc *sc = device_private(ld->sc_dv);
+	int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ;
+
+	if (bp->b_flags & B_FUA)
+		flags |= NVME_NS_CTX_F_FUA;
 
 	return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc,
 	    bp, bp->b_data, bp->b_bcount,
 	    sc->sc_ld.sc_secsize, bp->b_rawblkno,
-	    BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ,
+	    flags,
 	    ld_nvme_biodone);
 }
 
@@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in
 	int error;
 	struct ld_nvme_softc *sc = device_private(ld->sc_dv);
 
-	*addr = 0;
+	/*
+	 * DPO not supported, Dataset Management (DSM) field doesn't specify
+	 * the same semantics.
+	 */ 
+	*addr = DKCACHE_FUA;
 
 	if (!nvme_has_volatile_write_cache(sc->sc_nvme)) {
 		/* cache simply not present */
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v
retrieving revision 1.25
diff -u -p -r1.25 nvme.c
--- dev/ic/nvme.c	28 Feb 2017 20:53:50 -0000	1.25
+++ dev/ic/nvme.c	5 Mar 2017 22:08:35 -0000
@@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st
 
 	htolem64(&sqe->slba, ccb->nnc_blkno);
 
+	if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
+		htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
+
 	/* guaranteed by upper layers, but check just in case */
 	KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
 	htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.12
diff -u -p -r1.12 nvmevar.h
--- dev/ic/nvmevar.h	28 Feb 2017 20:53:50 -0000	1.12
+++ dev/ic/nvmevar.h	5 Mar 2017 22:08:35 -0000
@@ -64,6 +64,7 @@ struct nvme_ccb {
 	uint16_t	nnc_flags;
 #define	NVME_NS_CTX_F_READ	__BIT(0)
 #define	NVME_NS_CTX_F_POLL	__BIT(1)
+#define	NVME_NS_CTX_F_FUA	__BIT(2)
 
 	struct buf	*nnc_buf;
 	daddr_t		nnc_blkno;
Index: dev/scsipi/scsipi_disk.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v
retrieving revision 1.21
diff -u -p -r1.21 scsipi_disk.h
--- dev/scsipi/scsipi_disk.h	25 Dec 2007 18:33:42 -0000	1.21
+++ dev/scsipi/scsipi_disk.h	5 Mar 2017 22:08:35 -0000
@@ -62,9 +62,10 @@ struct scsipi_rw_10 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SRWB_RELADDR	0x01	/* obsolete */
-#define	SRWB_FUA_NV	0x02	/* force unit access non-volatile cache */
-#define	SRWB_FUA	0x08	/* force unit access */
-#define	SRWB_DPO	0x10	/* disable page out */
+#define	SRWB_FUA_NV	0x02	/* force unit access non-volatile cache (SCSI-3) */
+#define	SRWB_RESV2	0x04	/* reserved (SCSI-2) */
+#define	SRWB_FUA	0x08	/* force unit access volatile cache (SCSI-2) */
+#define	SRWB_DPO	0x10	/* disable page out (SCSI-2) */
 #define	SRWB_PROTECT(x) ((x) << 5)
 	u_int8_t addr[4];
 	u_int8_t reserved;
@@ -159,4 +160,7 @@ struct scsipi_capacity_descriptor {
 #define	SCSIPI_CAP_DESC_CODE_FORMATTED		0x2
 #define	SCSIPI_CAP_DESC_CODE_NONE		0x3
 
+/* defines for the device specific byte in the mode select/sense header */
+#define	SMH_DSP_DPOFUA		0x10
+
 #endif /* _DEV_SCSIPI_SCSIPI_DISK_H_ */
Index: dev/scsipi/sd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v
retrieving revision 1.322
diff -u -p -r1.322 sd.c
--- dev/scsipi/sd.c	21 Dec 2016 21:28:30 -0000	1.322
+++ dev/scsipi/sd.c	5 Mar 2017 22:08:35 -0000
@@ -654,6 +654,7 @@ sd_diskstart(device_t dev, struct buf *b
 	struct scsipi_generic *cmdp;
 	struct scsipi_xfer *xs;
 	int error, flags, nblks, cmdlen;
+	int cdb_flags;
 
 	mutex_enter(chan_mtx(chan));
 
@@ -698,12 +699,27 @@ sd_diskstart(device_t dev, struct buf *b
 		nblks = howmany(bp->b_bcount, sd->params.blksize);
 
 	/*
+	 * Pass FUA and/or DPO if requested. Must be done before CDB
+	 * selection, as 6-byte CDB doesn't support the flags.
+	 */
+	cdb_flags = 0;
+
+	if (bp->b_flags & B_FUA)
+		cdb_flags |= SRWB_FUA;
+
+	if (bp->b_flags & B_DPO)
+		cdb_flags |= SRWB_DPO;
+
+	/*
 	 * Fill out the scsi command.  Use the smallest CDB possible
-	 * (6-byte, 10-byte, or 16-byte).
+	 * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO,
+	 * need to use 10-byte or bigger, as the 6-byte doesn't support
+	 * the flags.
 	 */
 	if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) &&
 	    ((nblks & 0xff) == nblks) &&
-	    !(periph->periph_quirks & PQUIRK_ONLYBIG)) {
+	    !(periph->periph_quirks & PQUIRK_ONLYBIG) &&
+	    !cdb_flags) {
 		/* 6-byte CDB */
 		memset(&cmd_small, 0, sizeof(cmd_small));
 		cmd_small.opcode = (bp->b_flags & B_READ) ?
@@ -732,6 +748,9 @@ sd_diskstart(device_t dev, struct buf *b
 		cmdp = (struct scsipi_generic *)&cmd16;
 	}
 
+	if (cdb_flags)
+		cmdp->bytes[0] = cdb_flags;
+
 	/*
 	 * Figure out what flags to use.
 	 */
@@ -1796,7 +1815,9 @@ sd_getcache(struct sd_softc *sd, int *bi
 	int error, bits = 0;
 	int big;
 	union scsi_disk_pages *pages;
+	uint8_t dev_spec;
 
+	/* only SCSI-2 and later supported */
 	if (periph->periph_version < 2)
 		return (EOPNOTSUPP);
 
@@ -1806,10 +1827,13 @@ sd_getcache(struct sd_softc *sd, int *bi
 	if (error)
 		return (error);
 
-	if (big)
+	if (big) {
 		pages = (void *)(&scsipi_sense.header.big + 1);
-	else
+		dev_spec = scsipi_sense.header.big.dev_spec;
+	} else {
 		pages = (void *)(&scsipi_sense.header.small + 1);
+		dev_spec = scsipi_sense.header.small.dev_spec;
+	}
 
 	if ((pages->caching_params.flags & CACHING_RCD) == 0)
 		bits |= DKCACHE_READ;
@@ -1818,6 +1842,13 @@ sd_getcache(struct sd_softc *sd, int *bi
 	if (pages->caching_params.pg_code & PGCODE_PS)
 		bits |= DKCACHE_SAVE;
 
+	/*
+	 * Support for FUA/DPO, defined starting with SCSI-2. Use only
+	 * if device claims to support it, according to the MODE SENSE.
+	 */
+	if (ISSET(dev_spec, SMH_DSP_DPOFUA))
+		bits |= DKCACHE_FUA | DKCACHE_DPO;
+
 	memset(&scsipi_sense, 0, sizeof(scsipi_sense));
 	error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
 	    sizeof(scsipi_sense.pages.caching_params),


Home | Main Index | Thread Index | Old Index