tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Exposing FUA as alternative to DIOCCACHESYNC for WAPBL



Hi,

I'm working on an interface for WAPBL to use Force Unit Access (FUA)
feature on compatible hardware (currently SCSI and NVMe), as a
replacement to full disk cache flushes. I'd also like to add support
for DPO (Disable Page Out), as that is trivial extension of FUA
support at least for SCSI.

Scope is currently limited to I/O via traditional buffer cache
(metadata only) interface for now, as that is all what is needed for
WAPBL case. First support direct use over sd(4)/ld(4), then the fix
layered drivers like dk(4), cgd(4) and eventually raid(4).

In order to be a reliable cache flush replacement, the FUA flag needs
to be exposed in a way that it's used only when the underlying driver
and hardware supports it. When it's specified, the driver needs to
honor it.

For DPO, it should be safe to simply ignore the flag when not
supported, because it's just disk cache optimization.

Since I wasn't able to spot any specific support for this in FreeBSD
or OpenBSD, I've come with a new, and rather simplistic kernel change
to expose the feature. It basically relies on the caller to DTRT.

Code extends DIOCGCACHE to return also information about FUA/DPO
support by the underlying hardware. Then, I added new flags for buffer
cache I/O, and modified the drivers to pass the flags to the hardware
when present in the struct buf.

It's supposed to be used in a way that caller on the beginning checks
the FUA support via DIOCGCACHE once, then the caller issues any
further I/O requests with FUA flag when DIOCGCACHE indicated FUA is
supported. This way there is no need to modify drivers to refuse the
flag when not supported. It assumes that if DIOCGCACHE indicates
support, then the I/O will reach the same hardware also with the
struct buf flag kept.

Seems so far all drivers keep struct buf b_flags when splitting or
processing the I/O buffer, i.e. cgd(4), dk(4), raid(4) all seem to do
it. This needs to be tested of course and we need to fix any disk
pseudo devices which don't pass the flags  intact.

Attached is the patch. It's really small and compile-tested only at
the moment. The main goal now is to get feedback if this approach is
sufficient and suitable, or whether we need something completely
different or complex.

Comments?

Jaromir
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.126
diff -u -p -r1.126 buf.h
--- sys/buf.h	26 Dec 2016 23:12:33 -0000	1.126
+++ sys/buf.h	1 Mar 2017 21:19:41 -0000
@@ -198,11 +198,13 @@ struct buf {
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_DEVPRIVATE	0x02000000	/* Device driver private flag. */
+#define	B_FUA		0x08000000	/* Force Unit Access flag (mandatory). */
+#define	B_DPO		0x10000000	/* Disable Page Out flag (advisory). */
 
 #define BUF_FLAGBITS \
     "\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \
     "\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
-    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH"
+    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34FUA\35DPO"
 
 /* Avoid weird code due to B_WRITE being a "pseudo flag" */
 #define BUF_ISREAD(bp)	(((bp)->b_flags & B_READ) == B_READ)
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.22
diff -u -p -r1.22 dkio.h
--- sys/dkio.h	8 Dec 2015 20:36:15 -0000	1.22
+++ sys/dkio.h	1 Mar 2017 21:19:41 -0000
@@ -85,6 +85,8 @@
 #define	DKCACHE_RCHANGE	0x000100 /* read enable is changeable */
 #define	DKCACHE_WCHANGE	0x000200 /* write enable is changeable */
 #define	DKCACHE_SAVE	0x010000 /* cache parameters are savable/save them */
+#define	DKCACHE_FUA	0x020000 /* Force Unit Access supported */
+#define	DKCACHE_DPO	0x040000 /* Disable Page Out supported */
 
 		/* sync disk cache */
 #define	DIOCCACHESYNC	_IOW('d', 118, int)	/* sync cache (force?) */
Index: dev/ic/ld_nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v
retrieving revision 1.14
diff -u -p -r1.14 ld_nvme.c
--- dev/ic/ld_nvme.c	28 Feb 2017 20:55:09 -0000	1.14
+++ dev/ic/ld_nvme.c	1 Mar 2017 21:19:41 -0000
@@ -152,11 +152,15 @@ static int
 ld_nvme_start(struct ld_softc *ld, struct buf *bp)
 {
 	struct ld_nvme_softc *sc = device_private(ld->sc_dv);
+	int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ;
+
+	if (bp->b_flags & B_FUA)
+		flags |= NVME_NS_CTX_F_FUA;
 
 	return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc,
 	    bp, bp->b_data, bp->b_bcount,
 	    sc->sc_ld.sc_secsize, bp->b_rawblkno,
-	    BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ,
+	    flags,
 	    ld_nvme_biodone);
 }
 
@@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in
 	int error;
 	struct ld_nvme_softc *sc = device_private(ld->sc_dv);
 
-	*addr = 0;
+	/*
+	 * DPO not supported, Dataset Management (DSM) field doesn't specify
+	 * the same semantics.
+	 */ 
+	*addr = DKCACHE_FUA;
 
 	if (!nvme_has_volatile_write_cache(sc->sc_nvme)) {
 		/* cache simply not present */
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v
retrieving revision 1.25
diff -u -p -r1.25 nvme.c
--- dev/ic/nvme.c	28 Feb 2017 20:53:50 -0000	1.25
+++ dev/ic/nvme.c	1 Mar 2017 21:19:43 -0000
@@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st
 
 	htolem64(&sqe->slba, ccb->nnc_blkno);
 
+	if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
+		htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
+
 	/* guaranteed by upper layers, but check just in case */
 	KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
 	htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.12
diff -u -p -r1.12 nvmevar.h
--- dev/ic/nvmevar.h	28 Feb 2017 20:53:50 -0000	1.12
+++ dev/ic/nvmevar.h	1 Mar 2017 21:19:43 -0000
@@ -64,6 +64,7 @@ struct nvme_ccb {
 	uint16_t	nnc_flags;
 #define	NVME_NS_CTX_F_READ	__BIT(0)
 #define	NVME_NS_CTX_F_POLL	__BIT(1)
+#define	NVME_NS_CTX_F_FUA	__BIT(2)
 
 	struct buf	*nnc_buf;
 	daddr_t		nnc_blkno;
Index: dev/scsipi/scsi_spc.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsi_spc.h,v
retrieving revision 1.5
diff -u -p -r1.5 scsi_spc.h
--- dev/scsipi/scsi_spc.h	6 Feb 2010 23:13:59 -0000	1.5
+++ dev/scsipi/scsi_spc.h	1 Mar 2017 21:19:43 -0000
@@ -147,6 +147,7 @@ struct scsi_mode_parameter_header_6 {
 	uint8_t data_length;
 	uint8_t medium_type;
 	uint8_t dev_spec;
+#define SMPH_DPOFUA		0x01
 	uint8_t blk_desc_len;		/* unused on ATAPI */
 };
 
Index: dev/scsipi/scsipi_disk.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v
retrieving revision 1.21
diff -u -p -r1.21 scsipi_disk.h
--- dev/scsipi/scsipi_disk.h	25 Dec 2007 18:33:42 -0000	1.21
+++ dev/scsipi/scsipi_disk.h	1 Mar 2017 21:19:43 -0000
@@ -62,9 +62,10 @@ struct scsipi_rw_10 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SRWB_RELADDR	0x01	/* obsolete */
-#define	SRWB_FUA_NV	0x02	/* force unit access non-volatile cache */
-#define	SRWB_FUA	0x08	/* force unit access */
-#define	SRWB_DPO	0x10	/* disable page out */
+#define	SRWB_FUA_NV	0x02	/* force unit access non-volatile cache (SCSI-3) */
+#define	SRWB_RESV2	0x04	/* reserved (SCSI-2) */
+#define	SRWB_FUA	0x08	/* force unit access volatile cache (SCSI-2) */
+#define	SRWB_DPO	0x10	/* disable page out (SCSI-2) */
 #define	SRWB_PROTECT(x) ((x) << 5)
 	u_int8_t addr[4];
 	u_int8_t reserved;
Index: dev/scsipi/sd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v
retrieving revision 1.322
diff -u -p -r1.322 sd.c
--- dev/scsipi/sd.c	21 Dec 2016 21:28:30 -0000	1.322
+++ dev/scsipi/sd.c	1 Mar 2017 21:19:43 -0000
@@ -733,6 +733,15 @@ sd_diskstart(device_t dev, struct buf *b
 	}
 
 	/*
+	 * Pass FUA and/or DPO if requested.
+	 */
+	if (bp->b_flags & B_FUA)
+		cmdp->bytes[0] |= SRWB_FUA;
+
+	if (bp->b_flags & B_DPO)
+		cmdp->bytes[0] |= SRWB_DPO;
+
+	/*
 	 * Figure out what flags to use.
 	 */
 	flags = XS_CTL_NOSLEEP|XS_CTL_ASYNC|XS_CTL_SIMPLE_TAG;
@@ -1818,6 +1827,10 @@ sd_getcache(struct sd_softc *sd, int *bi
 	if (pages->caching_params.pg_code & PGCODE_PS)
 		bits |= DKCACHE_SAVE;
 
+	/* available starting with SCSI-2 */
+	/* XXX possibly need to confirm via the DPOFUA flag in mode sense data */
+	bits |= DKCACHE_FUA | DKCACHE_DPO;
+
 	memset(&scsipi_sense, 0, sizeof(scsipi_sense));
 	error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
 	    sizeof(scsipi_sense.pages.caching_params),
Index: sbin/dkctl/dkctl.c
===================================================================
RCS file: /cvsroot/src/sbin/dkctl/dkctl.c,v
retrieving revision 1.23
diff -u -p -r1.23 dkctl.c
--- sbin/dkctl/dkctl.c	6 Jan 2016 23:03:13 -0000	1.23
+++ sbin/dkctl/dkctl.c	1 Mar 2017 21:22:33 -0000
@@ -306,6 +306,16 @@ disk_getcache(int argc, char *argv[])
 
 	printf("%s: cache parameters are %ssavable\n", dvname,
 	    (bits & DKCACHE_SAVE) ? "" : "not ");
+
+#ifdef DKCACHE_FUA
+	printf("%s: cache Force Unit Access (FUA) %ssupported\n", dvname,
+	    (bits & DKCACHE_FUA) ? "" : "not ");
+#endif /* DKCACHE_FUA */
+
+#ifdef DKCACHE_DPO
+	printf("%s: cache Disable Page Out (DPO) %ssupported\n", dvname,
+	    (bits & DKCACHE_DPO) ? "" : "not ");
+#endif /* DKCACHE_DPO */
 }
 
 static void


Home | Main Index | Thread Index | Old Index