tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Exposing FUA as alternative to DIOCCACHESYNC for WAPBL
Hi,
I'm working on an interface for WAPBL to use Force Unit Access (FUA)
feature on compatible hardware (currently SCSI and NVMe), as a
replacement to full disk cache flushes. I'd also like to add support
for DPO (Disable Page Out), as that is trivial extension of FUA
support at least for SCSI.
Scope is currently limited to I/O via traditional buffer cache
(metadata only) interface for now, as that is all what is needed for
WAPBL case. First support direct use over sd(4)/ld(4), then the fix
layered drivers like dk(4), cgd(4) and eventually raid(4).
In order to be a reliable cache flush replacement, the FUA flag needs
to be exposed in a way that it's used only when the underlying driver
and hardware supports it. When it's specified, the driver needs to
honor it.
For DPO, it should be safe to simply ignore the flag when not
supported, because it's just disk cache optimization.
Since I wasn't able to spot any specific support for this in FreeBSD
or OpenBSD, I've come with a new, and rather simplistic kernel change
to expose the feature. It basically relies on the caller to DTRT.
Code extends DIOCGCACHE to return also information about FUA/DPO
support by the underlying hardware. Then, I added new flags for buffer
cache I/O, and modified the drivers to pass the flags to the hardware
when present in the struct buf.
It's supposed to be used in a way that caller on the beginning checks
the FUA support via DIOCGCACHE once, then the caller issues any
further I/O requests with FUA flag when DIOCGCACHE indicated FUA is
supported. This way there is no need to modify drivers to refuse the
flag when not supported. It assumes that if DIOCGCACHE indicates
support, then the I/O will reach the same hardware also with the
struct buf flag kept.
Seems so far all drivers keep struct buf b_flags when splitting or
processing the I/O buffer, i.e. cgd(4), dk(4), raid(4) all seem to do
it. This needs to be tested of course and we need to fix any disk
pseudo devices which don't pass the flags intact.
Attached is the patch. It's really small and compile-tested only at
the moment. The main goal now is to get feedback if this approach is
sufficient and suitable, or whether we need something completely
different or complex.
Comments?
Jaromir
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.126
diff -u -p -r1.126 buf.h
--- sys/buf.h 26 Dec 2016 23:12:33 -0000 1.126
+++ sys/buf.h 1 Mar 2017 21:19:41 -0000
@@ -198,11 +198,13 @@ struct buf {
#define B_RAW 0x00080000 /* Set by physio for raw transfers. */
#define B_READ 0x00100000 /* Read buffer. */
#define B_DEVPRIVATE 0x02000000 /* Device driver private flag. */
+#define B_FUA 0x08000000 /* Force Unit Access flag (mandatory). */
+#define B_DPO 0x10000000 /* Disable Page Out flag (advisory). */
#define BUF_FLAGBITS \
"\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \
"\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
- "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH"
+ "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34FUA\35DPO"
/* Avoid weird code due to B_WRITE being a "pseudo flag" */
#define BUF_ISREAD(bp) (((bp)->b_flags & B_READ) == B_READ)
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.22
diff -u -p -r1.22 dkio.h
--- sys/dkio.h 8 Dec 2015 20:36:15 -0000 1.22
+++ sys/dkio.h 1 Mar 2017 21:19:41 -0000
@@ -85,6 +85,8 @@
#define DKCACHE_RCHANGE 0x000100 /* read enable is changeable */
#define DKCACHE_WCHANGE 0x000200 /* write enable is changeable */
#define DKCACHE_SAVE 0x010000 /* cache parameters are savable/save them */
+#define DKCACHE_FUA 0x020000 /* Force Unit Access supported */
+#define DKCACHE_DPO 0x040000 /* Disable Page Out supported */
/* sync disk cache */
#define DIOCCACHESYNC _IOW('d', 118, int) /* sync cache (force?) */
Index: dev/ic/ld_nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v
retrieving revision 1.14
diff -u -p -r1.14 ld_nvme.c
--- dev/ic/ld_nvme.c 28 Feb 2017 20:55:09 -0000 1.14
+++ dev/ic/ld_nvme.c 1 Mar 2017 21:19:41 -0000
@@ -152,11 +152,15 @@ static int
ld_nvme_start(struct ld_softc *ld, struct buf *bp)
{
struct ld_nvme_softc *sc = device_private(ld->sc_dv);
+ int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ;
+
+ if (bp->b_flags & B_FUA)
+ flags |= NVME_NS_CTX_F_FUA;
return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc,
bp, bp->b_data, bp->b_bcount,
sc->sc_ld.sc_secsize, bp->b_rawblkno,
- BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ,
+ flags,
ld_nvme_biodone);
}
@@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in
int error;
struct ld_nvme_softc *sc = device_private(ld->sc_dv);
- *addr = 0;
+ /*
+ * DPO not supported, Dataset Management (DSM) field doesn't specify
+ * the same semantics.
+ */
+ *addr = DKCACHE_FUA;
if (!nvme_has_volatile_write_cache(sc->sc_nvme)) {
/* cache simply not present */
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v
retrieving revision 1.25
diff -u -p -r1.25 nvme.c
--- dev/ic/nvme.c 28 Feb 2017 20:53:50 -0000 1.25
+++ dev/ic/nvme.c 1 Mar 2017 21:19:43 -0000
@@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st
htolem64(&sqe->slba, ccb->nnc_blkno);
+ if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
+ htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
+
/* guaranteed by upper layers, but check just in case */
KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.12
diff -u -p -r1.12 nvmevar.h
--- dev/ic/nvmevar.h 28 Feb 2017 20:53:50 -0000 1.12
+++ dev/ic/nvmevar.h 1 Mar 2017 21:19:43 -0000
@@ -64,6 +64,7 @@ struct nvme_ccb {
uint16_t nnc_flags;
#define NVME_NS_CTX_F_READ __BIT(0)
#define NVME_NS_CTX_F_POLL __BIT(1)
+#define NVME_NS_CTX_F_FUA __BIT(2)
struct buf *nnc_buf;
daddr_t nnc_blkno;
Index: dev/scsipi/scsi_spc.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsi_spc.h,v
retrieving revision 1.5
diff -u -p -r1.5 scsi_spc.h
--- dev/scsipi/scsi_spc.h 6 Feb 2010 23:13:59 -0000 1.5
+++ dev/scsipi/scsi_spc.h 1 Mar 2017 21:19:43 -0000
@@ -147,6 +147,7 @@ struct scsi_mode_parameter_header_6 {
uint8_t data_length;
uint8_t medium_type;
uint8_t dev_spec;
+#define SMPH_DPOFUA 0x01
uint8_t blk_desc_len; /* unused on ATAPI */
};
Index: dev/scsipi/scsipi_disk.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v
retrieving revision 1.21
diff -u -p -r1.21 scsipi_disk.h
--- dev/scsipi/scsipi_disk.h 25 Dec 2007 18:33:42 -0000 1.21
+++ dev/scsipi/scsipi_disk.h 1 Mar 2017 21:19:43 -0000
@@ -62,9 +62,10 @@ struct scsipi_rw_10 {
u_int8_t opcode;
u_int8_t byte2;
#define SRWB_RELADDR 0x01 /* obsolete */
-#define SRWB_FUA_NV 0x02 /* force unit access non-volatile cache */
-#define SRWB_FUA 0x08 /* force unit access */
-#define SRWB_DPO 0x10 /* disable page out */
+#define SRWB_FUA_NV 0x02 /* force unit access non-volatile cache (SCSI-3) */
+#define SRWB_RESV2 0x04 /* reserved (SCSI-2) */
+#define SRWB_FUA 0x08 /* force unit access volatile cache (SCSI-2) */
+#define SRWB_DPO 0x10 /* disable page out (SCSI-2) */
#define SRWB_PROTECT(x) ((x) << 5)
u_int8_t addr[4];
u_int8_t reserved;
Index: dev/scsipi/sd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v
retrieving revision 1.322
diff -u -p -r1.322 sd.c
--- dev/scsipi/sd.c 21 Dec 2016 21:28:30 -0000 1.322
+++ dev/scsipi/sd.c 1 Mar 2017 21:19:43 -0000
@@ -733,6 +733,15 @@ sd_diskstart(device_t dev, struct buf *b
}
/*
+ * Pass FUA and/or DPO if requested.
+ */
+ if (bp->b_flags & B_FUA)
+ cmdp->bytes[0] |= SRWB_FUA;
+
+ if (bp->b_flags & B_DPO)
+ cmdp->bytes[0] |= SRWB_DPO;
+
+ /*
* Figure out what flags to use.
*/
flags = XS_CTL_NOSLEEP|XS_CTL_ASYNC|XS_CTL_SIMPLE_TAG;
@@ -1818,6 +1827,10 @@ sd_getcache(struct sd_softc *sd, int *bi
if (pages->caching_params.pg_code & PGCODE_PS)
bits |= DKCACHE_SAVE;
+ /* available starting with SCSI-2 */
+ /* XXX possibly need to confirm via the DPOFUA flag in mode sense data */
+ bits |= DKCACHE_FUA | DKCACHE_DPO;
+
memset(&scsipi_sense, 0, sizeof(scsipi_sense));
error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
sizeof(scsipi_sense.pages.caching_params),
Index: sbin/dkctl/dkctl.c
===================================================================
RCS file: /cvsroot/src/sbin/dkctl/dkctl.c,v
retrieving revision 1.23
diff -u -p -r1.23 dkctl.c
--- sbin/dkctl/dkctl.c 6 Jan 2016 23:03:13 -0000 1.23
+++ sbin/dkctl/dkctl.c 1 Mar 2017 21:22:33 -0000
@@ -306,6 +306,16 @@ disk_getcache(int argc, char *argv[])
printf("%s: cache parameters are %ssavable\n", dvname,
(bits & DKCACHE_SAVE) ? "" : "not ");
+
+#ifdef DKCACHE_FUA
+ printf("%s: cache Force Unit Access (FUA) %ssupported\n", dvname,
+ (bits & DKCACHE_FUA) ? "" : "not ");
+#endif /* DKCACHE_FUA */
+
+#ifdef DKCACHE_DPO
+ printf("%s: cache Disable Page Out (DPO) %ssupported\n", dvname,
+ (bits & DKCACHE_DPO) ? "" : "not ");
+#endif /* DKCACHE_DPO */
}
static void
Home |
Main Index |
Thread Index |
Old Index