tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: Exposing FUA as alternative to DIOCCACHESYNC for WAPBL
Attached is final version of the patch. It uses MEDIA prefix for the
flags, but keeps the FUA/DPO - i.e names are B_MEDIA_FUA, B_MEDIA_DPO.
For wapbl it introduces a sysctl to use the feature, default is off
for now.
I plan to commit this later in the week or early next week, unless
there are some serious objections.
Jaromir
2017-03-05 23:22 GMT+01:00 Jaromír Doleček <jaromir.dolecek%gmail.com@localhost>:
> Here is an updated patch. It was updated to check for the FUA support
> for SCSI, using the MODE SENSE device-specific flag. Code was tested
> with QEMU emulated bha(4) and nvme. WAPBL code was updated to use the
> flag. It keeps the flag naming for now.
>
> In the patch, WAPBL sets the flag for journal writes, and also for the
> metadata buffer for bawrite() call after journal commit.
>
> There is possible layer violation for metadata write - b_flags are
> supposed to be set by owner of the buffer. Not sure how strict we
> want/need to be there - perhaps introduce another flag field? Also the
> flag
> probably needs to be unset in biodone hook, so that the code
> guarantees the buffer in buffer cache doesn't accidentaly keep it over
> to another I/O.
>
> Jaromir
? dev/ic/TODO.nvme
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.126
diff -u -p -r1.126 buf.h
--- sys/buf.h 26 Dec 2016 23:12:33 -0000 1.126
+++ sys/buf.h 27 Mar 2017 22:31:22 -0000
@@ -198,16 +198,21 @@ struct buf {
#define B_RAW 0x00080000 /* Set by physio for raw transfers. */
#define B_READ 0x00100000 /* Read buffer. */
#define B_DEVPRIVATE 0x02000000 /* Device driver private flag. */
+#define B_MEDIA_FUA 0x08000000 /* Set Force Unit Access for media. */
+#define B_MEDIA_DPO 0x10000000 /* Set Disable Page Out for media. */
#define BUF_FLAGBITS \
"\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \
"\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
- "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH"
+ "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34MEDIAFUA\35MEDIADPO"
/* Avoid weird code due to B_WRITE being a "pseudo flag" */
#define BUF_ISREAD(bp) (((bp)->b_flags & B_READ) == B_READ)
#define BUF_ISWRITE(bp) (((bp)->b_flags & B_READ) == B_WRITE)
+/* Media flags, to be passed for nested I/O */
+#define B_MEDIA_FLAGS (B_MEDIA_FUA|B_MEDIA_DPO)
+
/*
* This structure describes a clustered I/O. It is stored in the b_saveaddr
* field of the buffer on which I/O is done. At I/O completion, cluster
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.22
diff -u -p -r1.22 dkio.h
--- sys/dkio.h 8 Dec 2015 20:36:15 -0000 1.22
+++ sys/dkio.h 27 Mar 2017 22:31:22 -0000
@@ -85,6 +85,8 @@
#define DKCACHE_RCHANGE 0x000100 /* read enable is changeable */
#define DKCACHE_WCHANGE 0x000200 /* write enable is changeable */
#define DKCACHE_SAVE 0x010000 /* cache parameters are savable/save them */
+#define DKCACHE_FUA 0x020000 /* Force Unit Access supported */
+#define DKCACHE_DPO 0x040000 /* Disable Page Out supported */
/* sync disk cache */
#define DIOCCACHESYNC _IOW('d', 118, int) /* sync cache (force?) */
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v
retrieving revision 1.271
diff -u -p -r1.271 vfs_bio.c
--- kern/vfs_bio.c 21 Mar 2017 10:46:49 -0000 1.271
+++ kern/vfs_bio.c 27 Mar 2017 22:31:22 -0000
@@ -2027,7 +2027,7 @@ nestiobuf_iodone(buf_t *bp)
void
nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
{
- const int b_read = mbp->b_flags & B_READ;
+ const int b_pass = mbp->b_flags & (B_READ|B_MEDIA_FLAGS);
struct vnode *vp = mbp->b_vp;
KASSERT(mbp->b_bcount >= offset + size);
@@ -2035,14 +2035,14 @@ nestiobuf_setup(buf_t *mbp, buf_t *bp, i
bp->b_dev = mbp->b_dev;
bp->b_objlock = mbp->b_objlock;
bp->b_cflags = BC_BUSY;
- bp->b_flags = B_ASYNC | b_read;
+ bp->b_flags = B_ASYNC | b_pass;
bp->b_iodone = nestiobuf_iodone;
bp->b_data = (char *)mbp->b_data + offset;
bp->b_resid = bp->b_bcount = size;
bp->b_bufsize = bp->b_bcount;
bp->b_private = mbp;
BIO_COPYPRIO(bp, mbp);
- if (!b_read && vp != NULL) {
+ if (BUF_ISWRITE(bp) && vp != NULL) {
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
Index: kern/vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.92
diff -u -p -r1.92 vfs_wapbl.c
--- kern/vfs_wapbl.c 17 Mar 2017 03:19:46 -0000 1.92
+++ kern/vfs_wapbl.c 27 Mar 2017 22:31:23 -0000
@@ -71,6 +71,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,
static struct sysctllog *wapbl_sysctl;
static int wapbl_flush_disk_cache = 1;
static int wapbl_verbose_commit = 0;
+static int wapbl_use_fua = 0; /* switched off by default for now */
static inline size_t wapbl_space_free(size_t, off_t, off_t);
@@ -230,6 +231,16 @@ struct wapbl {
u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
daddr_t wl_buffer_dblk; /* l: buffer disk block address */
size_t wl_buffer_used; /* l: buffer current use */
+
+ int wl_dkcache; /* r: disk cache flags */
+#define WAPBL_USE_FUA(wl) \
+ (wapbl_use_fua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
+#define WAPBL_JFLAGS(wl) \
+ (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
+#define WAPBL_MFLAGS(wl) \
+ (WAPBL_USE_FUA(wl) ? (wl)->wl_mwrite_flags : 0)
+ int wl_jwrite_flags; /* r: journal write flags */
+ int wl_mwrite_flags; /* r: metadata write flags */
};
#ifdef WAPBL_DEBUG_PRINT
@@ -281,6 +292,8 @@ static void wapbl_deallocation_free(stru
static void wapbl_evcnt_init(struct wapbl *);
static void wapbl_evcnt_free(struct wapbl *);
+static void wapbl_dkcache_init(struct wapbl *);
+
#if 0
int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
#endif
@@ -335,6 +348,18 @@ wapbl_sysctl_init(void)
SYSCTL_DESCR("show time and size of wapbl log commits"),
NULL, 0, &wapbl_verbose_commit, 0,
CTL_CREATE, CTL_EOL);
+ if (rv)
+ return rv;
+
+ rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "use_fua",
+ SYSCTL_DESCR("use FUA/DPO instead of cash flush if available"),
+ NULL, 0, &wapbl_use_fua, 0,
+ CTL_CREATE, CTL_EOL);
+ if (rv)
+ return rv;
+
return rv;
}
@@ -391,6 +416,30 @@ wapbl_evcnt_free(struct wapbl *wl)
evcnt_detach(&wl->wl_ev_cacheflush);
}
+static void
+wapbl_dkcache_init(struct wapbl *wl)
+{
+ int error;
+
+ /* Get disk cache flags */
+ error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
+ FWRITE, FSCRED);
+ if (error) {
+ /* behave as if there was a write cache */
+ wl->wl_dkcache = DKCACHE_WRITE;
+ }
+
+ /* Use FUA instead of cache flush if available */
+ if (ISSET(wl->wl_dkcache, DKCACHE_FUA)) {
+ wl->wl_jwrite_flags |= B_MEDIA_FUA;
+ wl->wl_mwrite_flags |= B_MEDIA_FUA;
+ }
+
+ /* Use DPO for journal writes if available */
+ if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
+ wl->wl_jwrite_flags |= B_MEDIA_DPO;
+}
+
static int
wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
{
@@ -563,6 +612,8 @@ wapbl_start(struct wapbl ** wlp, struct
wapbl_evcnt_init(wl);
+ wapbl_dkcache_init(wl);
+
/* Initialize the commit header */
{
struct wapbl_wc_header *wc;
@@ -809,7 +860,6 @@ wapbl_doio(void *data, size_t len, struc
struct buf *bp;
int error;
- KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
KASSERT(devvp->v_type == VBLK);
if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
@@ -823,7 +873,7 @@ wapbl_doio(void *data, size_t len, struc
bp = getiobuf(devvp, true);
bp->b_flags = flags;
- bp->b_cflags = BC_BUSY; /* silly & dubious */
+ bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
bp->b_dev = devvp->v_rdev;
bp->b_data = data;
bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
@@ -898,7 +948,8 @@ wapbl_buffered_flush(struct wapbl *wl)
return 0;
error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
- wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+ wl->wl_devvp, wl->wl_buffer_dblk,
+ B_WRITE | WAPBL_JFLAGS(wl));
wl->wl_buffer_used = 0;
wl->wl_ev_journalwrite.ev_count++;
@@ -948,12 +999,10 @@ wapbl_buffered_write(void *data, size_t
if (len >= resid) {
memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
wl->wl_buffer_used += resid;
- error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
- wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+ error = wapbl_buffered_flush(wl);
data = (uint8_t *)data + resid;
len -= resid;
wl->wl_buffer_dblk = pbn + btodb(resid);
- wl->wl_buffer_used = 0;
if (error)
return error;
}
@@ -1500,6 +1549,13 @@ wapbl_biodone(struct buf *bp)
}
/*
+ * Make sure that the buf doesn't retain the media flags, so that
+ * e.g. wapbl_use_fua has immediate effect on any following I/O.
+ * The flags will be set again if needed by another I/O.
+ */
+ bp->b_flags &= ~B_MEDIA_FLAGS;
+
+ /*
* Release the buffer here. wapbl_flush() may wait for the
* log to become empty and we better unbusy the buffer before
* wapbl_flush() returns.
@@ -1754,6 +1810,10 @@ wapbl_flush(struct wapbl *wl, int waitfo
}
bp->b_iodone = wapbl_biodone;
bp->b_private = we;
+
+ /* make sure the block is saved sync when FUA in use */
+ bp->b_flags |= WAPBL_MFLAGS(wl);
+
bremfree(bp);
wapbl_remove_buf_locked(wl, bp);
mutex_exit(&wl->wl_mtx);
@@ -2201,7 +2261,8 @@ wapbl_cache_sync(struct wapbl *wl, const
int force = 1;
int error;
- if (!wapbl_flush_disk_cache) {
+ /* Skip full cache sync if disabled, or when using FUA */
+ if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
return 0;
}
if (verbose) {
Index: dev/ic/ld_nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v
retrieving revision 1.14
diff -u -p -r1.14 ld_nvme.c
--- dev/ic/ld_nvme.c 28 Feb 2017 20:55:09 -0000 1.14
+++ dev/ic/ld_nvme.c 27 Mar 2017 22:31:23 -0000
@@ -152,11 +152,15 @@ static int
ld_nvme_start(struct ld_softc *ld, struct buf *bp)
{
struct ld_nvme_softc *sc = device_private(ld->sc_dv);
+ int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ;
+
+ if (bp->b_flags & B_MEDIA_FUA)
+ flags |= NVME_NS_CTX_F_FUA;
return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc,
bp, bp->b_data, bp->b_bcount,
sc->sc_ld.sc_secsize, bp->b_rawblkno,
- BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ,
+ flags,
ld_nvme_biodone);
}
@@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in
int error;
struct ld_nvme_softc *sc = device_private(ld->sc_dv);
- *addr = 0;
+ /*
+ * DPO not supported, Dataset Management (DSM) field doesn't specify
+ * the same semantics.
+ */
+ *addr = DKCACHE_FUA;
if (!nvme_has_volatile_write_cache(sc->sc_nvme)) {
/* cache simply not present */
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v
retrieving revision 1.25
diff -u -p -r1.25 nvme.c
--- dev/ic/nvme.c 28 Feb 2017 20:53:50 -0000 1.25
+++ dev/ic/nvme.c 27 Mar 2017 22:31:23 -0000
@@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st
htolem64(&sqe->slba, ccb->nnc_blkno);
+ if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
+ htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
+
/* guaranteed by upper layers, but check just in case */
KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.12
diff -u -p -r1.12 nvmevar.h
--- dev/ic/nvmevar.h 28 Feb 2017 20:53:50 -0000 1.12
+++ dev/ic/nvmevar.h 27 Mar 2017 22:31:23 -0000
@@ -64,6 +64,7 @@ struct nvme_ccb {
uint16_t nnc_flags;
#define NVME_NS_CTX_F_READ __BIT(0)
#define NVME_NS_CTX_F_POLL __BIT(1)
+#define NVME_NS_CTX_F_FUA __BIT(2)
struct buf *nnc_buf;
daddr_t nnc_blkno;
Index: dev/scsipi/scsipi_disk.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v
retrieving revision 1.21
diff -u -p -r1.21 scsipi_disk.h
--- dev/scsipi/scsipi_disk.h 25 Dec 2007 18:33:42 -0000 1.21
+++ dev/scsipi/scsipi_disk.h 27 Mar 2017 22:31:23 -0000
@@ -62,9 +62,10 @@ struct scsipi_rw_10 {
u_int8_t opcode;
u_int8_t byte2;
#define SRWB_RELADDR 0x01 /* obsolete */
-#define SRWB_FUA_NV 0x02 /* force unit access non-volatile cache */
-#define SRWB_FUA 0x08 /* force unit access */
-#define SRWB_DPO 0x10 /* disable page out */
+#define SRWB_FUA_NV 0x02 /* force unit access non-volatile cache (SCSI-3) */
+#define SRWB_RESV2 0x04 /* reserved (SCSI-2) */
+#define SRWB_FUA 0x08 /* force unit access volatile cache (SCSI-2) */
+#define SRWB_DPO 0x10 /* disable page out (SCSI-2) */
#define SRWB_PROTECT(x) ((x) << 5)
u_int8_t addr[4];
u_int8_t reserved;
@@ -159,4 +160,7 @@ struct scsipi_capacity_descriptor {
#define SCSIPI_CAP_DESC_CODE_FORMATTED 0x2
#define SCSIPI_CAP_DESC_CODE_NONE 0x3
+/* defines for the device specific byte in the mode select/sense header */
+#define SMH_DSP_DPOFUA 0x10
+
#endif /* _DEV_SCSIPI_SCSIPI_DISK_H_ */
Index: dev/scsipi/sd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v
retrieving revision 1.322
diff -u -p -r1.322 sd.c
--- dev/scsipi/sd.c 21 Dec 2016 21:28:30 -0000 1.322
+++ dev/scsipi/sd.c 27 Mar 2017 22:31:23 -0000
@@ -654,6 +654,7 @@ sd_diskstart(device_t dev, struct buf *b
struct scsipi_generic *cmdp;
struct scsipi_xfer *xs;
int error, flags, nblks, cmdlen;
+ int cdb_flags;
mutex_enter(chan_mtx(chan));
@@ -698,12 +699,27 @@ sd_diskstart(device_t dev, struct buf *b
nblks = howmany(bp->b_bcount, sd->params.blksize);
/*
+ * Pass FUA and/or DPO if requested. Must be done before CDB
+ * selection, as 6-byte CDB doesn't support the flags.
+ */
+ cdb_flags = 0;
+
+ if (bp->b_flags & B_MEDIA_FUA)
+ cdb_flags |= SRWB_FUA;
+
+ if (bp->b_flags & B_MEDIA_DPO)
+ cdb_flags |= SRWB_DPO;
+
+ /*
* Fill out the scsi command. Use the smallest CDB possible
- * (6-byte, 10-byte, or 16-byte).
+ * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO,
+ * need to use 10-byte or bigger, as the 6-byte doesn't support
+ * the flags.
*/
if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) &&
((nblks & 0xff) == nblks) &&
- !(periph->periph_quirks & PQUIRK_ONLYBIG)) {
+ !(periph->periph_quirks & PQUIRK_ONLYBIG) &&
+ !cdb_flags) {
/* 6-byte CDB */
memset(&cmd_small, 0, sizeof(cmd_small));
cmd_small.opcode = (bp->b_flags & B_READ) ?
@@ -732,6 +748,9 @@ sd_diskstart(device_t dev, struct buf *b
cmdp = (struct scsipi_generic *)&cmd16;
}
+ if (cdb_flags)
+ cmdp->bytes[0] = cdb_flags;
+
/*
* Figure out what flags to use.
*/
@@ -1796,7 +1815,9 @@ sd_getcache(struct sd_softc *sd, int *bi
int error, bits = 0;
int big;
union scsi_disk_pages *pages;
+ uint8_t dev_spec;
+ /* only SCSI-2 and later supported */
if (periph->periph_version < 2)
return (EOPNOTSUPP);
@@ -1806,10 +1827,13 @@ sd_getcache(struct sd_softc *sd, int *bi
if (error)
return (error);
- if (big)
+ if (big) {
pages = (void *)(&scsipi_sense.header.big + 1);
- else
+ dev_spec = scsipi_sense.header.big.dev_spec;
+ } else {
pages = (void *)(&scsipi_sense.header.small + 1);
+ dev_spec = scsipi_sense.header.small.dev_spec;
+ }
if ((pages->caching_params.flags & CACHING_RCD) == 0)
bits |= DKCACHE_READ;
@@ -1818,6 +1842,13 @@ sd_getcache(struct sd_softc *sd, int *bi
if (pages->caching_params.pg_code & PGCODE_PS)
bits |= DKCACHE_SAVE;
+ /*
+ * Support for FUA/DPO, defined starting with SCSI-2. Use only
+ * if device claims to support it, according to the MODE SENSE.
+ */
+ if (ISSET(dev_spec, SMH_DSP_DPOFUA))
+ bits |= DKCACHE_FUA | DKCACHE_DPO;
+
memset(&scsipi_sense, 0, sizeof(scsipi_sense));
error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
sizeof(scsipi_sense.pages.caching_params),
Home |
Main Index |
Thread Index |
Old Index