tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
SSD "trim" support
Here is a prototype implementation for TRIM (or DELETE or how you
call it) support for ffs.
It is inspired by the FreeBSD implementation as this is done
asynchronously in ffs_blkfree(), before the blocks are actually
marked free in the filesystem.
Since ffs (at least NetBSD's) frees blocks in reverse order
I thought it was a good idea to collapse adjacent blocks
already at the ffs level where this a-priori knowledge is
present. This is different from FreeBSD; whether this is worth
the effort is subject to research.
This implementation handled only a single block range
per transaction. SSDs can handle at least 64 vectors with
one command. Extending the code should be simple.
I've given this some testing on an Intel and a Kingston
SSD. Anyone interested in reviewing this, or more tests
and optimizations?
(The md(4) backend is for testing only, nothing serious.)
best regards
Matthias
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Forschungszentrum Juelich GmbH
52425 Juelich
Sitz der Gesellschaft: Juelich
Eingetragen im Handelsregister des Amtsgerichts Dueren Nr. HR B 3498
Vorsitzender des Aufsichtsrats: MinDir Dr. Karl Eugen Huthmacher
Geschaeftsfuehrung: Prof. Dr. Achim Bachem (Vorsitzender),
Karsten Beneke (stellv. Vorsitzender), Prof. Dr.-Ing. Harald Bolt,
Prof. Dr. Sebastian M. Schmidt
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Kennen Sie schon unsere app? http://www.fz-juelich.de/app
# HG changeset patch
# Parent 1d86822e09d45741e2a6270e799eaecddf808c10
diff -r 1d86822e09d4 sbin/atactl/atactl.c
--- a/sbin/atactl/atactl.c Mon Feb 27 16:37:53 2012 +0100
+++ b/sbin/atactl/atactl.c Tue Feb 28 22:57:23 2012 +0100
@@ -177,6 +177,7 @@
{ WDC_VER_ATA5, "ATA-5" },
{ WDC_VER_ATA6, "ATA-6" },
{ WDC_VER_ATA7, "ATA-7" },
+ { WDC_VER_ATA8, "ATA-8" },
{ 0, NULL },
};
@@ -1041,6 +1042,10 @@
inqbuf->atap_sata_features_supp, ata_sata_feat);
}
+ if ((inqbuf->atap_ata_major & WDC_VER_ATA8) &&
+ (inqbuf->support_dsm & ATA_SUPPORT_DSM_TRIM))
+ printf("TRIM supported\n");
+
return;
}
diff -r 1d86822e09d4 sys/dev/ata/atareg.h
--- a/sys/dev/ata/atareg.h Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/dev/ata/atareg.h Tue Feb 28 22:57:23 2012 +0100
@@ -90,6 +90,7 @@
/* Commands for Disk Controller. */
#define WDCC_NOP 0x00 /* Always fail with "aborted
command" */
+#define ATA_DATA_SET_MANAGEMENT 0x06
#define WDCC_RECAL 0x10 /* disk restore code -- resets
cntlr */
#define WDCC_READ 0x20 /* disk read code */
@@ -387,6 +388,7 @@
#define WDC_VER_ATA5 0x0020
#define WDC_VER_ATA6 0x0040
#define WDC_VER_ATA7 0x0080
+#define WDC_VER_ATA8 0x0100
uint16_t atap_ata_minor; /* 81: Minor version number */
uint16_t atap_cmd_set1; /* 82: command set supported */
#define WDC_CMD1_NOP 0x4000 /* NOP */
@@ -451,7 +453,8 @@
uint16_t atap_apm_val; /* 91: current APM value */
uint16_t __reserved5[8]; /* 92-99: reserved */
uint16_t atap_max_lba[4]; /* 100-103: Max. user LBA addr */
- uint16_t __reserved6[2]; /* 104-105: reserved */
+ uint16_t __reserved6; /* 104: reserved */
+ uint16_t max_dsm_blocks; /* 105: DSM (ATA-8/ACS-2) */
uint16_t atap_secsz; /* 106: physical/logical sector size */
#define ATA_SECSZ_VALID_MASK 0xc000
#define ATA_SECSZ_VALID 0x4000
@@ -480,7 +483,10 @@
#define ATA_CFA_MODE1_DIS 0x1000 /* CFA Mode 1 Disabled */
#define ATA_CFA_MODE1_REQ 0x2000 /* CFA Mode 1 Required */
#define ATA_CFA_WORD160 0x8000 /* Word 160 supported */
- uint16_t __reserved10[15]; /* 161-175: reserved for CFA */
+ uint16_t __reserved10[8]; /* 161-168: reserved for CFA */
+ uint16_t support_dsm; /* 169: DSM (ATA-8/ACS-2) */
+#define ATA_SUPPORT_DSM_TRIM 0x0001
+ uint16_t __reserved10a[6]; /* 170-175: reserved for CFA */
uint8_t atap_media_serial[60]; /* 176-205: media serial number */
uint16_t __reserved11[3]; /* 206-208: */
uint16_t atap_logical_align; /* 209: logical/physical alignment */
diff -r 1d86822e09d4 sys/dev/ata/wd.c
--- a/sys/dev/ata/wd.c Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/dev/ata/wd.c Tue Feb 28 22:57:23 2012 +0100
@@ -178,6 +178,7 @@
void wddone(void *);
int wd_get_params(struct wd_softc *, u_int8_t, struct ataparams *);
int wd_flushcache(struct wd_softc *, int);
+int wd_trim(struct wd_softc *, int, struct disk_trim_range *);
bool wd_shutdown(device_t, int);
int wd_getcache(struct wd_softc *, int *);
@@ -1508,6 +1509,19 @@
return 0;
}
+ case DIOCGTRIMPARAMS: {
+ struct disk_trim_params * tp;
+
+ if (!(wd->sc_params.support_dsm & ATA_SUPPORT_DSM_TRIM))
+ return ENOTTY;
+ tp = (struct disk_trim_params *)addr;
+ tp->maxsize = 0xffff; /*wd->sc_params.max_dsm_blocks*/
+ printf("wd: maxtrimsize %ld\n", tp->maxsize);
+ return 0;
+ }
+ case DIOCTRIM:
+ return wd_trim(wd, WDPART(dev), (struct disk_trim_range *)addr);
+
default:
return ENOTTY;
}
@@ -1913,6 +1927,57 @@
return 0;
}
+int
+wd_trim(struct wd_softc *wd, int part, struct disk_trim_range *tr)
+{
+ struct ata_command ata_c;
+ unsigned char *req;
+ daddr_t bno = tr->bno;
+
+ if (part != RAW_PART)
+ bno += wd->sc_dk.dk_label->d_partitions[part].p_offset;;
+
+ req = kmem_zalloc(512, KM_SLEEP);
+ req[0] = bno & 0xff;
+ req[1] = (bno >> 8) & 0xff;
+ req[2] = (bno >> 16) & 0xff;
+ req[3] = (bno >> 24) & 0xff;
+ req[4] = (bno >> 32) & 0xff;
+ req[5] = (bno >> 40) & 0xff;
+ req[6] = tr->size & 0xff;
+ req[7] = (tr->size >> 8) & 0xff;
+
+ memset(&ata_c, 0, sizeof(struct ata_command));
+ ata_c.r_command = ATA_DATA_SET_MANAGEMENT;
+ ata_c.r_count = 1;
+ ata_c.r_features = ATA_SUPPORT_DSM_TRIM;
+ ata_c.r_st_bmask = WDCS_DRDY;
+ ata_c.r_st_pmask = WDCS_DRDY;
+ ata_c.timeout = 30000; /* 30s timeout */
+ ata_c.data = req;
+ ata_c.bcount = 512;
+ ata_c.flags |= AT_WRITE | AT_WAIT;
+ if (wd->atabus->ata_exec_command(wd->drvp, &ata_c) != ATACMD_COMPLETE) {
+ aprint_error_dev(wd->sc_dev,
+ "trim command didn't complete\n");
+ kmem_free(req, 512);
+ return EIO;
+ }
+ kmem_free(req, 512);
+ if (ata_c.flags & AT_ERROR) {
+ if (ata_c.r_error == WDCE_ABRT) /* command not supported */
+ return ENODEV;
+ }
+ if (ata_c.flags & (AT_ERROR | AT_TIMEOU | AT_DF)) {
+ char sbuf[sizeof(at_errbits) + 64];
+ snprintb(sbuf, sizeof(sbuf), at_errbits, ata_c.flags);
+ aprint_error_dev(wd->sc_dev, "wd_trim: status=%s\n",
+ sbuf);
+ return EIO;
+ }
+ return 0;
+}
+
bool
wd_shutdown(device_t dev, int how)
{
diff -r 1d86822e09d4 sys/dev/md.c
--- a/sys/dev/md.c Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/dev/md.c Tue Feb 28 22:57:23 2012 +0100
@@ -60,6 +60,7 @@
#include <sys/proc.h>
#include <sys/conf.h>
#include <sys/disklabel.h>
+#include <sys/kmem.h>
#include <uvm/uvm_extern.h>
@@ -448,6 +449,9 @@
struct md_conf *umd;
struct disklabel *lp;
struct partinfo *pp;
+ struct disk_trim_params *tp;
+ struct disk_trim_range *tr;
+ struct buf trim_buf;
int error;
if ((sc = device_lookup_private(&md_cd, MD_UNIT(dev))) == NULL)
@@ -469,6 +473,34 @@
&sc->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
mutex_exit(&sc->sc_lock);
return 0;
+
+ case DIOCGTRIMPARAMS:
+ if (sc->sc_type != MD_UMEM_SERVER) {
+ mutex_exit(&sc->sc_lock);
+ return ENOTTY;
+ }
+ mutex_exit(&sc->sc_lock);
+ tp = (struct disk_trim_params *)data;
+ tp->maxsize = 200; /* XXX arbitrary */
+ return 0;
+
+ case DIOCTRIM:
+ tr = (struct disk_trim_range *)data;
+ KASSERT(sc->sc_type == MD_UMEM_SERVER);
+ memset(&trim_buf, 0, sizeof(trim_buf));
+ buf_init(&trim_buf);
+ /* XXX no partition support!? */
+ trim_buf.b_blkno = tr->bno;
+ trim_buf.b_bcount = tr->size;
+ trim_buf.b_data = (void *)0xdeadbeef;
+ trim_buf.b_cflags = BC_BUSY;
+ bufq_put(sc->sc_buflist, &trim_buf);
+ cv_signal(&sc->sc_cv);
+ mutex_exit(&sc->sc_lock);
+ biowait(&trim_buf);
+ error = trim_buf.b_error;
+ buf_destroy(&trim_buf);
+ return error;
}
}
@@ -635,7 +667,14 @@
size_t off; /* offset into "device" */
size_t xfer; /* amount to transfer */
int error;
- bool is_read;
+ bool is_read, is_trim;
+ static char trimpattern[DEV_BSIZE];
+ static int trimpattern_inited = 0;
+
+ if (!trimpattern_inited) {
+ memset(trimpattern, 'x', DEV_BSIZE);
+ trimpattern_inited = 1;
+ }
KASSERT(mutex_owned(&sc->sc_lock));
@@ -651,6 +690,7 @@
mutex_exit(&sc->sc_lock);
error = 0;
is_read = ((bp->b_flags & B_READ) == B_READ);
+ is_trim = (bp->b_data == (void *)0xdeadbeef);
bp->b_resid = bp->b_bcount;
off = (bp->b_blkno << DEV_BSHIFT);
if (off >= sc->sc_size) {
@@ -664,7 +704,15 @@
xfer = (sc->sc_size - off);
addr = (char *)sc->sc_addr + off;
disk_busy(&sc->sc_dkdev);
- if (is_read)
+ if (is_trim) {
+ int i;
+ for (i = 0; i < bp->b_bcount; i++) {
+ error = copyout(trimpattern, addr, DEV_BSIZE);
+ if (error)
+ break;
+ addr = (char *)addr + DEV_BSIZE;
+ }
+ } else if (is_read)
error = copyin(addr, bp->b_data, xfer);
else
error = copyout(bp->b_data, addr, xfer);
diff -r 1d86822e09d4 sys/sys/dkio.h
--- a/sys/sys/dkio.h Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/sys/dkio.h Tue Feb 28 22:57:23 2012 +0100
@@ -109,4 +109,15 @@
#define DIOCTUR _IOR('d', 128, int) /* test unit ready */
+struct disk_trim_params {
+ long maxsize; /* in DEV_BSIZE units */
+};
+#define DIOCGTRIMPARAMS _IOR('d', 129, struct disk_trim_params)
+
+struct disk_trim_range {
+ daddr_t bno;
+ long size;
+};
+#define DIOCTRIM _IOW('d', 130, struct disk_trim_range)
+
#endif /* _SYS_DKIO_H_ */
diff -r 1d86822e09d4 sys/ufs/ffs/ffs_alloc.c
--- a/sys/ufs/ffs/ffs_alloc.c Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/ufs/ffs/ffs_alloc.c Tue Feb 28 22:57:23 2012 +0100
@@ -1552,9 +1552,8 @@
*
* => um_lock not held on entry or exit
*/
-void
-ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
- ino_t inum)
+static void
+ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size)
{
struct cg *cgp;
struct buf *bp;
@@ -1574,12 +1573,6 @@
ump = VFSTOUFS(devvp->v_specmountpoint);
KASSERT(fs == ump->um_fs);
cgblkno = fsbtodb(fs, cgtod(fs, cg));
- if (ffs_snapblkfree(fs, devvp, bno, size, inum))
- return;
-
- error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
- if (error)
- return;
error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
NOCRED, B_MODIFY, &bp);
@@ -1598,6 +1591,230 @@
bdwrite(bp);
}
+struct trimdata {
+ struct work wk; /* must be first */
+ struct vnode *devvp;
+ daddr_t bno;
+ long size;
+};
+
+struct trimstuff {
+ struct fs *fs;
+ struct trimdata *entry;
+ long maxsize;
+ kmutex_t entrylk;
+ struct workqueue *wq;
+ int wqcnt, wqdraining;
+ kmutex_t wqlk;
+ kcondvar_t wqcv;
+ /* timer for flush? */
+};
+
+int dotrim = 0;
+/*#define TRIMDEBUG*/
+
+static void
+ffs_blkfree_td(struct fs *fs, struct trimdata *td)
+{
+ long todo;
+
+ while (td->size) {
+ todo = min(td->size,
+ lfragtosize(fs, (fs->fs_frag - fragnum(fs, td->bno))));
+ ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
+ td->bno += numfrags(fs, todo);
+ td->size -= todo;
+ }
+}
+
+static void
+ffs_trimcb(struct work *wk, void *arg)
+{
+ struct trimdata *td = (void *)wk;
+ struct trimstuff *ts = arg;
+ struct fs *fs = ts->fs;
+ struct disk_trim_range ta;
+ int error;
+
+ ta.bno = fsbtodb(fs, td->bno);
+ ta.size = td->size >> DEV_BSHIFT;
+ error = VOP_IOCTL(td->devvp, DIOCTRIM, &ta, FWRITE, FSCRED);
+#ifdef TRIMDEBUG
+ printf("trim(%lld,%ld):%d\n", td->bno, td->size, error);
+#endif
+
+ ffs_blkfree_td(fs, td);
+ kmem_free(td, sizeof(*td));
+ mutex_enter(&ts->wqlk);
+ ts->wqcnt--;
+ if (ts->wqdraining && !ts->wqcnt)
+ cv_signal(&ts->wqcv);
+ mutex_exit(&ts->wqlk);
+}
+
+void *
+ffs_trimstuff_init(struct vnode *devvp, struct fs *fs)
+{
+ struct disk_trim_params tp;
+ struct trimstuff *ts;
+ int error;
+
+ if (!dotrim)
+ return NULL;
+ error = VOP_IOCTL(devvp, DIOCGTRIMPARAMS, &tp, FREAD, FSCRED);
+ if (error) {
+ printf("DIOCGTRIMPARAMS: %d\n", error);
+ return NULL;
+ }
+ if (tp.maxsize * DEV_BSIZE < fs->fs_bsize) {
+ printf("tp.maxsize=%ld, fs_bsize=%d\n", tp.maxsize,
fs->fs_bsize);
+ return NULL;
+ }
+
+ ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
+ error = workqueue_create(&ts->wq, "trimwq", ffs_trimcb, ts,
+ 0, 0, 0);
+ if (error) {
+ kmem_free(ts, sizeof (*ts));
+ return NULL;
+ }
+ mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
+ cv_init(&ts->wqcv, "trimwqcv");
+ ts->maxsize = max(tp.maxsize * DEV_BSIZE, 100*1024); /* XXX */
+ ts->fs = fs;
+ return ts;
+}
+
+void
+ffs_trimstuff_finish(void *vts, int flags)
+{
+ struct trimstuff *ts = vts;
+ struct trimdata *td = NULL;
+ int res = 0;
+
+ /* wait for workqueue to drain */
+ mutex_enter(&ts->wqlk);
+ if (ts->wqcnt) {
+ ts->wqdraining = 1;
+ res = cv_timedwait(&ts->wqcv, &ts->wqlk, mstohz(5000));
+ }
+ mutex_exit(&ts->wqlk);
+ if (res)
+ printf("ffs_trimstuff drain timeout\n");
+
+ mutex_enter(&ts->entrylk);
+ if (ts->entry) {
+ td = ts->entry;
+ ts->entry = NULL;
+ }
+ mutex_exit(&ts->entrylk);
+ if (td) {
+ /* XXX don't tell disk, its optional */
+ ffs_blkfree_td(ts->fs, td);
+#ifdef TRIMDEBUG
+ printf("finish(%lld,%ld)\n", td->bno, td->size);
+#endif
+ kmem_free(td, sizeof(*td));
+ }
+
+ cv_destroy(&ts->wqcv);
+ mutex_destroy(&ts->entrylk);
+ mutex_destroy(&ts->wqlk);
+ workqueue_destroy(ts->wq);
+ kmem_free(ts, sizeof(*ts));
+}
+
+void
+ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
+ ino_t inum)
+{
+ struct ufsmount *ump;
+ int error;
+ dev_t dev;
+ struct trimstuff *ts;
+ struct trimdata *td;
+
+ dev = devvp->v_rdev;
+ ump = VFSTOUFS(devvp->v_specmountpoint);
+ if (ffs_snapblkfree(fs, devvp, bno, size, inum))
+ return;
+
+ error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
+ if (error)
+ return;
+
+ if (!ump->um_trimstuff) {
+ ffs_blkfree_cg(fs, devvp, bno, size);
+ return;
+ }
+
+#ifdef TRIMDEBUG
+ printf("blkfree(%lld,%ld)\n", bno, size);
+#endif
+ ts = ump->um_trimstuff;
+ td = NULL;
+
+ mutex_enter(&ts->entrylk);
+ if (ts->entry) {
+ td = ts->entry;
+ /* ffs deallocs backwards, check for prepend only */
+ if (td->bno == bno + numfrags(fs, size)
+ && td->size + size <= ts->maxsize) {
+ td->bno = bno;
+ td->size += size;
+ if (td->size < ts->maxsize) {
+#ifdef TRIMDEBUG
+ printf("defer(%lld,%ld)\n", td->bno, td->size);
+#endif
+ mutex_exit(&ts->entrylk);
+ return;
+ }
+ size = 0; /* mark done */
+ }
+ ts->entry = NULL;
+ }
+ mutex_exit(&ts->entrylk);
+
+ if (td) {
+#ifdef TRIMDEBUG
+ printf("enq old(%lld,%ld)\n", td->bno, td->size);
+#endif
+ mutex_enter(&ts->wqlk);
+ ts->wqcnt++;
+ mutex_exit(&ts->wqlk);
+ workqueue_enqueue(ts->wq, &td->wk, NULL);
+ }
+ if (!size)
+ return;
+
+ td = kmem_alloc(sizeof(*td), KM_SLEEP);
+ td->devvp = devvp;
+ td->bno = bno;
+ td->size = size;
+
+ if (td->size < ts->maxsize) { /* XXX always the case */
+ mutex_enter(&ts->entrylk);
+ if (!ts->entry) { /* possible race? */
+#ifdef TRIMDEBUG
+ printf("defer(%lld,%ld)\n", td->bno, td->size);
+#endif
+ ts->entry = td;
+ td = NULL;
+ }
+ mutex_exit(&ts->entrylk);
+ }
+ if (td) {
+#ifdef TRIMDEBUG
+ printf("enq new(%lld,%ld)\n", td->bno, td->size);
+#endif
+ mutex_enter(&ts->wqlk);
+ ts->wqcnt++;
+ mutex_exit(&ts->wqlk);
+ workqueue_enqueue(ts->wq, &td->wk, NULL);
+ }
+}
+
/*
* Free a block or fragment from a snapshot cg copy.
*
diff -r 1d86822e09d4 sys/ufs/ffs/ffs_extern.h
--- a/sys/ufs/ffs/ffs_extern.h Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/ufs/ffs/ffs_extern.h Tue Feb 28 22:57:23 2012 +0100
@@ -98,6 +98,8 @@
int ffs_blkalloc(struct inode *, daddr_t, long);
int ffs_blkalloc_ump(struct ufsmount *, daddr_t, long);
void ffs_blkfree(struct fs *, struct vnode *, daddr_t, long, ino_t);
+void *ffs_trimstuff_init(struct vnode *, struct fs *);
+void ffs_trimstuff_finish(void *, int);
void ffs_blkfree_snap(struct fs *, struct vnode *, daddr_t, long, ino_t);
int ffs_vfree(struct vnode *, ino_t, int);
int ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
diff -r 1d86822e09d4 sys/ufs/ffs/ffs_vfsops.c
--- a/sys/ufs/ffs/ffs_vfsops.c Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/ufs/ffs/ffs_vfsops.c Tue Feb 28 22:57:23 2012 +0100
@@ -1279,6 +1279,9 @@
ufs_extattr_uepm_init(&ump->um_extattr);
#endif /* UFS_EXTATTR */
+ /* XXX mount option */
+ ump->um_trimstuff = ffs_trimstuff_init(devvp, fs);
+
return (0);
out:
#ifdef WAPBL
@@ -1435,6 +1438,11 @@
extern int doforce;
#endif
+ if (ump->um_trimstuff) {
+ ffs_trimstuff_finish(ump->um_trimstuff, mntflags);
+ ump->um_trimstuff = NULL;
+ }
+
flags = 0;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
diff -r 1d86822e09d4 sys/ufs/ufs/ufsmount.h
--- a/sys/ufs/ufs/ufsmount.h Mon Feb 27 16:37:53 2012 +0100
+++ b/sys/ufs/ufs/ufsmount.h Tue Feb 28 22:57:23 2012 +0100
@@ -124,6 +124,8 @@
void *um_snapinfo; /* snapshot private data */
const struct ufs_ops *um_ops;
+
+ void *um_trimstuff;
};
struct ufs_ops {
Home |
Main Index |
Thread Index |
Old Index