NetBSD-Bugs archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: kern/58111: Tracking issue for potential ZFS data corruption



The following reply was made to PR kern/58111; it has been noted by GNATS.

From: Simon Burge <simonb%NetBSD.org@localhost>
To: gnats-bugs%netbsd.org@localhost
Cc: kern-bug-people%netbsd.org@localhost, gnats-admin%netbsd.org@localhost,
    netbsd-bugs%netbsd.org@localhost
Subject: Re: kern/58111: Tracking issue for potential ZFS data corruption
Date: Fri, 05 Apr 2024 22:31:14 +1100

 > >Number:         58111
 > >Category:       kern
 > >Synopsis:       Tracking issue for potential ZFS data corruption
 
 Attached is a reproducer for the ZFS corruption bug.  There's a
 coreutils patch to make "cp" to use the FIOSEEKHOLE and FIOSEEKDATA
 ioctls.  Coreutils "cp" and "dd" should be installed on the target as
 "/usr/bin/gcp" and "gdd" (in your path) and needs bash as /bin/bash.
 Should probably use NetBSD's dd with msgfmt=quiet and use NetBSD's
 /bin/sh with some tweaks, but I was lazy.
 
 Running this reproducer on an 8 CPU qemu NetBSD amd64 VM with an 8GB ZFS
 pool got 11 groups of failures over 8 hours on an unpatched host.
 
 Cheers,
 Simon.
 --
 # This is a shell archive.  Save it in a file, remove anything before
 # this line, and then unpack it by entering "sh file".  Note, it may
 # create directories; files and directories will be owned by you and
 # have default permissions.
 #
 # This archive contains:
 #
 #	reproducer.sh
 #	repro10.sh
 #	coreutils-copy.c.diff
 #
 echo x - 'reproducer.sh'
 sed 's/^X//' >'reproducer.sh' << 'END-of-reproducer.sh'
 X#!/bin/bash
 X#
 X# Run this script multiple times in parallel inside your pool's mount
 X# to reproduce https://github.com/openzfs/zfs/issues/15526.  Like:
 X#
 X# ./reproducer.sh & ./reproducer.sh & ./reproducer.sh & /reproducer.sh & wait
 X#
 X
 X#if [ $(cat /sys/module/zfs/parameters/zfs_bclone_enabled) != "1" ] ; then
 X#	echo "please set /sys/module/zfs/parameters/zfs_bclone_enabled = 1"
 X#	exit
 X#fi
 X
 X#CP=/home/rich/coreutils-9.1/src/cp
 X#CP=/home/rich/coreutils-9.3/src/cp
 X#CP=/home/rich/coreutils/src/cp
 XCP=/usr/bin/gcp
 X
 Xprefix="reproducer_${BASHPID}_"
 Xgdd if=/dev/urandom of=${prefix}0 bs=1M count=1 status=none
 X
 X##### echo "writing files"
 Xend=500
 Xh=0
 Xfor i in `seq 1 2 $end` ; do
 X	let "j=$i+1"
 X	${CP} ${prefix}$h ${prefix}$i
 X	${CP} ${prefix}$i ${prefix}$j
 X	let "h++"
 Xdone
 X
 X##### echo "checking files"
 Xfor i in `seq 1 $end` ; do
 X	diff ${prefix}0 ${prefix}$i
 Xdone
 END-of-reproducer.sh
 echo x - 'repro10.sh'
 sed 's/^X//' >'repro10.sh' << 'END-of-repro10.sh'
 X#!/bin/sh
 X
 Xdate
 X# echo cleaning up previous test
 Xrm -f reproducer_* 2> /dev/null
 X
 Xscriptdir=$(dirname $0)
 X
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 X${scriptdir}/reproducer.sh &
 Xwait
 END-of-repro10.sh
 echo x - 'coreutils-copy.c.diff'
 sed 's/^X//' >'coreutils-copy.c.diff' << 'END-of-coreutils-copy.c.diff'
 X--- src/copy.c.orig	2023-08-29 21:39:27.000000000 +1000
 X+++ src/copy.c	2024-04-05 02:48:41.652462664 +1100
 X@@ -534,6 +534,7 @@
 X   return true;
 X }
 X 
 X+#define SEEK_HOLE	// XXX netbsd
 X #ifdef SEEK_HOLE
 X /* Perform an efficient extent copy, if possible.  This avoids
 X    the overhead of detecting holes in hole-introducing/preserving
 X@@ -562,7 +563,10 @@
 X 
 X   while (0 <= ext_start)
 X     {
 X-      off_t ext_end = lseek (src_fd, ext_start, SEEK_HOLE);
 X+      //XXX off_t ext_end = lseek (src_fd, ext_start, SEEK_HOLE);
 X+      off_t ext_end = ext_start;
 X+      if (ioctl(src_fd, FIOSEEKHOLE, &ext_end) < 0)
 X+	ext_end = -1;
 X       if (ext_end < 0)
 X         {
 X           if (errno != ENXIO)
 X@@ -641,7 +645,10 @@
 X           break;
 X         }
 X 
 X-      ext_start = lseek (src_fd, dest_pos, SEEK_DATA);
 X+      //XXX ext_start = lseek (src_fd, dest_pos, SEEK_DATA);
 X+      ext_start = dest_pos;
 X+      if (ioctl(src_fd, FIOSEEKDATA, &ext_start) < 0)
 X+	ext_start = -1;
 X       if (ext_start < 0 && errno != ENXIO)
 X         goto cannot_lseek;
 X     }
 X@@ -1141,13 +1148,19 @@
 X 
 X   /* Only attempt SEEK_HOLE if this heuristic
 X      suggests the file is sparse.  */
 X+#if 0	// XXX skip this check!
 X   if (! (HAVE_STRUCT_STAT_ST_BLOCKS
 X          && S_ISREG (sb->st_mode)
 X          && ST_NBLOCKS (*sb) < sb->st_size / ST_NBLOCKSIZE))
 X     return PLAIN_SCANTYPE;
 X+#endif	// XXX
 X 
 X #ifdef SEEK_HOLE
 X-  off_t ext_start = lseek (fd, 0, SEEK_DATA);
 X+  //XXX off_t ext_start = lseek (fd, 0, SEEK_DATA);
 X+  off_t ext_start = 0;
 X+  if (ioctl(fd, FIOSEEKDATA, &ext_start) < 0)
 X+    ext_start = -1;
 X+
 X   if (0 <= ext_start || errno == ENXIO)
 X     {
 X       scan_inference->ext_start = ext_start;
 END-of-coreutils-copy.c.diff
 exit
 


Home | Main Index | Thread Index | Old Index