Subject: Re: data corruption on aceride
To: Hiroki Sato <hrs@allbsd.org>
From: Manuel Bouyer <bouyer@antioche.eu.org>
List: port-sparc64
Date: 06/22/2005 22:03:12
--n8g4imXOkfNTN/H1
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Sun, Jun 19, 2005 at 09:07:20PM +0900, Hiroki Sato wrote:
>  I am using netbsd-2-0 as of March, but I have data corruption
>  issue on Sun Blade 100 for nearly a year.
>  The corruption does not occur immediately, but find(1) performed
>  by the /etc/security script triggers the problem gradually
>  and it starts reporting "bad file descriptor".  Until
>  the problem occurs it takes several weeks or months.
> 
>  After DMA of the aceride driver is disabled by using "flags 0x0002,"
>  the symptom disappears and the box becomes stable for these months.
>  Should I try the latest current kernel?

Could you please try the attached patch against the netbsd-2 branch ?
I'm testing a similar change to a current kernel and it seems to work, but
as I didn't see the corruption probleme before ...

-- 
Manuel Bouyer <bouyer@antioche.eu.org>
     NetBSD: 26 ans d'experience feront toujours la difference
--

--n8g4imXOkfNTN/H1
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=diff

Index: ic/wdc.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/wdc.c,v
retrieving revision 1.172.2.7.2.2
diff -u -r1.172.2.7.2.2 wdc.c
--- ic/wdc.c	16 Apr 2005 10:59:50 -0000	1.172.2.7.2.2
+++ ic/wdc.c	22 Jun 2005 19:53:59 -0000
@@ -697,23 +697,13 @@
 	delay(5000);
 #endif
 
-	if (wdc != NULL && (wdc->cap & WDC_CAPABILITY_SELECT))
-		wdc->select(chp,0);
-	bus_space_write_1(chp->cmd_iot, chp->cmd_iohs[wd_sdh], 0, WDSD_IBM);
-	delay(10);	/* 400ns delay */
-	/* assert SRST, wait for reset to complete */
-	bus_space_write_1(chp->ctl_iot, chp->ctl_ioh, wd_aux_ctlr,
-	    WDCTL_RST | WDCTL_IDS | WDCTL_4BIT); 
-	DELAY(1000);
-	bus_space_write_1(chp->ctl_iot, chp->ctl_ioh, wd_aux_ctlr,
-	    WDCTL_IDS | WDCTL_4BIT);
+	if (wdc != NULL)
+		wdc->reset(chp, RESET_POLL);
+	else
+		wdc_do_reset(chp, RESET_POLL);
 	DELAY(2000);
 	(void) bus_space_read_1(chp->cmd_iot, chp->cmd_iohs[wd_error], 0);
 	bus_space_write_1(chp->ctl_iot, chp->ctl_ioh, wd_aux_ctlr, WDCTL_4BIT);
-	delay(10);	/* 400ns delay */
-	/* ACK interrupt in case there is one pending left (Promise ATA100) */
-	if (wdc != NULL && (wdc->cap & WDC_CAPABILITY_IRQACK))
-		wdc->irqack(chp);
 	splx(s);
 
 	ret_value = __wdcwait_reset(chp, ret_value, poll);
@@ -782,6 +772,8 @@
 	callout_init(&chp->ch_callout);
 	if (wdc->drv_probe == NULL)
 		wdc->drv_probe = wdc_drvprobe;
+	if (wdc->reset == NULL)
+		wdc->reset = wdc_do_reset;
 	if (inited == 0) {
 		/* Initialize the ata_xfer pool. */
 		pool_init(&wdc_xfer_pool, sizeof(struct ata_xfer), 0,
@@ -1033,9 +1025,33 @@
 {
 	struct wdc_softc *wdc = chp->ch_wdc;
 	int drv_mask1, drv_mask2;
+
+	wdc->reset(chp, poll);
+
+	drv_mask1 = (chp->ch_drive[0].drive_flags & DRIVE) ? 0x01:0x00;
+	drv_mask1 |= (chp->ch_drive[1].drive_flags & DRIVE) ? 0x02:0x00;
+	drv_mask2 = __wdcwait_reset(chp, drv_mask1,
+	    (poll == RESET_SLEEP) ? 0 : 1);
+	if (drv_mask2 != drv_mask1) {
+		printf("%s channel %d: reset failed for",
+		    wdc->sc_dev.dv_xname, chp->ch_channel);
+		if ((drv_mask1 & 0x01) != 0 && (drv_mask2 & 0x01) == 0)
+			printf(" drive 0");
+		if ((drv_mask1 & 0x02) != 0 && (drv_mask2 & 0x02) == 0)
+			printf(" drive 1");
+		printf("\n");
+	}
+	bus_space_write_1(chp->ctl_iot, chp->ctl_ioh, wd_aux_ctlr, WDCTL_4BIT);
+	return  (drv_mask1 != drv_mask2) ? 1 : 0;
+}
+
+void
+wdc_do_reset(struct wdc_channel *chp, int poll)
+{
+	struct wdc_softc *wdc = chp->ch_wdc;
 	int s = 0;
 
-	if (wdc->cap & WDC_CAPABILITY_SELECT)
+	if (wdc && wdc->cap & WDC_CAPABILITY_SELECT)
 		wdc->select(chp,0);
 	if (poll != RESET_SLEEP)
 		s = splbio();
@@ -1043,6 +1059,10 @@
 	bus_space_write_1(chp->cmd_iot, chp->cmd_iohs[wd_sdh], 0, WDSD_IBM);
 	delay(10);	/* 400ns delay */
 	bus_space_write_1(chp->ctl_iot, chp->ctl_ioh, wd_aux_ctlr,
+	    WDCTL_IDS | WDCTL_4BIT);
+	delay(10);	/* 400ns delay */
+	/* assert SRST, wait for reset to complete */
+	bus_space_write_1(chp->ctl_iot, chp->ctl_ioh, wd_aux_ctlr,
 	    WDCTL_RST | WDCTL_IDS | WDCTL_4BIT);
 	delay(2000);
 	(void) bus_space_read_1(chp->cmd_iot, chp->cmd_iohs[wd_error], 0);
@@ -1050,26 +1070,10 @@
 	    WDCTL_4BIT | WDCTL_IDS);
 	delay(10);	/* 400ns delay */
 	if (poll != RESET_SLEEP) {
-		if (wdc->cap & WDC_CAPABILITY_IRQACK)
+		if (wdc && wdc->cap & WDC_CAPABILITY_IRQACK)
 			wdc->irqack(chp);
 		splx(s);
 	}
-
-	drv_mask1 = (chp->ch_drive[0].drive_flags & DRIVE) ? 0x01:0x00;
-	drv_mask1 |= (chp->ch_drive[1].drive_flags & DRIVE) ? 0x02:0x00;
-	drv_mask2 = __wdcwait_reset(chp, drv_mask1,
-	    (poll == RESET_SLEEP) ? 0 : 1);
-	if (drv_mask2 != drv_mask1) {
-		printf("%s channel %d: reset failed for",
-		    wdc->sc_dev.dv_xname, chp->ch_channel);
-		if ((drv_mask1 & 0x01) != 0 && (drv_mask2 & 0x01) == 0)
-			printf(" drive 0");
-		if ((drv_mask1 & 0x02) != 0 && (drv_mask2 & 0x02) == 0)
-			printf(" drive 1");
-		printf("\n");
-	}
-	bus_space_write_1(chp->ctl_iot, chp->ctl_ioh, wd_aux_ctlr, WDCTL_4BIT);
-	return  (drv_mask1 != drv_mask2) ? 1 : 0;
 }
 
 static int
Index: ic/wdcvar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/wdcvar.h,v
retrieving revision 1.55.2.1
diff -u -r1.55.2.1 wdcvar.h
--- ic/wdcvar.h	18 Apr 2004 02:23:41 -0000	1.55.2.1
+++ ic/wdcvar.h	22 Jun 2005 19:53:59 -0000
@@ -165,6 +165,8 @@
 
 	/* if WDC_CAPABILITY_IRQACK set in 'cap' */
 	void		(*irqack)(struct wdc_channel *);
+	/* Optional callback to perform a bus reset */
+	void		(*reset)(struct wdc_channel *, int);
 };
 
 /*
@@ -205,6 +207,7 @@
 void	wdccommandshort(struct wdc_channel *, int, int);
 void	wdctimeout(void *arg);
 void	wdc_reset_channel(struct ata_drive_datas *, int);
+void	wdc_do_reset(struct wdc_channel *, int);
 
 int	wdc_exec_command(struct ata_drive_datas *, struct wdc_command*);
 #define WDC_COMPLETE 0x01
Index: pci/aceride.c
===================================================================
RCS file: /cvsroot/src/sys/dev/pci/aceride.c,v
retrieving revision 1.6
diff -u -r1.6 aceride.c
--- pci/aceride.c	3 Jan 2004 22:56:53 -0000	1.6
+++ pci/aceride.c	22 Jun 2005 19:53:59 -0000
@@ -38,6 +38,8 @@
 #include <dev/pci/pciidevar.h>
 #include <dev/pci/pciide_acer_reg.h>
 
+static int acer_pcib_match(struct pci_attach_args *);
+static void acer_do_reset(struct wdc_channel *, int);
 static void acer_chip_map(struct pciide_softc*, struct pci_attach_args*);
 static void acer_setup_channel(struct wdc_channel*);
 static int  acer_pci_intr(void *);
@@ -45,7 +47,11 @@
 static int  aceride_match(struct device *, struct cfdata *, void *);
 static void aceride_attach(struct device *, struct device *, void *);
 
-CFATTACH_DECL(aceride, sizeof(struct pciide_softc),
+struct aceride_softc {
+	struct pciide_softc pciide_sc;
+	struct pci_attach_args pcib_pa;
+};
+CFATTACH_DECL(aceride, sizeof(struct aceride_softc),
     aceride_match, aceride_attach, NULL, NULL);
 
 static const struct pciide_product_desc pciide_acer_products[] =  {
@@ -86,6 +92,21 @@
 
 }
 
+static int
+acer_pcib_match(struct pci_attach_args *pa)
+{
+	/*
+	 * we need to access the PCI config space of the pcib, see
+	 * acer_do_reset()
+	 */
+	if (PCI_CLASS(pa->pa_class) == PCI_CLASS_BRIDGE &&
+	    PCI_SUBCLASS(pa->pa_class) == PCI_SUBCLASS_BRIDGE_ISA &&
+	    PCI_VENDOR(pa->pa_id) == PCI_VENDOR_ALI &&
+	    PCI_PRODUCT(pa->pa_id) == PCI_PRODUCT_ALI_M1543)
+		return 1;
+	return 0;
+}
+
 static void
 acer_chip_map(struct pciide_softc *sc, struct pci_attach_args *pa)
 {
@@ -94,6 +115,7 @@
 	pcireg_t cr, interface;
 	bus_size_t cmdsize, ctlsize;
 	pcireg_t rev = PCI_REVISION(pa->pa_class);
+	struct aceride_softc *acer_sc = (struct aceride_softc *)sc;
 
 	if (pciide_chipen(sc, pa) == 0)
 		return;
@@ -152,6 +174,15 @@
 		    | ACER_0x4B_CDETECT);
 	}
 
+	if (rev == 0xC3) {
+		/* install reset bug workaround */
+		if (pci_find_device(&acer_sc->pcib_pa, acer_pcib_match) == 0) {
+			printf("%s: WARNING: can't find pci-isa bridge\n",
+			    sc->sc_wdcdev.sc_dev.dv_xname);
+		} else
+			sc->sc_wdcdev.reset = acer_do_reset;
+	}
+
 	for (channel = 0; channel < sc->sc_wdcdev.nchannels; channel++) {
 		cp = &sc->pciide_channels[channel];
 		if (pciide_chansetup(sc, channel, interface) == 0)
@@ -169,6 +200,31 @@
 }
 
 static void
+acer_do_reset(struct wdc_channel *chp, int poll)
+{
+	struct pciide_channel *cp = (struct pciide_channel*)chp;
+	struct pciide_softc *sc = (struct pciide_softc *)cp->wdc_channel.ch_wdc;
+	struct aceride_softc *acer_sc = (struct aceride_softc *)sc;
+	u_int8_t reg;
+
+	/*
+	 * From OpenSolaris: after a reset we need to disable/enable the
+	 * corresponding channel, or data corruption will occur in
+	 * UltraDMA modes
+	 */
+
+	wdc_do_reset(chp, poll);
+	reg = pciide_pci_read(acer_sc->pcib_pa.pa_pc, acer_sc->pcib_pa.pa_tag,
+	    ACER_PCIB_CTRL);
+	printf("acer_do_reset reg 0x%x\n", reg);
+	pciide_pci_write(acer_sc->pcib_pa.pa_pc, acer_sc->pcib_pa.pa_tag,
+	    ACER_PCIB_CTRL, reg & ACER_PCIB_CTRL_ENCHAN(chp->ch_channel));
+	delay(1000);
+	pciide_pci_write(acer_sc->pcib_pa.pa_pc, acer_sc->pcib_pa.pa_tag,
+	    ACER_PCIB_CTRL, reg);
+}
+
+static void
 acer_setup_channel(struct wdc_channel *chp)
 {
 	struct ata_drive_datas *drvp;
Index: pci/pciide_acer_reg.h
===================================================================
RCS file: /cvsroot/src/sys/dev/pci/pciide_acer_reg.h,v
retrieving revision 1.7
diff -u -r1.7 pciide_acer_reg.h
--- pci/pciide_acer_reg.h	5 Oct 2003 17:48:49 -0000	1.7
+++ pci/pciide_acer_reg.h	22 Jun 2005 19:53:59 -0000
@@ -90,6 +90,10 @@
 #define ACER_0x79_REVC2_EN	0x4
 #define ACER_0x79_EN		0x2
 
+/* OpenSolaris: channel enable/disable in the PCI-ISA bridge */
+#define ACER_PCIB_CTRL	0x58
+#define ACER_PCIB_CTRL_ENCHAN(chan) (0x4 << (chan))
+
 /*
  * IDE bus frequency (1 byte)
  * This should be setup by the BIOS - can we rely on this ?

--n8g4imXOkfNTN/H1--