Subject: Dealing with bad blocks on fixed disks
To: None <tech-kern@netbsd.org>
From: Darren Reed <darrenr@reed.wattle.id.au>
List: tech-kern
Date: 04/13/2003 10:49:01
Having been bitten again(!) by a disk with errors, it occurred to me that
NetBSD's handling of these errors was sub-optimal - the kernel would keep
trying block X and attempts to access it would always result in an error
that was considered "unrecoverable".

I discussed this briefly with Jason who suggested a linked list might be
suitable for the job, because, (a) only disks with errors would be impacted
and (b) if the list is non-NULL, the disk is already "in bad shape".  The
logic seems fine to me :-)  A few hours later and I've come up with some
patches, below.  Of the things I'm not sure about, malloc/free and cleaning
up the SLIST created are top of the list.

To round out the functionality, I added DIOCBSLIST (to get a list of the
bad sectors) and DIOCBSFLUSH (flush the list of bad sectors).  I'm not
100% sure if I've done DIOCBSLIST "correctly" but given the lack of sysctl
at that level, I didn't see any other way to go about it.  Amongst other
things, the impact of changes to the bad sector structure on any
application that uses DIOCBSLIST is of concern and whether or not that
can be designed around.  At first glance, it would appear that atactl(8)
would be the appropriate place to start with for a user interface to these
ioctls to the disks ?

Of the disk devices I've looked at, wd.c, ld.c and sd.c, only wd.c seems
to obvious to me for how to fix the problem.  In trawling through the sd.c
source code, I found sd_reassign_blocks() but no reference to it anywhere
in dev/scsipi/* and unless I'm mistaken, this is now an orphan function ?
Does someone want to comment on this or about SCSI disks and how we handle
failures to read disks and whether or not using the same algorithm as I've
used below for wd.c would be appropriate for SCSI ?

So far, testing of the changes to wd.c, excluding the ioctls, seems to
indicate the patches work as designed :)

Darren

Index: sys/disk.h
===================================================================
RCS file: /cvsroot/src/sys/sys/disk.h,v
retrieving revision 1.20
diff -c -r1.20 disk.h
*** sys/disk.h	2002/11/01 11:32:02	1.20
--- sys/disk.h	2003/04/13 00:29:04
***************
*** 182,187 ****
--- 182,197 ----
   */
  TAILQ_HEAD(disklist_head, disk);	/* the disklist is a TAILQ */
  
+ /*
+  * Bad sector lists per fixed disk
+  */
+ struct disk_badsectors {
+ 	SLIST_ENTRY(disk_badsectors)	dbs_next;
+ 	u_int64_t	dbs_min;	/* min. sector number */
+ 	u_int64_t	dbs_max;	/* max. sector number */
+ 	struct timeval	dbs_failedat;	/* first failure at */
+ };
+ 
  #ifdef _KERNEL
  extern	int disk_count;			/* number of disks in global disklist */
  
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.6
diff -c -r1.6 dkio.h
*** sys/dkio.h	2002/01/09 04:12:13	1.6
--- sys/dkio.h	2003/04/13 00:29:04
***************
*** 88,91 ****
--- 88,95 ----
  		/* sync disk cache */
  #define	DIOCCACHESYNC	_IOW('d', 118, int)	/* sync cache (force?) */
  
+ 		/* bad sector list */
+ #define	DIOCBSLIST	_IOWR('d', 119, caddr_t)	/* get list */
+ #define	DIOCBSFLUSH	_IO('d', 120)			/* flush list */
+ 
  #endif /* _SYS_DKIO_H_ */
Index: dev/ata/wd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ata/wd.c,v
retrieving revision 1.240
diff -c -r1.240 wd.c
*** dev/ata/wd.c	2003/04/03 22:18:23	1.240
--- dev/ata/wd.c	2003/04/13 00:29:08
***************
*** 288,293 ****
--- 288,294 ----
  #else
  	bufq_alloc(&wd->sc_q, BUFQ_DISKSORT|BUFQ_SORT_RAWBLOCK);
  #endif
+ 	SLIST_INIT(&wd->sc_bslist);
  
  	wd->atabus = adev->adev_bustype;
  	wd->openings = adev->adev_openings;
***************
*** 423,428 ****
--- 424,436 ----
  	struct buf *bp;
  	int s, bmaj, cmaj, i, mn;
  
+ 	/* Clean out the bad sector list */
+ 	while (!SLIST_EMPTY(&sc->sc_bslist)) {
+ 		void *head = SLIST_FIRST(&sc->sc_bslist);
+ 		SLIST_REMOVE_HEAD(&sc->sc_bslist, dbs_next);
+ 		free(head, M_TEMP);
+ 	}
+ 
  	/* locate the major number */
  	bmaj = bdevsw_lookup_major(&wd_bdevsw);
  	cmaj = cdevsw_lookup_major(&wd_cdevsw);
***************
*** 525,530 ****
--- 533,555 ----
  
  	bp->b_rawblkno = blkno;
  
+ 	/*
+ 	 * If the transfer about to be attempted contains only a block that
+ 	 * is known to be bad then return an error for the transfer without
+ 	 * even attempting to start a transfer up under the premis that we
+ 	 * will just end up doing more retries for a transfer that will end
+ 	 * up failing again.
+ 	 * XXX:SMP - mutex required to protect with DIOCBSFLUSH
+ 	 */
+ 	if (__predict_false(!SLIST_EMPTY(&wd->sc_bslist))) {
+ 		struct disk_badsectors *dbs;
+ 		daddr_t maxblk = blkno + bp->b_bcount;
+ 
+ 		SLIST_FOREACH(dbs, &wd->sc_bslist, dbs_next)
+ 			if (dbs->dbs_min >= blkno && dbs->dbs_max < maxblk)
+ 				goto bad;
+ 	}
+ 
  	/* Queue transfer on drive, activate drive and controller if idle. */
  	s = splbio();
  	BUFQ_PUT(&wd->sc_q, bp);
***************
*** 744,749 ****
--- 769,790 ----
  			return;
  		}
  		printf("\n");
+ 
+ 		/*
+ 		 * Not all errors indicate a failed block but those that do,
+ 		 * put the block on the bad-block list for the device.
+ 		 */
+ 		if ((wd->drvp->ata_vers >= 4 && wd->sc_wdc_bio.r_error & 64) ||
+ 		    (wd->drvp->ata_vers < 4 && wd->sc_wdc_bio.r_error & 192)) {
+ 			struct disk_badsectors *dbs;
+ 
+ 			dbs = malloc(sizeof *dbs, M_TEMP, M_WAITOK);
+ 			dbs->dbs_min = bp->b_rawblkno;
+ 			dbs->dbs_max = dbs->dbs_min + bp->b_bcount - 1;
+ 			microtime(&dbs->dbs_failedat);
+ 			SLIST_INSERT_HEAD(&wd->sc_bslist, dbs, dbs_next);
+ 		}
+ 
  		bp->b_flags |= B_ERROR;
  		bp->b_error = EIO;
  		break;
***************
*** 1136,1141 ****
--- 1177,1213 ----
  		bad144intern(wd);
  		return 0;
  #endif
+ 
+ 	case DIOCBSLIST :
+ 	{
+ 		caddr_t laddr = *(caddr_t *)addr;
+ 		struct disk_badsectors *dbs;
+ 		size_t available;
+ 		u_int32_t count;
+ 
+ 		count = 0;
+ 		copyin(laddr, &available, sizeof(available));
+ 		laddr += sizeof(count);
+ 		SLIST_FOREACH(dbs, &wd->sc_bslist, dbs_next) {
+ 			if (available < sizeof(dbs))
+ 				break;
+ 			available -= sizeof(dbs);
+ 			copyout(dbs, laddr, sizeof(*dbs));
+ 			laddr += sizeof(*dbs);
+ 			count++;
+ 		}
+ 		copyout(&count, *(caddr_t *)addr, sizeof(count));
+ 		return 0;
+ 	}
+ 
+ 	case DIOCBSFLUSH :
+ 		/* Clean out the bad sector list */
+ 		while (!SLIST_EMPTY(&wd->sc_bslist)) {
+ 			void *head = SLIST_FIRST(&wd->sc_bslist);
+ 			SLIST_REMOVE_HEAD(&wd->sc_bslist, dbs_next);
+ 			free(head, M_TEMP);
+ 		}
+ 		return 0;
  
  	case DIOCGDINFO:
  		*(struct disklabel *)addr = *(wd->sc_dk.dk_label);
Index: dev/ata/wdvar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ata/wdvar.h,v
retrieving revision 1.15
diff -c -r1.15 wdvar.h
*** dev/ata/wdvar.h	2003/04/03 22:18:23	1.15
--- dev/ata/wdvar.h	2003/04/13 00:29:08
***************
*** 136,141 ****
--- 136,143 ----
  
  	void *sc_sdhook;		/* our shutdown hook */
  
+ 	SLIST_HEAD(, disk_badsectors)	sc_bslist;
+ 
  #if NRND > 0
  	rndsource_element_t	rnd_source;
  #endif