Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/sys/dev/raidframe Vastly improve the error handling in the c...
details: https://anonhg.NetBSD.org/src/rev/6c75a96125db
branches: trunk
changeset: 573704:6c75a96125db
user: oster <oster%NetBSD.org@localhost>
date: Sat Feb 05 23:32:43 2005 +0000
description:
Vastly improve the error handling in the case of a read/write error
that occurs during a reconstruction. We go from zero error handling
and likely panicing if something goes amiss, to gracefully bailing and
leaving the system in the best, usable state possible.
- introduce rf_DrainReconEventQueue() to allow easy cleaning of the
reconstruction event queue
- change how we cleanup the floating recon buffers in
rf_FreeReconControl(). Detect the end of the list rather
than traversing according to a count.
- keep track of the number of pending reconstruction writes. In the
event of a read error, use this to wait long enough for the pending
writes to (hopefully) drain.
- more cleanup is still needed on this code, but I didn't want to
start mixing major functional changes with minor cleanups.
XXX: There is a known issue with pool items left outstanding due to
the IO failure, and this can show up in the form of a panic at the
tail end of a shutdown. This problem is much less severe than before
these changes, and the hope/plan is that this problem will go away
once this code gets overhauled again.
diffstat:
sys/dev/raidframe/rf_reconstruct.c | 308 +++++++++++++++++++++++++++++++-----
sys/dev/raidframe/rf_reconstruct.h | 8 +-
sys/dev/raidframe/rf_reconutil.c | 14 +-
sys/dev/raidframe/rf_revent.c | 29 +++-
sys/dev/raidframe/rf_revent.h | 4 +-
5 files changed, 307 insertions(+), 56 deletions(-)
diffs (truncated from 666 to 300 lines):
diff -r 1db845a5d8f1 -r 6c75a96125db sys/dev/raidframe/rf_reconstruct.c
--- a/sys/dev/raidframe/rf_reconstruct.c Sat Feb 05 20:19:00 2005 +0000
+++ b/sys/dev/raidframe/rf_reconstruct.c Sat Feb 05 23:32:43 2005 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: rf_reconstruct.c,v 1.81 2005/01/22 02:24:31 oster Exp $ */
+/* $NetBSD: rf_reconstruct.c,v 1.82 2005/02/05 23:32:43 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
@@ -33,7 +33,7 @@
************************************************************/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.81 2005/01/22 02:24:31 oster Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.82 2005/02/05 23:32:43 oster Exp $");
#include <sys/time.h>
#include <sys/buf.h>
@@ -94,6 +94,11 @@
#endif /* RF_DEBUG_RECON */
+#define RF_RECON_DONE_READS 1
+#define RF_RECON_READ_ERROR 2
+#define RF_RECON_WRITE_ERROR 3
+#define RF_RECON_READ_STOPPED 4
+
#define RF_MAX_FREE_RECONBUFFER 32
#define RF_MIN_FREE_RECONBUFFER 16
@@ -320,6 +325,16 @@
rf_update_component_labels(raidPtr,
RF_NORMAL_COMPONENT_UPDATE);
+ } else {
+ /* Reconstruct failed. */
+
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ /* Failed disk goes back to "failed" status */
+ raidPtr->Disks[col].status = rf_ds_failed;
+
+ /* Spare disk goes back to "spare" status. */
+ spareDiskPtr->status = rf_ds_spare;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
}
return (rc);
}
@@ -496,10 +511,6 @@
reconDesc->maxReconExecTicks = 0;
rc = rf_ContinueReconstructFailedDisk(reconDesc);
- RF_LOCK_MUTEX(raidPtr->mutex);
- raidPtr->reconInProgress--;
- RF_UNLOCK_MUTEX(raidPtr->mutex);
-
if (!rc) {
RF_LOCK_MUTEX(raidPtr->mutex);
/* Need to set these here, as at this point it'll be claiming
@@ -536,8 +547,18 @@
rf_update_component_labels(raidPtr,
RF_NORMAL_COMPONENT_UPDATE);
+ } else {
+ /* Reconstruct-in-place failed. Disk goes back to
+ "failed" status, regardless of what it was before. */
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ raidPtr->Disks[col].status = rf_ds_failed;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ }
- }
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ raidPtr->reconInProgress--;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+
RF_SIGNAL_COND(raidPtr->waitForReconCond);
return (rc);
}
@@ -552,9 +573,12 @@
RF_ReconMap_t *mapPtr;
RF_ReconCtrl_t *tmp_reconctrl;
RF_ReconEvent_t *event;
+ RF_CallbackDesc_t *p;
struct timeval etime, elpsd;
unsigned long xor_s, xor_resid_us;
int i, ds;
+ int status;
+ int recon_error, write_error;
raidPtr->accumXorTimeUs = 0;
#if RF_ACC_TRACE > 0
@@ -609,19 +633,65 @@
* they've completed all work */
mapPtr = raidPtr->reconControl->reconMap;
-
+ recon_error = 0;
+ write_error = 0;
+
while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
event = rf_GetNextReconEvent(reconDesc);
RF_ASSERT(event);
-
- if (ProcessReconEvent(raidPtr, event))
+
+ status = ProcessReconEvent(raidPtr, event);
+
+ /* the normal case is that a read completes, and all is well. */
+ if (status == RF_RECON_DONE_READS) {
reconDesc->numDisksDone++;
+ } else if ((status == RF_RECON_READ_ERROR) ||
+ (status == RF_RECON_WRITE_ERROR)) {
+ /* an error was encountered while reconstructing...
+ Pretend we've finished this disk.
+ */
+ recon_error = 1;
+ raidPtr->reconControl->error = 1;
+
+ /* bump the numDisksDone count for reads,
+ but not for writes */
+ if (status == RF_RECON_READ_ERROR)
+ reconDesc->numDisksDone++;
+
+ /* write errors are special -- when we are
+ done dealing with the reads that are
+ finished, we don't want to wait for any
+ writes */
+ if (status == RF_RECON_WRITE_ERROR)
+ write_error = 1;
+
+ } else if (status == RF_RECON_READ_STOPPED) {
+ /* count this component as being "done" */
+ reconDesc->numDisksDone++;
+ }
+
+ if (recon_error) {
+
+ /* make sure any stragglers are woken up so that
+ their theads will complete, and we can get out
+ of here with all IO processed */
+
+ while (raidPtr->reconControl->headSepCBList) {
+ p = raidPtr->reconControl->headSepCBList;
+ raidPtr->reconControl->headSepCBList = p->next;
+ p->next = NULL;
+ rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
+ rf_FreeCallbackDesc(p);
+ }
+ }
+
raidPtr->reconControl->numRUsTotal =
mapPtr->totalRUs;
raidPtr->reconControl->numRUsComplete =
mapPtr->totalRUs -
rf_UnitsLeftToReconstruct(mapPtr);
+
#if RF_DEBUG_RECON
raidPtr->reconControl->percentComplete =
(raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
@@ -637,19 +707,107 @@
}
/* at this point all the reads have completed. We now wait
* for any pending writes to complete, and then we're done */
-
- while (rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
+
+ while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
event = rf_GetNextReconEvent(reconDesc);
RF_ASSERT(event);
-
- (void) ProcessReconEvent(raidPtr, event); /* ignore return code */
+
+ status = ProcessReconEvent(raidPtr, event);
+ if (status == RF_RECON_WRITE_ERROR) {
+ recon_error = 1;
+ raidPtr->reconControl->error = 1;
+ /* an error was encountered at the very end... bail */
+ } else {
#if RF_DEBUG_RECON
- raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
- if (rf_prReconSched) {
- rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
+ raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
+ if (rf_prReconSched) {
+ rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
+ }
+#endif
}
+ }
+
+ if (recon_error) {
+ /* we've encountered an error in reconstructing. */
+ printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
+
+ /* we start by blocking IO to the RAID set. */
+ rf_SuspendNewRequestsAndWait(raidPtr);
+
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ /* mark set as being degraded, rather than
+ rf_rs_reconstructing as we were before the problem.
+ After this is done we can update status of the
+ component disks without worrying about someone
+ trying to read from a failed component.
+ */
+ raidPtr->status = rf_rs_degraded;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+ /* resume IO */
+ rf_ResumeNewRequests(raidPtr);
+
+ /* At this point there are two cases:
+ 1) If we've experienced a read error, then we've
+ already waited for all the reads we're going to get,
+ and we just need to wait for the writes.
+
+ 2) If we've experienced a write error, we've also
+ already waited for all the reads to complete,
+ but there is little point in waiting for the writes --
+ when they do complete, they will just be ignored.
+
+ So we just wait for writes to complete if we didn't have a
+ write error.
+ */
+
+ if (!write_error) {
+ /* wait for writes to complete */
+ while (raidPtr->reconControl->pending_writes > 0) {
+ event = rf_GetNextReconEvent(reconDesc);
+ status = ProcessReconEvent(raidPtr, event);
+
+ if (status == RF_RECON_WRITE_ERROR) {
+ raidPtr->reconControl->error = 1;
+ /* an error was encountered at the very end... bail.
+ This will be very bad news for the user, since
+ at this point there will have been a read error
+ on one component, and a write error on another!
+ */
+ break;
+ }
+ }
+ }
+
+
+ /* cleanup */
+
+ /* drain the event queue - after waiting for the writes above,
+ there shouldn't be much (if anything!) left in the queue. */
+
+ rf_DrainReconEventQueue(reconDesc);
+
+ /* XXX As much as we'd like to free the recon control structure
+ and the reconDesc, we have no way of knowing if/when those will
+ be touched by IO that has yet to occur. It is rather poor to be
+ basically causing a 'memory leak' here, but there doesn't seem to be
+ a cleaner alternative at this time. Perhaps when the reconstruct code
+ gets a makeover this problem will go away.
+ */
+#if 0
+ rf_FreeReconControl(raidPtr);
#endif
+
+#if RF_ACC_TRACE > 0
+ RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
+#endif
+ /* XXX see comment above */
+#if 0
+ FreeReconDesc(reconDesc);
+#endif
+
+ return (1);
}
/* Success: mark the dead disk as reconstructed. We quiesce
@@ -683,7 +841,6 @@
(int) raidPtr->reconControl->starttime.tv_sec,
(int) raidPtr->reconControl->starttime.tv_usec,
(int) etime.tv_sec, (int) etime.tv_usec);
-
#if RF_RECON_STATS > 0
printf("raid%d: Total head-sep stall count was %d\n",
raidPtr->raidid, (int) reconDesc->hsStallCount);
@@ -695,11 +852,10 @@
FreeReconDesc(reconDesc);
return (0);
+
}
/*****************************************************************************
* do the right thing upon each reconstruction event.
- * returns nonzero if and only if there is nothing left unread on the
- * indicated disk
*****************************************************************************/
static int
ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
@@ -708,6 +864,8 @@
RF_ReconBuffer_t *rbuf;
RF_SectorCount_t sectorsPerRU;
Home |
Main Index |
Thread Index |
Old Index