Source-Changes-HG archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

[src/trunk]: src/sys/dev/raidframe Fix a long-standing bug related to rebooti...



details:   https://anonhg.NetBSD.org/src/rev/0361c97736fe
branches:  trunk
changeset: 333698:0361c97736fe
user:      oster <oster%NetBSD.org@localhost>
date:      Fri Nov 14 14:29:16 2014 +0000

description:
Fix a long-standing bug related to rebooting while a
reconstruct-to-spare is underway but not yet complete.

The issue was that a component was being marked as a used_spare when
the rebuild started, not when the rebuild was actually finished.
Marking it as a used_spare meant that the component label on the spare
was being updated such that after a reboot the component would be
considered up-to-date, regardless of whether the rebuild actually
completed!

This fix includes:
 1) Add an additional state "rf_ds_rebuilding_spare" which is used
    to denote that a spare is currently being rebuilt from the live
    components.
 2) Update the comments on the disk states, which were out-of-sync
    with reality.
 3) When rebuilding to a spare component, that spare now enters the
    state rf_ds_rebuilding_spare instead of the state rf_ds_used_spare.
 4) When the rebuild is actually complete then the spare component
    enters the rf_ds_used_spare state.  rf_ds_used_spare is now used
    exclusively for the case where the rebuilding to the spare has
    completed successfully.

XXX: Someday we need to teach raidctl(8) about this new state, and
take out the backwards compatibility code in rf_netbsdkintf.c (see
RAIDFRAME_GET_INFO in raidioctl()).  For today, this fix needs to be
generic enough that it can get backported without major grief.

XXX: Needs pullup to netbsd-5*, netbsd-6*, and netbsd-7

Fixes PR#49244.

diffstat:

 sys/dev/raidframe/raidframevar.h   |  20 ++++++++++----------
 sys/dev/raidframe/rf_netbsdkintf.c |   8 ++++++--
 sys/dev/raidframe/rf_reconstruct.c |  15 +++++++++++----
 3 files changed, 27 insertions(+), 16 deletions(-)

diffs (115 lines):

diff -r 02bed496030a -r 0361c97736fe sys/dev/raidframe/raidframevar.h
--- a/sys/dev/raidframe/raidframevar.h  Fri Nov 14 13:30:48 2014 +0000
+++ b/sys/dev/raidframe/raidframevar.h  Fri Nov 14 14:29:16 2014 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: raidframevar.h,v 1.16 2014/02/28 10:16:51 skrll Exp $ */
+/*     $NetBSD: raidframevar.h,v 1.17 2014/11/14 14:29:16 oster Exp $ */
 /*-
  * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
  * All rights reserved.
@@ -383,17 +383,17 @@
  * IF YOU ADD A STATE, CHECK TO SEE IF YOU NEED TO MODIFY RF_DEAD_DISK().
  */
 enum RF_DiskStatus_e {
-        rf_ds_optimal,          /* no problems */
-        rf_ds_failed,           /* reconstruction ongoing */
-        rf_ds_reconstructing,   /* reconstruction complete to spare, dead disk
-                                 * not yet replaced */
-        rf_ds_dist_spared,      /* reconstruction complete to distributed
+       rf_ds_optimal,          /* no problems */
+       rf_ds_failed,           /* disk has failed */
+       rf_ds_reconstructing,   /* reconstruction ongoing */
+       rf_ds_dist_spared,      /* reconstruction complete to distributed
                                  * spare space, dead disk not yet replaced */
-        rf_ds_spared,           /* reconstruction complete to distributed
-                                 * spare space, dead disk not yet replaced */
-        rf_ds_spare,            /* an available spare disk */
-        rf_ds_used_spare        /* a spare which has been used, and hence is
+       rf_ds_spared,           /* reconstruction complete, dead disk not 
+                                  yet replaced */
+       rf_ds_spare,            /* an available spare disk */
+       rf_ds_used_spare,       /* a spare which has been used, and hence is
                                  * not available */
+       rf_ds_rebuilding_spare  /* a spare which is being rebuilt to */
 };
 typedef enum RF_DiskStatus_e RF_DiskStatus_t;
 
diff -r 02bed496030a -r 0361c97736fe sys/dev/raidframe/rf_netbsdkintf.c
--- a/sys/dev/raidframe/rf_netbsdkintf.c        Fri Nov 14 13:30:48 2014 +0000
+++ b/sys/dev/raidframe/rf_netbsdkintf.c        Fri Nov 14 14:29:16 2014 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_netbsdkintf.c,v 1.315 2014/11/04 07:51:55 mlelstv Exp $     */
+/*     $NetBSD: rf_netbsdkintf.c,v 1.316 2014/11/14 14:29:16 oster Exp $       */
 
 /*-
  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
@@ -101,7 +101,7 @@
  ***********************************************************/
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.315 2014/11/04 07:51:55 mlelstv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.316 2014/11/14 14:29:16 oster Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_compat_netbsd.h"
@@ -1532,6 +1532,10 @@
                }
                for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
                        d_cfg->spares[i] = raidPtr->Disks[j];
+                       if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
+                               /* XXX: raidctl(8) expects to see this as a used spare */
+                               d_cfg->spares[i].status = rf_ds_used_spare;
+                       }
                }
                retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
                RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
diff -r 02bed496030a -r 0361c97736fe sys/dev/raidframe/rf_reconstruct.c
--- a/sys/dev/raidframe/rf_reconstruct.c        Fri Nov 14 13:30:48 2014 +0000
+++ b/sys/dev/raidframe/rf_reconstruct.c        Fri Nov 14 14:29:16 2014 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: rf_reconstruct.c,v 1.120 2014/06/14 07:39:00 hannken Exp $     */
+/*     $NetBSD: rf_reconstruct.c,v 1.121 2014/11/14 14:29:16 oster Exp $       */
 /*
  * Copyright (c) 1995 Carnegie-Mellon University.
  * All rights reserved.
@@ -33,7 +33,7 @@
  ************************************************************/
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.120 2014/06/14 07:39:00 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.121 2014/11/14 14:29:16 oster Exp $");
 
 #include <sys/param.h>
 #include <sys/time.h>
@@ -263,7 +263,7 @@
                for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
                        if (raidPtr->Disks[scol].status == rf_ds_spare) {
                                spareDiskPtr = &raidPtr->Disks[scol];
-                               spareDiskPtr->status = rf_ds_used_spare;
+                               spareDiskPtr->status = rf_ds_rebuilding_spare;
                                break;
                        }
                }
@@ -310,6 +310,13 @@
                /* XXX doesn't hold for RAID 6!!*/
 
                rf_lock_mutex2(raidPtr->mutex);
+               /* The failed disk has already been marked as rf_ds_spared 
+                  (or rf_ds_dist_spared) in
+                  rf_ContinueReconstructFailedDisk() 
+                  so we just update the spare disk as being a used spare
+               */
+
+               spareDiskPtr->status = rf_ds_used_spare;
                raidPtr->parity_good = RF_RAID_CLEAN;
                rf_unlock_mutex2(raidPtr->mutex);
 
@@ -483,7 +490,7 @@
        rf_unlock_mutex2(raidPtr->mutex);
 
        spareDiskPtr = &raidPtr->Disks[col];
-       spareDiskPtr->status = rf_ds_used_spare;
+       spareDiskPtr->status = rf_ds_rebuilding_spare;
 
        printf("raid%d: initiating in-place reconstruction on column %d\n",
               raidPtr->raidid, col);



Home | Main Index | Thread Index | Old Index