written to, that device.
 
       state
-        A file recording the current state of the device in the array
+       A file recording the current state of the device in the array
        which can be a comma separated list of
              faulty   - device has been kicked from active use due to
-                         a detected fault or it has unacknowledged bad
-                         blocks
+                        a detected fault, or it has unacknowledged bad
+                        blocks
              in_sync  - device is a fully in-sync member of the array
              writemostly - device will only be subject to read
-                        requests if there are no other options.
+                        requests if there are no other options.
                         This applies only to raid1 arrays.
              blocked  - device has failed, and the failure hasn't been
                         acknowledged yet by the metadata handler.
                         This includes spares that are in the process
                         of being recovered to
              write_error - device has ever seen a write error.
+             want_replacement - device is (mostly) working but probably
+                        should be replaced, either due to errors or
+                        due to user request.
+             replacement - device is a replacement for another active
+                        device with same raid_disk.
+
+
        This list may grow in future.
        This can be written to.
        Writing "faulty"  simulates a failure on the device.
        Writing "in_sync" sets the in_sync flag.
        Writing "write_error" sets writeerrorseen flag.
        Writing "-write_error" clears writeerrorseen flag.
+       Writing "want_replacement" is allowed at any time except to a
+               replacement device or a spare.  It sets the flag.
+       Writing "-want_replacement" is allowed at any time.  It clears
+               the flag.
+       Writing "replacement" or "-replacement" is only allowed before
+               starting the array.  It sets or clears the flag.
+
 
        This file responds to select/poll. Any change to 'faulty'
        or 'blocked' causes an event.
 
                }
                if (sb->devflags & WriteMostly1)
                        set_bit(WriteMostly, &rdev->flags);
+               if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
+                       set_bit(Replacement, &rdev->flags);
        } else /* MULTIPATH are always insync */
                set_bit(In_sync, &rdev->flags);
 
                sb->recovery_offset =
                        cpu_to_le64(rdev->recovery_offset);
        }
+       if (test_bit(Replacement, &rdev->flags))
+               sb->feature_map |=
+                       cpu_to_le32(MD_FEATURE_REPLACEMENT);
 
        if (mddev->reshape_position != MaxSector) {
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
                len += sprintf(page+len, "%swrite_error", sep);
                sep = ",";
        }
+       if (test_bit(WantReplacement, &rdev->flags)) {
+               len += sprintf(page+len, "%swant_replacement", sep);
+               sep = ",";
+       }
+       if (test_bit(Replacement, &rdev->flags)) {
+               len += sprintf(page+len, "%sreplacement", sep);
+               sep = ",";
+       }
+
        return len+sprintf(page+len, "\n");
 }
 
        } else if (cmd_match(buf, "-write_error")) {
                clear_bit(WriteErrorSeen, &rdev->flags);
                err = 0;
+       } else if (cmd_match(buf, "want_replacement")) {
+               /* Any non-spare device that is not a replacement can
+                * become want_replacement at any time, but we then need to
+                * check if recovery is needed.
+                */
+               if (rdev->raid_disk >= 0 &&
+                   !test_bit(Replacement, &rdev->flags))
+                       set_bit(WantReplacement, &rdev->flags);
+               set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+               md_wakeup_thread(rdev->mddev->thread);
+               err = 0;
+       } else if (cmd_match(buf, "-want_replacement")) {
+               /* Clearing 'want_replacement' is always allowed.
+                * Once replacements starts it is too late though.
+                */
+               err = 0;
+               clear_bit(WantReplacement, &rdev->flags);
+       } else if (cmd_match(buf, "replacement")) {
+               /* Can only set a device as a replacement when array has not
+                * yet been started.  Once running, replacement is automatic
+                * from spares, or by assigning 'slot'.
+                */
+               if (rdev->mddev->pers)
+                       err = -EBUSY;
+               else {
+                       set_bit(Replacement, &rdev->flags);
+                       err = 0;
+               }
+       } else if (cmd_match(buf, "-replacement")) {
+               /* Similarly, can only clear Replacement before start */
+               if (rdev->mddev->pers)
+                       err = -EBUSY;
+               else {
+                       clear_bit(Replacement, &rdev->flags);
+                       err = 0;
+               }
        }
        if (!err)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
                        if (test_bit(Faulty, &rdev->flags)) {
                                seq_printf(seq, "(F)");
                                continue;
-                       } else if (rdev->raid_disk < 0)
+                       }
+                       if (rdev->raid_disk < 0)
                                seq_printf(seq, "(S)"); /* spare */
+                       if (test_bit(Replacement, &rdev->flags))
+                               seq_printf(seq, "(R)");
                        sectors += rdev->sectors;
                }
 
 
         * This reduces the burden of testing multiple flags in many cases
         */
 
-       unsigned long   flags;
-#define        Faulty          1               /* device is known to have a fault */
-#define        In_sync         2               /* device is in_sync with rest of array */
-#define        WriteMostly     4               /* Avoid reading if at all possible */
-#define        AutoDetected    7               /* added by auto-detect */
-#define Blocked                8               /* An error occurred but has not yet
-                                        * been acknowledged by the metadata
-                                        * handler, so don't allow writes
-                                        * until it is cleared */
-#define WriteErrorSeen 9               /* A write error has been seen on this
-                                        * device
-                                        */
-#define FaultRecorded  10              /* Intermediate state for clearing
-                                        * Blocked.  The Fault is/will-be
-                                        * recorded in the metadata, but that
-                                        * metadata hasn't been stored safely
-                                        * on disk yet.
-                                        */
-#define BlockedBadBlocks 11            /* A writer is blocked because they
-                                        * found an unacknowledged bad-block.
-                                        * This can safely be cleared at any
-                                        * time, and the writer will re-check.
-                                        * It may be set at any time, and at
-                                        * worst the writer will timeout and
-                                        * re-check.  So setting it as
-                                        * accurately as possible is good, but
-                                        * not absolutely critical.
-                                        */
+       unsigned long   flags;  /* bit set of 'enum flag_bits' bits. */
        wait_queue_head_t blocked_wait;
 
        int desc_nr;                    /* descriptor index in the superblock */
                sector_t size;          /* in sectors */
        } badblocks;
 };
+enum flag_bits {
+       Faulty,                 /* device is known to have a fault */
+       In_sync,                /* device is in_sync with rest of array */
+       WriteMostly,            /* Avoid reading if at all possible */
+       AutoDetected,           /* added by auto-detect */
+       Blocked,                /* An error occurred but has not yet
+                                * been acknowledged by the metadata
+                                * handler, so don't allow writes
+                                * until it is cleared */
+       WriteErrorSeen,         /* A write error has been seen on this
+                                * device
+                                */
+       FaultRecorded,          /* Intermediate state for clearing
+                                * Blocked.  The Fault is/will-be
+                                * recorded in the metadata, but that
+                                * metadata hasn't been stored safely
+                                * on disk yet.
+                                */
+       BlockedBadBlocks,       /* A writer is blocked because they
+                                * found an unacknowledged bad-block.
+                                * This can safely be cleared at any
+                                * time, and the writer will re-check.
+                                * It may be set at any time, and at
+                                * worst the writer will timeout and
+                                * re-check.  So setting it as
+                                * accurately as possible is good, but
+                                * not absolutely critical.
+                                */
+       WantReplacement,        /* This device is a candidate to be
+                                * hot-replaced, either because it has
+                                * reported some faults, or because
+                                * of explicit request.
+                                */
+       Replacement,            /* This device is a replacement for
+                                * a want_replacement device with same
+                                * raid_disk number.
+                                */
+};
 
 #define BB_LEN_MASK    (0x00000000000001FFULL)
 #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
 static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
        char nm[20];
-       sprintf(nm, "rd%d", rdev->raid_disk);
-       return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+       if (!test_bit(Replacement, &rdev->flags)) {
+               sprintf(nm, "rd%d", rdev->raid_disk);
+               return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+       } else
+               return 0;
 }
 
 static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
        char nm[20];
-       sprintf(nm, "rd%d", rdev->raid_disk);
-       sysfs_remove_link(&mddev->kobj, nm);
+       if (!test_bit(Replacement, &rdev->flags)) {
+               sprintf(nm, "rd%d", rdev->raid_disk);
+               sysfs_remove_link(&mddev->kobj, nm);
+       }
 }
 
 /*
 
                                           */
 #define        MD_FEATURE_RESHAPE_ACTIVE       4
 #define        MD_FEATURE_BAD_BLOCKS           8 /* badblock list is not empty */
-
-#define        MD_FEATURE_ALL                  (1|2|4|8)
+#define        MD_FEATURE_REPLACEMENT          16 /* This device is replacing an
+                                           * active device with same 'role'.
+                                           * 'recovery_offset' is also set.
+                                           */
+#define        MD_FEATURE_ALL                  (1|2|4|8|16)
 
 #endif