#include "inode-map.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
                goto fail_tree_roots;
        }
 
-       btrfs_close_extra_devices(fs_devices);
+       /*
+        * keep the device that is marked to be the target device for the
+        * dev_replace procedure
+        */
+       btrfs_close_extra_devices(fs_info, fs_devices, 0);
 
        if (!fs_devices->latest_bdev) {
                printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
                goto fail_block_groups;
        }
 
+       ret = btrfs_init_dev_replace(fs_info);
+       if (ret) {
+               pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+               goto fail_block_groups;
+       }
+
+       btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
                return ret;
        }
 
+       ret = btrfs_resume_dev_replace_async(fs_info);
+       if (ret) {
+               pr_warn("btrfs: failed to resume dev_replace\n");
+               close_ctree(tree_root);
+               return ret;
+       }
+
        return 0;
 
 fail_qgroup:
        /* pause restriper - we want to resume on mount */
        btrfs_pause_balance(fs_info);
 
+       btrfs_dev_replace_suspend_for_unmount(fs_info);
+
        btrfs_scrub_cancel(fs_info);
 
        /* wait for any defraggers to finish */
 
 #include "volumes.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "dev-replace.h"
 
 #undef DEBUG
 
        int nzones = 0;
        int i;
        unsigned long index = logical >> PAGE_CACHE_SHIFT;
+       int dev_replace_is_ongoing;
 
        spin_lock(&fs_info->reada_lock);
        re = radix_tree_lookup(&fs_info->reada_tree, index);
        }
 
        /* insert extent in reada_tree + all per-device trees, all or nothing */
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
        if (ret == -EEXIST) {
                BUG_ON(!re_exist);
                re_exist->refcnt++;
                spin_unlock(&fs_info->reada_lock);
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        if (ret) {
                spin_unlock(&fs_info->reada_lock);
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                goto error;
        }
        prev_dev = NULL;
+       dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+                       &fs_info->dev_replace);
        for (i = 0; i < nzones; ++i) {
                dev = bbio->stripes[i].dev;
                if (dev == prev_dev) {
                        /* cannot read ahead on missing device */
                        continue;
                }
+               if (dev_replace_is_ongoing &&
+                   dev == fs_info->dev_replace.tgtdev) {
+                       /*
+                        * as this device is selected for reading only as
+                        * a last resort, skip it for read ahead.
+                        */
+                       continue;
+               }
                prev_dev = dev;
                ret = radix_tree_insert(&dev->reada_extents, index, re);
                if (ret) {
                        BUG_ON(fs_info == NULL);
                        radix_tree_delete(&fs_info->reada_tree, index);
                        spin_unlock(&fs_info->reada_lock);
+                       btrfs_dev_replace_unlock(&fs_info->dev_replace);
                        goto error;
                }
        }
        spin_unlock(&fs_info->reada_lock);
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
 
        kfree(bbio);
        return re;
 
                return -EIO;
        }
 
-       if (dev->scrub_device) {
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (dev->scrub_device ||
+           (!is_dev_replace &&
+            btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                mutex_unlock(&fs_info->scrub_lock);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                scrub_workers_put(fs_info);
                return -EINPROGRESS;
        }
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
        sctx = scrub_setup_ctx(dev, is_dev_replace);
        if (IS_ERR(sctx)) {
                mutex_unlock(&fs_info->scrub_lock);
 
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
+#include "dev-replace.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
                return 0;
 
        if (*flags & MS_RDONLY) {
+               /*
+                * this also happens on 'umount -rf' or on shutdown, when
+                * the filesystem is busy.
+                */
                sb->s_flags |= MS_RDONLY;
 
+               btrfs_dev_replace_suspend_for_unmount(fs_info);
+               btrfs_scrub_cancel(fs_info);
+
                ret = btrfs_commit_super(root);
                if (ret)
                        goto restore;
                if (ret)
                        goto restore;
 
+               ret = btrfs_resume_dev_replace_async(fs_info);
+               if (ret) {
+                       pr_warn("btrfs: failed to resume dev_replace\n");
+                       goto restore;
+               }
                sb->s_flags &= ~MS_RDONLY;
        }
 
 
 #include "tree-log.h"
 #include "inode-map.h"
 #include "volumes.h"
+#include "dev-replace.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
                return ret;
 
        ret = btrfs_run_dev_stats(trans, root->fs_info);
-       BUG_ON(ret);
+       WARN_ON(ret);
+       ret = btrfs_run_dev_replace(trans, root->fs_info);
+       WARN_ON(ret);
 
        ret = btrfs_run_qgroups(trans, root->fs_info);
        BUG_ON(ret);
        switch_commit_root(fs_info->extent_root);
        up_write(&fs_info->extent_commit_sem);
 
+       btrfs_after_dev_replace_commit(fs_info);
+
        return 0;
 }
 
 
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "math.h"
+#include "dev-replace.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
        return ERR_PTR(-ENOMEM);
 }
 
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step)
 {
        struct btrfs_device *device, *next;
 
                        continue;
                }
 
+               if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+                       /*
+                        * In the first step, keep the device which has
+                        * the correct fsid and the devid that is used
+                        * for the dev_replace procedure.
+                        * In the second step, the dev_replace state is
+                        * read from the device tree and it is known
+                        * whether the procedure is really active or
+                        * not, which means whether this device is
+                        * used or whether it should be removed.
+                        */
+                       if (step == 0 || device->is_tgtdev_for_dev_replace) {
+                               continue;
+                       }
+               }
                if (device->bdev) {
                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
                if (device->writeable) {
                        list_del_init(&device->dev_alloc_list);
                        device->writeable = 0;
-                       fs_devices->rw_devices--;
+                       if (!device->is_tgtdev_for_dev_replace)
+                               fs_devices->rw_devices--;
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
                if (device->bdev)
                        fs_devices->open_devices--;
 
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
                        fs_devices->rotating = 1;
 
                fs_devices->open_devices++;
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        fs_devices->rw_devices++;
                        list_add(&device->dev_alloc_list,
                                 &fs_devices->alloc_list);
                root->fs_info->avail_system_alloc_bits |
                root->fs_info->avail_metadata_alloc_bits;
 
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-           root->fs_info->fs_devices->num_devices <= 4) {
+       num_devices = root->fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+               WARN_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                printk(KERN_ERR "btrfs: unable to go below four devices "
                       "on raid10\n");
                ret = -EINVAL;
                goto out;
        }
 
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-           root->fs_info->fs_devices->num_devices <= 2) {
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
                printk(KERN_ERR "btrfs: unable to go below two "
                       "devices on raid1\n");
                ret = -EINVAL;
        u64 allowed;
        int mixed = 0;
        int ret;
+       u64 num_devices;
 
        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
                }
        }
 
+       num_devices = fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+               BUG_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       if (fs_info->fs_devices->num_devices == 1)
+       if (num_devices == 1)
                allowed |= BTRFS_BLOCK_GROUP_DUP;
-       else if (fs_info->fs_devices->num_devices < 4)
+       else if (num_devices < 4)
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
        else
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
+               WARN_ON(ndevs > fs_devices->rw_devices);
        }
 
        /*
        device->io_align = btrfs_device_io_align(leaf, dev_item);
        device->io_width = btrfs_device_io_width(leaf, dev_item);
        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+       WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
        device->is_tgtdev_for_dev_replace = 0;
 
        ptr = (unsigned long)btrfs_device_uuid(dev_item);
 
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step);
 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
                                         char *device_path,
                                         struct btrfs_device **device);