diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0e410478ad27..76b82506bf92 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,7 @@ #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" +#include "dev-replace.h" #ifdef CONFIG_X86 #include @@ -2438,7 +2439,11 @@ int open_ctree(struct super_block *sb, goto fail_tree_roots; } - btrfs_close_extra_devices(fs_devices); + /* + * keep the device that is marked to be the target device for the + * dev_replace procedure + */ + btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_CRIT "btrfs: failed to read devices on %s\n", @@ -2510,6 +2515,14 @@ int open_ctree(struct super_block *sb, goto fail_block_groups; } + ret = btrfs_init_dev_replace(fs_info); + if (ret) { + pr_err("btrfs: failed to init dev_replace: %d\n", ret); + goto fail_block_groups; + } + + btrfs_close_extra_devices(fs_info, fs_devices, 1); + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2658,6 +2671,13 @@ int open_ctree(struct super_block *sb, return ret; } + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + close_ctree(tree_root); + return ret; + } + return 0; fail_qgroup: @@ -3300,6 +3320,8 @@ int close_ctree(struct btrfs_root *root) /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9f363e17ec74..c705a48e676b 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -27,6 +27,7 @@ #include "volumes.h" #include "disk-io.h" #include "transaction.h" +#include "dev-replace.h" #undef DEBUG @@ -331,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; + int dev_replace_is_ongoing; spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -392,6 +394,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, } /* insert extent in reada_tree + all per-device trees, all or nothing */ + btrfs_dev_replace_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { @@ -399,13 +402,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } prev_dev = NULL; + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( + &fs_info->dev_replace); for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; if (dev == prev_dev) { @@ -422,6 +429,14 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, /* cannot read ahead on missing device */ continue; } + if (dev_replace_is_ongoing && + dev == fs_info->dev_replace.tgtdev) { + /* + * as this device is selected for reading only as + * a last resort, skip it for read ahead. + */ + continue; + } prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { @@ -434,10 +449,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); goto error; } } spin_unlock(&fs_info->reada_lock); + btrfs_dev_replace_unlock(&fs_info->dev_replace); kfree(bbio); return re; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 61157a26cf2a..30cbf6921c0b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2843,12 +2843,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EIO; } - if (dev->scrub_device) { + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (dev->scrub_device || + (!is_dev_replace && + btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { + btrfs_dev_replace_unlock(&fs_info->dev_replace); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return -EINPROGRESS; } + btrfs_dev_replace_unlock(&fs_info->dev_replace); sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(&fs_info->scrub_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ad4380684b9b..def4f24b58df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@ #include "export.h" #include "compression.h" #include "rcu-string.h" +#include "dev-replace.h" #define CREATE_TRACE_POINTS #include @@ -1225,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (*flags & MS_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ sb->s_flags |= MS_RDONLY; + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + ret = btrfs_commit_super(root); if (ret) goto restore; @@ -1263,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + goto restore; + } sb->s_flags &= ~MS_RDONLY; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7b297354e738..bcc6b65be3b0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,6 +30,7 @@ #include "tree-log.h" #include "inode-map.h" #include "volumes.h" +#include "dev-replace.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -845,7 +846,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, return ret; ret = btrfs_run_dev_stats(trans, root->fs_info); - BUG_ON(ret); + WARN_ON(ret); + ret = btrfs_run_dev_replace(trans, root->fs_info); + WARN_ON(ret); ret = btrfs_run_qgroups(trans, root->fs_info); BUG_ON(ret); @@ -868,6 +871,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, switch_commit_root(fs_info->extent_root); up_write(&fs_info->extent_commit_sem); + btrfs_after_dev_replace_commit(fs_info); + return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5777e6a9aab1..a4e0963bf457 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "math.h" +#include "dev-replace.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -505,7 +506,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; @@ -528,6 +530,21 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) continue; } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) { + /* + * In the first step, keep the device which has + * the correct fsid and the devid that is used + * for the dev_replace procedure. + * In the second step, the dev_replace state is + * read from the device tree and it is known + * whether the procedure is really active or + * not, which means whether this device is + * used or whether it should be removed. + */ + if (step == 0 || device->is_tgtdev_for_dev_replace) { + continue; + } + } if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -536,7 +553,8 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) if (device->writeable) { list_del_init(&device->dev_alloc_list); device->writeable = 0; - fs_devices->rw_devices--; + if (!device->is_tgtdev_for_dev_replace) + fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; @@ -594,7 +612,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->bdev) fs_devices->open_devices--; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { list_del_init(&device->dev_alloc_list); fs_devices->rw_devices--; } @@ -718,7 +736,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fs_devices->rotating = 1; fs_devices->open_devices++; - if (device->writeable) { + if (device->writeable && !device->is_tgtdev_for_dev_replace) { fs_devices->rw_devices++; list_add(&device->dev_alloc_list, &fs_devices->alloc_list); @@ -1350,16 +1368,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_system_alloc_bits | root->fs_info->avail_metadata_alloc_bits; - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { printk(KERN_ERR "btrfs: unable to go below four devices " "on raid10\n"); ret = -EINVAL; goto out; } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { printk(KERN_ERR "btrfs: unable to go below two " "devices on raid1\n"); ret = -EINVAL; @@ -2935,6 +2959,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, u64 allowed; int mixed = 0; int ret; + u64 num_devices; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -2963,10 +2988,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + BUG_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&fs_info->dev_replace); allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (fs_info->fs_devices->num_devices == 1) + if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (fs_info->fs_devices->num_devices < 4) + else if (num_devices < 4) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); else allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | @@ -3591,6 +3623,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; + WARN_ON(ndevs > fs_devices->rw_devices); } /* @@ -4773,6 +4806,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); device->sector_size = btrfs_device_sector_size(leaf, dev_item); + WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); device->is_tgtdev_for_dev_replace = 0; ptr = (unsigned long)btrfs_device_uuid(dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 58d79375deaf..37d0157167b0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -268,7 +268,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device);