int ret;
 
        if (!root->reloc_root)
-               return 0;
+               goto out;
 
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
        ret = btrfs_update_root(trans, root->fs_info->tree_root,
                                &reloc_root->root_key, root_item);
        BUG_ON(ret);
+
+out:
        return 0;
 }
 
        u64 num_bytes = 0;
        int ret;
 
-       spin_lock(&root->fs_info->trans_lock);
+       mutex_lock(&root->fs_info->reloc_mutex);
        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
        rc->merging_rsv_size += rc->nodes_relocated * 2;
-       spin_unlock(&root->fs_info->trans_lock);
+       mutex_unlock(&root->fs_info->reloc_mutex);
+
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
        int ret;
 again:
        root = rc->extent_root;
-       spin_lock(&root->fs_info->trans_lock);
+
+       /*
+        * this serializes us with btrfs_record_root_in_transaction,
+        * we have to make sure nobody is in the middle of
+        * adding their roots to the list while we are
+        * doing this splice
+        */
+       mutex_lock(&root->fs_info->reloc_mutex);
        list_splice_init(&rc->reloc_roots, &reloc_roots);
-       spin_unlock(&root->fs_info->trans_lock);
+       mutex_unlock(&root->fs_info->reloc_mutex);
 
        while (!list_empty(&reloc_roots)) {
                found = 1;
 static void set_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-       spin_lock(&fs_info->trans_lock);
+
+       mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = rc;
-       spin_unlock(&fs_info->trans_lock);
+       mutex_unlock(&fs_info->reloc_mutex);
 }
 
 static void unset_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-       spin_lock(&fs_info->trans_lock);
+
+       mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = NULL;
-       spin_unlock(&fs_info->trans_lock);
+       mutex_unlock(&fs_info->reloc_mutex);
 }
 
 static int check_extent_flags(u64 flags)
 
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        if (root->ref_cows && root->last_trans < trans->transid) {
                WARN_ON(root == root->fs_info->extent_root);
                WARN_ON(root->commit_root != root->node);
 
+               /*
+                * see below for in_trans_setup usage rules
+                * we have the reloc mutex held now, so there
+                * is only one writer in this function
+                */
+               root->in_trans_setup = 1;
+
+               /* make sure readers find in_trans_setup before
+                * they find our root->last_trans update
+                */
+               smp_wmb();
+
                spin_lock(&root->fs_info->fs_roots_radix_lock);
                if (root->last_trans == trans->transid) {
                        spin_unlock(&root->fs_info->fs_roots_radix_lock);
                        return 0;
                }
-               root->last_trans = trans->transid;
                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                           (unsigned long)root->root_key.objectid,
                           BTRFS_ROOT_TRANS_TAG);
                spin_unlock(&root->fs_info->fs_roots_radix_lock);
+               root->last_trans = trans->transid;
+
+               /* this is pretty tricky.  We don't want to
+                * take the relocation lock in btrfs_record_root_in_trans
+                * unless we're really doing the first setup for this root in
+                * this transaction.
+                *
+                * Normally we'd use root->last_trans as a flag to decide
+                * if we want to take the expensive mutex.
+                *
+                * But, we have to set root->last_trans before we
+                * init the relocation root, otherwise, we trip over warnings
+                * in ctree.c.  The solution used here is to flag ourselves
+                * with root->in_trans_setup.  When this is 1, we're still
+                * fixing up the reloc trees and everyone must wait.
+                *
+                * When this is zero, they can trust root->last_trans and fly
+                * through btrfs_record_root_in_trans without having to take the
+                * lock.  smp_wmb() makes sure that all the writes above are
+                * done before we pop in the zero below
+                */
                btrfs_init_reloc_root(trans, root);
+               smp_wmb();
+               root->in_trans_setup = 0;
        }
        return 0;
 }
 
+
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+       if (!root->ref_cows)
+               return 0;
+
+       /*
+        * see record_root_in_trans for comments about in_trans_setup usage
+        * and barriers
+        */
+       smp_rmb();
+       if (root->last_trans == trans->transid &&
+           !root->in_trans_setup)
+               return 0;
+
+       mutex_lock(&root->fs_info->reloc_mutex);
+       record_root_in_trans(trans, root);
+       mutex_unlock(&root->fs_info->reloc_mutex);
+
+       return 0;
+}
+
 /* wait for commit against the current transaction to become unblocked
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
        parent = dget_parent(dentry);
        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
-       btrfs_record_root_in_trans(trans, parent_root);
+       record_root_in_trans(trans, parent_root);
 
        /*
         * insert the directory item
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
 
-       btrfs_record_root_in_trans(trans, root);
+       record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        btrfs_check_and_init_root_item(new_root_item);
        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
 
+       /*
+        * the reloc mutex makes sure that we stop
+        * the balancing code from coming in and moving
+        * extents around in the middle of the commit
+        */
+       mutex_lock(&root->fs_info->reloc_mutex);
+
        ret = create_pending_snapshots(trans, root->fs_info);
        BUG_ON(ret);
 
        root->fs_info->running_transaction = NULL;
        root->fs_info->trans_no_join = 0;
        spin_unlock(&root->fs_info->trans_lock);
+       mutex_unlock(&root->fs_info->reloc_mutex);
 
        wake_up(&root->fs_info->transaction_wait);