return path;
 }
 
+/*
+ * set all locked nodes in the path to blocking locks.  This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+       int i;
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               if (p->nodes[i] && p->locks[i])
+                       btrfs_set_lock_blocking(p->nodes[i]);
+       }
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+{
+       int i;
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               if (p->nodes[i] && p->locks[i])
+                       btrfs_clear_lock_blocking(p->nodes[i]);
+       }
+}
+
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
+       /* cow is set to blocking by btrfs_init_new_buffer */
+
        copy_extent_buffer(cow, buf, 0, 0, cow->len);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
        }
 
        search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+       if (parent)
+               btrfs_set_lock_blocking(parent);
+       btrfs_set_lock_blocking(buf);
+
        ret = __btrfs_cow_block(trans, root, buf, parent,
                                 parent_slot, cow_ret, search_start, 0,
                                 prealloc_dest);
        if (parent_nritems == 1)
                return 0;
 
+       btrfs_set_lock_blocking(parent);
+
        for (i = start_slot; i < end_slot; i++) {
                int close = 1;
 
                        search_start = last_block;
 
                btrfs_tree_lock(cur);
+               btrfs_set_lock_blocking(cur);
                err = __btrfs_cow_block(trans, root, cur, parent, i,
                                        &cur, search_start,
                                        min(16 * blocksize,
                return 0;
 
        mid = path->nodes[level];
+
        WARN_ON(!path->locks[level]);
        WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
                /* promote the child to a root */
                child = read_node_slot(root, mid, 0);
                btrfs_tree_lock(child);
+               btrfs_set_lock_blocking(child);
                BUG_ON(!child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
                BUG_ON(ret);
 
                add_root_to_dirty_list(root);
                btrfs_tree_unlock(child);
+
                path->locks[level] = 0;
                path->nodes[level] = NULL;
                clean_tree_block(trans, root, mid);
        left = read_node_slot(root, parent, pslot - 1);
        if (left) {
                btrfs_tree_lock(left);
+               btrfs_set_lock_blocking(left);
                wret = btrfs_cow_block(trans, root, left,
                                       parent, pslot - 1, &left, 0);
                if (wret) {
        right = read_node_slot(root, parent, pslot + 1);
        if (right) {
                btrfs_tree_lock(right);
+               btrfs_set_lock_blocking(right);
                wret = btrfs_cow_block(trans, root, right,
                                       parent, pslot + 1, &right, 0);
                if (wret) {
                u32 left_nr;
 
                btrfs_tree_lock(left);
+               btrfs_set_lock_blocking(left);
+
                left_nr = btrfs_header_nritems(left);
                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
                        wret = 1;
         */
        if (right) {
                u32 right_nr;
+
                btrfs_tree_lock(right);
+               btrfs_set_lock_blocking(right);
+
                right_nr = btrfs_header_nritems(right);
                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
                        wret = 1;
        }
 }
 
+/*
+ * returns -EAGAIN if it had to drop the path, or zero if everything was in
+ * cache
+ */
+static noinline int reada_for_balance(struct btrfs_root *root,
+                                     struct btrfs_path *path, int level)
+{
+       int slot;
+       int nritems;
+       struct extent_buffer *parent;
+       struct extent_buffer *eb;
+       u64 gen;
+       u64 block1 = 0;
+       u64 block2 = 0;
+       int ret = 0;
+       int blocksize;
+
+       parent = path->nodes[level - 1];
+       if (!parent)
+               return 0;
+
+       nritems = btrfs_header_nritems(parent);
+       slot = path->slots[level];
+       blocksize = btrfs_level_size(root, level);
+
+       if (slot > 0) {
+               block1 = btrfs_node_blockptr(parent, slot - 1);
+               gen = btrfs_node_ptr_generation(parent, slot - 1);
+               eb = btrfs_find_tree_block(root, block1, blocksize);
+               if (eb && btrfs_buffer_uptodate(eb, gen))
+                       block1 = 0;
+               free_extent_buffer(eb);
+       }
+       if (slot < nritems) {
+               block2 = btrfs_node_blockptr(parent, slot + 1);
+               gen = btrfs_node_ptr_generation(parent, slot + 1);
+               eb = btrfs_find_tree_block(root, block2, blocksize);
+               if (eb && btrfs_buffer_uptodate(eb, gen))
+                       block2 = 0;
+               free_extent_buffer(eb);
+       }
+       if (block1 || block2) {
+               ret = -EAGAIN;
+               btrfs_release_path(root, path);
+               if (block1)
+                       readahead_tree_block(root, block1, blocksize, 0);
+               if (block2)
+                       readahead_tree_block(root, block2, blocksize, 0);
+
+               if (block1) {
+                       eb = read_tree_block(root, block1, blocksize, 0);
+                       free_extent_buffer(eb);
+               }
+               if (block1) {
+                       eb = read_tree_block(root, block2, blocksize, 0);
+                       free_extent_buffer(eb);
+               }
+       }
+       return ret;
+}
+
+
 /*
  * when we walk down the tree, it is usually safe to unlock the higher layers
  * in the tree.  The exceptions are when our path goes through slot 0, because
        }
 }
 
+/*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node.  This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+       int i;
+
+       if (path->keep_locks || path->lowest_level)
+               return;
+
+       for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+               if (!path->nodes[i])
+                       break;
+               if (!path->locks[i])
+                       break;
+               btrfs_tree_unlock(path->nodes[i]);
+               path->locks[i] = 0;
+       }
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
                         */
                        if (prealloc_block.objectid &&
                            prealloc_block.offset != b->len) {
+                               btrfs_set_path_blocking(p);
                                btrfs_free_reserved_extent(root,
                                           prealloc_block.objectid,
                                           prealloc_block.offset);
                                goto again;
                        }
 
+                       btrfs_set_path_blocking(p);
+
                        wret = btrfs_cow_block(trans, root, b,
                                               p->nodes[level + 1],
                                               p->slots[level + 1],
                if (!p->skip_locking)
                        p->locks[level] = 1;
 
+               btrfs_clear_path_blocking(p);
+
+               /*
+                * we have a lock on b and as long as we aren't changing
+                * the tree, there is no way to for the items in b to change.
+                * It is safe to drop the lock on our parent before we
+                * go through the expensive btree search on b.
+                *
+                * If cow is true, then we might be changing slot zero,
+                * which may require changing the parent.  So, we can't
+                * drop the lock until after we know which slot we're
+                * operating on.
+                */
+               if (!cow)
+                       btrfs_unlock_up_safe(p, level + 1);
+
                ret = check_block(root, p, level);
                if (ret) {
                        ret = -1;
                }
 
                ret = bin_search(b, key, level, &slot);
+
                if (level != 0) {
                        if (ret && slot > 0)
                                slot -= 1;
                        if ((p->search_for_split || ins_len > 0) &&
                            btrfs_header_nritems(b) >=
                            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
-                               int sret = split_node(trans, root, p, level);
+                               int sret;
+
+                               sret = reada_for_balance(root, p, level);
+                               if (sret)
+                                       goto again;
+
+                               btrfs_set_path_blocking(p);
+                               sret = split_node(trans, root, p, level);
+                               btrfs_clear_path_blocking(p);
+
                                BUG_ON(sret > 0);
                                if (sret) {
                                        ret = sret;
                                b = p->nodes[level];
                                slot = p->slots[level];
                        } else if (ins_len < 0) {
-                               int sret = balance_level(trans, root, p,
-                                                        level);
+                               int sret;
+
+                               sret = reada_for_balance(root, p, level);
+                               if (sret)
+                                       goto again;
+
+                               btrfs_set_path_blocking(p);
+                               sret = balance_level(trans, root, p, level);
+                               btrfs_clear_path_blocking(p);
+
                                if (sret) {
                                        ret = sret;
                                        goto done;
                                 * of the btree by dropping locks before
                                 * we read.
                                 */
-                               if (level > 1) {
+                               if (level > 0) {
                                        btrfs_release_path(NULL, p);
                                        if (tmp)
                                                free_extent_buffer(tmp);
                                                free_extent_buffer(tmp);
                                        goto again;
                                } else {
+                                       btrfs_set_path_blocking(p);
                                        if (tmp)
                                                free_extent_buffer(tmp);
                                        if (should_reada)
                                        b = read_node_slot(root, b, slot);
                                }
                        }
-                       if (!p->skip_locking)
-                               btrfs_tree_lock(b);
+                       if (!p->skip_locking) {
+                               int lret;
+
+                               btrfs_clear_path_blocking(p);
+                               lret = btrfs_try_spin_lock(b);
+
+                               if (!lret) {
+                                       btrfs_set_path_blocking(p);
+                                       btrfs_tree_lock(b);
+                                       btrfs_clear_path_blocking(p);
+                               }
+                       }
                } else {
                        p->slots[level] = slot;
                        if (ins_len > 0 &&
                            btrfs_leaf_free_space(root, b) < ins_len) {
-                               int sret = split_leaf(trans, root, key,
+                               int sret;
+
+                               btrfs_set_path_blocking(p);
+                               sret = split_leaf(trans, root, key,
                                                      p, ins_len, ret == 0);
+                               btrfs_clear_path_blocking(p);
+
                                BUG_ON(sret > 0);
                                if (sret) {
                                        ret = sret;
        }
        ret = 1;
 done:
+       /*
+        * we don't really know what they plan on doing with the path
+        * from here on, so for now just mark it as blocking
+        */
+       btrfs_set_path_blocking(p);
        if (prealloc_block.objectid) {
                btrfs_free_reserved_extent(root,
                           prealloc_block.objectid,
                           prealloc_block.offset);
        }
-
        return ret;
 }
 
        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
        BUG_ON(ret);
 
+       btrfs_set_lock_blocking(eb);
+
        parent = eb;
        while (1) {
                level = btrfs_header_level(parent);
                        eb = read_tree_block(root, bytenr, blocksize,
                                             generation);
                        btrfs_tree_lock(eb);
+                       btrfs_set_lock_blocking(eb);
                }
 
                /*
                                eb = read_tree_block(root, bytenr, blocksize,
                                                generation);
                                btrfs_tree_lock(eb);
+                               btrfs_set_lock_blocking(eb);
                        }
 
                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
 
        right = read_node_slot(root, upper, slot + 1);
        btrfs_tree_lock(right);
+       btrfs_set_lock_blocking(right);
+
        free_space = btrfs_leaf_free_space(root, right);
        if (free_space < data_size)
                goto out_unlock;
 
        left = read_node_slot(root, path->nodes[1], slot - 1);
        btrfs_tree_lock(left);
+       btrfs_set_lock_blocking(left);
+
        free_space = btrfs_leaf_free_space(root, left);
        if (free_space < data_size) {
                ret = 1;
        path->keep_locks = 0;
        BUG_ON(ret);
 
+       /*
+        * make sure any changes to the path from split_leaf leave it
+        * in a blocking state
+        */
+       btrfs_set_path_blocking(path);
+
        leaf = path->nodes[0];
        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
 
                BUG();
        }
 out:
+       btrfs_unlock_up_safe(path, 1);
        return ret;
 }
 
                 */
                if (slot >= nritems) {
                        path->slots[level] = slot;
+                       btrfs_set_path_blocking(path);
                        sret = btrfs_find_next_key(root, path, min_key, level,
                                                  cache_only, min_trans);
                        if (sret == 0) {
                                btrfs_release_path(root, path);
                                goto again;
                        } else {
+                               btrfs_clear_path_blocking(path);
                                goto out;
                        }
                }
                        unlock_up(path, level, 1);
                        goto out;
                }
+               btrfs_set_path_blocking(path);
                cur = read_node_slot(root, cur, slot);
 
                btrfs_tree_lock(cur);
+
                path->locks[level - 1] = 1;
                path->nodes[level - 1] = cur;
                unlock_up(path, level, 1);
+               btrfs_clear_path_blocking(path);
        }
 out:
        if (ret == 0)
                memcpy(min_key, &found_key, sizeof(found_key));
+       btrfs_set_path_blocking(path);
        return ret;
 }
 
        if (ret < 0)
                return ret;
 
+       btrfs_set_path_blocking(path);
        nritems = btrfs_header_nritems(path->nodes[0]);
        /*
         * by releasing the path above we dropped all our locks.  A balance
                        free_extent_buffer(next);
                }
 
+               /* the path was set to blocking above */
                if (level == 1 && (path->locks[1] || path->skip_locking) &&
                    path->reada)
                        reada_for_search(root, path, level, slot, 0);
                if (!path->skip_locking) {
                        WARN_ON(!btrfs_tree_locked(c));
                        btrfs_tree_lock(next);
+                       btrfs_set_lock_blocking(next);
                }
                break;
        }
                        path->locks[level] = 1;
                if (!level)
                        break;
+
+               btrfs_set_path_blocking(path);
                if (level == 1 && path->locks[1] && path->reada)
                        reada_for_search(root, path, level, slot, 0);
                next = read_node_slot(root, next, 0);
                if (!path->skip_locking) {
                        WARN_ON(!btrfs_tree_locked(path->nodes[level]));
                        btrfs_tree_lock(next);
+                       btrfs_set_lock_blocking(next);
                }
        }
 done:
 
        while (1) {
                if (path->slots[0] == 0) {
+                       btrfs_set_path_blocking(path);
                        ret = btrfs_prev_leaf(root, path);
                        if (ret != 0)
                                return ret;
 
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int slot, int nr);
 int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
        if (ret == 0)
-               buf->flags |= EXTENT_UPTODATE;
+               set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
        else
                WARN_ON(1);
        return buf;
        if (btrfs_header_generation(buf) ==
            root->fs_info->running_transaction->transid) {
                WARN_ON(!btrfs_tree_locked(buf));
+
+               /* ugh, clear_extent_buffer_dirty can be expensive */
+               btrfs_set_lock_blocking(buf);
+
                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
                                          buf);
        }
        u64 transid = btrfs_header_generation(buf);
        struct inode *btree_inode = root->fs_info->btree_inode;
 
+       btrfs_set_lock_blocking(buf);
+
        WARN_ON(!btrfs_tree_locked(buf));
        if (transid != root->fs_info->generation) {
                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
        int ret;
        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
        if (ret == 0)
-               buf->flags |= EXTENT_UPTODATE;
+               set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
        return ret;
 }
 
 
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
+
+       btrfs_set_lock_blocking(buf);
        btrfs_set_buffer_uptodate(buf);
+
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
                set_extent_dirty(&root->dirty_log_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
                         buf->start + buf->len - 1, GFP_NOFS);
        }
        trans->blocks_used++;
+       /* this returns a buffer locked for blocking */
        return buf;
 }
 
 
                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
                btrfs_tree_lock(next);
+               btrfs_set_lock_blocking(next);
 
                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
                                              &refs);
 
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
        eb->start = start;
        eb->len = len;
-       mutex_init(&eb->mutex);
+       spin_lock_init(&eb->lock);
+       init_waitqueue_head(&eb->lock_wq);
+
 #if LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
        list_add(&eb->leak_list, &buffers);
                unlock_page(p);
        }
        if (uptodate)
-               eb->flags |= EXTENT_UPTODATE;
-       eb->flags |= EXTENT_BUFFER_FILLED;
+               set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
        spin_lock(&tree->buffer_lock);
        exists = buffer_tree_insert(tree, start, &eb->rb_node);
        unsigned long num_pages;
 
        num_pages = num_extent_pages(eb->start, eb->len);
-       eb->flags &= ~EXTENT_UPTODATE;
+       clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
        clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
                              GFP_NOFS);
        struct page *page;
        int pg_uptodate = 1;
 
-       if (eb->flags & EXTENT_UPTODATE)
+       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 1;
 
        ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
        struct bio *bio = NULL;
        unsigned long bio_flags = 0;
 
-       if (eb->flags & EXTENT_UPTODATE)
+       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 0;
 
        if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
        }
        if (all_uptodate) {
                if (start_i == 0)
-                       eb->flags |= EXTENT_UPTODATE;
+                       set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
                goto unlock_exit;
        }
 
        }
 
        if (!ret)
-               eb->flags |= EXTENT_UPTODATE;
+               set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        return ret;
 
 unlock_exit:
                unmap_extent_buffer(eb, eb->map_token, km);
                eb->map_token = NULL;
                save = 1;
-               WARN_ON(!mutex_is_locked(&eb->mutex));
        }
        err = map_private_extent_buffer(eb, start, min_len, token, map,
                                       map_start, map_len, km);
 
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
 
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_BLOCKING 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
        unsigned long map_start;
        unsigned long map_len;
        struct page *first_page;
+       unsigned long bflags;
        atomic_t refs;
-       int flags;
        struct list_head leak_list;
        struct rb_node rb_node;
-       struct mutex mutex;
+
+       /* the spinlock is used to protect most operations */
+       spinlock_t lock;
+
+       /*
+        * when we keep the lock held while blocking, waiters go onto
+        * the wq
+        */
+       wait_queue_head_t lock_wq;
 };
 
 struct extent_map_tree;
 
 #include "tree-log.h"
 #include "ref-cache.h"
 #include "compression.h"
+#include "locking.h"
 
 struct btrfs_iget_args {
        u64 ino;
        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+
        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
                                                alloc_group_block, 0);
        btrfs_free_path(path);
                goto failed;
        }
 
+       btrfs_unlock_up_safe(path, 1);
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                  struct btrfs_inode_item);
 
 #include "locking.h"
 
 /*
- * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
- * and the spin is not tuned very extensively.  The spinning does make a big
- * difference in almost every workload, but spinning for the right amount of
- * time needs some help.
- *
- * In general, we want to spin as long as the lock holder is doing btree
- * searches, and we should give up if they are in more expensive code.
+ * btrfs_header_level() isn't free, so don't call it when lockdep isn't
+ * on
  */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void spin_nested(struct extent_buffer *eb)
+{
+       spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+}
+#else
+static inline void spin_nested(struct extent_buffer *eb)
+{
+       spin_lock(&eb->lock);
+}
+#endif
 
-int btrfs_tree_lock(struct extent_buffer *eb)
+/*
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait.  After
+ * this you can safely schedule with the lock held.
+ */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
 {
-       int i;
+       if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+               set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+               spin_unlock(&eb->lock);
+       }
+       /* exit with the spin lock released and the bit set */
+}
 
-       if (mutex_trylock(&eb->mutex))
-               return 0;
+/*
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+       if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+               spin_nested(eb);
+               clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+               smp_mb__after_clear_bit();
+       }
+       /* exit with the spin lock held */
+}
+
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for every long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+       int i;
        for (i = 0; i < 512; i++) {
                cpu_relax();
-               if (mutex_trylock(&eb->mutex))
+               if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                       return 1;
+               if (need_resched())
+                       break;
+       }
+       return 0;
+}
+
+/*
+ * This is somewhat different from trylock.  It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
+{
+       int i;
+
+       spin_nested(eb);
+       if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+               return 1;
+       spin_unlock(&eb->lock);
+
+       /* spin for a bit on the BLOCKING flag */
+       for (i = 0; i < 2; i++) {
+               if (!btrfs_spin_on_block(eb))
+                       break;
+
+               spin_nested(eb);
+               if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                       return 1;
+               spin_unlock(&eb->lock);
+       }
+       return 0;
+}
+
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup.  The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want.  We want a single proc
+ * to be notified that the lock is ready for taking.  If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+                              int sync, void *key)
+{
+       autoremove_wake_function(wait, mode, sync, key);
+       return 1;
+}
+
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+       DEFINE_WAIT(wait);
+       wait.func = btrfs_wake_function;
+
+       while(1) {
+               spin_nested(eb);
+
+               /* nobody is blocking, exit with the spinlock held */
+               if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
                        return 0;
+
+               /*
+                * we have the spinlock, but the real owner is blocking.
+                * wait for them
+                */
+               spin_unlock(&eb->lock);
+
+               /*
+                * spin for a bit, and if the blocking flag goes away,
+                * loop around
+                */
+               if (btrfs_spin_on_block(eb))
+                       continue;
+
+               prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+                                         TASK_UNINTERRUPTIBLE);
+
+               if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                       schedule();
+
+               finish_wait(&eb->lock_wq, &wait);
        }
-       cpu_relax();
-       mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
        return 0;
 }
 
+/*
+ * Very quick trylock, this does not spin or schedule.  It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
 int btrfs_try_tree_lock(struct extent_buffer *eb)
 {
-       return mutex_trylock(&eb->mutex);
+       if (spin_trylock(&eb->lock)) {
+               if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+                       /*
+                        * we've got the spinlock, but the real owner is
+                        * blocking.  Drop the spinlock and return failure
+                        */
+                       spin_unlock(&eb->lock);
+                       return 0;
+               }
+               return 1;
+       }
+       /* someone else has the spinlock giveup */
+       return 0;
 }
 
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
-       mutex_unlock(&eb->mutex);
+       /*
+        * if we were a blocking owner, we don't have the spinlock held
+        * just clear the bit and look for waiters
+        */
+       if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+               smp_mb__after_clear_bit();
+       else
+               spin_unlock(&eb->lock);
+
+       if (waitqueue_active(&eb->lock_wq))
+               wake_up(&eb->lock_wq);
        return 0;
 }
 
 int btrfs_tree_locked(struct extent_buffer *eb)
 {
-       return mutex_is_locked(&eb->mutex);
+       return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
+                       spin_is_locked(&eb->lock);
 }
 
 /*
 {
        int i;
        struct extent_buffer *eb;
+
        for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
                eb = path->nodes[i];
                if (!eb)
                        break;
                smp_mb();
-               if (!list_empty(&eb->mutex.wait_list))
+               if (spin_is_contended(&eb->lock) ||
+                   waitqueue_active(&eb->lock_wq))
                        return 1;
        }
        return 0;
 
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
 int btrfs_tree_locked(struct extent_buffer *eb);
+
 int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_try_spin_lock(struct extent_buffer *eb);
+
 int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+
+void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking(struct extent_buffer *eb);
 #endif
 
                u32 nritems;
 
                root_node = btrfs_lock_root_node(root);
+               btrfs_set_lock_blocking(root_node);
                nritems = btrfs_header_nritems(root_node);
                root->defrag_max.objectid = 0;
                /* from above we know this is not a leaf */
 
 
                                btrfs_tree_lock(next);
                                clean_tree_block(trans, root, next);
+                               btrfs_set_lock_blocking(next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
 
                next = path->nodes[*level];
                btrfs_tree_lock(next);
                clean_tree_block(trans, root, next);
+               btrfs_set_lock_blocking(next);
                btrfs_wait_tree_block_writeback(next);
                btrfs_tree_unlock(next);
 
 
                                btrfs_tree_lock(next);
                                clean_tree_block(trans, root, next);
+                               btrfs_set_lock_blocking(next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
 
 
                        btrfs_tree_lock(next);
                        clean_tree_block(trans, log, next);
+                       btrfs_set_lock_blocking(next);
                        btrfs_wait_tree_block_writeback(next);
                        btrfs_tree_unlock(next);