obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           hash.o file-item.o inode-item.o inode-map.o disk-io.o \
-          transaction.o bit-radix.o inode.o file.o
+          transaction.o bit-radix.o inode.o file.o tree-defrag.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #        root-tree.o dir-item.o hash.o file-item.o inode-item.o \
 
        memset(p, 0, sizeof(*p));
 }
 
-static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
                           *root, struct buffer_head *buf, struct buffer_head
                           *parent, int parent_slot, struct buffer_head
-                          **cow_ret)
+                          **cow_ret, u64 search_start, u64 empty_size)
 {
        struct buffer_head *cow;
        struct btrfs_node *cow_node;
-       int ret;
+       int ret = 0;
+       int different_trans = 0;
 
+       WARN_ON(root->ref_cows && trans->transid != root->last_trans);
        WARN_ON(!buffer_uptodate(buf));
-       if (trans->transaction != root->fs_info->running_transaction) {
-               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-                      root->fs_info->running_transaction->transid);
-               WARN_ON(1);
-       }
-       if (trans->transid != root->fs_info->generation) {
-               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-                      root->fs_info->generation);
-               WARN_ON(1);
-       }
-       if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
-                                   trans->transid) {
-               *cow_ret = buf;
-               return 0;
-       }
-       cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
+       cow = btrfs_alloc_free_block(trans, root, search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
+
        cow_node = btrfs_buffer_node(cow);
        if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
                WARN_ON(1);
+
        memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
        btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
        btrfs_set_header_generation(&cow_node->header, trans->transid);
        btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
-       ret = btrfs_inc_ref(trans, root, buf);
-       if (ret)
-               return ret;
+
+       WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) >
+               trans->transid);
+       if (btrfs_header_generation(btrfs_buffer_header(buf)) !=
+                                   trans->transid) {
+               different_trans = 1;
+               ret = btrfs_inc_ref(trans, root, buf);
+               if (ret)
+                       return ret;
+       } else {
+               WARN_ON(!root->ref_cows);
+               clean_tree_block(trans, root, buf);
+       }
+
        if (buf == root->node) {
                root->node = cow;
                get_bh(cow);
                btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
                                        bh_blocknr(cow));
                btrfs_mark_buffer_dirty(parent);
+               WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) !=
+                                   trans->transid);
                btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
        }
        btrfs_block_release(root, buf);
        return 0;
 }
 
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct buffer_head *buf, struct buffer_head
+                          *parent, int parent_slot, struct buffer_head
+                          **cow_ret)
+{
+       u64 search_start;
+       if (trans->transaction != root->fs_info->running_transaction) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->running_transaction->transid);
+               WARN_ON(1);
+       }
+       if (trans->transid != root->fs_info->generation) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->generation);
+               WARN_ON(1);
+       }
+       if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
+                                   trans->transid) {
+               *cow_ret = buf;
+               return 0;
+       }
+
+       search_start = bh_blocknr(buf) & ~((u64)65535);
+       return __btrfs_cow_block(trans, root, buf, parent,
+                                parent_slot, cow_ret, search_start, 0);
+}
+
+static int close_blocks(u64 blocknr, u64 other)
+{
+       if (blocknr < other && other - blocknr < 8)
+               return 1;
+       if (blocknr > other && blocknr - other < 8)
+               return 1;
+       return 0;
+}
+
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct buffer_head *parent,
+                      int cache_only)
+{
+       struct btrfs_node *parent_node;
+       struct buffer_head *cur_bh;
+       struct buffer_head *tmp_bh;
+       u64 blocknr;
+       u64 search_start = 0;
+       u64 other;
+       u32 parent_nritems;
+       int start_slot;
+       int end_slot;
+       int i;
+       int err = 0;
+
+       if (trans->transaction != root->fs_info->running_transaction) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->running_transaction->transid);
+               WARN_ON(1);
+       }
+       if (trans->transid != root->fs_info->generation) {
+               printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+                      root->fs_info->generation);
+               WARN_ON(1);
+       }
+       parent_node = btrfs_buffer_node(parent);
+       parent_nritems = btrfs_header_nritems(&parent_node->header);
+
+       start_slot = 0;
+       end_slot = parent_nritems;
+
+       if (parent_nritems == 1)
+               return 0;
+
+       for (i = start_slot; i < end_slot; i++) {
+               int close = 1;
+               blocknr = btrfs_node_blockptr(parent_node, i);
+               if (i > 0) {
+                       other = btrfs_node_blockptr(parent_node, i - 1);
+                       close = close_blocks(blocknr, other);
+               }
+               if (close && i < end_slot - 1) {
+                       other = btrfs_node_blockptr(parent_node, i + 1);
+                       close = close_blocks(blocknr, other);
+               }
+               if (close)
+                       continue;
+
+               cur_bh = btrfs_find_tree_block(root, blocknr);
+               if (!cur_bh || !buffer_uptodate(cur_bh) ||
+                   buffer_locked(cur_bh)) {
+                       if (cache_only) {
+                               brelse(cur_bh);
+                               continue;
+                       }
+                       brelse(cur_bh);
+                       cur_bh = read_tree_block(root, blocknr);
+               }
+               if (search_start == 0) {
+                       search_start = bh_blocknr(cur_bh) & ~((u64)65535);
+               }
+               err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
+                                       &tmp_bh, search_start,
+                                       min(8, end_slot - i));
+               if (err)
+                       break;
+               search_start = bh_blocknr(tmp_bh);
+               brelse(tmp_bh);
+       }
+       return err;
+}
+
 /*
  * The leaf data grows from end-to-front in the node.
  * this returns the address of the start of the last item,
 
                parent_slot = path->slots[level + 1];
                parent_key = &parent->ptrs[parent_slot].key;
+
                BUG_ON(memcmp(parent_key, &leaf->items[0].key,
                       sizeof(struct btrfs_disk_key)));
                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
  * readahead one full node of leaves
  */
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
-                            int slot)
+                            int level, int slot)
 {
        struct btrfs_node *node;
        int i;
        unsigned long gang[8];
        struct buffer_head *bh;
 
-       if (!path->nodes[1])
+       if (level == 0)
+               return;
+
+       if (!path->nodes[level])
                return;
 
-       node = btrfs_buffer_node(path->nodes[1]);
+       node = btrfs_buffer_node(path->nodes[level]);
        search = btrfs_node_blockptr(node, slot);
        bh = btrfs_find_tree_block(root, search);
        if (bh) {
                for (i = 0; i < ret; i++) {
                        blocknr = gang[i];
                        clear_radix_bit(&found, blocknr);
-                       if (nread > 64)
+                       if (nread > 32)
                                continue;
                        if (direction > 0 && cluster_start <= blocknr &&
                            cluster_start + 8 > blocknr) {
        struct buffer_head *b;
        struct buffer_head *cow_buf;
        struct btrfs_node *c;
-       struct btrfs_root_item *root_item = &root->root_item;
        u64 blocknr;
        int slot;
        int ret;
        int should_reada = p->reada;
        u8 lowest_level = 0;
 
-       if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
-               lowest_level = root_item->drop_level;
-               WARN_ON(ins_len || cow);
-       }
-
+       lowest_level = p->lowest_level;
+       WARN_ON(lowest_level && ins_len);
        WARN_ON(p->nodes[0] != NULL);
        WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 again:
                        if (level == lowest_level)
                                break;
                        blocknr = btrfs_node_blockptr(c, slot);
-                       if (level == 1 && should_reada)
-                               reada_for_search(root, p, slot);
+                       if (should_reada)
+                               reada_for_search(root, p, level, slot);
                        b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 
                } else {
        BUG_ON(path->nodes[level]);
        BUG_ON(path->nodes[level-1] != root->node);
 
-       t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
+       t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0);
        if (IS_ERR(t))
                return PTR_ERR(t);
        c = btrfs_buffer_node(t);
        }
 
        c_nritems = btrfs_header_nritems(&c->header);
-       split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
+       split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0);
        if (IS_ERR(split_buffer))
                return PTR_ERR(split_buffer);
 
        nritems = btrfs_header_nritems(&l->header);
        mid = (nritems + 1)/ 2;
 
-       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
        if (IS_ERR(right_buffer))
                return PTR_ERR(right_buffer);
 
 
        if (!double_split)
                return ret;
-       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+       right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
        if (IS_ERR(right_buffer))
                return PTR_ERR(right_buffer);
 
                blocknr = btrfs_node_blockptr(c_node, slot);
                if (next)
                        btrfs_block_release(root, next);
-               if (level == 1 && path->reada)
-                       reada_for_search(root, path, slot);
+               if (path->reada)
+                       reada_for_search(root, path, level, slot);
                next = read_tree_block(root, blocknr);
                break;
        }
                path->slots[level] = 0;
                if (!level)
                        break;
-               if (level == 1 && path->reada)
-                       reada_for_search(root, path, slot);
+               if (path->reada)
+                       reada_for_search(root, path, level, slot);
                next = read_tree_block(root,
                       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
        }
 
        struct buffer_head *nodes[BTRFS_MAX_LEVEL];
        int slots[BTRFS_MAX_LEVEL];
        int reada;
+       int lowest_level;
 };
 
 /*
        u64 highest_inode;
        u64 last_inode_alloc;
        int ref_cows;
+       struct btrfs_key defrag_progress;
+       int defrag_running;
+       int defrag_level;
 };
 
 /* the lower bits in the key flags defines the item type */
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                           struct btrfs_root *root, u64 hint);
+                                           struct btrfs_root *root, u64 hint,
+                                           u64 empty_size);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, u64 owner,
-                      u64 num_blocks, u64 search_start,
+                      u64 num_blocks, u64 empty_size, u64 search_start,
                       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct buffer_head *buf);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 /* ctree.c */
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct buffer_head *buf, struct buffer_head
+                          *parent, int parent_slot, struct buffer_head
+                          **cow_ret);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, struct btrfs_path *p, int
                      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct buffer_head *parent,
+                      int cache_only);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct inode *inode,
                       u64 start, u64 end, u64 *hint_block);
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, int cache_only);
 #endif
 
                     struct buffer_head *buf)
 {
        WARN_ON(atomic_read(&buf->b_count) == 0);
+       lock_buffer(buf);
        clear_buffer_dirty(buf);
+       unlock_buffer(buf);
        return 0;
 }
 
        root->last_inode_alloc = 0;
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
+       memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+       root->defrag_running = 0;
+       root->defrag_level = 0;
        root->root_key.objectid = objectid;
        return 0;
 }
        fs_info->closing = 1;
        btrfs_transaction_flush_work(root);
        mutex_lock(&fs_info->fs_mutex);
+       btrfs_defrag_dirty_roots(root->fs_info);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        /* run commit again to  drop the original snapshot */
 {
        struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
        u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+
        WARN_ON(!atomic_read(&bh->b_count));
+
        if (transid != root->fs_info->generation) {
                printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
                        (unsigned long long)bh->b_blocknr,
 
 #include "transaction.h"
 
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *orig_root, u64 num_blocks, u64 search_start,
+                           *orig_root, u64 num_blocks, u64 empty_size,
+                           u64 search_start,
                            u64 search_end, u64 hint_block,
                            struct btrfs_key *ins, u64 exclude_start,
                            u64 exclude_nr, int data);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-       ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0,
+       ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, 0,
                               (u64)-1, 0, &ins, 0, 0, 0);
        if (ret) {
                btrfs_free_path(path);
        struct btrfs_block_group_item *bi;
        struct btrfs_key ins;
 
-       ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins,
+       ret = find_free_extent(trans, extent_root, 0, 0, 0, (u64)-1, 0, &ins,
                               0, 0, 0);
        /* FIXME, set bit to recalc cache groups on next mount */
        if (ret)
 static int try_remove_page(struct address_space *mapping, unsigned long index)
 {
        int ret;
+       return 0;
        ret = invalidate_mapping_pages(mapping, index, index);
        return ret;
 }
        if (!path)
                return -ENOMEM;
 
-       ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
+       ret = find_free_extent(trans, root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
        if (ret) {
                btrfs_free_path(path);
                return ret;
  * Any available blocks before search_start are skipped.
  */
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *orig_root, u64 num_blocks, u64 search_start, u64
-                           search_end, u64 hint_block,
+                           *orig_root, u64 num_blocks, u64 empty_size,
+                           u64 search_start, u64 search_end, u64 hint_block,
                            struct btrfs_key *ins, u64 exclude_start,
                            u64 exclude_nr, int data)
 {
                                                     data, 1);
        }
 
+       total_needed += empty_size;
        path = btrfs_alloc_path();
 
 check_failed:
                        goto error;
                }
                search_start = orig_search_start;
-               if (wrapped)
+               if (wrapped) {
+                       if (!full_scan)
+                               total_needed -= empty_size;
                        full_scan = 1;
-               else
+               } else
                        wrapped = 1;
                goto new_group;
        }
                        ret = -ENOSPC;
                        goto error;
                }
-               if (wrapped)
+               if (wrapped) {
+                       if (!full_scan)
+                               total_needed -= empty_size;
                        full_scan = 1;
-               else
+               } else
                        wrapped = 1;
        }
        block_group = btrfs_lookup_block_group(info, search_start);
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, u64 owner,
-                      u64 num_blocks, u64 hint_block,
+                      u64 num_blocks, u64 empty_size, u64 hint_block,
                       u64 search_end, struct btrfs_key *ins, int data)
 {
        int ret;
         * in the correct block group.
         */
        if (data) {
-               ret = find_free_extent(trans, root, 0, 0,
+               ret = find_free_extent(trans, root, 0, 0, 0,
                                       search_end, 0, &prealloc_key, 0, 0, 0);
                BUG_ON(ret);
                if (ret)
        }
 
        /* do the real allocation */
-       ret = find_free_extent(trans, root, num_blocks, search_start,
-                              search_end, hint_block, ins,
+       ret = find_free_extent(trans, root, num_blocks, empty_size,
+                              search_start, search_end, hint_block, ins,
                               exclude_start, exclude_nr, data);
        BUG_ON(ret);
        if (ret)
                exclude_start = ins->objectid;
                exclude_nr = ins->offset;
                hint_block = exclude_start + exclude_nr;
-               ret = find_free_extent(trans, root, 0, search_start,
+               ret = find_free_extent(trans, root, 0, 0, search_start,
                                       search_end, hint_block,
                                       &prealloc_key, exclude_start,
                                       exclude_nr, 0);
  * returns the tree buffer or NULL.
  */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root, u64 hint)
+                                          struct btrfs_root *root, u64 hint,
+                                          u64 empty_size)
 {
        struct btrfs_key ins;
        int ret;
        struct buffer_head *buf;
 
        ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-                                1, hint, (unsigned long)-1, &ins, 0);
+                                1, empty_size, hint,
+                                (unsigned long)-1, &ins, 0);
        if (ret) {
                BUG_ON(ret > 0);
                return ERR_PTR(ret);
                btrfs_free_extent(trans, root, ins.objectid, 1, 0);
                return ERR_PTR(-ENOMEM);
        }
+       WARN_ON(buffer_dirty(buf));
        set_buffer_uptodate(buf);
        set_buffer_checked(buf);
        set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
                struct btrfs_key key;
                struct btrfs_disk_key *found_key;
                struct btrfs_node *node;
+
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+               level = root_item->drop_level;
+               path->lowest_level = level;
                wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0) {
+               if (wret < 0) {
                        ret = wret;
                        goto out;
                }
-               level = root_item->drop_level;
                node = btrfs_buffer_node(path->nodes[level]);
                found_key = &node->ptrs[path->slots[level]].key;
                WARN_ON(memcmp(found_key, &root_item->drop_progress,
                        ret = wret;
                num_walks++;
                if (num_walks > 10) {
-                       struct btrfs_key key;
-                       btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
                        ret = -EAGAIN;
                        get_bh(root->node);
                        break;
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
                        btrfs_block_release(root, path->nodes[i]);
+                       path->nodes[i] = 0;
                }
        }
 out:
 
        if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
            pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
                err = btrfs_alloc_extent(trans, root, inode->i_ino,
-                                        num_blocks, hint_block, (u64)-1,
+                                        num_blocks, 0, hint_block, (u64)-1,
                                         &ins, 1);
                if (err)
                        goto failed_truncate;
 
                                 &alloc_hint);
        if (ret)
                goto out;
-       ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+       ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
                                 alloc_hint, (u64)-1, &ins, 1);
        if (ret)
                goto out;
        if (create & BTRFS_GET_BLOCK_CREATE) {
                struct btrfs_key ins;
                ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-                                        1, alloc_hint, (u64)-1,
+                                        1, 0, alloc_hint, (u64)-1,
                                         &ins, 1);
                if (ret) {
                        err = ret;
        trans = btrfs_start_transaction(root, 1);
        BUG_ON(!trans);
 
-       subvol = btrfs_alloc_free_block(trans, root, 0);
+       subvol = btrfs_alloc_free_block(trans, root, 0, 0);
        if (IS_ERR(subvol))
                return PTR_ERR(subvol);
        leaf = btrfs_buffer_leaf(subvol);
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_vol_args vol_args;
+       struct btrfs_trans_handle *trans;
        int ret = 0;
+       int err;
        struct btrfs_dir_item *di;
        int namelen;
        struct btrfs_path *path;
                else
                        ret = create_snapshot(root, vol_args.name, namelen);
                break;
+
+       case BTRFS_IOC_DEFRAG:
+               mutex_lock(&root->fs_info->fs_mutex);
+               trans = btrfs_start_transaction(root, 1);
+               memset(&root->defrag_progress, 0,
+                      sizeof(root->defrag_progress));
+               while (1) {
+                       root->defrag_running = 1;
+                       err = btrfs_defrag_leaves(trans, root, 0);
+
+                       btrfs_end_transaction(trans, root);
+                       mutex_unlock(&root->fs_info->fs_mutex);
+
+                       btrfs_btree_balance_dirty(root);
+
+                       mutex_lock(&root->fs_info->fs_mutex);
+                       trans = btrfs_start_transaction(root, 1);
+                       if (err != -EAGAIN)
+                               break;
+               }
+               root->defrag_running = 0;
+               btrfs_end_transaction(trans, root);
+               mutex_unlock(&root->fs_info->fs_mutex);
+               ret = 0;
+               break;
        default:
                return -ENOTTY;
        }
 
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_ADD_DISK _IOW(BTRFS_IOCTL_MAGIC, 2, \
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
                                   struct btrfs_ioctl_vol_args)
 #endif
 
 static struct workqueue_struct *trans_wq;
 
 #define BTRFS_ROOT_TRANS_TAG 0
+#define BTRFS_ROOT_DEFRAG_TAG 1
 
 static void put_transaction(struct btrfs_transaction *transaction)
 {
        return 0;
 }
 
+static int record_root_in_trans(struct btrfs_root *root)
+{
+       u64 running_trans_id = root->fs_info->running_transaction->transid;
+       if (root->ref_cows && root->last_trans < running_trans_id) {
+               WARN_ON(root == root->fs_info->extent_root);
+               if (root->root_item.refs != 0) {
+                       radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                                  (unsigned long)root->root_key.objectid,
+                                  BTRFS_ROOT_TRANS_TAG);
+                       radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                                  (unsigned long)root->root_key.objectid,
+                                  BTRFS_ROOT_DEFRAG_TAG);
+                       root->commit_root = root->node;
+                       get_bh(root->node);
+               } else {
+                       WARN_ON(1);
+               }
+               root->last_trans = running_trans_id;
+       }
+       return 0;
+}
+
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_blocks)
 {
        struct btrfs_trans_handle *h =
                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        int ret;
-       u64 running_trans_id;
 
        mutex_lock(&root->fs_info->trans_mutex);
        ret = join_transaction(root);
        BUG_ON(ret);
-       running_trans_id = root->fs_info->running_transaction->transid;
 
-       if (root != root->fs_info->tree_root && root->last_trans <
-           running_trans_id) {
-               WARN_ON(root == root->fs_info->extent_root);
-               WARN_ON(root->ref_cows != 1);
-               if (root->root_item.refs != 0) {
-                       radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-                                          (unsigned long)root->root_key.objectid,
-                                          BTRFS_ROOT_TRANS_TAG);
-                       root->commit_root = root->node;
-                       get_bh(root->node);
-               } else {
-                       WARN_ON(1);
-               }
-       }
-       root->last_trans = running_trans_id;
-       h->transid = running_trans_id;
+       record_root_in_trans(root);
+       h->transid = root->fs_info->running_transaction->transid;
        h->transaction = root->fs_info->running_transaction;
        h->blocks_reserved = num_blocks;
        h->blocks_used = 0;
                                              gang[i]);
                        if (!page)
                                continue;
+                       if (PageWriteback(page)) {
+                               if (PageDirty(page))
+                                       wait_on_page_writeback(page);
+                               else {
+                                       unlock_page(page);
+                                       page_cache_release(page);
+                                       continue;
+                               }
+                       }
                        err = write_one_page(page, 0);
                        if (err)
                                werr = err;
        return err;
 }
 
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
+{
+       struct btrfs_root *gang[1];
+       struct btrfs_root *root;
+       struct btrfs_root *tree_root = info->tree_root;
+       struct btrfs_trans_handle *trans;
+       int i;
+       int ret;
+       int err = 0;
+       u64 last = 0;
+
+       trans = btrfs_start_transaction(tree_root, 1);
+       while(1) {
+               ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
+                                                (void **)gang, last,
+                                                ARRAY_SIZE(gang),
+                                                BTRFS_ROOT_DEFRAG_TAG);
+               if (ret == 0)
+                       break;
+               for (i = 0; i < ret; i++) {
+                       root = gang[i];
+                       last = root->root_key.objectid + 1;
+                       radix_tree_tag_clear(&info->fs_roots_radix,
+                                    (unsigned long)root->root_key.objectid,
+                                    BTRFS_ROOT_DEFRAG_TAG);
+                       if (root->defrag_running)
+                               continue;
+
+                       while (1) {
+                               mutex_lock(&root->fs_info->trans_mutex);
+                               record_root_in_trans(root);
+                               mutex_unlock(&root->fs_info->trans_mutex);
+
+                               root->defrag_running = 1;
+                               err = btrfs_defrag_leaves(trans, root, 1);
+                               btrfs_end_transaction(trans, tree_root);
+                               mutex_unlock(&info->fs_mutex);
+
+                               btrfs_btree_balance_dirty(root);
+
+                               mutex_lock(&info->fs_mutex);
+                               trans = btrfs_start_transaction(tree_root, 1);
+                               if (err != -EAGAIN)
+                                       break;
+                       }
+                       root->defrag_running = 0;
+               }
+       }
+       btrfs_end_transaction(trans, tree_root);
+       return err;
+}
+
 static int drop_dirty_roots(struct btrfs_root *tree_root,
                            struct list_head *list)
 {
                goto out;
        }
        mutex_unlock(&root->fs_info->trans_mutex);
+       btrfs_defrag_dirty_roots(root->fs_info);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
 out:
 
 void btrfs_init_transaction_sys(void);
 void btrfs_exit_transaction_sys(void);
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 
 #endif
 
--- /dev/null
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+
+static void reada_defrag(struct btrfs_root *root,
+                        struct btrfs_node *node)
+{
+       int i;
+       u32 nritems;
+       u64 blocknr;
+       int ret;
+
+       nritems = btrfs_header_nritems(&node->header);
+       for (i = 0; i < nritems; i++) {
+               blocknr = btrfs_node_blockptr(node, i);
+               ret = readahead_tree_block(root, blocknr);
+               if (ret)
+                       break;
+       }
+}
+
+static int defrag_walk_down(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_path *path, int *level,
+                           int cache_only)
+{
+       struct buffer_head *next;
+       struct buffer_head *cur;
+       u64 blocknr;
+       int ret = 0;
+
+       WARN_ON(*level < 0);
+       WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+       while(*level > 0) {
+               WARN_ON(*level < 0);
+               WARN_ON(*level >= BTRFS_MAX_LEVEL);
+               cur = path->nodes[*level];
+
+               if (!cache_only && *level > 1 && path->slots[*level] == 0)
+                       reada_defrag(root, btrfs_buffer_node(cur));
+
+               if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+                       WARN_ON(1);
+
+               if (path->slots[*level] >=
+                   btrfs_header_nritems(btrfs_buffer_header(cur)))
+                       break;
+
+               if (*level == 1) {
+                       ret = btrfs_realloc_node(trans, root,
+                                                path->nodes[*level],
+                                                cache_only);
+                       break;
+               }
+               blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
+                                             path->slots[*level]);
+
+               if (cache_only) {
+                       next = btrfs_find_tree_block(root, blocknr);
+                       if (!next || !buffer_uptodate(next) ||
+                          buffer_locked(next)) {
+                               brelse(next);
+                               path->slots[*level]++;
+                               continue;
+                       }
+               } else {
+                       next = read_tree_block(root, blocknr);
+               }
+               ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
+                                     path->slots[*level], &next);
+               BUG_ON(ret);
+               ret = btrfs_realloc_node(trans, root, next, cache_only);
+               BUG_ON(ret);
+               WARN_ON(*level <= 0);
+               if (path->nodes[*level-1])
+                       btrfs_block_release(root, path->nodes[*level-1]);
+               path->nodes[*level-1] = next;
+               *level = btrfs_header_level(btrfs_buffer_header(next));
+               path->slots[*level] = 0;
+       }
+       WARN_ON(*level < 0);
+       WARN_ON(*level >= BTRFS_MAX_LEVEL);
+       btrfs_block_release(root, path->nodes[*level]);
+       path->nodes[*level] = NULL;
+       *level += 1;
+       WARN_ON(ret);
+       return 0;
+}
+
+static int defrag_walk_up(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path, int *level,
+                         int cache_only)
+{
+       int i;
+       int slot;
+       struct btrfs_node *node;
+
+       for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+               slot = path->slots[i];
+               if (slot < btrfs_header_nritems(
+                   btrfs_buffer_header(path->nodes[i])) - 1) {
+                       path->slots[i]++;
+                       *level = i;
+                       node = btrfs_buffer_node(path->nodes[i]);
+                       WARN_ON(i == 0);
+                       btrfs_disk_key_to_cpu(&root->defrag_progress,
+                                             &node->ptrs[path->slots[i]].key);
+                       root->defrag_level = i;
+                       return 0;
+               } else {
+                       btrfs_block_release(root, path->nodes[*level]);
+                       path->nodes[*level] = NULL;
+                       *level = i + 1;
+               }
+       }
+       return 1;
+}
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, int cache_only)
+{
+       struct btrfs_path *path = NULL;
+       struct buffer_head *tmp;
+       int ret = 0;
+       int wret;
+       int level;
+       int orig_level;
+       int i;
+       int num_runs = 0;
+
+       if (root->ref_cows == 0) {
+               goto out;
+       }
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       level = btrfs_header_level(btrfs_buffer_header(root->node));
+       orig_level = level;
+       if (level == 0) {
+               goto out;
+       }
+       if (root->defrag_progress.objectid == 0) {
+               get_bh(root->node);
+               ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
+               BUG_ON(ret);
+               ret = btrfs_realloc_node(trans, root, root->node, cache_only);
+               BUG_ON(ret);
+               path->nodes[level] = root->node;
+               path->slots[level] = 0;
+       } else {
+               level = root->defrag_level;
+               path->lowest_level = level;
+               wret = btrfs_search_slot(trans, root, &root->defrag_progress,
+                                        path, 0, 1);
+
+               if (wret < 0) {
+                       ret = wret;
+                       goto out;
+               }
+               while(level > 0 && !path->nodes[level])
+                       level--;
+               if (!path->nodes[level]) {
+                       ret = 0;
+                       goto out;
+               }
+       }
+
+       while(1) {
+               wret = defrag_walk_down(trans, root, path, &level, cache_only);
+               if (wret > 0)
+                       break;
+               if (wret < 0)
+                       ret = wret;
+
+               wret = defrag_walk_up(trans, root, path, &level, cache_only);
+               if (wret > 0)
+                       break;
+               if (wret < 0)
+                       ret = wret;
+               if (num_runs++ > 8) {
+                       ret = -EAGAIN;
+                       break;
+               }
+       }
+       for (i = 0; i <= orig_level; i++) {
+               if (path->nodes[i]) {
+                       btrfs_block_release(root, path->nodes[i]);
+                       path->nodes[i] = 0;
+               }
+       }
+out:
+       if (path)
+               btrfs_free_path(path);
+       if (ret != -EAGAIN) {
+               memset(&root->defrag_progress, 0,
+                      sizeof(root->defrag_progress));
+       }
+       return ret;
+}