int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
-                               struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 time_seq, struct ulist **roots);
+                        struct btrfs_fs_info *fs_info, u64 bytenr,
+                        u64 time_seq, struct ulist **roots);
 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        u32 name_len, unsigned long name_off,
                        struct extent_buffer *eb_in, u64 parent,
 
 }
 
 /*
- * Increment the upper half of tree_mod_seq, set lower half zero.
- *
- * Must be called with fs_info->tree_mod_seq_lock held.
- */
-static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
-{
-       u64 seq = atomic64_read(&fs_info->tree_mod_seq);
-       seq &= 0xffffffff00000000ull;
-       seq += 1ull << 32;
-       atomic64_set(&fs_info->tree_mod_seq, seq);
-       return seq;
-}
-
-/*
- * Increment the lower half of tree_mod_seq.
- *
- * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
- * are generated should not technically require a spin lock here. (Rationale:
- * incrementing the minor while incrementing the major seq number is between its
- * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
- * just returns a unique sequence number as usual.) We have decided to leave
- * that requirement in here and rethink it once we notice it really imposes a
- * problem on some workload.
+ * Pull a new tree mod seq number for our operation.
  */
-static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 {
        return atomic64_inc_return(&fs_info->tree_mod_seq);
 }
 
-/*
- * return the last minor in the previous major tree_mod_seq number
- */
-u64 btrfs_tree_mod_seq_prev(u64 seq)
-{
-       return (seq & 0xffffffff00000000ull) - 1ull;
-}
-
 /*
  * This adds a new blocker to the tree mod log's blocker list if the @elem
  * passed does not already have a sequence number set. So when a caller expects
 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
                           struct seq_list *elem)
 {
-       u64 seq;
-
        tree_mod_log_write_lock(fs_info);
        spin_lock(&fs_info->tree_mod_seq_lock);
        if (!elem->seq) {
-               elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
+               elem->seq = btrfs_inc_tree_mod_seq(fs_info);
                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
        }
-       seq = btrfs_inc_tree_mod_seq_minor(fs_info);
        spin_unlock(&fs_info->tree_mod_seq_lock);
        tree_mod_log_write_unlock(fs_info);
 
-       return seq;
+       return elem->seq;
 }
 
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 
        BUG_ON(!tm);
 
-       spin_lock(&fs_info->tree_mod_seq_lock);
-       tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
-       spin_unlock(&fs_info->tree_mod_seq_lock);
+       tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 
        tm_root = &fs_info->tree_mod_log;
        new = &tm_root->rb_node;
 
 
        /* holds configuration and tracking. Protected by qgroup_lock */
        struct rb_root qgroup_tree;
+       struct rb_root qgroup_op_tree;
        spinlock_t qgroup_lock;
+       spinlock_t qgroup_op_lock;
+       atomic_t qgroup_op_seq;
 
        /*
         * used to avoid frequently calling ulist_alloc()/ulist_free()
                         u64 min_alloc_size, u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref, int for_cow);
+                 struct extent_buffer *buf, int full_backref, int no_quota);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref, int for_cow);
+                 struct extent_buffer *buf, int full_backref, int no_quota);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                     u64 owner, u64 offset, int for_cow);
+                     u64 owner, u64 offset, int no_quota);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset, int for_cow);
+                        u64 root_objectid, u64 owner, u64 offset, int no_quota);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
-
 int btrfs_start_nocow_write(struct btrfs_root *root);
 void btrfs_end_nocow_write(struct btrfs_root *root);
 /* ctree.c */
                           struct seq_list *elem);
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
                            struct seq_list *elem);
-u64 btrfs_tree_mod_seq_prev(u64 seq);
 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
 
 /* root-item.c */
 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                         u64 start, int err);
 
-/* qgroup.c */
-struct qgroup_update {
-       struct list_head list;
-       struct btrfs_delayed_ref_node *node;
-       struct btrfs_delayed_extent_op *extent_op;
-};
-
-int btrfs_quota_enable(struct btrfs_trans_handle *trans,
-                      struct btrfs_fs_info *fs_info);
-int btrfs_quota_disable(struct btrfs_trans_handle *trans,
-                       struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
-void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-                       struct btrfs_fs_info *fs_info, u64 qgroupid,
-                       char *name);
-int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info, u64 qgroupid);
-int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
-                      struct btrfs_fs_info *fs_info, u64 qgroupid,
-                      struct btrfs_qgroup_limit *limit);
-int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
-void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_delayed_ref_node *node,
-                           struct btrfs_delayed_extent_op *extent_op);
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_fs_info *fs_info,
-                            struct btrfs_delayed_ref_node *node,
-                            struct btrfs_delayed_extent_op *extent_op);
-int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
-                     struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
-                        struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
-                        struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
-
 static inline int is_fstree(u64 rootid)
 {
        if (rootid == BTRFS_FS_TREE_OBJECTID ||
 
                return -1;
        if (ref1->type > ref2->type)
                return 1;
+       if (ref1->no_quota > ref2->no_quota)
+               return 1;
+       if (ref1->no_quota < ref2->no_quota)
+               return -1;
        /* merging of sequenced refs is not allowed */
        if (compare_seq) {
                if (ref1->seq < ref2->seq)
                     struct btrfs_delayed_ref_head *head_ref,
                     struct btrfs_delayed_ref_node *ref, u64 bytenr,
                     u64 num_bytes, u64 parent, u64 ref_root, int level,
-                    int action, int for_cow)
+                    int action, int no_quota)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_tree_ref *full_ref;
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
 
+       if (is_fstree(ref_root))
+               seq = atomic64_read(&fs_info->tree_mod_seq);
        delayed_refs = &trans->transaction->delayed_refs;
 
        /* first set the basic ref node struct up */
        ref->action = action;
        ref->is_head = 0;
        ref->in_tree = 1;
-
-       if (need_ref_seq(for_cow, ref_root))
-               seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+       ref->no_quota = no_quota;
        ref->seq = seq;
 
        full_ref = btrfs_delayed_node_to_tree_ref(ref);
                     struct btrfs_delayed_ref_head *head_ref,
                     struct btrfs_delayed_ref_node *ref, u64 bytenr,
                     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
-                    u64 offset, int action, int for_cow)
+                    u64 offset, int action, int no_quota)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_data_ref *full_ref;
 
        delayed_refs = &trans->transaction->delayed_refs;
 
+       if (is_fstree(ref_root))
+               seq = atomic64_read(&fs_info->tree_mod_seq);
+
        /* first set the basic ref node struct up */
        atomic_set(&ref->refs, 1);
        ref->bytenr = bytenr;
        ref->action = action;
        ref->is_head = 0;
        ref->in_tree = 1;
-
-       if (need_ref_seq(for_cow, ref_root))
-               seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+       ref->no_quota = no_quota;
        ref->seq = seq;
 
        full_ref = btrfs_delayed_node_to_data_ref(ref);
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root,  int level, int action,
                               struct btrfs_delayed_extent_op *extent_op,
-                              int for_cow)
+                              int no_quota)
 {
        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
 
+       if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+               no_quota = 0;
+
        BUG_ON(extent_op && extent_op->is_data);
        ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
        if (!ref)
 
        add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, level, action,
-                                  for_cow);
+                                  no_quota);
        spin_unlock(&delayed_refs->lock);
-       if (need_ref_seq(for_cow, ref_root))
-               btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
 
        return 0;
 }
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
                               struct btrfs_delayed_extent_op *extent_op,
-                              int for_cow)
+                              int no_quota)
 {
        struct btrfs_delayed_data_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
 
+       if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+               no_quota = 0;
+
        BUG_ON(extent_op && !extent_op->is_data);
        ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
        if (!ref)
 
        add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, owner, offset,
-                                  action, for_cow);
+                                  action, no_quota);
        spin_unlock(&delayed_refs->lock);
-       if (need_ref_seq(for_cow, ref_root))
-               btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
 
        return 0;
 }
 
 
        unsigned int action:8;
        unsigned int type:8;
+       unsigned int no_quota:1;
        /* is this node still in the rbtree? */
        unsigned int is_head:1;
        unsigned int in_tree:1;
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root, int level, int action,
                               struct btrfs_delayed_extent_op *extent_op,
-                              int for_cow);
+                              int no_quota);
 int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                               struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
                               struct btrfs_delayed_extent_op *extent_op,
-                              int for_cow);
+                              int no_quota);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                            struct btrfs_delayed_ref_root *delayed_refs,
                            u64 seq);
 
-/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
-       if (for_cow)
-               return 0;
-
-       if (rootid == BTRFS_FS_TREE_OBJECTID)
-               return 1;
-
-       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
-               return 1;
-
-       return 0;
-}
-
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
 
 #include "dev-replace.h"
 #include "raid56.h"
 #include "sysfs.h"
+#include "qgroup.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
        spin_lock_init(&fs_info->free_chunk_lock);
        spin_lock_init(&fs_info->tree_mod_seq_lock);
        spin_lock_init(&fs_info->super_lock);
+       spin_lock_init(&fs_info->qgroup_op_lock);
        spin_lock_init(&fs_info->buffer_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
        atomic_set(&fs_info->defrag_running, 0);
+       atomic_set(&fs_info->qgroup_op_seq, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
        fs_info->max_inline = 8192 * 1024;
        spin_lock_init(&fs_info->qgroup_lock);
        mutex_init(&fs_info->qgroup_ioctl_lock);
        fs_info->qgroup_tree = RB_ROOT;
+       fs_info->qgroup_op_tree = RB_ROOT;
        INIT_LIST_HEAD(&fs_info->dirty_qgroups);
        fs_info->qgroup_seq = 1;
        fs_info->quota_enabled = 0;
 
 #include "free-space-cache.h"
 #include "math.h"
 #include "sysfs.h"
+#include "qgroup.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
                                u64 bytenr, u64 num_bytes, u64 parent,
                                u64 root_objectid, u64 owner_objectid,
                                u64 owner_offset, int refs_to_drop,
-                               struct btrfs_delayed_extent_op *extra_op);
+                               struct btrfs_delayed_extent_op *extra_op,
+                               int no_quota);
 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
                                    struct extent_buffer *leaf,
                                    struct btrfs_extent_item *ei);
                                     struct btrfs_root *root,
                                     u64 parent, u64 root_objectid,
                                     u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins);
+                                    int level, struct btrfs_key *ins,
+                                    int no_quota);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 flags,
                          int force);
 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
                                           struct btrfs_root *root,
                                           struct btrfs_path *path,
-                                          int refs_to_drop)
+                                          int refs_to_drop, int *last_ref)
 {
        struct btrfs_key key;
        struct btrfs_extent_data_ref *ref1 = NULL;
 
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
+               *last_ref = 1;
        } else {
                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
                                  struct btrfs_path *path,
                                  struct btrfs_extent_inline_ref *iref,
                                  int refs_to_mod,
-                                 struct btrfs_delayed_extent_op *extent_op)
+                                 struct btrfs_delayed_extent_op *extent_op,
+                                 int *last_ref)
 {
        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
                else
                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
        } else {
+               *last_ref = 1;
                size =  btrfs_extent_inline_ref_size(type);
                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
                ptr = (unsigned long)iref;
        if (ret == 0) {
                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
                update_inline_extent_backref(root, path, iref,
-                                            refs_to_add, extent_op);
+                                            refs_to_add, extent_op, NULL);
        } else if (ret == -ENOENT) {
                setup_inline_extent_backref(root, path, iref, parent,
                                            root_objectid, owner, offset,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 struct btrfs_extent_inline_ref *iref,
-                                int refs_to_drop, int is_data)
+                                int refs_to_drop, int is_data, int *last_ref)
 {
        int ret = 0;
 
        BUG_ON(!is_data && refs_to_drop != 1);
        if (iref) {
                update_inline_extent_backref(root, path, iref,
-                                            -refs_to_drop, NULL);
+                                            -refs_to_drop, NULL, last_ref);
        } else if (is_data) {
-               ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+               ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
+                                            last_ref);
        } else {
+               *last_ref = 1;
                ret = btrfs_del_item(trans, root, path);
        }
        return ret;
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset, int for_cow)
+                        u64 root_objectid, u64 owner, u64 offset,
+                        int no_quota)
 {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                        num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
        } else {
                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
                                        num_bytes,
                                        parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
        }
        return ret;
 }
                                  u64 bytenr, u64 num_bytes,
                                  u64 parent, u64 root_objectid,
                                  u64 owner, u64 offset, int refs_to_add,
+                                 int no_quota,
                                  struct btrfs_delayed_extent_op *extent_op)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_extent_item *item;
+       struct btrfs_key key;
        u64 refs;
        int ret;
+       enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
+       if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
+               no_quota = 1;
+
        path->reada = 1;
        path->leave_spinning = 1;
        /* this will setup the path even if it fails to insert the back ref */
-       ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
-                                          path, bytenr, num_bytes, parent,
+       ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
+                                          bytenr, num_bytes, parent,
                                           root_objectid, owner, offset,
                                           refs_to_add, extent_op);
-       if (ret != -EAGAIN)
+       if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
                goto out;
+       /*
+        * Ok we were able to insert an inline extent and it appears to be a new
+        * reference, deal with the qgroup accounting.
+        */
+       if (!ret && !no_quota) {
+               ASSERT(root->fs_info->quota_enabled);
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               item = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_item);
+               if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
+                       type = BTRFS_QGROUP_OPER_ADD_SHARED;
+               btrfs_release_path(path);
 
+               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+                                             bytenr, num_bytes, type, 0);
+               goto out;
+       }
+
+       /*
+        * Ok we had -EAGAIN which means we didn't have space to insert and
+        * inline extent ref, so just update the reference count and add a
+        * normal backref.
+        */
        leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, item);
+       if (refs)
+               type = BTRFS_QGROUP_OPER_ADD_SHARED;
        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
        if (extent_op)
                __run_delayed_extent_op(extent_op, leaf, item);
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
 
+       if (!no_quota) {
+               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+                                             bytenr, num_bytes, type, 0);
+               if (ret)
+                       goto out;
+       }
+
        path->reada = 1;
        path->leave_spinning = 1;
-
        /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
                                    path, bytenr, parent, root_objectid,
 
        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
                parent = ref->parent;
-       else
-               ref_root = ref->root;
+       ref_root = ref->root;
 
        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
                if (extent_op)
                                             node->num_bytes, parent,
                                             ref_root, ref->objectid,
                                             ref->offset, node->ref_mod,
-                                            extent_op);
+                                            node->no_quota, extent_op);
        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
                ret = __btrfs_free_extent(trans, root, node->bytenr,
                                          node->num_bytes, parent,
                                          ref_root, ref->objectid,
                                          ref->offset, node->ref_mod,
-                                         extent_op);
+                                         extent_op, node->no_quota);
        } else {
                BUG();
        }
 
        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                parent = ref->parent;
-       else
-               ref_root = ref->root;
+       ref_root = ref->root;
 
        ins.objectid = node->bytenr;
        if (skinny_metadata) {
                                                parent, ref_root,
                                                extent_op->flags_to_set,
                                                &extent_op->key,
-                                               ref->level, &ins);
+                                               ref->level, &ins,
+                                               node->no_quota);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
                                             node->num_bytes, parent, ref_root,
-                                            ref->level, 0, 1, extent_op);
+                                            ref->level, 0, 1, node->no_quota,
+                                            extent_op);
        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
                ret = __btrfs_free_extent(trans, root, node->bytenr,
                                          node->num_bytes, parent, ref_root,
-                                         ref->level, 0, 1, extent_op);
+                                         ref->level, 0, 1, extent_op,
+                                         node->no_quota);
        } else {
                BUG();
        }
 }
 #endif
 
-int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
-                                        struct btrfs_fs_info *fs_info)
-{
-       struct qgroup_update *qgroup_update;
-       int ret = 0;
-
-       if (list_empty(&trans->qgroup_ref_list) !=
-           !trans->delayed_ref_elem.seq) {
-               /* list without seq or seq without list */
-               btrfs_err(fs_info,
-                       "qgroup accounting update error, list is%s empty, seq is %#x.%x",
-                       list_empty(&trans->qgroup_ref_list) ? "" : " not",
-                       (u32)(trans->delayed_ref_elem.seq >> 32),
-                       (u32)trans->delayed_ref_elem.seq);
-               BUG();
-       }
-
-       if (!trans->delayed_ref_elem.seq)
-               return 0;
-
-       while (!list_empty(&trans->qgroup_ref_list)) {
-               qgroup_update = list_first_entry(&trans->qgroup_ref_list,
-                                                struct qgroup_update, list);
-               list_del(&qgroup_update->list);
-               if (!ret)
-                       ret = btrfs_qgroup_account_ref(
-                                       trans, fs_info, qgroup_update->node,
-                                       qgroup_update->extent_op);
-               kfree(qgroup_update);
-       }
-
-       btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
-
-       return ret;
-}
-
 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
 {
        u64 num_bytes;
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
 
-       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
        delayed_refs = &trans->transaction->delayed_refs;
        if (count == 0) {
                count = atomic_read(&delayed_refs->num_entries) * 2;
                goto again;
        }
 out:
+       ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
+       if (ret)
+               return ret;
        assert_qgroups_uptodate(trans);
        return 0;
 }
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          int full_backref, int inc, int for_cow)
+                          int full_backref, int inc, int no_quota)
 {
        u64 bytenr;
        u64 num_bytes;
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                          key.offset, for_cow);
+                                          key.offset, no_quota);
                        if (ret)
                                goto fail;
                } else {
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, level - 1, 0,
-                                          for_cow);
+                                          no_quota);
                        if (ret)
                                goto fail;
                }
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref, int for_cow)
+                 struct extent_buffer *buf, int full_backref, int no_quota)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref, int for_cow)
+                 struct extent_buffer *buf, int full_backref, int no_quota)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes, u64 parent,
                                u64 root_objectid, u64 owner_objectid,
                                u64 owner_offset, int refs_to_drop,
-                               struct btrfs_delayed_extent_op *extent_op)
+                               struct btrfs_delayed_extent_op *extent_op,
+                               int no_quota)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
        int num_to_del = 1;
        u32 item_size;
        u64 refs;
+       int last_ref = 0;
+       enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                 SKINNY_METADATA);
 
+       if (!info->quota_enabled || !is_fstree(root_objectid))
+               no_quota = 1;
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
                        BUG_ON(iref);
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    NULL, refs_to_drop,
-                                                   is_data);
+                                                   is_data, &last_ref);
                        if (ret) {
                                btrfs_abort_transaction(trans, extent_root, ret);
                                goto out;
        refs -= refs_to_drop;
 
        if (refs > 0) {
+               type = BTRFS_QGROUP_OPER_SUB_SHARED;
                if (extent_op)
                        __run_delayed_extent_op(extent_op, leaf, ei);
                /*
                if (found_extent) {
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    iref, refs_to_drop,
-                                                   is_data);
+                                                   is_data, &last_ref);
                        if (ret) {
                                btrfs_abort_transaction(trans, extent_root, ret);
                                goto out;
                        }
                }
 
+               last_ref = 1;
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                if (ret) {
                        goto out;
                }
        }
+       btrfs_release_path(path);
+
+       /* Deal with the quota accounting */
+       if (!ret && last_ref && !no_quota) {
+               int mod_seq = 0;
+
+               if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+                   type == BTRFS_QGROUP_OPER_SUB_SHARED)
+                       mod_seq = 1;
+
+               ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
+                                             bytenr, num_bytes, type,
+                                             mod_seq);
+       }
 out:
        btrfs_free_path(path);
        return ret;
 /* Can return -ENOMEM */
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                     u64 owner, u64 offset, int for_cow)
+                     u64 owner, u64 offset, int no_quota)
 {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                        num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+                                       BTRFS_DROP_DELAYED_REF, NULL, no_quota);
        } else {
                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
                                                num_bytes,
                                                parent, root_objectid, owner,
                                                offset, BTRFS_DROP_DELAYED_REF,
-                                               NULL, for_cow);
+                                               NULL, no_quota);
        }
        return ret;
 }
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
 
+       /* Always set parent to 0 here since its exclusive anyway. */
+       ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+                                     ins->objectid, ins->offset,
+                                     BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+       if (ret)
+               return ret;
+
        ret = update_block_group(root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                                     struct btrfs_root *root,
                                     u64 parent, u64 root_objectid,
                                     u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins)
+                                    int level, struct btrfs_key *ins,
+                                    int no_quota)
 {
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        u32 size = sizeof(*extent_item) + sizeof(*iref);
+       u64 num_bytes = ins->offset;
        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                 SKINNY_METADATA);
 
 
        if (skinny_metadata) {
                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+               num_bytes = root->leafsize;
        } else {
                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
                btrfs_set_tree_block_key(leaf, block_info, key);
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
 
+       if (!no_quota) {
+               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+                                             ins->objectid, num_bytes,
+                                             BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+               if (ret)
+                       return ret;
+       }
+
        ret = update_block_group(root, ins->objectid, root->leafsize, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
 
 #include "tree-log.h"
 #include "locking.h"
 #include "volumes.h"
+#include "qgroup.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
-                                               start - extent_offset, 0);
+                                               start - extent_offset, 1);
                                BUG_ON(ret); /* -ENOMEM */
                        }
                        key.offset = start;
 
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                          ino, orig_offset, 0);
+                                          ino, orig_offset, 1);
                BUG_ON(ret); /* -ENOMEM */
 
                if (split == start) {
 
 #include "dev-replace.h"
 #include "props.h"
 #include "sysfs.h"
+#include "qgroup.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
        return ret;
 }
 
+/* Helper to check and see if this root currently has a ref on the given disk
+ * bytenr.  If it does then we need to update the quota for this root.  This
+ * doesn't do anything if quotas aren't enabled.
+ */
+static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                    u64 disko)
+{
+       struct seq_list tree_mod_seq_elem = {};
+       struct ulist *roots;
+       struct ulist_iterator uiter;
+       struct ulist_node *root_node = NULL;
+       int ret;
+
+       if (!root->fs_info->quota_enabled)
+               return 1;
+
+       btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+       ret = btrfs_find_all_roots(trans, root->fs_info, disko,
+                                  tree_mod_seq_elem.seq, &roots);
+       if (ret < 0)
+               goto out;
+       ret = 0;
+       ULIST_ITER_INIT(&uiter);
+       while ((root_node = ulist_next(roots, &uiter))) {
+               if (root_node->val == root->objectid) {
+                       ret = 1;
+                       break;
+               }
+       }
+       ulist_free(roots);
+out:
+       btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+       return ret;
+}
+
 /**
  * btrfs_clone() - clone a range from inode file to another
  *
        u32 nritems;
        int slot;
        int ret;
+       int no_quota;
        u64 len = olen_aligned;
+       u64 last_disko = 0;
 
        ret = -ENOMEM;
        buf = vmalloc(btrfs_level_size(root, 0));
 
                nritems = btrfs_header_nritems(path->nodes[0]);
 process_slot:
+               no_quota = 1;
                if (path->slots[0] >= nritems) {
                        ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
                        if (ret < 0)
                                                             datao);
                                btrfs_set_file_extent_num_bytes(leaf, extent,
                                                                datal);
+
+                               /*
+                                * We need to look up the roots that point at
+                                * this bytenr and see if the new root does.  If
+                                * it does not we need to make sure we update
+                                * quotas appropriately.
+                                */
+                               if (disko && root != BTRFS_I(src)->root &&
+                                   disko != last_disko) {
+                                       no_quota = check_ref(trans, root,
+                                                            disko);
+                                       if (no_quota < 0) {
+                                               btrfs_abort_transaction(trans,
+                                                                       root,
+                                                                       ret);
+                                               btrfs_end_transaction(trans,
+                                                                     root);
+                                               ret = no_quota;
+                                               goto out;
+                                       }
+                               }
+
                                if (disko) {
                                        inode_add_bytes(inode, datal);
                                        ret = btrfs_inc_extent_ref(trans, root,
                                                        root->root_key.objectid,
                                                        btrfs_ino(inode),
                                                        new_key.offset - datao,
-                                                       0);
+                                                       no_quota);
                                        if (ret) {
                                                btrfs_abort_transaction(trans,
                                                                        root,
 
 #include "ulist.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "qgroup.h"
 
 /* TODO XXX FIXME
  *  - subvol delete -> delete when ref goes to 0? delete limits also?
        /*
         * temp variables for accounting operations
         */
-       u64 tag;
-       u64 refcnt;
+       u64 old_refcnt;
+       u64 new_refcnt;
 };
 
 /*
        struct btrfs_qgroup *member;
 };
 
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
 static int
 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
                   int init_flags);
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
 }
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+                    struct btrfs_qgroup_operation *oper2)
+{
+       if (oper1->bytenr < oper2->bytenr)
+               return -1;
+       if (oper1->bytenr > oper2->bytenr)
+               return 1;
+       if (oper1->seq < oper2->seq)
+               return -1;
+       if (oper1->seq > oper2->seq)
+               return -1;
+       if (oper1->ref_root < oper2->ref_root)
+               return -1;
+       if (oper1->ref_root > oper2->ref_root)
+               return 1;
+       if (oper1->type < oper2->type)
+               return -1;
+       if (oper1->type > oper2->type)
+               return 1;
+       return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+                             struct btrfs_qgroup_operation *oper)
+{
+       struct rb_node **p;
+       struct rb_node *parent = NULL;
+       struct btrfs_qgroup_operation *cur;
+       int cmp;
+
+       spin_lock(&fs_info->qgroup_op_lock);
+       p = &fs_info->qgroup_op_tree.rb_node;
+       while (*p) {
+               parent = *p;
+               cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+               cmp = comp_oper(cur, oper);
+               if (cmp < 0) {
+                       p = &(*p)->rb_right;
+               } else if (cmp) {
+                       p = &(*p)->rb_left;
+               } else {
+                       spin_unlock(&fs_info->qgroup_op_lock);
+                       return -EEXIST;
+               }
+       }
+       rb_link_node(&oper->n, parent, p);
+       rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+       spin_unlock(&fs_info->qgroup_op_lock);
+       return 0;
+}
 
 /*
- * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
- * the modification into a list that's later used by btrfs_end_transaction to
- * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point.  If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
  */
 int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_delayed_ref_node *node,
-                           struct btrfs_delayed_extent_op *extent_op)
+                           struct btrfs_fs_info *fs_info, u64 ref_root,
+                           u64 bytenr, u64 num_bytes,
+                           enum btrfs_qgroup_operation_type type, int mod_seq)
 {
-       struct qgroup_update *u;
+       struct btrfs_qgroup_operation *oper;
+       int ret;
 
-       BUG_ON(!trans->delayed_ref_elem.seq);
-       u = kmalloc(sizeof(*u), GFP_NOFS);
-       if (!u)
+       if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+               return 0;
+
+       oper = kmalloc(sizeof(*oper), GFP_NOFS);
+       if (!oper)
                return -ENOMEM;
 
-       u->node = node;
-       u->extent_op = extent_op;
-       list_add_tail(&u->list, &trans->qgroup_ref_list);
+       oper->ref_root = ref_root;
+       oper->bytenr = bytenr;
+       oper->num_bytes = num_bytes;
+       oper->type = type;
+       oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+       INIT_LIST_HEAD(&oper->elem.list);
+       oper->elem.seq = 0;
+       ret = insert_qgroup_oper(fs_info, oper);
+       if (ret) {
+               /* Shouldn't happen so have an assert for developers */
+               ASSERT(0);
+               kfree(oper);
+               return ret;
+       }
+       list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+       if (mod_seq)
+               btrfs_get_tree_mod_seq(fs_info, &oper->elem);
 
        return 0;
 }
 
-static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
-                                   struct ulist *roots, struct ulist *tmp,
-                                   u64 seq)
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ */
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_qgroup_operation *oper)
+{
+       struct btrfs_qgroup *qgroup;
+       struct ulist *tmp;
+       struct btrfs_qgroup_list *glist;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       int sign = 0;
+       int ret = 0;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+
+       spin_lock(&fs_info->qgroup_lock);
+       if (!fs_info->quota_root)
+               goto out;
+       qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+       if (!qgroup)
+               goto out;
+       switch (oper->type) {
+       case BTRFS_QGROUP_OPER_ADD_EXCL:
+               sign = 1;
+               break;
+       case BTRFS_QGROUP_OPER_SUB_EXCL:
+               sign = -1;
+               break;
+       default:
+               ASSERT(0);
+       }
+       qgroup->rfer += sign * oper->num_bytes;
+       qgroup->rfer_cmpr += sign * oper->num_bytes;
+
+       WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
+       qgroup->excl += sign * oper->num_bytes;
+       qgroup->excl_cmpr += sign * oper->num_bytes;
+
+       qgroup_dirty(fs_info, qgroup);
+
+       /* Get all of the parent groups that contain this qgroup */
+       list_for_each_entry(glist, &qgroup->groups, next_group) {
+               ret = ulist_add(tmp, glist->group->qgroupid,
+                               ptr_to_u64(glist->group), GFP_ATOMIC);
+               if (ret < 0)
+                       goto out;
+       }
+
+       /* Iterate all of the parents and adjust their reference counts */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(tmp, &uiter))) {
+               qgroup = u64_to_ptr(unode->aux);
+               qgroup->rfer += sign * oper->num_bytes;
+               qgroup->rfer_cmpr += sign * oper->num_bytes;
+               qgroup->excl += sign * oper->num_bytes;
+               if (sign < 0)
+                       WARN_ON(qgroup->excl < oper->num_bytes);
+               qgroup->excl_cmpr += sign * oper->num_bytes;
+               qgroup_dirty(fs_info, qgroup);
+
+               /* Add any parents of the parents */
+               list_for_each_entry(glist, &qgroup->groups, next_group) {
+                       ret = ulist_add(tmp, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               goto out;
+               }
+       }
+       ret = 0;
+out:
+       spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(tmp);
+       return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+                                 u64 root_to_skip, struct ulist *tmp,
+                                 struct ulist *roots, struct ulist *qgroups,
+                                 u64 seq, int *old_roots, int rescan)
 {
        struct ulist_node *unode;
        struct ulist_iterator uiter;
 
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(roots, &uiter))) {
+               /* We don't count our current root here */
+               if (unode->val == root_to_skip)
+                       continue;
                qg = find_qgroup_rb(fs_info, unode->val);
                if (!qg)
                        continue;
+               /*
+                * We could have a pending removal of this same ref so we may
+                * not have actually found our ref root when doing
+                * btrfs_find_all_roots, so we need to keep track of how many
+                * old roots we find in case we removed ours and added a
+                * different one at the same time.  I don't think this could
+                * happen in practice but that sort of thinking leads to pain
+                * and suffering and to the dark side.
+                */
+               (*old_roots)++;
 
                ulist_reinit(tmp);
-                                               /* XXX id not needed */
-               ret = ulist_add(tmp, qg->qgroupid,
-                               (u64)(uintptr_t)qg, GFP_ATOMIC);
+               ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+                               GFP_ATOMIC);
+               if (ret < 0)
+                       return ret;
+               ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
                if (ret < 0)
                        return ret;
                ULIST_ITER_INIT(&tmp_uiter);
                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
                        struct btrfs_qgroup_list *glist;
 
-                       qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
-                       if (qg->refcnt < seq)
-                               qg->refcnt = seq + 1;
+                       qg = u64_to_ptr(tmp_unode->aux);
+                       /*
+                        * We use this sequence number to keep from having to
+                        * run the whole list and 0 out the refcnt every time.
+                        * We basically use sequnce as the known 0 count and
+                        * then add 1 everytime we see a qgroup.  This is how we
+                        * get how many of the roots actually point up to the
+                        * upper level qgroups in order to determine exclusive
+                        * counts.
+                        *
+                        * For rescan we want to set old_refcnt to seq so our
+                        * exclusive calculations end up correct.
+                        */
+                       if (rescan)
+                               qg->old_refcnt = seq;
+                       else if (qg->old_refcnt < seq)
+                               qg->old_refcnt = seq + 1;
                        else
-                               ++qg->refcnt;
+                               qg->old_refcnt++;
 
+                       if (qg->new_refcnt < seq)
+                               qg->new_refcnt = seq + 1;
+                       else
+                               qg->new_refcnt++;
                        list_for_each_entry(glist, &qg->groups, next_group) {
+                               ret = ulist_add(qgroups, glist->group->qgroupid,
+                                               ptr_to_u64(glist->group),
+                                               GFP_ATOMIC);
+                               if (ret < 0)
+                                       return ret;
                                ret = ulist_add(tmp, glist->group->qgroupid,
-                                               (u64)(uintptr_t)glist->group,
+                                               ptr_to_u64(glist->group),
                                                GFP_ATOMIC);
                                if (ret < 0)
                                        return ret;
                        }
                }
        }
+       return 0;
+}
 
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_qgroup_operation *oper,
+                                      struct ulist *tmp,
+                                      struct ulist *qgroups, u64 seq,
+                                      int *old_roots)
+{
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       struct btrfs_qgroup *qg;
+       struct btrfs_qgroup_operation *tmp_oper;
+       struct rb_node *n;
+       int ret;
+
+       ulist_reinit(tmp);
+
+       /*
+        * We only walk forward in the tree since we're only interested in
+        * removals that happened _after_  our operation.
+        */
+       spin_lock(&fs_info->qgroup_op_lock);
+       n = rb_next(&oper->n);
+       spin_unlock(&fs_info->qgroup_op_lock);
+       if (!n)
+               return 0;
+       tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+       while (tmp_oper->bytenr == oper->bytenr) {
+               /*
+                * If it's not a removal we don't care, additions work out
+                * properly with our refcnt tracking.
+                */
+               if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+                   tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+                       goto next;
+               qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+               if (!qg)
+                       goto next;
+               ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+                               GFP_ATOMIC);
+               if (ret) {
+                       if (ret < 0)
+                               return ret;
+                       /*
+                        * We only want to increase old_roots if this qgroup is
+                        * not already in the list of qgroups.  If it is already
+                        * there then that means it must have been re-added or
+                        * the delete will be discarded because we had an
+                        * existing ref that we haven't looked up yet.  In this
+                        * case we don't want to increase old_roots.  So if ret
+                        * == 1 then we know that this is the first time we've
+                        * seen this qgroup and we can bump the old_roots.
+                        */
+                       (*old_roots)++;
+                       ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+                                       GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+               }
+next:
+               spin_lock(&fs_info->qgroup_op_lock);
+               n = rb_next(&tmp_oper->n);
+               spin_unlock(&fs_info->qgroup_op_lock);
+               if (!n)
+                       break;
+               tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+       }
+
+       /* Ok now process the qgroups we found */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(tmp, &uiter))) {
+               struct btrfs_qgroup_list *glist;
+
+               qg = u64_to_ptr(unode->aux);
+               if (qg->old_refcnt < seq)
+                       qg->old_refcnt = seq + 1;
+               else
+                       qg->old_refcnt++;
+               if (qg->new_refcnt < seq)
+                       qg->new_refcnt = seq + 1;
+               else
+                       qg->new_refcnt++;
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ret = ulist_add(qgroups, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+                       ret = ulist_add(tmp, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+               }
+       }
        return 0;
 }
 
-static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info,
-                                   struct ulist *roots, struct ulist *tmp,
-                                   u64 seq, int sgn, u64 num_bytes,
-                                   struct btrfs_qgroup *qgroup)
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_qgroup_operation *oper,
+                                 struct btrfs_qgroup *qgroup,
+                                 struct ulist *tmp, struct ulist *qgroups,
+                                 u64 seq)
 {
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        struct btrfs_qgroup *qg;
-       struct btrfs_qgroup_list *glist;
        int ret;
 
        ulist_reinit(tmp);
-       ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+       ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+                       GFP_ATOMIC);
+       if (ret < 0)
+               return ret;
+       ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+                       GFP_ATOMIC);
        if (ret < 0)
                return ret;
-
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(tmp, &uiter))) {
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
-               if (qg->refcnt < seq) {
-                       /* not visited by step 1 */
-                       qg->rfer += sgn * num_bytes;
-                       qg->rfer_cmpr += sgn * num_bytes;
-                       if (roots->nnodes == 0) {
-                               qg->excl += sgn * num_bytes;
-                               qg->excl_cmpr += sgn * num_bytes;
-                       }
-                       qgroup_dirty(fs_info, qg);
-               }
-               WARN_ON(qg->tag >= seq);
-               qg->tag = seq;
+               struct btrfs_qgroup_list *glist;
 
+               qg = u64_to_ptr(unode->aux);
+               if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+                       if (qg->new_refcnt < seq)
+                               qg->new_refcnt = seq + 1;
+                       else
+                               qg->new_refcnt++;
+               } else {
+                       if (qg->old_refcnt < seq)
+                               qg->old_refcnt = seq + 1;
+                       else
+                               qg->old_refcnt++;
+               }
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ret = ulist_add(tmp, glist->group->qgroupid,
-                                       (uintptr_t)glist->group, GFP_ATOMIC);
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+                       ret = ulist_add(qgroups, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
                        if (ret < 0)
                                return ret;
                }
        }
-
        return 0;
 }
 
-static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info,
-                                   struct ulist *roots, struct ulist *tmp,
-                                   u64 seq, int sgn, u64 num_bytes)
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+                                 u64 root_to_skip, u64 num_bytes,
+                                 struct ulist *qgroups, u64 seq,
+                                 int old_roots, int new_roots, int rescan)
 {
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        struct btrfs_qgroup *qg;
-       struct ulist_node *tmp_unode;
-       struct ulist_iterator tmp_uiter;
-       int ret;
+       u64 cur_new_count, cur_old_count;
 
        ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(roots, &uiter))) {
-               qg = find_qgroup_rb(fs_info, unode->val);
-               if (!qg)
-                       continue;
+       while ((unode = ulist_next(qgroups, &uiter))) {
+               bool dirty = false;
 
-               ulist_reinit(tmp);
-               ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
-               if (ret < 0)
-                       return ret;
+               qg = u64_to_ptr(unode->aux);
+               /*
+                * Wasn't referenced before but is now, add to the reference
+                * counters.
+                */
+               if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+                       qg->rfer += num_bytes;
+                       qg->rfer_cmpr += num_bytes;
+                       dirty = true;
+               }
 
-               ULIST_ITER_INIT(&tmp_uiter);
-               while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
-                       struct btrfs_qgroup_list *glist;
+               /*
+                * Was referenced before but isn't now, subtract from the
+                * reference counters.
+                */
+               if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+                       qg->rfer -= num_bytes;
+                       qg->rfer_cmpr -= num_bytes;
+                       dirty = true;
+               }
 
-                       qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
-                       if (qg->tag == seq)
-                               continue;
+               if (qg->old_refcnt < seq)
+                       cur_old_count = 0;
+               else
+                       cur_old_count = qg->old_refcnt - seq;
+               if (qg->new_refcnt < seq)
+                       cur_new_count = 0;
+               else
+                       cur_new_count = qg->new_refcnt - seq;
 
-                       if (qg->refcnt - seq == roots->nnodes) {
-                               qg->excl -= sgn * num_bytes;
-                               qg->excl_cmpr -= sgn * num_bytes;
-                               qgroup_dirty(fs_info, qg);
-                       }
+               /*
+                * If our refcount was the same as the roots previously but our
+                * new count isn't the same as the number of roots now then we
+                * went from having a exclusive reference on this range to not.
+                */
+               if (old_roots && cur_old_count == old_roots &&
+                   (cur_new_count != new_roots || new_roots == 0)) {
+                       WARN_ON(cur_new_count != new_roots && new_roots == 0);
+                       qg->excl -= num_bytes;
+                       qg->excl_cmpr -= num_bytes;
+                       dirty = true;
+               }
 
-                       list_for_each_entry(glist, &qg->groups, next_group) {
-                               ret = ulist_add(tmp, glist->group->qgroupid,
-                                               (uintptr_t)glist->group,
-                                               GFP_ATOMIC);
-                               if (ret < 0)
-                                       return ret;
-                       }
+               /*
+                * If we didn't reference all the roots before but now we do we
+                * have an exclusive reference to this range.
+                */
+               if ((!old_roots || (old_roots && cur_old_count != old_roots))
+                   && cur_new_count == new_roots) {
+                       qg->excl += num_bytes;
+                       qg->excl_cmpr += num_bytes;
+                       dirty = true;
                }
-       }
 
+               if (dirty)
+                       qgroup_dirty(fs_info, qg);
+       }
        return 0;
 }
 
 /*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr.  If we do then we can just discard this operation.
  */
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_fs_info *fs_info,
-                            struct btrfs_delayed_ref_node *node,
-                            struct btrfs_delayed_extent_op *extent_op)
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_qgroup_operation *oper)
 {
-       struct btrfs_root *quota_root;
-       u64 ref_root;
-       struct btrfs_qgroup *qgroup;
        struct ulist *roots = NULL;
-       u64 seq;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
        int ret = 0;
-       int sgn;
 
-       if (!fs_info->quota_enabled)
-               return 0;
-
-       BUG_ON(!fs_info->quota_root);
+       ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+                                  oper->elem.seq, &roots);
+       if (ret < 0)
+               return ret;
+       ret = 0;
 
-       if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
-           node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
-               struct btrfs_delayed_tree_ref *ref;
-               ref = btrfs_delayed_node_to_tree_ref(node);
-               ref_root = ref->root;
-       } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
-                  node->type == BTRFS_SHARED_DATA_REF_KEY) {
-               struct btrfs_delayed_data_ref *ref;
-               ref = btrfs_delayed_node_to_data_ref(node);
-               ref_root = ref->root;
-       } else {
-               BUG();
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(roots, &uiter))) {
+               if (unode->val == oper->ref_root) {
+                       ret = 1;
+                       break;
+               }
        }
+       ulist_free(roots);
+       btrfs_put_tree_mod_seq(fs_info, &oper->elem);
 
-       if (!is_fstree(ref_root)) {
-               /*
-                * non-fs-trees are not being accounted
-                */
-               return 0;
-       }
+       return ret;
+}
 
-       switch (node->action) {
-       case BTRFS_ADD_DELAYED_REF:
-       case BTRFS_ADD_DELAYED_EXTENT:
-               sgn = 1;
-               seq = btrfs_tree_mod_seq_prev(node->seq);
-               break;
-       case BTRFS_DROP_DELAYED_REF:
-               sgn = -1;
-               seq = node->seq;
-               break;
-       case BTRFS_UPDATE_DELAYED_HEAD:
-               return 0;
-       default:
-               BUG();
-       }
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters.  The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count.  Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count.  This means that if
+ * anybody is equal to or below this sequence they were never referenced.  We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently.  This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently.  We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ *                     [qgroup 1/0]
+ *                  /         |          \
+ *             [qg 0/0]   [qg 0/1]     [qg 0/2]
+ *                \          |            /
+ *               [        extent           ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0.  The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes.  Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_info *fs_info,
+                                   struct btrfs_qgroup_operation *oper)
+{
+       struct ulist *roots = NULL;
+       struct ulist *qgroups, *tmp;
+       struct btrfs_qgroup *qgroup;
+       struct seq_list elem = {};
+       u64 seq;
+       int old_roots = 0;
+       int new_roots = 0;
+       int ret = 0;
 
-       mutex_lock(&fs_info->qgroup_rescan_lock);
-       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-               if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
-                       mutex_unlock(&fs_info->qgroup_rescan_lock);
+       if (oper->elem.seq) {
+               ret = check_existing_refs(trans, fs_info, oper);
+               if (ret < 0)
+                       return ret;
+               if (ret)
                        return 0;
-               }
        }
-       mutex_unlock(&fs_info->qgroup_rescan_lock);
 
-       /*
-        * the delayed ref sequence number we pass depends on the direction of
-        * the operation. for add operations, we pass
-        * tree_mod_log_prev_seq(node->seq) to skip
-        * the delayed ref's current sequence number, because we need the state
-        * of the tree before the add operation. for delete operations, we pass
-        * (node->seq) to include the delayed ref's current sequence number,
-        * because we need the state of the tree after the delete operation.
-        */
-       ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
-       if (ret < 0)
-               return ret;
-
-       spin_lock(&fs_info->qgroup_lock);
+       qgroups = ulist_alloc(GFP_NOFS);
+       if (!qgroups)
+               return -ENOMEM;
 
-       quota_root = fs_info->quota_root;
-       if (!quota_root)
-               goto unlock;
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
 
-       qgroup = find_qgroup_rb(fs_info, ref_root);
+       btrfs_get_tree_mod_seq(fs_info, &elem);
+       ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+                                  &roots);
+       btrfs_put_tree_mod_seq(fs_info, &elem);
+       if (ret < 0) {
+               ulist_free(qgroups);
+               ulist_free(tmp);
+               return ret;
+       }
+       spin_lock(&fs_info->qgroup_lock);
+       qgroup = find_qgroup_rb(fs_info, oper->ref_root);
        if (!qgroup)
-               goto unlock;
+               goto out;
+       seq = fs_info->qgroup_seq;
 
        /*
-        * step 1: for each old ref, visit all nodes once and inc refcnt
+        * So roots is the list of all the roots currently pointing at the
+        * bytenr, including the ref we are adding if we are adding, or not if
+        * we are removing a ref.  So we pass in the ref_root to skip that root
+        * in our calculations.  We set old_refnct and new_refcnt cause who the
+        * hell knows what everything looked like before, and it doesn't matter
+        * except...
         */
-       ulist_reinit(fs_info->qgroup_ulist);
-       seq = fs_info->qgroup_seq;
-       fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+       ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+                                    seq, &old_roots, 0);
+       if (ret < 0)
+               goto out;
 
-       ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
-                                      seq);
-       if (ret)
-               goto unlock;
+       /*
+        * Now adjust the refcounts of the qgroups that care about this
+        * reference, either the old_count in the case of removal or new_count
+        * in the case of an addition.
+        */
+       ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+                                    seq);
+       if (ret < 0)
+               goto out;
 
        /*
-        * step 2: walk from the new root
+        * ...in the case of removals.  If we had a removal before we got around
+        * to processing this operation then we need to find that guy and count
+        * his references as if they really existed so we don't end up screwing
+        * up the exclusive counts.  Then whenever we go to process the delete
+        * everything will be grand and we can account for whatever exclusive
+        * changes need to be made there.  We also have to pass in old_roots so
+        * we have an accurate count of the roots as it pertains to this
+        * operations view of the world.
         */
-       ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
-                                      seq, sgn, node->num_bytes, qgroup);
-       if (ret)
-               goto unlock;
+       ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+                                         &old_roots);
+       if (ret < 0)
+               goto out;
 
        /*
-        * step 3: walk again from old refs
+        * We are adding our root, need to adjust up the number of roots,
+        * otherwise old_roots is the number of roots we want.
         */
-       ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
-                                      seq, sgn, node->num_bytes);
-       if (ret)
-               goto unlock;
+       if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+               new_roots = old_roots + 1;
+       } else {
+               new_roots = old_roots;
+               old_roots++;
+       }
+       fs_info->qgroup_seq += old_roots + 1;
 
-unlock:
+
+       /*
+        * And now the magic happens, bless Arne for having a pretty elegant
+        * solution for this.
+        */
+       qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+                              qgroups, seq, old_roots, new_roots, 0);
+out:
        spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(qgroups);
        ulist_free(roots);
+       ulist_free(tmp);
+       return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info,
+                               struct btrfs_qgroup_operation *oper)
+{
+       int ret = 0;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       BUG_ON(!fs_info->quota_root);
+
+       mutex_lock(&fs_info->qgroup_rescan_lock);
+       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+               if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+                       mutex_unlock(&fs_info->qgroup_rescan_lock);
+                       return 0;
+               }
+       }
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+       ASSERT(is_fstree(oper->ref_root));
+
+       switch (oper->type) {
+       case BTRFS_QGROUP_OPER_ADD_EXCL:
+       case BTRFS_QGROUP_OPER_SUB_EXCL:
+               ret = qgroup_excl_accounting(fs_info, oper);
+               break;
+       case BTRFS_QGROUP_OPER_ADD_SHARED:
+       case BTRFS_QGROUP_OPER_SUB_SHARED:
+               ret = qgroup_shared_accounting(trans, fs_info, oper);
+               break;
+       default:
+               ASSERT(0);
+       }
+       return ret;
+}
 
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_qgroup_operation *oper;
+       int ret = 0;
+
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               oper = list_first_entry(&trans->qgroup_ref_list,
+                                       struct btrfs_qgroup_operation, list);
+               list_del_init(&oper->list);
+               if (!ret || !trans->aborted)
+                       ret = btrfs_qgroup_account(trans, fs_info, oper);
+               spin_lock(&fs_info->qgroup_op_lock);
+               rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+               spin_unlock(&fs_info->qgroup_op_lock);
+               btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+               kfree(oper);
+       }
        return ret;
 }
 
                srcgroup = find_qgroup_rb(fs_info, srcid);
                if (!srcgroup)
                        goto unlock;
-               dstgroup->rfer = srcgroup->rfer - level_size;
-               dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+
+               /*
+                * We call inherit after we clone the root in order to make sure
+                * our counts don't go crazy, so at this point the only
+                * difference between the two roots should be the root node.
+                */
+               dstgroup->rfer = srcgroup->rfer;
+               dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+               dstgroup->excl = level_size;
+               dstgroup->excl_cmpr = level_size;
                srcgroup->excl = level_size;
                srcgroup->excl_cmpr = level_size;
                qgroup_dirty(fs_info, dstgroup);
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+               qg = u64_to_ptr(unode->aux);
 
                if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
                    qg->reserved + (s64)qg->rfer + num_bytes >
        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
 
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+               qg = u64_to_ptr(unode->aux);
 
                qg->reserved += num_bytes;
        }
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+               qg = u64_to_ptr(unode->aux);
 
                qg->reserved -= num_bytes;
 
  */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-                  struct btrfs_trans_handle *trans, struct ulist *tmp,
-                  struct extent_buffer *scratch_leaf)
+                  struct btrfs_trans_handle *trans, struct ulist *qgroups,
+                  struct ulist *tmp, struct extent_buffer *scratch_leaf)
 {
        struct btrfs_key found;
        struct ulist *roots = NULL;
-       struct ulist_node *unode;
-       struct ulist_iterator uiter;
        struct seq_list tree_mod_seq_elem = {};
+       u64 num_bytes;
        u64 seq;
+       int new_roots;
        int slot;
        int ret;
 
        mutex_unlock(&fs_info->qgroup_rescan_lock);
 
        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
-               u64 num_bytes;
-
                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
                    found.type != BTRFS_METADATA_ITEM_KEY)
                else
                        num_bytes = found.offset;
 
-               ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
-                                          tree_mod_seq_elem.seq, &roots);
+               ulist_reinit(qgroups);
+               ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+                                          &roots);
                if (ret < 0)
                        goto out;
                spin_lock(&fs_info->qgroup_lock);
                seq = fs_info->qgroup_seq;
                fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
 
-               ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
-               if (ret) {
+               new_roots = 0;
+               ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+                                            seq, &new_roots, 1);
+               if (ret < 0) {
                        spin_unlock(&fs_info->qgroup_lock);
                        ulist_free(roots);
                        goto out;
                }
 
-               /*
-                * step2 of btrfs_qgroup_account_ref works from a single root,
-                * we're doing all at once here.
-                */
-               ulist_reinit(tmp);
-               ULIST_ITER_INIT(&uiter);
-               while ((unode = ulist_next(roots, &uiter))) {
-                       struct btrfs_qgroup *qg;
-
-                       qg = find_qgroup_rb(fs_info, unode->val);
-                       if (!qg)
-                               continue;
-
-                       ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
-                                       GFP_ATOMIC);
-                       if (ret < 0) {
-                               spin_unlock(&fs_info->qgroup_lock);
-                               ulist_free(roots);
-                               goto out;
-                       }
-               }
-
-               /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
-               ULIST_ITER_INIT(&uiter);
-               while ((unode = ulist_next(tmp, &uiter))) {
-                       struct btrfs_qgroup *qg;
-                       struct btrfs_qgroup_list *glist;
-
-                       qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
-                       qg->rfer += num_bytes;
-                       qg->rfer_cmpr += num_bytes;
-                       WARN_ON(qg->tag >= seq);
-                       if (qg->refcnt - seq == roots->nnodes) {
-                               qg->excl += num_bytes;
-                               qg->excl_cmpr += num_bytes;
-                       }
-                       qgroup_dirty(fs_info, qg);
-
-                       list_for_each_entry(glist, &qg->groups, next_group) {
-                               ret = ulist_add(tmp, glist->group->qgroupid,
-                                               (uintptr_t)glist->group,
-                                               GFP_ATOMIC);
-                               if (ret < 0) {
-                                       spin_unlock(&fs_info->qgroup_lock);
-                                       ulist_free(roots);
-                                       goto out;
-                               }
-                       }
+               ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+                                            seq, 0, new_roots, 1);
+               if (ret < 0) {
+                       spin_unlock(&fs_info->qgroup_lock);
+                       ulist_free(roots);
+                       goto out;
                }
-
                spin_unlock(&fs_info->qgroup_lock);
                ulist_free(roots);
-               ret = 0;
        }
-
 out:
        btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 
                                                     qgroup_rescan_work);
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans = NULL;
-       struct ulist *tmp = NULL;
+       struct ulist *tmp = NULL, *qgroups = NULL;
        struct extent_buffer *scratch_leaf = NULL;
        int err = -ENOMEM;
 
        path = btrfs_alloc_path();
        if (!path)
                goto out;
+       qgroups = ulist_alloc(GFP_NOFS);
+       if (!qgroups)
+               goto out;
        tmp = ulist_alloc(GFP_NOFS);
        if (!tmp)
                goto out;
                        err = -EINTR;
                } else {
                        err = qgroup_rescan_leaf(fs_info, path, trans,
-                                                tmp, scratch_leaf);
+                                                qgroups, tmp, scratch_leaf);
                }
                if (err > 0)
                        btrfs_commit_transaction(trans, fs_info->fs_root);
 
 out:
        kfree(scratch_leaf);
-       ulist_free(tmp);
+       ulist_free(qgroups);
        btrfs_free_path(path);
 
        mutex_lock(&fs_info->qgroup_rescan_lock);
 
--- /dev/null
+/*
+ * Copyright (C) 2014 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_QGROUP__
+#define __BTRFS_QGROUP__
+
+/*
+ * A description of the operations, all of these operations only happen when we
+ * are adding the 1st reference for that subvolume in the case of adding space
+ * or on the last reference delete in the case of subtraction.  The only
+ * exception is the last one, which is added for confusion.
+ *
+ * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
+ * one pointing at the bytes we are adding.  This is called on the first
+ * allocation.
+ *
+ * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
+ * shared between subvols.  This is called on the creation of a ref that already
+ * has refs from a different subvolume, so basically reflink.
+ *
+ * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
+ * one referencing the range.
+ *
+ * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
+ * refs with other subvolumes.
+ */
+enum btrfs_qgroup_operation_type {
+       BTRFS_QGROUP_OPER_ADD_EXCL,
+       BTRFS_QGROUP_OPER_ADD_SHARED,
+       BTRFS_QGROUP_OPER_SUB_EXCL,
+       BTRFS_QGROUP_OPER_SUB_SHARED,
+};
+
+struct btrfs_qgroup_operation {
+       u64 ref_root;
+       u64 bytenr;
+       u64 num_bytes;
+       u64 seq;
+       enum btrfs_qgroup_operation_type type;
+       struct seq_list elem;
+       struct rb_node n;
+       struct list_head list;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info, u64 qgroupid,
+                       char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info, u64 qgroupid,
+                      struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_fs_info *fs_info, u64 ref_root,
+                           u64 bytenr, u64 num_bytes,
+                           enum btrfs_qgroup_operation_type type,
+                           int mod_seq);
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_info *fs_info);
+void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
+                                  struct btrfs_fs_info *fs_info,
+                                  struct btrfs_qgroup_operation *oper);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+                     struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+                        struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+                        struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+                              u64 rfer, u64 excl);
+#endif
+
+#endif /* __BTRFS_QGROUP__ */
 
 #include "inode-map.h"
 #include "volumes.h"
 #include "dev-replace.h"
+#include "qgroup.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
                return 0;
        }
 
-       /*
-        * do the qgroup accounting as early as possible
-        */
-       err = btrfs_delayed_refs_qgroup_accounting(trans, info);
-
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
 
-       if (trans->qgroup_reserved) {
-               /*
-                * the same root has to be passed here between start_transaction
-                * and end_transaction. Subvolume quota depends on this.
-                */
-               btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
-
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
 
                btrfs_run_delayed_refs(trans, root, cur);
        }
 
+       if (trans->qgroup_reserved) {
+               /*
+                * the same root has to be passed here between start_transaction
+                * and end_transaction. Subvolume quota depends on this.
+                */
+               btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
+               trans->qgroup_reserved = 0;
+       }
+
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
 
                        goto no_free_objectid;
        }
 
-       pending->error = btrfs_qgroup_inherit(trans, fs_info,
-                                             root->root_key.objectid,
-                                             objectid, pending->inherit);
-       if (pending->error)
-               goto no_free_objectid;
-
        key.objectid = objectid;
        key.offset = (u64)-1;
        key.type = BTRFS_ROOT_ITEM_KEY;
                goto fail;
        }
 
+       /*
+        * We need to flush delayed refs in order to make sure all of our quota
+        * operations have been done before we call btrfs_qgroup_inherit.
+        */
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
+       pending->error = btrfs_qgroup_inherit(trans, fs_info,
+                                             root->root_key.objectid,
+                                             objectid, pending->inherit);
+       if (pending->error)
+               goto no_free_objectid;
+
        /* see comments in should_cow_block() */
        set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
        smp_wmb();
         * them now so that they hinder processing of more delayed refs
         * as little as possible.
         */
-       if (ret) {
-               btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-               return ret;
-       }
-
-       ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
        if (ret)
                return ret;