A number of workloads do not require copy on write data or checksumming.
mount -o nodatasum to disable checksums and -o nodatacow to disable
both copy on write and checksumming.
In nodatacow mode, copy on write is still performed when a given extent
is under snapshot.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
        memset(p, 0, sizeof(*p));
 }
 
-static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct extent_buffer *buf,
+                     struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+       struct extent_buffer *cow;
+       u32 nritems;
+       int ret = 0;
+       int level;
+       struct btrfs_key first_key;
+       struct btrfs_root new_root;
+
+       memcpy(&new_root, root, sizeof(new_root));
+       new_root.root_key.objectid = new_root_objectid;
+
+       WARN_ON(root->ref_cows && trans->transid !=
+               root->fs_info->running_transaction->transid);
+       WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+       level = btrfs_header_level(buf);
+       nritems = btrfs_header_nritems(buf);
+       if (nritems) {
+               if (level == 0)
+                       btrfs_item_key_to_cpu(buf, &first_key, 0);
+               else
+                       btrfs_node_key_to_cpu(buf, &first_key, 0);
+       } else {
+               first_key.objectid = 0;
+       }
+       cow = __btrfs_alloc_free_block(trans, &new_root, buf->len,
+                                      new_root_objectid,
+                                      trans->transid, first_key.objectid,
+                                      level, buf->start, 0);
+       if (IS_ERR(cow))
+               return PTR_ERR(cow);
+
+       copy_extent_buffer(cow, buf, 0, 0, cow->len);
+       btrfs_set_header_bytenr(cow, cow->start);
+       btrfs_set_header_generation(cow, trans->transid);
+       btrfs_set_header_owner(cow, new_root_objectid);
+
+       WARN_ON(btrfs_header_generation(buf) > trans->transid);
+       ret = btrfs_inc_ref(trans, &new_root, buf);
+       if (ret)
+               return ret;
+
+       btrfs_mark_buffer_dirty(cow);
+       *cow_ret = cow;
+       return 0;
+}
+
+int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct extent_buffer *buf,
                             struct extent_buffer *parent, int parent_slot,
 
 #define BTRFS_STRING_ITEM_KEY  253
 
 #define BTRFS_MOUNT_NODATASUM          0x1
+#define BTRFS_MOUNT_NODATACOW          0x2
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
        btrfs_item_offset_nr(leaf, slot)))
 
 /* extent-tree.c */
+u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
+                                 struct btrfs_path *count_path,
+                                 u64 first_extent);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy);
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
                    struct extent_buffer **cow_ret);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct extent_buffer *buf,
+                     struct extent_buffer **cow_ret, u64 new_root_objectid);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 
        return 0;
 }
 
+u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
+                                 struct btrfs_path *count_path,
+                                 u64 first_extent)
+{
+       struct btrfs_root *extent_root = root->fs_info->extent_root;
+       struct btrfs_path *path;
+       u64 bytenr;
+       u64 found_objectid;
+       u64 root_objectid = 0;
+       u32 total_count = 0;
+       u32 cur_count;
+       u32 refs;
+       u32 nritems;
+       int ret;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct extent_buffer *l;
+       struct btrfs_extent_item *item;
+       struct btrfs_extent_ref *ref_item;
+       int level = -1;
+
+       path = btrfs_alloc_path();
+again:
+       if (level == -1)
+               bytenr = first_extent;
+       else
+               bytenr = count_path->nodes[level]->start;
+
+       cur_count = 0;
+       key.objectid = bytenr;
+       key.offset = 0;
+
+       btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+       ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret == 0);
+
+       l = path->nodes[0];
+       btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+
+       if (found_key.objectid != bytenr ||
+           found_key.type != BTRFS_EXTENT_ITEM_KEY) {
+               goto out;
+       }
+
+       item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+       refs = btrfs_extent_refs(l, item);
+       while (1) {
+               nritems = btrfs_header_nritems(l);
+               if (path->slots[0] >= nritems) {
+                       ret = btrfs_next_leaf(extent_root, path);
+                       if (ret == 0)
+                               continue;
+                       break;
+               }
+               btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+               if (found_key.objectid != bytenr)
+                       break;
+               if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+                       path->slots[0]++;
+                       continue;
+               }
+
+               cur_count++;
+               ref_item = btrfs_item_ptr(l, path->slots[0],
+                                         struct btrfs_extent_ref);
+               found_objectid = btrfs_ref_root(l, ref_item);
+
+               if (found_objectid != root_objectid)
+                       total_count++;
+
+               if (total_count > 1)
+                       goto out;
+
+               if (root_objectid == 0)
+                       root_objectid = found_objectid;
+
+               path->slots[0]++;
+       }
+       if (cur_count == 0) {
+               total_count = 0;
+               goto out;
+       }
+       if (total_count > 1)
+               goto out;
+       if (level >= 0 && root->node == count_path->nodes[level])
+               goto out;
+       level++;
+       btrfs_release_path(root, path);
+       goto again;
+
+out:
+       btrfs_free_path(path);
+       return total_count;
+
+}
+
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, u64 owner_objectid)
 {
        if (!path)
                return -ENOMEM;
 
-       if (ref_generation && owner_objectid == 0 && root_objectid == 3) {
-//printk("drop backref root %Lu gen %Lu byte %Lu\n", root_objectid, ref_generation, bytenr );
-       }
        ret = lookup_extent_backref(trans, extent_root, path,
                                    bytenr, root_objectid,
                                    ref_generation,
 
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 };
 
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-       struct btrfs_key ins;
        u64 alloc_hint = 0;
        u64 num_bytes;
-       int ret;
        u64 blocksize = root->sectorsize;
+       struct btrfs_key ins;
+       int ret;
 
-       mutex_lock(&root->fs_info->fs_mutex);
        trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, inode);
        BUG_ON(!trans);
+       btrfs_set_trans_block_group(trans, inode);
+
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+       num_bytes = max(blocksize,  num_bytes);
        ret = btrfs_drop_extents(trans, root, inode,
                                 start, start + num_bytes, start, &alloc_hint);
 
                                       ins.offset);
 out:
        btrfs_end_transaction(trans, root);
+       return ret;
+}
+
+static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+{
+       u64 extent_start;
+       u64 extent_end;
+       u64 bytenr;
+       u64 cow_end;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_buffer *leaf;
+       int found_type;
+       struct btrfs_path *path;
+       struct btrfs_file_extent_item *item;
+       int ret;
+       int err;
+       struct btrfs_key found_key;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+again:
+       ret = btrfs_lookup_file_extent(NULL, root, path,
+                                      inode->i_ino, start, 0);
+       if (ret < 0) {
+               btrfs_free_path(path);
+               return ret;
+       }
+
+       cow_end = end;
+       if (ret != 0) {
+               if (path->slots[0] == 0)
+                       goto not_found;
+               path->slots[0]--;
+       }
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0],
+                             struct btrfs_file_extent_item);
+
+       /* are we inside the extent that was found? */
+       btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+       found_type = btrfs_key_type(&found_key);
+       if (found_key.objectid != inode->i_ino ||
+           found_type != BTRFS_EXTENT_DATA_KEY) {
+               goto not_found;
+       }
+
+       found_type = btrfs_file_extent_type(leaf, item);
+       extent_start = found_key.offset;
+       if (found_type == BTRFS_FILE_EXTENT_REG) {
+               extent_end = extent_start +
+                      btrfs_file_extent_num_bytes(leaf, item);
+               err = 0;
+
+               if (start < extent_start || start >= extent_end)
+                       goto not_found;
+
+               cow_end = min(end, extent_end - 1);
+               bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+               if (bytenr == 0)
+                       goto not_found;
+
+               bytenr += btrfs_file_extent_offset(leaf, item);
+               if (btrfs_count_snapshots_in_path(root, path, bytenr) != 1) {
+                       goto not_found;
+               }
+
+               start = extent_end;
+       } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+               goto not_found;
+       }
+loop:
+       if (start > end) {
+               btrfs_free_path(path);
+               return 0;
+       }
+       btrfs_release_path(root, path);
+       goto again;
+
+not_found:
+       cow_file_range(inode, start, cow_end);
+       start = cow_end + 1;
+       goto loop;
+}
+
+static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       mutex_lock(&root->fs_info->fs_mutex);
+       if (btrfs_test_opt(root, NODATACOW))
+               ret = run_delalloc_nocow(inode, start, end);
+       else
+               ret = cow_file_range(inode, start, end);
        mutex_unlock(&root->fs_info->fs_mutex);
        return ret;
 }
 
        btrfs_cow_one_page(inode, page, PAGE_CACHE_SIZE);
 
-       set_page_extent_mapped(page);
-       set_page_dirty(page);
-
        if (pos > inode->i_size) {
                i_size_write(inode, pos);
                mark_inode_dirty(inode);
        key.objectid = objectid;
        key.offset = 1;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
        extent_buffer_get(root->node);
        btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
        free_extent_buffer(tmp);
-       btrfs_set_root_bytenr(&new_root_item, root->node->start);
-       btrfs_set_root_level(&new_root_item, btrfs_header_level(root->node));
+
+       btrfs_copy_root(trans, root, root->node, &tmp, objectid);
+
+       btrfs_set_root_bytenr(&new_root_item, tmp->start);
+       btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                &new_root_item);
+       free_extent_buffer(tmp);
        if (ret)
                goto fail;
 
 
        if (ret)
                goto fail;
-
-       ret = btrfs_inc_root_ref(trans, root, objectid);
-       if (ret)
-               goto fail;
 fail:
        nr = trans->blocks_used;
        err = btrfs_commit_transaction(trans, root);
 
 }
 
 enum {
-       Opt_subvol, Opt_nodatasum, Opt_err,
+       Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_err,
 };
 
 static match_table_t tokens = {
        {Opt_subvol, "subvol=%s"},
        {Opt_nodatasum, "nodatasum"},
+       {Opt_nodatacow, "nodatacow"},
        {Opt_err, NULL}
 };
 
        struct btrfs_fs_info *info = NULL;
        substring_t args[MAX_OPT_ARGS];
 
-       if (root)
-               info = root->fs_info;
-
        if (!options)
                return 1;
 
+       /*
+        * strsep changes the string, duplicate it because parse_options
+        * gets called twice
+        */
+       options = kstrdup(options, GFP_NOFS);
+       if (!options)
+               return -ENOMEM;
+
+       if (root)
+               info = root->fs_info;
+
        while ((p = strsep (&options, ",")) != NULL) {
                int token;
                if (!*p)
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_subvol:
-                       if (subvol_name)
+                       if (subvol_name) {
                                *subvol_name = match_strdup(&args[0]);
+                       }
                        break;
                case Opt_nodatasum:
-                       if (root)
+                       if (info) {
+                               printk("btrfs: setting nodatacsum\n");
                                btrfs_set_opt(info->mount_opt, NODATASUM);
+                       }
+                       break;
+               case Opt_nodatacow:
+                       if (info) {
+                               printk("btrfs: setting nodatacow\n");
+                               btrfs_set_opt(info->mount_opt, NODATACOW);
+                               btrfs_set_opt(info->mount_opt, NODATASUM);
+                       }
                        break;
                default:
-                       return 0;
+                       break;
                }
        }
+       kfree(options);
        return 1;
 }