tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
        depends on EXPERIMENTAL
        select LIBCRC32C
+       select ZLIB_INFLATE
+       select ZLIB_DEFLATE
        help
          Btrfs is a new filesystem with extents, writable snapshotting,
          support for multiple devices and many more features.
 
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-          ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+          ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+          compression.o
 else
 
 # Normal Makefile
 
--- /dev/null
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+       /* number of bios pending for this compressed extent */
+       atomic_t pending_bios;
+
+       /* the pages with the compressed data on them */
+       struct page **compressed_pages;
+
+       /* inode that owns this data */
+       struct inode *inode;
+
+       /* starting offset in the inode for our pages */
+       u64 start;
+
+       /* number of bytes in the inode we're working on */
+       unsigned long len;
+
+       /* number of bytes on disk */
+       unsigned long compressed_len;
+
+       /* number of compressed pages in the array */
+       unsigned long nr_pages;
+
+       /* IO errors */
+       int errors;
+
+       /* for reads, this is the bio we are copying the data into */
+       struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+                                       u64 first_byte, gfp_t gfp_flags)
+{
+       struct bio *bio;
+       int nr_vecs;
+
+       nr_vecs = bio_get_nr_vecs(bdev);
+       bio = bio_alloc(gfp_flags, nr_vecs);
+
+       if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+               while (!bio && (nr_vecs /= 2))
+                       bio = bio_alloc(gfp_flags, nr_vecs);
+       }
+
+       if (bio) {
+               bio->bi_size = 0;
+               bio->bi_bdev = bdev;
+               bio->bi_sector = first_byte >> 9;
+       }
+       return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+       struct extent_io_tree *tree;
+       struct compressed_bio *cb = bio->bi_private;
+       struct inode *inode;
+       struct page *page;
+       unsigned long index;
+       int ret;
+
+       if (err)
+               cb->errors = 1;
+
+       /* if there are more bios still pending for this compressed
+        * extent, just exit
+        */
+       if (!atomic_dec_and_test(&cb->pending_bios))
+               goto out;
+
+       /* ok, we're the last bio for this extent, lets start
+        * the decompression.
+        */
+       inode = cb->inode;
+       tree = &BTRFS_I(inode)->io_tree;
+       ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+                                       cb->start,
+                                       cb->orig_bio->bi_io_vec,
+                                       cb->orig_bio->bi_vcnt,
+                                       cb->compressed_len);
+       if (ret)
+               cb->errors = 1;
+
+       /* release the compressed pages */
+       index = 0;
+       for (index = 0; index < cb->nr_pages; index++) {
+               page = cb->compressed_pages[index];
+               page->mapping = NULL;
+               page_cache_release(page);
+       }
+
+       /* do io completion on the original bio */
+       if (cb->errors)
+               bio_io_error(cb->orig_bio);
+       else
+               bio_endio(cb->orig_bio, 0);
+
+       /* finally free the cb struct */
+       kfree(cb->compressed_pages);
+       kfree(cb);
+out:
+       bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+                                            unsigned long ram_size)
+{
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+       struct page *pages[16];
+       unsigned long nr_pages = end_index - index + 1;
+       int i;
+       int ret;
+
+       while(nr_pages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nr_pages, ARRAY_SIZE(pages)), pages);
+               if (ret == 0) {
+                       nr_pages -= 1;
+                       index += 1;
+                       continue;
+               }
+               for (i = 0; i < ret; i++) {
+                       end_page_writeback(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               nr_pages -= ret;
+               index += ret;
+       }
+       /* the inode may be gone now */
+       return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+       struct extent_io_tree *tree;
+       struct compressed_bio *cb = bio->bi_private;
+       struct inode *inode;
+       struct page *page;
+       unsigned long index;
+
+       if (err)
+               cb->errors = 1;
+
+       /* if there are more bios still pending for this compressed
+        * extent, just exit
+        */
+       if (!atomic_dec_and_test(&cb->pending_bios))
+               goto out;
+
+       /* ok, we're the last bio for this extent, step one is to
+        * call back into the FS and do all the end_io operations
+        */
+       inode = cb->inode;
+       tree = &BTRFS_I(inode)->io_tree;
+       tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+                                        cb->start,
+                                        cb->start + cb->len - 1,
+                                        NULL, 1);
+
+       end_compressed_writeback(inode, cb->start, cb->len);
+       /* note, our inode could be gone now */
+
+       /*
+        * release the compressed pages, these came from alloc_page and
+        * are not attached to the inode at all
+        */
+       index = 0;
+       for (index = 0; index < cb->nr_pages; index++) {
+               page = cb->compressed_pages[index];
+               page->mapping = NULL;
+               page_cache_release(page);
+       }
+
+       /* finally free the cb struct */
+       kfree(cb->compressed_pages);
+       kfree(cb);
+out:
+       bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                unsigned long len, u64 disk_start,
+                                unsigned long compressed_len,
+                                struct page **compressed_pages,
+                                unsigned long nr_pages)
+{
+       struct bio *bio = NULL;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct compressed_bio *cb;
+       unsigned long bytes_left;
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       int page_index = 0;
+       struct page *page;
+       u64 first_byte = disk_start;
+       struct block_device *bdev;
+       int ret;
+
+       WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+       cb = kmalloc(sizeof(*cb), GFP_NOFS);
+       atomic_set(&cb->pending_bios, 0);
+       cb->errors = 0;
+       cb->inode = inode;
+       cb->start = start;
+       cb->len = len;
+       cb->compressed_pages = compressed_pages;
+       cb->compressed_len = compressed_len;
+       cb->orig_bio = NULL;
+       cb->nr_pages = nr_pages;
+
+       bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+       ret = btrfs_csum_file_bytes(root, inode, start, len);
+       BUG_ON(ret);
+
+       bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+       bio->bi_private = cb;
+       bio->bi_end_io = end_compressed_bio_write;
+       atomic_inc(&cb->pending_bios);
+
+       /* create and submit bios for the compressed pages */
+       bytes_left = compressed_len;
+       while(bytes_left > 0) {
+               page = compressed_pages[page_index];
+               page->mapping = inode->i_mapping;
+               if (bio->bi_size)
+                       ret = io_tree->ops->merge_bio_hook(page, 0,
+                                                          PAGE_CACHE_SIZE,
+                                                          bio, 0);
+               else
+                       ret = 0;
+
+               if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+                   PAGE_CACHE_SIZE) {
+                       bio_get(bio);
+
+                       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+                       BUG_ON(ret);
+
+                       ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+                       BUG_ON(ret);
+
+                       bio_put(bio);
+
+                       bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+                       atomic_inc(&cb->pending_bios);
+                       bio->bi_private = cb;
+                       bio->bi_end_io = end_compressed_bio_write;
+                       bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+               }
+               page_index++;
+               bytes_left -= PAGE_CACHE_SIZE;
+               first_byte += PAGE_CACHE_SIZE;
+       }
+       bio_get(bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+       BUG_ON(ret);
+
+       ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+       BUG_ON(ret);
+
+       bio_put(bio);
+       return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                int mirror_num, unsigned long bio_flags)
+{
+       struct extent_io_tree *tree;
+       struct extent_map_tree *em_tree;
+       struct compressed_bio *cb;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+       unsigned long compressed_len;
+       unsigned long nr_pages;
+       unsigned long page_index;
+       struct page *page;
+       struct block_device *bdev;
+       struct bio *comp_bio;
+       u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+       struct extent_map *em;
+       int ret;
+
+       tree = &BTRFS_I(inode)->io_tree;
+       em_tree = &BTRFS_I(inode)->extent_tree;
+
+       /* we need the actual starting offset of this extent in the file */
+       spin_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree,
+                                  page_offset(bio->bi_io_vec->bv_page),
+                                  PAGE_CACHE_SIZE);
+       spin_unlock(&em_tree->lock);
+
+       cb = kmalloc(sizeof(*cb), GFP_NOFS);
+       atomic_set(&cb->pending_bios, 0);
+       cb->errors = 0;
+       cb->inode = inode;
+
+       cb->start = em->start;
+       compressed_len = em->block_len;
+       free_extent_map(em);
+
+       cb->len = uncompressed_len;
+       cb->compressed_len = compressed_len;
+       cb->orig_bio = bio;
+
+       nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+                                PAGE_CACHE_SIZE;
+       cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+                                      GFP_NOFS);
+       bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+       for (page_index = 0; page_index < nr_pages; page_index++) {
+               cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+                                                             __GFP_HIGHMEM);
+       }
+       cb->nr_pages = nr_pages;
+
+       comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+       comp_bio->bi_private = cb;
+       comp_bio->bi_end_io = end_compressed_bio_read;
+       atomic_inc(&cb->pending_bios);
+
+       for (page_index = 0; page_index < nr_pages; page_index++) {
+               page = cb->compressed_pages[page_index];
+               page->mapping = inode->i_mapping;
+               if (comp_bio->bi_size)
+                       ret = tree->ops->merge_bio_hook(page, 0,
+                                                       PAGE_CACHE_SIZE,
+                                                       comp_bio, 0);
+               else
+                       ret = 0;
+
+               if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+                   PAGE_CACHE_SIZE) {
+                       bio_get(comp_bio);
+
+                       ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+                       BUG_ON(ret);
+
+                       ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+                       BUG_ON(ret);
+
+                       bio_put(comp_bio);
+
+                       comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+                                                       GFP_NOFS);
+                       atomic_inc(&cb->pending_bios);
+                       bio->bi_private = cb;
+                       bio->bi_end_io = end_compressed_bio_write;
+                       bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+               }
+               cur_disk_byte += PAGE_CACHE_SIZE;
+       }
+       bio_get(comp_bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+       BUG_ON(ret);
+
+       ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+       BUG_ON(ret);
+
+       bio_put(comp_bio);
+       return 0;
+}
 
--- /dev/null
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+                         struct page *dest_page,
+                         unsigned long start_byte,
+                         size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                             u64 start, unsigned long len,
+                             struct page **pages,
+                             unsigned long nr_dest_pages,
+                             unsigned long *out_pages,
+                             unsigned long *total_in,
+                             unsigned long *total_out,
+                             unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                             u64 disk_start,
+                             struct bio_vec *bvec,
+                             int vcnt,
+                             size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                 unsigned long len, u64 disk_start,
+                                 unsigned long compressed_len,
+                                 struct page **compressed_pages,
+                                 unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                int mirror_num, unsigned long bio_flags);
+#endif
 
        __le32 nsec;
 } __attribute__ ((__packed__));
 
-/*
- * there is no padding here on purpose.  If you want to extent the inode,
- * make a new item type
- */
+typedef enum {
+       BTRFS_COMPRESS_NONE = 0,
+       BTRFS_COMPRESS_ZLIB = 1,
+       BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+       BTRFS_ENCRYPTION_NONE = 0,
+       BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
 struct btrfs_inode_item {
        /* nfs style generation number */
        __le64 generation;
        __le64 rdev;
        __le16 flags;
        __le16 compat_flags;
+
        struct btrfs_timespec atime;
        struct btrfs_timespec ctime;
        struct btrfs_timespec mtime;
 #define BTRFS_FILE_EXTENT_INLINE 1
 
 struct btrfs_file_extent_item {
+       /*
+        * transaction id that created this extent
+        */
        __le64 generation;
+       /*
+        * max number of bytes to hold this extent in ram
+        * when we split a compressed extent we can't know how big
+        * each of the resulting pieces will be.  So, this is
+        * an upper limit on the size of the extent in ram instead of
+        * an exact limit.
+        */
+       __le64 ram_bytes;
+
+       /*
+        * 32 bits for the various ways we might encode the data,
+        * including compression and encryption.  If any of these
+        * are set to something a given disk format doesn't understand
+        * it is treated like an incompat flag for reading and writing,
+        * but not for stat.
+        */
+       u8 compression;
+       u8 encryption;
+       __le16 other_encoding; /* spare for later use */
+
+       /* are we inline data or a real extent? */
        u8 type;
+
        /*
         * disk space consumed by the extent, checksum blocks are included
         * in these numbers
         */
        __le64 offset;
        /*
-        * the logical number of file blocks (no csums included)
+        * the logical number of file blocks (no csums included).  This
+        * always reflects the size uncompressed and without encoding.
         */
        __le64 num_bytes;
+
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
 #define BTRFS_MOUNT_NOBARRIER          (1 << 2)
 #define BTRFS_MOUNT_SSD                        (1 << 3)
 #define BTRFS_MOUNT_DEGRADED           (1 << 4)
+#define BTRFS_MOUNT_COMPRESS           (1 << 5)
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
 #define BTRFS_INODE_NODATASUM          (1 << 0)
 #define BTRFS_INODE_NODATACOW          (1 << 1)
 #define BTRFS_INODE_READONLY           (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS         (1 << 3)
 #define btrfs_clear_flag(inode, flag)  (BTRFS_I(inode)->flags &= \
                                         ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)    (BTRFS_I(inode)->flags |= \
        return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
 }
 
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-                                              struct btrfs_item *e)
-{
-       unsigned long offset;
-       offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
-       return btrfs_item_size(eb, e) - offset;
-}
-
 BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
                   disk_bytenr, 64);
 BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
                  offset, 64);
 BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
                   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+                  ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+                  compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+                  encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+                  other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+                                              struct btrfs_file_extent_item *e)
+{
+       return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+                                                   struct btrfs_item *e)
+{
+       unsigned long offset;
+       offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+       return btrfs_item_size(eb, e) - offset;
+}
 
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              u64 objectid, u64 pos, u64 disk_offset,
-                              u64 disk_num_bytes,
-                            u64 num_bytes, u64 offset);
+                            struct btrfs_root *root,
+                            u64 objectid, u64 pos,
+                            u64 disk_offset, u64 disk_num_bytes,
+                            u64 num_bytes, u64 offset, u64 ram_bytes,
+                            u8 compression, u8 encryption, u16 other_encoding);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
                           struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                         u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
                                  int namelen);
 
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-                        size_t size, struct bio *bio);
+                        size_t size, struct bio *bio, unsigned long bio_flags);
 
 unsigned long btrfs_force_ra(struct address_space *mapping,
                              struct file_ra_state *ra, struct file *file,
 
        extent_submit_bio_hook_t *submit_bio_hook;
        int rw;
        int mirror_num;
+       unsigned long bio_flags;
        struct btrfs_work work;
 };
 
        }
        em->start = 0;
        em->len = (u64)-1;
+       em->block_len = (u64)-1;
        em->block_start = 0;
        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
                wake_up(&fs_info->async_submit_wait);
 
        async->submit_bio_hook(async->inode, async->rw, async->bio,
-                              async->mirror_num);
+                              async->mirror_num, async->bio_flags);
        kfree(async);
 }
 
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
+                       unsigned long bio_flags,
                        extent_submit_bio_hook_t *submit_bio_hook)
 {
        struct async_submit_bio *async;
        async->submit_bio_hook = submit_bio_hook;
        async->work.func = run_one_async_submit;
        async->work.flags = 0;
+       async->bio_flags = bio_flags;
 
        while(atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
 }
 
 static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                int mirror_num)
+                                int mirror_num, unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                int mirror_num)
+                                int mirror_num, unsigned long bio_flags)
 {
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
         */
        if (!(rw & (1 << BIO_RW))) {
-               return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+               return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
        }
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                  inode, rw, bio, mirror_num,
+                                  inode, rw, bio, mirror_num, 0,
                                   __btree_submit_bio_hook);
 }
 
        fs_info->btree_inode = new_inode(sb);
        fs_info->btree_inode->i_ino = 1;
        fs_info->btree_inode->i_nlink = 1;
+
        fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
 
        INIT_LIST_HEAD(&fs_info->ordered_extents);
         */
        btrfs_init_workers(&fs_info->workers, "worker",
                           fs_info->thread_pool_size);
+
        btrfs_init_workers(&fs_info->submit_workers, "submit",
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size));
        }
 
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+       fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+                                   4 * 1024 * 1024 / PAGE_CACHE_SIZE);
 
        nodesize = btrfs_super_nodesize(disk_super);
        leafsize = btrfs_super_leafsize(disk_super);
 
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
+                       unsigned long bio_flags,
                        extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 
 
        em->start = extent_key->objectid - offset;
        em->len = extent_key->offset;
+       em->block_len = extent_key->offset;
        em->block_start = extent_key->objectid;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
 };
 
 struct disk_extent {
+       u64 ram_bytes;
        u64 disk_bytenr;
        u64 disk_num_bytes;
        u64 offset;
        u64 num_bytes;
+       u8 compression;
+       u8 encryption;
+       u16 other_encoding;
 };
 
 static int is_cowonly_root(u64 root_objectid)
                        btrfs_file_extent_disk_num_bytes(leaf, fi);
                exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
                exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+               exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+               exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+               exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+               exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                          fi);
                WARN_ON(exts[nr].offset > 0);
                WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
                                                new_extents[0].disk_bytenr);
                        btrfs_set_file_extent_disk_num_bytes(leaf, fi,
                                                new_extents[0].disk_num_bytes);
+                       btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                               new_extents[0].ram_bytes);
                        ext_offset += new_extents[0].offset;
                        btrfs_set_file_extent_offset(leaf, fi, ext_offset);
                        btrfs_mark_buffer_dirty(leaf);
                                                new_extents[i].disk_bytenr);
                                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
                                                new_extents[i].disk_num_bytes);
+                               btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                               new_extents[i].ram_bytes);
+
+                               btrfs_set_file_extent_compression(leaf, fi,
+                                               new_extents[i].compression);
+                               btrfs_set_file_extent_encryption(leaf, fi,
+                                               new_extents[i].encryption);
+                               btrfs_set_file_extent_other_encoding(leaf, fi,
+                                               new_extents[i].other_encoding);
+
                                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        extent_len);
                                ext_offset += new_extents[i].offset;
                ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
 
                btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+               btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                               new_extent->ram_bytes);
                btrfs_set_file_extent_disk_bytenr(leaf, fi,
                                                new_extent->disk_bytenr);
                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
        BUG_ON(err);
 
        err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-                                      group->key.offset, 0);
+                                      group->key.offset, 0, group->key.offset,
+                                      0, 0, 0);
        BUG_ON(err);
 
        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
 
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
+#define LEAK_DEBUG 1
 #ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
 #endif
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
-                                            u64 *start, u64 *end, u64 max_bytes)
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+                                       u64 *start, u64 *end, u64 max_bytes)
 {
        struct rb_node *node;
        struct extent_state *state;
        u64 total_bytes = 0;
 
        spin_lock_irq(&tree->lock);
+
        /*
         * this search will find all the extents that end after
         * our range starts.
         */
-search_again:
        node = tree_search(tree, cur_start);
        if (!node) {
                if (!found)
                                *end = state->end;
                        goto out;
                }
-               if (!found && !(state->state & EXTENT_BOUNDARY)) {
-                       struct extent_state *prev_state;
-                       struct rb_node *prev_node = node;
-                       while(1) {
-                               prev_node = rb_prev(prev_node);
-                               if (!prev_node)
-                                       break;
-                               prev_state = rb_entry(prev_node,
-                                                     struct extent_state,
-                                                     rb_node);
-                               if ((prev_state->end + 1 != state->start) ||
-                                   !(prev_state->state & EXTENT_DELALLOC))
-                                       break;
-                               if ((cur_start - prev_state->start) * 2 >
-                                    max_bytes)
-                                       break;
-                               state = prev_state;
-                               node = prev_node;
-                       }
-               }
-               if (state->state & EXTENT_LOCKED) {
-                       DEFINE_WAIT(wait);
-                       atomic_inc(&state->refs);
-                       prepare_to_wait(&state->wq, &wait,
-                                       TASK_UNINTERRUPTIBLE);
-                       spin_unlock_irq(&tree->lock);
-                       schedule();
-                       spin_lock_irq(&tree->lock);
-                       finish_wait(&state->wq, &wait);
-                       free_extent_state(state);
-                       goto search_again;
-               }
-               set_state_cb(tree, state, EXTENT_LOCKED);
-               state->state |= EXTENT_LOCKED;
                if (!found)
                        *start = state->start;
                found++;
        return found;
 }
 
+static noinline int __unlock_for_delalloc(struct inode *inode,
+                                         struct page *locked_page,
+                                         u64 start, u64 end)
+{
+       int ret;
+       struct page *pages[16];
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+       unsigned long nr_pages = end_index - index + 1;
+       int i;
+
+       if (index == locked_page->index && end_index == index)
+               return 0;
+
+       while(nr_pages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nr_pages, ARRAY_SIZE(pages)), pages);
+               for (i = 0; i < ret; i++) {
+                       if (pages[i] != locked_page)
+                               unlock_page(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               nr_pages -= ret;
+               index += ret;
+               cond_resched();
+       }
+       return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+                                       struct page *locked_page,
+                                       u64 delalloc_start,
+                                       u64 delalloc_end)
+{
+       unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+       unsigned long start_index = index;
+       unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+       unsigned long pages_locked = 0;
+       struct page *pages[16];
+       unsigned long nrpages;
+       int ret;
+       int i;
+
+       /* the caller is responsible for locking the start index */
+       if (index == locked_page->index && index == end_index)
+               return 0;
+
+       /* skip the page at the start index */
+       nrpages = end_index - index + 1;
+       while(nrpages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nrpages, ARRAY_SIZE(pages)), pages);
+               if (ret == 0) {
+                       ret = -EAGAIN;
+                       goto done;
+               }
+               /* now we have an array of pages, lock them all */
+               for (i = 0; i < ret; i++) {
+                       /*
+                        * the caller is taking responsibility for
+                        * locked_page
+                        */
+                       if (pages[i] != locked_page)
+                               lock_page(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               pages_locked += ret;
+               nrpages -= ret;
+               index += ret;
+               cond_resched();
+       }
+       ret = 0;
+done:
+       if (ret && pages_locked) {
+               __unlock_for_delalloc(inode, locked_page,
+                             delalloc_start,
+                             ((u64)(start_index + pages_locked - 1)) <<
+                             PAGE_CACHE_SHIFT);
+       }
+       return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+                                            struct extent_io_tree *tree,
+                                            struct page *locked_page,
+                                            u64 *start, u64 *end,
+                                            u64 max_bytes)
+{
+       u64 delalloc_start;
+       u64 delalloc_end;
+       u64 found;
+       int ret;
+       int loops = 0;
+
+again:
+       /* step one, find a bunch of delalloc bytes starting at start */
+       delalloc_start = *start;
+       delalloc_end = 0;
+       found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+                                   max_bytes);
+       if (!found) {
+               *start = delalloc_start;
+               *end = delalloc_end;
+               return found;
+       }
+
+       /*
+        * make sure to limit the number of pages we try to lock down
+        * if we're looping.
+        */
+       if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+               delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
+                       ~((u64)PAGE_CACHE_SIZE - 1);
+       }
+       /* step two, lock all the pages after the page that has start */
+       ret = lock_delalloc_pages(inode, locked_page,
+                                 delalloc_start, delalloc_end);
+       if (ret == -EAGAIN) {
+               /* some of the pages are gone, lets avoid looping by
+                * shortening the size of the delalloc range we're searching
+                */
+               if (!loops) {
+                       unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+                       max_bytes = PAGE_CACHE_SIZE - offset;
+                       loops = 1;
+                       goto again;
+               } else {
+                       found = 0;
+                       goto out_failed;
+               }
+       }
+       BUG_ON(ret);
+
+       /* step three, lock the state bits for the whole range */
+       lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+       /* then test to make sure it is all still delalloc */
+       ret = test_range_bit(tree, delalloc_start, delalloc_end,
+                            EXTENT_DELALLOC, 1);
+       if (!ret) {
+               unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+               __unlock_for_delalloc(inode, locked_page,
+                             delalloc_start, delalloc_end);
+               cond_resched();
+               goto again;
+       }
+       *start = delalloc_start;
+       *end = delalloc_end;
+out_failed:
+       return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+                               struct extent_io_tree *tree,
+                               u64 start, u64 end, struct page *locked_page,
+                               int clear_dirty, int set_writeback,
+                               int end_writeback)
+{
+       int ret;
+       struct page *pages[16];
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+       unsigned long nr_pages = end_index - index + 1;
+       int i;
+       int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+
+       if (clear_dirty)
+               clear_bits |= EXTENT_DIRTY;
+
+       clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+
+       while(nr_pages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nr_pages, ARRAY_SIZE(pages)), pages);
+               for (i = 0; i < ret; i++) {
+                       if (pages[i] == locked_page) {
+                               page_cache_release(pages[i]);
+                               continue;
+                       }
+                       if (clear_dirty)
+                               clear_page_dirty_for_io(pages[i]);
+                       if (set_writeback)
+                               set_page_writeback(pages[i]);
+                       if (end_writeback)
+                               end_page_writeback(pages[i]);
+                       unlock_page(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               nr_pages -= ret;
+               index += ret;
+               cond_resched();
+       }
+       return 0;
+}
+EXPORT_SYMBOL(extent_clear_unlock_delalloc);
+
 /*
  * count the number of bytes in the tree that have a given bit(s)
  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
        return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+                         unsigned long bio_flags)
 {
        int ret = 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
-       struct rb_node *node;
-       struct extent_state *state;
        u64 start;
        u64 end;
 
        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
        end = start + bvec->bv_len - 1;
 
-       spin_lock_irq(&tree->lock);
-       node = __etree_search(tree, start, NULL, NULL);
-       BUG_ON(!node);
-       state = rb_entry(node, struct extent_state, rb_node);
-       while(state->end < end) {
-               node = rb_next(node);
-               state = rb_entry(node, struct extent_state, rb_node);
-       }
-       BUG_ON(state->end != end);
-       spin_unlock_irq(&tree->lock);
-
        bio->bi_private = NULL;
 
        bio_get(bio);
 
        if (tree->ops && tree->ops->submit_bio_hook)
                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-                                          mirror_num);
+                                          mirror_num, bio_flags);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                              struct bio **bio_ret,
                              unsigned long max_pages,
                              bio_end_io_t end_io_func,
-                             int mirror_num)
+                             int mirror_num,
+                             unsigned long prev_bio_flags,
+                             unsigned long bio_flags)
 {
        int ret = 0;
        struct bio *bio;
        int nr;
+       int contig = 0;
+       int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+       int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+       size_t page_size = min(size, PAGE_CACHE_SIZE);
 
        if (bio_ret && *bio_ret) {
                bio = *bio_ret;
-               if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+               if (old_compressed)
+                       contig = bio->bi_sector == sector;
+               else
+                       contig = bio->bi_sector + (bio->bi_size >> 9) ==
+                               sector;
+
+               if (prev_bio_flags != bio_flags || !contig ||
                    (tree->ops && tree->ops->merge_bio_hook &&
-                    tree->ops->merge_bio_hook(page, offset, size, bio)) ||
-                   bio_add_page(bio, page, size, offset) < size) {
-                       ret = submit_one_bio(rw, bio, mirror_num);
+                    tree->ops->merge_bio_hook(page, offset, page_size, bio,
+                                              bio_flags)) ||
+                   bio_add_page(bio, page, page_size, offset) < page_size) {
+                       ret = submit_one_bio(rw, bio, mirror_num,
+                                            prev_bio_flags);
                        bio = NULL;
                } else {
                        return 0;
                }
        }
-       nr = bio_get_nr_vecs(bdev);
+       if (this_compressed)
+               nr = BIO_MAX_PAGES;
+       else
+               nr = bio_get_nr_vecs(bdev);
+
        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
        if (!bio) {
                printk("failed to allocate bio nr %d\n", nr);
        }
 
-
-       bio_add_page(bio, page, size, offset);
+       bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
        bio->bi_private = tree;
 
        if (bio_ret) {
                *bio_ret = bio;
        } else {
-               ret = submit_one_bio(rw, bio, mirror_num);
+               ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
        }
 
        return ret;
 static int __extent_read_full_page(struct extent_io_tree *tree,
                                   struct page *page,
                                   get_extent_t *get_extent,
-                                  struct bio **bio, int mirror_num)
+                                  struct bio **bio, int mirror_num,
+                                  unsigned long *bio_flags)
 {
        struct inode *inode = page->mapping->host;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        int nr = 0;
        size_t page_offset = 0;
        size_t iosize;
+       size_t disk_io_size;
        size_t blocksize = inode->i_sb->s_blocksize;
+       unsigned long this_bio_flag = 0;
 
        set_page_extent_mapped(page);
 
        end = page_end;
        lock_extent(tree, start, end, GFP_NOFS);
 
+       if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+               char *userpage;
+               size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+               if (zero_offset) {
+                       iosize = PAGE_CACHE_SIZE - zero_offset;
+                       userpage = kmap_atomic(page, KM_USER0);
+                       memset(userpage + zero_offset, 0, iosize);
+                       flush_dcache_page(page);
+                       kunmap_atomic(userpage, KM_USER0);
+               }
+       }
        while (cur <= end) {
                if (cur >= last_byte) {
                        char *userpage;
                }
                BUG_ON(end < cur);
 
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                       this_bio_flag = EXTENT_BIO_COMPRESSED;
+
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-               sector = (em->block_start + extent_offset) >> 9;
+               if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+                       disk_io_size = em->block_len;
+                       sector = em->block_start >> 9;
+               } else {
+                       sector = (em->block_start + extent_offset) >> 9;
+                       disk_io_size = iosize;
+               }
                bdev = em->bdev;
                block_start = em->block_start;
                free_extent_map(em);
                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
                        pnr -= page->index;
                        ret = submit_extent_page(READ, tree, page,
-                                        sector, iosize, page_offset,
+                                        sector, disk_io_size, page_offset,
                                         bdev, bio, pnr,
-                                        end_bio_extent_readpage, mirror_num);
+                                        end_bio_extent_readpage, mirror_num,
+                                        *bio_flags,
+                                        this_bio_flag);
                        nr++;
+                       *bio_flags = this_bio_flag;
                }
                if (ret)
                        SetPageError(page);
                            get_extent_t *get_extent)
 {
        struct bio *bio = NULL;
+       unsigned long bio_flags = 0;
        int ret;
 
-       ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
+       ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+                                     &bio_flags);
        if (bio)
-               submit_one_bio(READ, bio, 0);
+               submit_one_bio(READ, bio, 0, bio_flags);
        return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
        u64 nr_delalloc;
        u64 delalloc_end;
+       int page_started;
+       int compressed;
 
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 
        delalloc_start = start;
        delalloc_end = 0;
+       page_started = 0;
        while(delalloc_end < page_end) {
-               nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+               nr_delalloc = find_lock_delalloc_range(inode, tree,
+                                                      page,
+                                                      &delalloc_start,
                                                       &delalloc_end,
                                                       128 * 1024 * 1024);
                if (nr_delalloc == 0) {
                        delalloc_start = delalloc_end + 1;
                        continue;
                }
-               tree->ops->fill_delalloc(inode, delalloc_start,
-                                        delalloc_end);
-               clear_extent_bit(tree, delalloc_start,
-                                delalloc_end,
-                                EXTENT_LOCKED | EXTENT_DELALLOC,
-                                1, 0, GFP_NOFS);
+               tree->ops->fill_delalloc(inode, page, delalloc_start,
+                                        delalloc_end, &page_started);
                delalloc_start = delalloc_end + 1;
        }
+
+       /* did the fill delalloc function already unlock and start the IO? */
+       if (page_started) {
+               return 0;
+       }
+
        lock_extent(tree, start, page_end, GFP_NOFS);
        unlock_start = start;
 
        if (tree->ops && tree->ops->writepage_start_hook) {
-               ret = tree->ops->writepage_start_hook(page, start, page_end);
+               ret = tree->ops->writepage_start_hook(page, start,
+                                                     page_end);
                if (ret == -EAGAIN) {
                        unlock_extent(tree, start, page_end, GFP_NOFS);
                        redirty_page_for_writepage(wbc, page);
                sector = (em->block_start + extent_offset) >> 9;
                bdev = em->bdev;
                block_start = em->block_start;
+               compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                free_extent_map(em);
                em = NULL;
 
-               if (block_start == EXTENT_MAP_HOLE ||
+               /*
+                * compressed and inline extents are written through other
+                * paths in the FS
+                */
+               if (compressed || block_start == EXTENT_MAP_HOLE ||
                    block_start == EXTENT_MAP_INLINE) {
                        clear_extent_dirty(tree, cur,
                                           cur + iosize - 1, GFP_NOFS);
                        unlock_extent(tree, unlock_start, cur + iosize -1,
                                      GFP_NOFS);
 
-                       if (tree->ops && tree->ops->writepage_end_io_hook)
+                       /*
+                        * end_io notification does not happen here for
+                        * compressed extents
+                        */
+                       if (!compressed && tree->ops &&
+                           tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
                                                         cur + iosize - 1,
                                                         NULL, 1);
-                       cur = cur + iosize;
+                       else if (compressed) {
+                               /* we don't want to end_page_writeback on
+                                * a compressed extent.  this happens
+                                * elsewhere
+                                */
+                               nr++;
+                       }
+
+                       cur += iosize;
                        pg_offset += iosize;
                        unlock_start = cur;
                        continue;
                }
-
                /* leave this out until we have a page_mkwrite call */
                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
                                   EXTENT_DIRTY, 0)) {
                        pg_offset += iosize;
                        continue;
                }
+
                clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
                if (tree->ops && tree->ops->writepage_io_hook) {
                        ret = tree->ops->writepage_io_hook(page, cur,
                        ret = submit_extent_page(WRITE, tree, page, sector,
                                                 iosize, pg_offset, bdev,
                                                 &epd->bio, max_nr,
-                                                end_bio_extent_writepage, 0);
+                                                end_bio_extent_writepage,
+                                                0, 0, 0);
                        if (ret)
                                SetPageError(page);
                }
        extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                 __extent_writepage, &epd);
        if (epd.bio) {
-               submit_one_bio(WRITE, epd.bio, 0);
+               submit_one_bio(WRITE, epd.bio, 0, 0);
        }
        return ret;
 }
        ret = extent_write_cache_pages(tree, mapping, wbc,
                                       __extent_writepage, &epd);
        if (epd.bio) {
-               submit_one_bio(WRITE, epd.bio, 0);
+               submit_one_bio(WRITE, epd.bio, 0, 0);
        }
        return ret;
 }
        struct bio *bio = NULL;
        unsigned page_idx;
        struct pagevec pvec;
+       unsigned long bio_flags = 0;
 
        pagevec_init(&pvec, 0);
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                        if (!pagevec_add(&pvec, page))
                                __pagevec_lru_add(&pvec);
                        __extent_read_full_page(tree, page, get_extent,
-                                               &bio, 0);
+                                               &bio, 0, &bio_flags);
                }
                page_cache_release(page);
        }
                __pagevec_lru_add(&pvec);
        BUG_ON(!list_empty(pages));
        if (bio)
-               submit_one_bio(READ, bio, 0);
+               submit_one_bio(READ, bio, 0, bio_flags);
        return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
                        ret = submit_extent_page(READ, tree, page,
                                         sector, iosize, page_offset, em->bdev,
                                         NULL, 1,
-                                        end_bio_extent_preparewrite, 0);
+                                        end_bio_extent_preparewrite, 0,
+                                        0, 0);
                        iocount++;
                        block_start = block_start + iosize;
                } else {
                        }
                        if (!test_range_bit(tree, em->start,
                                            extent_map_end(em) - 1,
-                                           EXTENT_LOCKED, 0)) {
+                                           EXTENT_LOCKED | EXTENT_WRITEBACK |
+                                           EXTENT_ORDERED,
+                                           0)) {
                                remove_extent_mapping(map, em);
                                /* once for the rb tree */
                                free_extent_map(em);
        int inc_all_pages = 0;
        unsigned long num_pages;
        struct bio *bio = NULL;
+       unsigned long bio_flags = 0;
 
        if (eb->flags & EXTENT_UPTODATE)
                return 0;
                        ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
                                                      get_extent, &bio,
-                                                     mirror_num);
+                                                     mirror_num, &bio_flags);
                        if (err) {
                                ret = err;
                                printk("err %d from __extent_read_full_page\n", ret);
        }
 
        if (bio)
-               submit_one_bio(READ, bio, mirror_num);
+               submit_one_bio(READ, bio, mirror_num, bio_flags);
 
        if (ret || !wait) {
                if (ret)
 
 #define EXTENT_BOUNDARY (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
 struct extent_state;
 
 typedef        int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
-                                      struct bio *bio, int mirror_num);
+                                      struct bio *bio, int mirror_num,
+                                      unsigned long bio_flags);
 struct extent_io_ops {
-       int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+       int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+                            u64 start, u64 end, int *page_started);
        int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
        int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
        extent_submit_bio_hook_t *submit_bio_hook;
        int (*merge_bio_hook)(struct page *page, unsigned long offset,
-                             size_t size, struct bio *bio);
+                             size_t size, struct bio *bio,
+                             unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
                                       u64 start, u64 end,
 int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
                          u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+                               struct extent_io_tree *tree,
+                               u64 start, u64 end, struct page *locked_page,
+                               int clear_dirty, int set_writeback,
+                               int clear_writeback);
 #endif
 
        if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
                return 0;
 
+       /*
+        * don't merge compressed extents, we need to know their
+        * actual size
+        */
+       if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+               return 0;
+
        if (extent_map_end(prev) == next->start &&
            prev->flags == next->flags &&
            prev->bdev == next->bdev &&
                if (rb && mergable_maps(merge, em)) {
                        em->start = merge->start;
                        em->len += merge->len;
+                       em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        merge->in_tree = 0;
                        rb_erase(&merge->rb_node, &tree->map);
                merge = rb_entry(rb, struct extent_map, rb_node);
        if (rb && mergable_maps(em, merge)) {
                em->len += merge->len;
+               em->block_len += merge->len;
                rb_erase(&merge->rb_node, &tree->map);
                merge->in_tree = 0;
                free_extent_map(merge);
 
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
 
 struct extent_map {
        struct rb_node rb_node;
        u64 start;
        u64 len;
        u64 block_start;
+       u64 block_len;
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
 
 static inline u64 extent_map_block_end(struct extent_map *em)
 {
-       if (em->block_start + em->len < em->block_start)
+       if (em->block_start + em->block_len < em->block_start)
                return (u64)-1;
-       return em->block_start + em->len;
+       return em->block_start + em->block_len;
 }
 
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
 
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
                             u64 disk_offset, u64 disk_num_bytes,
-                            u64 num_bytes, u64 offset)
+                            u64 num_bytes, u64 offset, u64 ram_bytes,
+                            u8 compression, u8 encryption, u16 other_encoding)
 {
        int ret = 0;
        struct btrfs_file_extent_item *item;
        btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
        btrfs_set_file_extent_offset(leaf, item, offset);
        btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+       btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
        btrfs_set_file_extent_generation(leaf, item, trans->transid);
        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+       btrfs_set_file_extent_compression(leaf, item, compression);
+       btrfs_set_file_extent_encryption(leaf, item, encryption);
+       btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
        btrfs_mark_buffer_dirty(leaf);
 out:
        btrfs_free_path(path);
        return 0;
 }
 
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                         u64 start, unsigned long len)
+{
+       struct btrfs_ordered_sum *sums;
+       struct btrfs_sector_sum *sector_sum;
+       struct btrfs_ordered_extent *ordered;
+       char *data;
+       struct page *page;
+       unsigned long total_bytes = 0;
+       unsigned long this_sum_bytes = 0;
+
+       sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+       if (!sums)
+               return -ENOMEM;
+
+       sector_sum = sums->sums;
+       sums->file_offset = start;
+       sums->len = len;
+       INIT_LIST_HEAD(&sums->list);
+       ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+       BUG_ON(!ordered);
+
+       while(len > 0) {
+               if (start >= ordered->file_offset + ordered->len ||
+                   start < ordered->file_offset) {
+                       sums->len = this_sum_bytes;
+                       this_sum_bytes = 0;
+                       btrfs_add_ordered_sum(inode, ordered, sums);
+                       btrfs_put_ordered_extent(ordered);
+
+                       sums = kzalloc(btrfs_ordered_sum_size(root, len),
+                                      GFP_NOFS);
+                       BUG_ON(!sums);
+                       sector_sum = sums->sums;
+                       sums->len = len;
+                       sums->file_offset = start;
+                       ordered = btrfs_lookup_ordered_extent(inode,
+                                                     sums->file_offset);
+                       BUG_ON(!ordered);
+               }
+
+               page = find_get_page(inode->i_mapping,
+                                    start >> PAGE_CACHE_SHIFT);
+
+               data = kmap_atomic(page, KM_USER0);
+               sector_sum->sum = ~(u32)0;
+               sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
+                                                 PAGE_CACHE_SIZE);
+               kunmap_atomic(data, KM_USER0);
+               btrfs_csum_final(sector_sum->sum,
+                                (char *)§or_sum->sum);
+               sector_sum->offset = page_offset(page);
+               page_cache_release(page);
+
+               sector_sum++;
+               total_bytes += PAGE_CACHE_SIZE;
+               this_sum_bytes += PAGE_CACHE_SIZE;
+               start += PAGE_CACHE_SIZE;
+
+               WARN_ON(len < PAGE_CACHE_SIZE);
+               len -= PAGE_CACHE_SIZE;
+       }
+       btrfs_add_ordered_sum(inode, ordered, sums);
+       btrfs_put_ordered_extent(ordered);
+       return 0;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio)
 {
 
        }
 }
 
-/* this does all the hard work for inserting an inline extent into
- * the btree.  Any existing inline extent is extended as required to make room,
- * otherwise things are inserted as required into the btree
- */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root, struct inode *inode,
-                               u64 offset, size_t size,
-                               struct page **pages, size_t page_offset,
-                               int num_pages)
-{
-       struct btrfs_key key;
-       struct btrfs_path *path;
-       struct extent_buffer *leaf;
-       char *kaddr;
-       unsigned long ptr;
-       struct btrfs_file_extent_item *ei;
-       struct page *page;
-       u32 datasize;
-       int err = 0;
-       int ret;
-       int i;
-       ssize_t cur_size;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       btrfs_set_trans_block_group(trans, inode);
-
-       key.objectid = inode->i_ino;
-       key.offset = offset;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-
-       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-       if (ret < 0) {
-               err = ret;
-               goto fail;
-       }
-       if (ret == 1) {
-               struct btrfs_key found_key;
-
-               if (path->slots[0] == 0)
-                       goto insert;
-
-               path->slots[0]--;
-               leaf = path->nodes[0];
-               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-               if (found_key.objectid != inode->i_ino)
-                       goto insert;
-
-               if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                       goto insert;
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-
-               if (btrfs_file_extent_type(leaf, ei) !=
-                   BTRFS_FILE_EXTENT_INLINE) {
-                       goto insert;
-               }
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               ret = 0;
-       }
-       if (ret == 0) {
-               u32 found_size;
-               u64 found_end;
-
-               leaf = path->nodes[0];
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-
-               if (btrfs_file_extent_type(leaf, ei) !=
-                   BTRFS_FILE_EXTENT_INLINE) {
-                       err = ret;
-                       btrfs_print_leaf(root, leaf);
-                       printk("found wasn't inline offset %Lu inode %lu\n",
-                              offset, inode->i_ino);
-                       goto fail;
-               }
-               found_size = btrfs_file_extent_inline_len(leaf,
-                                         btrfs_item_nr(leaf, path->slots[0]));
-               found_end = key.offset + found_size;
-
-               if (found_end < offset + size) {
-                       btrfs_release_path(root, path);
-                       ret = btrfs_search_slot(trans, root, &key, path,
-                                               offset + size - found_end, 1);
-                       BUG_ON(ret != 0);
-
-                       ret = btrfs_extend_item(trans, root, path,
-                                               offset + size - found_end);
-                       if (ret) {
-                               err = ret;
-                               goto fail;
-                       }
-                       leaf = path->nodes[0];
-                       ei = btrfs_item_ptr(leaf, path->slots[0],
-                                           struct btrfs_file_extent_item);
-                       inode_add_bytes(inode, offset + size - found_end);
-               }
-               if (found_end < offset) {
-                       ptr = btrfs_file_extent_inline_start(ei) + found_size;
-                       memset_extent_buffer(leaf, 0, ptr, offset - found_end);
-               }
-       } else {
-insert:
-               btrfs_release_path(root, path);
-               datasize = offset + size - key.offset;
-               inode_add_bytes(inode, datasize);
-               datasize = btrfs_file_extent_calc_inline_size(datasize);
-               ret = btrfs_insert_empty_item(trans, root, path, &key,
-                                             datasize);
-               if (ret) {
-                       err = ret;
-                       printk("got bad ret %d\n", ret);
-                       goto fail;
-               }
-               leaf = path->nodes[0];
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-               btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-               btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-       }
-       ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-
-       cur_size = size;
-       i = 0;
-       while (size > 0) {
-               page = pages[i];
-               kaddr = kmap_atomic(page, KM_USER0);
-               cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
-               write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
-               kunmap_atomic(kaddr, KM_USER0);
-               page_offset = 0;
-               ptr += cur_size;
-               size -= cur_size;
-               if (i >= num_pages) {
-                       printk("i %d num_pages %d\n", i, num_pages);
-               }
-               i++;
-       }
-       btrfs_mark_buffer_dirty(leaf);
-fail:
-       btrfs_free_path(path);
-       return err;
-}
-
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
        u64 start_pos;
        u64 end_of_last_block;
        u64 end_pos = pos + write_bytes;
-       u64 inline_size;
-       int did_inline = 0;
        loff_t isize = i_size_read(inode);
 
        start_pos = pos & ~((u64)root->sectorsize - 1);
                        err = btrfs_insert_file_extent(trans, root,
                                                       inode->i_ino,
                                                       last_pos_in_file,
-                                                      0, 0, hole_size, 0);
+                                                      0, 0, hole_size, 0,
+                                                      hole_size, 0, 0, 0);
                        btrfs_drop_extent_cache(inode, last_pos_in_file,
                                        last_pos_in_file + hole_size - 1, 0);
                        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
                        goto failed;
        }
 
-       /*
-        * either allocate an extent for the new bytes or setup the key
-        * to show we are doing inline data in the extent
+       /* check for reserved extents on each page, we don't want
+        * to reset the delalloc bit on things that already have
+        * extents reserved.
         */
-       inline_size = end_pos;
-       if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-           inline_size > root->fs_info->max_inline ||
-           (inline_size & (root->sectorsize -1)) == 0 ||
-           inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-               /* check for reserved extents on each page, we don't want
-                * to reset the delalloc bit on things that already have
-                * extents reserved.
-                */
-               btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-               for (i = 0; i < num_pages; i++) {
-                       struct page *p = pages[i];
-                       SetPageUptodate(p);
-                       ClearPageChecked(p);
-                       set_page_dirty(p);
-               }
-       } else {
-               u64 aligned_end;
-               /* step one, delete the existing extents in this range */
-               aligned_end = (pos + write_bytes + root->sectorsize - 1) &
-                       ~((u64)root->sectorsize - 1);
-               mutex_lock(&BTRFS_I(inode)->extent_mutex);
-               err = btrfs_drop_extents(trans, root, inode, start_pos,
-                                        aligned_end, aligned_end, &hint_byte);
-               if (err)
-                       goto failed;
-               if (isize > inline_size)
-                       inline_size = min_t(u64, isize, aligned_end);
-               inline_size -= start_pos;
-               err = insert_inline_extent(trans, root, inode, start_pos,
-                                          inline_size, pages, 0, num_pages);
-               btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
-               BUG_ON(err);
-               mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
-               /*
-                * an ugly way to do all the prop accounting around
-                * the page bits and mapping tags
-                */
-               set_page_writeback(pages[0]);
-               end_page_writeback(pages[0]);
-               did_inline = 1;
+       btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = pages[i];
+               SetPageUptodate(p);
+               ClearPageChecked(p);
+               set_page_dirty(p);
        }
        if (end_pos > isize) {
                i_size_write(inode, end_pos);
-               if (did_inline)
-                       BTRFS_I(inode)->disk_i_size = end_pos;
                btrfs_update_inode(trans, root, inode);
        }
 failed:
        int ret;
        int testend = 1;
        unsigned long flags;
+       int compressed = 0;
 
        WARN_ON(end < start);
        if (end == (u64)-1) {
                        free_extent_map(em);
                        continue;
                }
+               compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                remove_extent_mapping(em_tree, em);
 
                        split->start = em->start;
                        split->len = start - em->start;
                        split->block_start = em->block_start;
+
+                       if (compressed)
+                               split->block_len = em->block_len;
+                       else
+                               split->block_len = split->len;
+
                        split->bdev = em->bdev;
                        split->flags = flags;
                        ret = add_extent_mapping(em_tree, split);
                        split->bdev = em->bdev;
                        split->flags = flags;
 
-                       split->block_start = em->block_start + diff;
+                       if (compressed) {
+                               split->block_len = em->block_len;
+                               split->block_start = em->block_start;
+                       } else {
+                               split->block_len = split->len;
+                               split->block_start = em->block_start + diff;
+                       }
 
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        struct btrfs_item *item;
                        item = btrfs_item_nr(leaf, slot);
                        extent_end = found_key.offset +
-                            btrfs_file_extent_inline_len(leaf, item);
+                            btrfs_file_extent_inline_len(leaf, extent);
                        extent_end = (extent_end + root->sectorsize - 1) &
                                ~((u64)root->sectorsize -1 );
                }
        u64 extent_end = 0;
        u64 search_start = start;
        u64 leaf_start;
+       u64 ram_bytes = 0;
+       u8 compression = 0;
+       u8 encryption = 0;
+       u16 other_encoding = 0;
        u64 root_gen;
        u64 root_owner;
        struct extent_buffer *leaf;
        int recow;
        int ret;
 
+       inline_limit = 0;
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
        path = btrfs_alloc_path();
                        extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
                        found_type = btrfs_file_extent_type(leaf, extent);
+                       compression = btrfs_file_extent_compression(leaf,
+                                                                   extent);
+                       encryption = btrfs_file_extent_encryption(leaf,
+                                                                 extent);
+                       other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                 extent);
                        if (found_type == BTRFS_FILE_EXTENT_REG) {
                                extent_end =
                                     btrfs_file_extent_disk_bytenr(leaf,
 
                                extent_end = key.offset +
                                     btrfs_file_extent_num_bytes(leaf, extent);
+                               ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+                                                               extent);
                                found_extent = 1;
                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                               struct btrfs_item *item;
-                               item = btrfs_item_nr(leaf, slot);
                                found_inline = 1;
                                extent_end = key.offset +
-                                    btrfs_file_extent_inline_len(leaf, item);
+                                    btrfs_file_extent_inline_len(leaf, extent);
                        }
                } else {
                        extent_end = search_start;
                        search_start = (extent_end + mask) & ~mask;
                } else
                        search_start = extent_end;
-               if (end <= extent_end && start >= key.offset && found_inline) {
+
+               if (end <= extent_end && start >= key.offset && found_inline)
                        *hint_byte = EXTENT_MAP_INLINE;
-                       goto out;
-               }
 
                if (found_extent) {
                        read_extent_buffer(leaf, &old, (unsigned long)extent,
                        write_extent_buffer(leaf, &old,
                                            (unsigned long)extent, sizeof(old));
 
+                       btrfs_set_file_extent_compression(leaf, extent,
+                                                         compression);
+                       btrfs_set_file_extent_encryption(leaf, extent,
+                                                        encryption);
+                       btrfs_set_file_extent_other_encoding(leaf, extent,
+                                                            other_encoding);
                        btrfs_set_file_extent_offset(leaf, extent,
                                    le64_to_cpu(old.offset) + end - key.offset);
                        WARN_ON(le64_to_cpu(old.num_bytes) <
                                (extent_end - end));
                        btrfs_set_file_extent_num_bytes(leaf, extent,
                                                        extent_end - end);
+
+                       /*
+                        * set the ram bytes to the size of the full extent
+                        * before splitting.  This is a worst case flag,
+                        * but its the best we can do because we don't know
+                        * how splitting affects compression
+                        */
+                       btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                       ram_bytes);
                        btrfs_set_file_extent_type(leaf, extent,
                                                   BTRFS_FILE_EXTENT_REG);
 
 
 #include "compat.h"
 #include "tree-log.h"
 #include "ref-cache.h"
+#include "compression.h"
 
 struct btrfs_iget_args {
        u64 ino;
 };
 
 static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
        return ret;
 }
 
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root, struct inode *inode,
+                               u64 start, size_t size, size_t compressed_size,
+                               struct page **compressed_pages)
+{
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct page *page = NULL;
+       char *kaddr;
+       unsigned long ptr;
+       struct btrfs_file_extent_item *ei;
+       int err = 0;
+       int ret;
+       size_t cur_size = size;
+       size_t datasize;
+       unsigned long offset;
+       int use_compress = 0;
+
+       if (compressed_size && compressed_pages) {
+               use_compress = 1;
+               cur_size = compressed_size;
+       }
+
+       path = btrfs_alloc_path(); if (!path)
+               return -ENOMEM;
+
+       btrfs_set_trans_block_group(trans, inode);
+
+       key.objectid = inode->i_ino;
+       key.offset = start;
+       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+       inode_add_bytes(inode, size);
+       datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+       inode_add_bytes(inode, size);
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     datasize);
+       BUG_ON(ret);
+       if (ret) {
+               err = ret;
+               printk("got bad ret %d\n", ret);
+               goto fail;
+       }
+       leaf = path->nodes[0];
+       ei = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
+       btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+       btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+       btrfs_set_file_extent_encryption(leaf, ei, 0);
+       btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+       btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+       ptr = btrfs_file_extent_inline_start(ei);
+
+       if (use_compress) {
+               struct page *cpage;
+               int i = 0;
+               while(compressed_size > 0) {
+                       cpage = compressed_pages[i];
+                       cur_size = min(compressed_size,
+                                      PAGE_CACHE_SIZE);
+
+                       kaddr = kmap(cpage);
+                       write_extent_buffer(leaf, kaddr, ptr, cur_size);
+                       kunmap(cpage);
+
+                       i++;
+                       ptr += cur_size;
+                       compressed_size -= cur_size;
+               }
+               btrfs_set_file_extent_compression(leaf, ei,
+                                                 BTRFS_COMPRESS_ZLIB);
+       } else {
+               page = find_get_page(inode->i_mapping,
+                                    start >> PAGE_CACHE_SHIFT);
+               btrfs_set_file_extent_compression(leaf, ei, 0);
+               kaddr = kmap_atomic(page, KM_USER0);
+               offset = start & (PAGE_CACHE_SIZE - 1);
+               write_extent_buffer(leaf, kaddr + offset, ptr, size);
+               kunmap_atomic(kaddr, KM_USER0);
+               page_cache_release(page);
+       }
+       btrfs_mark_buffer_dirty(leaf);
+       btrfs_free_path(path);
+
+       BTRFS_I(inode)->disk_i_size = inode->i_size;
+       btrfs_update_inode(trans, root, inode);
+       return 0;
+fail:
+       btrfs_free_path(path);
+       return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode, u64 start, u64 end,
+                                size_t compressed_size,
+                                struct page **compressed_pages)
+{
+       u64 isize = i_size_read(inode);
+       u64 actual_end = min(end + 1, isize);
+       u64 inline_len = actual_end - start;
+       u64 aligned_end = (end + root->sectorsize - 1) &
+                       ~((u64)root->sectorsize - 1);
+       u64 hint_byte;
+       u64 data_len = inline_len;
+       int ret;
+
+       if (compressed_size)
+               data_len = compressed_size;
+
+       if (start > 0 ||
+           data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+           (!compressed_size &&
+           (actual_end & (root->sectorsize - 1)) == 0) ||
+           end + 1 < isize ||
+           data_len > root->fs_info->max_inline) {
+               return 1;
+       }
+
+       mutex_lock(&BTRFS_I(inode)->extent_mutex);
+       ret = btrfs_drop_extents(trans, root, inode, start,
+                                aligned_end, aligned_end, &hint_byte);
+       BUG_ON(ret);
+
+       if (isize > actual_end)
+               inline_len = min_t(u64, isize, actual_end);
+       ret = insert_inline_extent(trans, root, inode, start,
+                                  inline_len, compressed_size,
+                                  compressed_pages);
+       BUG_ON(ret);
+       btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+       mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+       return 0;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
  * allocate extents on disk for the range, and create ordered data structs
  * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
  */
-static int cow_file_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, struct page *locked_page,
+                         u64 start, u64 end, int *page_started)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        u64 alloc_hint = 0;
        u64 num_bytes;
+       unsigned long ram_size;
+       u64 orig_start;
+       u64 disk_num_bytes;
        u64 cur_alloc_size;
        u64 blocksize = root->sectorsize;
-       u64 orig_num_bytes;
+       u64 actual_end;
        struct btrfs_key ins;
        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
+       struct page **pages = NULL;
+       unsigned long nr_pages;
+       unsigned long nr_pages_ret = 0;
+       unsigned long total_compressed = 0;
+       unsigned long total_in = 0;
+       unsigned long max_compressed = 128 * 1024;
+       unsigned long max_uncompressed = 256 * 1024;
+       int i;
+       int will_compress;
 
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+       orig_start = start;
+
+       /*
+        * compression made this loop a bit ugly, but the basic idea is to
+        * compress some pages but keep the total size of the compressed
+        * extent relatively small.  If compression is off, this goto target
+        * is never used.
+        */
+again:
+       will_compress = 0;
+       nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+       nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
+       actual_end = min_t(u64, i_size_read(inode), end + 1);
+       total_compressed = actual_end - start;
+
+       /* we want to make sure that amount of ram required to uncompress
+        * an extent is reasonable, so we limit the total size in ram
+        * of a compressed extent to 256k
+        */
+       total_compressed = min(total_compressed, max_uncompressed);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
-       orig_num_bytes = num_bytes;
+       disk_num_bytes = num_bytes;
+       total_in = 0;
+       ret = 0;
 
-       if (alloc_hint == EXTENT_MAP_INLINE)
-               goto out;
+       /* we do compression for mount -o compress and when the
+        * inode has not been flagged as nocompress
+        */
+       if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+           btrfs_test_opt(root, COMPRESS)) {
+               WARN_ON(pages);
+               pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+               /* we want to make sure the amount of IO required to satisfy
+                * a random read is reasonably small, so we limit the size
+                * of a compressed extent to 128k
+                */
+               ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                                               total_compressed, pages,
+                                               nr_pages, &nr_pages_ret,
+                                               &total_in,
+                                               &total_compressed,
+                                               max_compressed);
+
+               if (!ret) {
+                       unsigned long offset = total_compressed &
+                               (PAGE_CACHE_SIZE - 1);
+                       struct page *page = pages[nr_pages_ret - 1];
+                       char *kaddr;
+
+                       /* zero the tail end of the last page, we might be
+                        * sending it down to disk
+                        */
+                       if (offset) {
+                               kaddr = kmap_atomic(page, KM_USER0);
+                               memset(kaddr + offset, 0,
+                                      PAGE_CACHE_SIZE - offset);
+                               kunmap_atomic(kaddr, KM_USER0);
+                       }
+                       will_compress = 1;
+               }
+       }
+       if (start == 0) {
+               /* lets try to make an inline extent */
+               if (ret || total_in < (end - start + 1)) {
+                       /* we didn't compress the entire range, try
+                        * to make an uncompressed inline extent.  This
+                        * is almost sure to fail, but maybe inline sizes
+                        * will get bigger later
+                        */
+                       ret = cow_file_range_inline(trans, root, inode,
+                                                   start, end, 0, NULL);
+               } else {
+                       ret = cow_file_range_inline(trans, root, inode,
+                                                   start, end,
+                                                   total_compressed, pages);
+               }
+               if (ret == 0) {
+                       extent_clear_unlock_delalloc(inode,
+                                                    &BTRFS_I(inode)->io_tree,
+                                                    start, end, NULL,
+                                                    1, 1, 1);
+                       *page_started = 1;
+                       ret = 0;
+                       goto free_pages_out;
+               }
+       }
+
+       if (will_compress) {
+               /*
+                * we aren't doing an inline extent round the compressed size
+                * up to a block size boundary so the allocator does sane
+                * things
+                */
+               total_compressed = (total_compressed + blocksize - 1) &
+                       ~(blocksize - 1);
+
+               /*
+                * one last check to make sure the compression is really a
+                * win, compare the page count read with the blocks on disk
+                */
+               total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+                       ~(PAGE_CACHE_SIZE - 1);
+               if (total_compressed >= total_in) {
+                       will_compress = 0;
+               } else {
+                       disk_num_bytes = total_compressed;
+                       num_bytes = total_in;
+               }
+       }
+       if (!will_compress && pages) {
+               /*
+                * the compression code ran but failed to make things smaller,
+                * free any pages it allocated and our page pointer array
+                */
+               for (i = 0; i < nr_pages_ret; i++) {
+                       page_cache_release(pages[i]);
+               }
+               kfree(pages);
+               pages = NULL;
+               total_compressed = 0;
+               nr_pages_ret = 0;
+
+               /* flag the file so we don't compress in the future */
+               btrfs_set_flag(inode, NOCOMPRESS);
+       }
+
+       BUG_ON(disk_num_bytes >
+              btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-       BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
        mutex_lock(&BTRFS_I(inode)->extent_mutex);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
-       while(num_bytes > 0) {
-               cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+       while(disk_num_bytes > 0) {
+               unsigned long min_bytes;
+
+               /*
+                * the max size of a compressed extent is pretty small,
+                * make the code a little less complex by forcing
+                * the allocator to find a whole compressed extent at once
+                */
+               if (will_compress)
+                       min_bytes = disk_num_bytes;
+               else
+                       min_bytes = root->sectorsize;
+
+               cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-                                          root->sectorsize, 0, alloc_hint,
+                                          min_bytes, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
                if (ret) {
                        WARN_ON(1);
-                       goto out;
+                       goto free_pages_out_fail;
                }
                em = alloc_extent_map(GFP_NOFS);
                em->start = start;
-               em->len = ins.offset;
+
+               if (will_compress) {
+                       ram_size = num_bytes;
+                       em->len = num_bytes;
+               } else {
+                       /* ramsize == disk size */
+                       ram_size = ins.offset;
+                       em->len = ins.offset;
+               }
+
                em->block_start = ins.objectid;
+               em->block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
+
                mutex_lock(&BTRFS_I(inode)->extent_mutex);
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+               if (will_compress)
+                       set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
                while(1) {
                        spin_lock(&em_tree->lock);
                        ret = add_extent_mapping(em_tree, em);
                                break;
                        }
                        btrfs_drop_extent_cache(inode, start,
-                                               start + ins.offset - 1, 0);
+                                               start + ram_size - 1, 0);
                }
                mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
                cur_alloc_size = ins.offset;
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-                                              ins.offset, 0);
+                                              ram_size, cur_alloc_size, 0,
+                                              will_compress);
                BUG_ON(ret);
-               if (num_bytes < cur_alloc_size) {
-                       printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+
+               if (disk_num_bytes < cur_alloc_size) {
+                       printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
                               cur_alloc_size);
                        break;
                }
+
+               if (will_compress) {
+                       /*
+                        * we're doing compression, we and we need to
+                        * submit the compressed extents down to the device.
+                        *
+                        * We lock down all the file pages, clearing their
+                        * dirty bits and setting them writeback.  Everyone
+                        * that wants to modify the page will wait on the
+                        * ordered extent above.
+                        *
+                        * The writeback bits on the file pages are
+                        * cleared when the compressed pages are on disk
+                        */
+                       btrfs_end_transaction(trans, root);
+
+                       if (start <= page_offset(locked_page) &&
+                           page_offset(locked_page) < start + ram_size) {
+                               *page_started = 1;
+                       }
+
+                       extent_clear_unlock_delalloc(inode,
+                                                    &BTRFS_I(inode)->io_tree,
+                                                    start,
+                                                    start + ram_size - 1,
+                                                    NULL, 1, 1, 0);
+
+                       ret = btrfs_submit_compressed_write(inode, start,
+                                                ram_size, ins.objectid,
+                                                cur_alloc_size, pages,
+                                                nr_pages_ret);
+
+                       BUG_ON(ret);
+                       trans = btrfs_join_transaction(root, 1);
+                       if (start + ram_size < end) {
+                               start += ram_size;
+                               alloc_hint = ins.objectid + ins.offset;
+                               /* pages will be freed at end_bio time */
+                               pages = NULL;
+                               goto again;
+                       } else {
+                               /* we've written everything, time to go */
+                               break;
+                       }
+               }
+               /* we're not doing compressed IO, don't unlock the first
+                * page (which the caller expects to stay locked), don't
+                * clear any dirty bits and don't set any writeback bits
+                */
+               extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                            start, start + ram_size - 1,
+                                            locked_page, 0, 0, 0);
+               disk_num_bytes -= cur_alloc_size;
                num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
        }
+
+       ret = 0;
 out:
        btrfs_end_transaction(trans, root);
+
        return ret;
+
+free_pages_out_fail:
+       extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                    start, end, locked_page, 0, 0, 0);
+free_pages_out:
+       for (i = 0; i < nr_pages_ret; i++)
+               page_cache_release(pages[i]);
+       if (pages)
+               kfree(pages);
+
+       goto out;
 }
 
 /*
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+                             u64 start, u64 end, int *page_started)
 {
        u64 extent_start;
        u64 extent_end;
                extent_end = extent_start + extent_num_bytes;
                err = 0;
 
+               if (btrfs_file_extent_compression(leaf, item) ||
+                   btrfs_file_extent_encryption(leaf,item) ||
+                   btrfs_file_extent_other_encoding(leaf, item))
+                       goto not_found;
+
                if (loops && start != extent_start)
                        goto not_found;
 
                bytenr += btrfs_file_extent_offset(leaf, item);
                extent_num_bytes = min(end + 1, extent_end) - start;
                ret = btrfs_add_ordered_extent(inode, start, bytenr,
-                                               extent_num_bytes, 1);
+                                               extent_num_bytes,
+                                               extent_num_bytes, 1, 0);
                if (ret) {
                        err = ret;
                        goto out;
 not_found:
                btrfs_end_transaction(trans, root);
                btrfs_free_path(path);
-               return cow_file_range(inode, start, end);
+               return cow_file_range(inode, locked_page, start, end,
+                                     page_started);
        }
 out:
        WARN_ON(err);
 /*
  * extent_io.c call back to do delayed allocation processing
  */
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+                             u64 start, u64 end, int *page_started)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
 
        if (btrfs_test_opt(root, NODATACOW) ||
            btrfs_test_flag(inode, NODATACOW))
-               ret = run_delalloc_nocow(inode, start, end);
+               ret = run_delalloc_nocow(inode, locked_page, start, end,
+                                        page_started);
        else
-               ret = cow_file_range(inode, start, end);
+               ret = cow_file_range(inode, locked_page, start, end,
+                                    page_started);
 
        return ret;
 }
  * we don't create bios that span stripes or chunks
  */
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-                        size_t size, struct bio *bio)
+                        size_t size, struct bio *bio,
+                        unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
        struct btrfs_mapping_tree *map_tree;
  * are inserted into the btree
  */
 int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num)
+                         int mirror_num, unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
  * or reading the csums from the tree before a read
  */
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num)
+                         int mirror_num, unsigned long bio_flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
 
        if (!(rw & (1 << BIO_RW))) {
                btrfs_lookup_bio_sums(root, inode, bio);
+
+               if (bio_flags & EXTENT_BIO_COMPRESSED) {
+                       return btrfs_submit_compressed_read(inode, bio,
+                                                   mirror_num, bio_flags);
+               }
+
                goto mapit;
        }
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                  __btrfs_submit_bio_hook);
+                                  bio_flags, __btrfs_submit_bio_hook);
 mapit:
        return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
  * good idea.  This causes problems because we want to make sure COW
  * properly happens and the data=ordered rules are followed.
  *
- * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * In our case any range that doesn't have the ORDERED bit set
  * hasn't been properly setup for IO.  We kick off an async process
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
        btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
                                          ordered_extent->start);
        btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-                                            ordered_extent->len);
+                                            ordered_extent->disk_len);
        btrfs_set_file_extent_offset(leaf, extent_item, 0);
+
+       if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+               btrfs_set_file_extent_compression(leaf, extent_item, 1);
+       else
+               btrfs_set_file_extent_compression(leaf, extent_item, 0);
+       btrfs_set_file_extent_encryption(leaf, extent_item, 0);
+       btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
+
+       /* ram bytes = extent_num_bytes for now */
        btrfs_set_file_extent_num_bytes(leaf, extent_item,
                                        ordered_extent->len);
+       btrfs_set_file_extent_ram_bytes(leaf, extent_item,
+                                       ordered_extent->len);
        btrfs_mark_buffer_dirty(leaf);
 
        btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
        ins.objectid = ordered_extent->start;
-       ins.offset = ordered_extent->len;
+       ins.offset = ordered_extent->disk_len;
        ins.type = BTRFS_EXTENT_ITEM_KEY;
        ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
                                          root->root_key.objectid,
        int ret;
        int rw;
        u64 logical;
+       unsigned long bio_flags = 0;
 
        ret = get_state_private(failure_tree, start, &private);
        if (ret) {
                }
                logical = start - em->start;
                logical = em->block_start + logical;
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                       bio_flags = EXTENT_BIO_COMPRESSED;
                failrec->logical = logical;
                free_extent_map(em);
                set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
                rw = READ;
 
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-                                                     failrec->last_mirror);
+                                                     failrec->last_mirror,
+                                                     bio_flags);
        return 0;
 }
 
                                item_end +=
                                    btrfs_file_extent_num_bytes(leaf, fi);
                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-                               struct btrfs_item *item = btrfs_item_nr(leaf,
-                                                               path->slots[0]);
                                item_end += btrfs_file_extent_inline_len(leaf,
-                                                                        item);
+                                                                        fi);
                        }
                        item_end--;
                }
                                root_owner = btrfs_header_owner(leaf);
                        }
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-                       if (!del_item) {
+                       /*
+                        * we can't truncate inline items that have had
+                        * special encodings
+                        */
+                       if (!del_item &&
+                           btrfs_file_extent_compression(leaf, fi) == 0 &&
+                           btrfs_file_extent_encryption(leaf, fi) == 0 &&
+                           btrfs_file_extent_other_encoding(leaf, fi) == 0) {
                                u32 size = new_size - found_key.offset;
 
                                if (root->ref_cows) {
                        err = btrfs_insert_file_extent(trans, root,
                                                       inode->i_ino,
                                                       hole_start, 0, 0,
-                                                      hole_size, 0);
+                                                      hole_size, 0, hole_size,
+                                                      0, 0, 0);
                        btrfs_drop_extent_cache(inode, hole_start,
                                                (u64)-1, 0);
                        btrfs_check_file(root, inode);
        start_diff = map_start - em->start;
        em->start = map_start;
        em->len = map_len;
-       if (em->block_start < EXTENT_MAP_LAST_BYTE)
+       if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+           !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                em->block_start += start_diff;
+               em->block_len -= start_diff;
+       }
        return add_extent_mapping(em_tree, em);
 }
 
+static noinline int uncompress_inline(struct btrfs_path *path,
+                                     struct inode *inode, struct page *page,
+                                     size_t pg_offset, u64 extent_offset,
+                                     struct btrfs_file_extent_item *item)
+{
+       int ret;
+       struct extent_buffer *leaf = path->nodes[0];
+       char *tmp;
+       size_t max_size;
+       unsigned long inline_size;
+       unsigned long ptr;
+
+       WARN_ON(pg_offset != 0);
+       max_size = btrfs_file_extent_ram_bytes(leaf, item);
+       inline_size = btrfs_file_extent_inline_item_len(leaf,
+                                       btrfs_item_nr(leaf, path->slots[0]));
+       tmp = kmalloc(inline_size, GFP_NOFS);
+       ptr = btrfs_file_extent_inline_start(item);
+
+       read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+       max_size = min(PAGE_CACHE_SIZE, max_size);
+       ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+                                   inline_size, max_size);
+       if (ret) {
+               char *kaddr = kmap_atomic(page, KM_USER0);
+               unsigned long copy_size = min_t(u64,
+                                 PAGE_CACHE_SIZE - pg_offset,
+                                 max_size - extent_offset);
+               memset(kaddr + pg_offset, 0, copy_size);
+               kunmap_atomic(kaddr, KM_USER0);
+       }
+       kfree(tmp);
+       return 0;
+}
+
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
  * the ugly parts come from merging extents from the disk with the
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
+       int compressed;
 
 again:
        spin_lock(&em_tree->lock);
        em->bdev = root->fs_info->fs_devices->latest_bdev;
        em->start = EXTENT_MAP_HOLE;
        em->len = (u64)-1;
+       em->block_len = (u64)-1;
 
        if (!path) {
                path = btrfs_alloc_path();
 
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
+       compressed = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG) {
                extent_end = extent_start +
                       btrfs_file_extent_num_bytes(leaf, item);
                        em->block_start = EXTENT_MAP_HOLE;
                        goto insert;
                }
-               bytenr += btrfs_file_extent_offset(leaf, item);
-               em->block_start = bytenr;
                em->start = extent_start;
                em->len = extent_end - extent_start;
+               if (compressed) {
+                       set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->block_start = bytenr;
+                       em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                        item);
+               } else {
+                       bytenr += btrfs_file_extent_offset(leaf, item);
+                       em->block_start = bytenr;
+                       em->block_len = em->len;
+               }
                goto insert;
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                u64 page_start;
                size_t extent_offset;
                size_t copy_size;
 
-               size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
-                                                   path->slots[0]));
+               size = btrfs_file_extent_inline_len(leaf, item);
                extent_end = (extent_start + size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
                if (start < extent_start || start >= extent_end) {
                }
                em->block_start = EXTENT_MAP_INLINE;
 
-               if (!page) {
+               if (!page || create) {
                        em->start = extent_start;
-                       em->len = size;
+                       em->len = (size + root->sectorsize - 1) &
+                       ~((u64)root->sectorsize - 1);
                        goto out;
                }
 
                em->start = extent_start + extent_offset;
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
-               map = kmap(page);
+               if (compressed)
+                       set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
-                       read_extent_buffer(leaf, map + pg_offset, ptr,
-                                          copy_size);
+                       if (btrfs_file_extent_compression(leaf, item) ==
+                           BTRFS_COMPRESS_ZLIB) {
+                               ret = uncompress_inline(path, inode, page,
+                                                       pg_offset,
+                                                       extent_offset, item);
+                               BUG_ON(ret);
+                       } else {
+                               map = kmap(page);
+                               read_extent_buffer(leaf, map + pg_offset, ptr,
+                                                  copy_size);
+                               kunmap(page);
+                       }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
                        if (!trans) {
                                trans = btrfs_join_transaction(root, 1);
                                goto again;
                        }
+                       map = kmap(page);
                        write_extent_buffer(leaf, map + pg_offset, ptr,
                                            copy_size);
+                       kunmap(page);
                        btrfs_mark_buffer_dirty(leaf);
                }
-               kunmap(page);
                set_extent_uptodate(io_tree, em->start,
                                    extent_map_end(em) - 1, GFP_NOFS);
                goto insert;
        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
        btrfs_set_file_extent_type(leaf, ei,
                                   BTRFS_FILE_EXTENT_INLINE);
+       btrfs_set_file_extent_encryption(leaf, ei, 0);
+       btrfs_set_file_extent_compression(leaf, ei, 0);
+       btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+       btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
        ptr = btrfs_file_extent_inline_start(ei);
        write_extent_buffer(leaf, symname, ptr, name_len);
        btrfs_mark_buffer_dirty(leaf);
 
  * inserted.
  */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                            u64 start, u64 len, int nocow)
+                            u64 start, u64 len, u64 disk_len, int nocow,
+                            int compressed)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        entry->file_offset = file_offset;
        entry->start = start;
        entry->len = len;
+       entry->disk_len = disk_len;
        entry->inode = inode;
        if (nocow)
                set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
+       if (compressed)
+               set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
 
        /* one ref for the tree */
        atomic_set(&entry->refs, 1);
         * for pdflush to find them
         */
        btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
-       if (wait)
+       if (wait) {
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
+       }
 }
 
 /*
 
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
        /* disk byte number */
        u64 start;
 
-       /* length of the extent in bytes */
+       /* ram length of the extent in bytes */
        u64 len;
 
+       /* extent length on disk */
+       u64 disk_len;
+
        /* flags (described above) */
        unsigned long flags;
 
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                            u64 start, u64 len, int nocow);
+                            u64 start, u64 len, u64 disk_len, int nocow,
+                            int compressed);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
 
                        if (btrfs_file_extent_type(l, fi) ==
                            BTRFS_FILE_EXTENT_INLINE) {
                                printk("\t\tinline extent data size %u\n",
-                                  btrfs_file_extent_inline_len(l, item));
+                                  btrfs_file_extent_inline_len(l, fi));
                                break;
                        }
                        printk("\t\textent data disk bytenr %llu nr %llu\n",
                               (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
                               (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-                       printk("\t\textent data offset %llu nr %llu\n",
+                       printk("\t\textent data offset %llu nr %llu ram %llu\n",
                          (unsigned long long)btrfs_file_extent_offset(l, fi),
-                         (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
+                         (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
+                         (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
                        break;
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
 
 #include "volumes.h"
 #include "version.h"
 #include "export.h"
+#include "compression.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-       Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
+       Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
 };
 
 static match_table_t tokens = {
        {Opt_max_inline, "max_inline=%s"},
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
+       {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
        {Opt_noacl, "noacl"},
        {Opt_err, NULL},
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
+               case Opt_compress:
+                       printk(KERN_INFO "btrfs: use compression\n");
+                       btrfs_set_opt(info->mount_opt, COMPRESS);
+                       break;
                case Opt_ssd:
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
                        btrfs_set_opt(info->mount_opt, SSD);
        err = btrfs_interface_init();
        if (err)
                goto free_extent_map;
+
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
+       btrfs_zlib_exit();
 }
 
 module_init(init_btrfs_fs)
 
        if (found_type == BTRFS_FILE_EXTENT_REG)
                extent_end = start + btrfs_file_extent_num_bytes(eb, item);
        else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-               size = btrfs_file_extent_inline_len(eb,
-                                                   btrfs_item_nr(eb, slot));
+               size = btrfs_file_extent_inline_len(eb, item);
                extent_end = (start + size + mask) & ~mask;
        } else {
                ret = 0;
 
        em->start = key.offset;
        em->len = *num_bytes;
        em->block_start = 0;
+       em->block_len = em->len;
 
        if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_add_system_chunk(trans, chunk_root, &key,
        em->start = logical;
        em->len = length;
        em->block_start = 0;
+       em->block_len = em->len;
 
        map->num_stripes = num_stripes;
        map->io_width = btrfs_chunk_io_width(leaf, chunk);
 
--- /dev/null
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright Â© 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+       avail_out = *dstlen - 12 and flush == Z_FINISH.
+       If it doesn't manage to finish, call it again with
+       avail_in == 0 and avail_out set to the remaining 12
+       bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+       z_stream inf_strm;
+       z_stream def_strm;
+       char *buf;
+       struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+       struct workspace *workspace;
+       int ret;
+       int cpus = num_online_cpus();
+
+again:
+       spin_lock(&workspace_lock);
+       if (!list_empty(&idle_workspace)) {
+               workspace = list_entry(idle_workspace.next, struct workspace,
+                                      list);
+               list_del(&workspace->list);
+               num_workspace--;
+               spin_unlock(&workspace_lock);
+               return workspace;
+
+       }
+       spin_unlock(&workspace_lock);
+       if (atomic_read(&alloc_workspace) > cpus) {
+               DEFINE_WAIT(wait);
+               prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+               if (atomic_read(&alloc_workspace) > cpus)
+                       schedule();
+               finish_wait(&workspace_wait, &wait);
+               goto again;
+       }
+       atomic_inc(&alloc_workspace);
+       workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+       if (!workspace) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+       if (!workspace->def_strm.workspace) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+       if (!workspace->inf_strm.workspace) {
+               ret = -ENOMEM;
+               goto fail_inflate;
+       }
+       workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+       if (!workspace->buf) {
+               ret = -ENOMEM;
+               goto fail_kmalloc;
+       }
+       return workspace;
+
+fail_kmalloc:
+       vfree(workspace->inf_strm.workspace);
+fail_inflate:
+       vfree(workspace->def_strm.workspace);
+fail:
+       kfree(workspace);
+       atomic_dec(&alloc_workspace);
+       wake_up(&workspace_wait);
+       return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+       spin_lock(&workspace_lock);
+       if (num_workspace < num_online_cpus()) {
+               list_add_tail(&workspace->list, &idle_workspace);
+               num_workspace++;
+               spin_unlock(&workspace_lock);
+               if (waitqueue_active(&workspace_wait))
+                       wake_up(&workspace_wait);
+               return 0;
+       }
+       spin_unlock(&workspace_lock);
+       vfree(workspace->def_strm.workspace);
+       vfree(workspace->inf_strm.workspace);
+       kfree(workspace->buf);
+       kfree(workspace);
+
+       atomic_dec(&alloc_workspace);
+       if (waitqueue_active(&workspace_wait))
+               wake_up(&workspace_wait);
+       return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+       struct workspace *workspace;
+       while(!list_empty(&idle_workspace)) {
+               workspace = list_entry(idle_workspace.next, struct workspace,
+                                      list);
+               list_del(&workspace->list);
+               vfree(workspace->def_strm.workspace);
+               vfree(workspace->inf_strm.workspace);
+               kfree(workspace->buf);
+               kfree(workspace);
+               atomic_dec(&alloc_workspace);
+       }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                             u64 start, unsigned long len,
+                             struct page **pages,
+                             unsigned long nr_dest_pages,
+                             unsigned long *out_pages,
+                             unsigned long *total_in,
+                             unsigned long *total_out,
+                             unsigned long max_out)
+{
+       int ret;
+       struct workspace *workspace;
+       char *data_in;
+       char *cpage_out;
+       int nr_pages = 0;
+       struct page *in_page = NULL;
+       struct page *out_page = NULL;
+       int out_written = 0;
+       int in_read = 0;
+       unsigned long bytes_left;
+
+       *out_pages = 0;
+       *total_out = 0;
+       *total_in = 0;
+
+       workspace = find_zlib_workspace();
+       if (!workspace)
+               return -1;
+
+       if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+               printk(KERN_WARNING "deflateInit failed\n");
+               ret = -1;
+               goto out;
+       }
+
+       workspace->def_strm.total_in = 0;
+       workspace->def_strm.total_out = 0;
+
+       in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+       data_in = kmap(in_page);
+
+       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       cpage_out = kmap(out_page);
+       pages[0] = out_page;
+       nr_pages = 1;
+
+       workspace->def_strm.next_in = data_in;
+       workspace->def_strm.next_out = cpage_out;
+       workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+       workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+       out_written = 0;
+       in_read = 0;
+
+       while (workspace->def_strm.total_in < len) {
+               ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+               if (ret != Z_OK) {
+                       printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                              ret);
+                       zlib_deflateEnd(&workspace->def_strm);
+                       ret = -1;
+                       goto out;
+               }
+
+               /* we're making it bigger, give up */
+               if (workspace->def_strm.total_in > 8192 &&
+                   workspace->def_strm.total_in <
+                   workspace->def_strm.total_out) {
+                       ret = -1;
+                       goto out;
+               }
+               /* we need another page for writing out.  Test this
+                * before the total_in so we will pull in a new page for
+                * the stream end if required
+                */
+               if (workspace->def_strm.avail_out == 0) {
+                       kunmap(out_page);
+                       if (nr_pages == nr_dest_pages) {
+                               out_page = NULL;
+                               ret = -1;
+                               goto out;
+                       }
+                       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       cpage_out = kmap(out_page);
+                       pages[nr_pages] = out_page;
+                       nr_pages++;
+                       workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+                       workspace->def_strm.next_out = cpage_out;
+               }
+               /* we're all done */
+               if (workspace->def_strm.total_in >= len)
+                       break;
+
+               /* we've read in a full page, get a new one */
+               if (workspace->def_strm.avail_in == 0) {
+                       if (workspace->def_strm.total_out > max_out)
+                               break;
+
+                       bytes_left = len - workspace->def_strm.total_in;
+                       kunmap(in_page);
+                       page_cache_release(in_page);
+
+                       start += PAGE_CACHE_SIZE;
+                       in_page = find_get_page(mapping,
+                                               start >> PAGE_CACHE_SHIFT);
+                       data_in = kmap(in_page);
+                       workspace->def_strm.avail_in = min(bytes_left,
+                                                          PAGE_CACHE_SIZE);
+                       workspace->def_strm.next_in = data_in;
+               }
+       }
+       workspace->def_strm.avail_in = 0;
+       ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+       zlib_deflateEnd(&workspace->def_strm);
+
+       if (ret != Z_STREAM_END) {
+               ret = -1;
+               goto out;
+       }
+
+       if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+               ret = -1;
+               goto out;
+       }
+
+       ret = 0;
+       *total_out = workspace->def_strm.total_out;
+       *total_in = workspace->def_strm.total_in;
+out:
+       *out_pages = nr_pages;
+       if (out_page)
+               kunmap(out_page);
+
+       if (in_page) {
+               kunmap(in_page);
+               page_cache_release(in_page);
+       }
+       free_workspace(workspace);
+       return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                             u64 disk_start,
+                             struct bio_vec *bvec,
+                             int vcnt,
+                             size_t srclen)
+{
+       int ret = 0;
+       int wbits = MAX_WBITS;
+       struct workspace *workspace;
+       char *data_in;
+       size_t total_out = 0;
+       unsigned long page_bytes_left;
+       unsigned long page_in_index = 0;
+       unsigned long page_out_index = 0;
+       struct page *page_out;
+       unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                       PAGE_CACHE_SIZE;
+       unsigned long buf_start;
+       unsigned long buf_offset;
+       unsigned long bytes;
+       unsigned long working_bytes;
+       unsigned long pg_offset;
+       unsigned long start_byte;
+       unsigned long current_buf_start;
+       char *kaddr;
+
+       workspace = find_zlib_workspace();
+       if (!workspace)
+               return -ENOMEM;
+
+       data_in = kmap(pages_in[page_in_index]);
+       workspace->inf_strm.next_in = data_in;
+       workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+       workspace->inf_strm.total_in = 0;
+
+       workspace->inf_strm.total_out = 0;
+       workspace->inf_strm.next_out = workspace->buf;
+       workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+       page_out = bvec[page_out_index].bv_page;
+       page_bytes_left = PAGE_CACHE_SIZE;
+       pg_offset = 0;
+
+       /* If it's deflate, and it's got no preset dictionary, then
+          we can tell zlib to skip the adler32 check. */
+       if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+           ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+           !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+               wbits = -((data_in[0] >> 4) + 8);
+               workspace->inf_strm.next_in += 2;
+               workspace->inf_strm.avail_in -= 2;
+       }
+
+       if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+               printk(KERN_WARNING "inflateInit failed\n");
+               ret = -1;
+               goto out;
+       }
+       while(workspace->inf_strm.total_in < srclen) {
+               ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+               if (ret != Z_OK && ret != Z_STREAM_END) {
+                       break;
+               }
+
+               /*
+                * buf start is the byte offset we're of the start of
+                * our workspace buffer
+                */
+               buf_start = total_out;
+
+               /* total_out is the last byte of the workspace buffer */
+               total_out = workspace->inf_strm.total_out;
+
+               working_bytes = total_out - buf_start;
+
+               /*
+                * start byte is the first byte of the page we're currently
+                * copying into relative to the start of the compressed data.
+                */
+               start_byte = page_offset(page_out) - disk_start;
+
+               if (working_bytes == 0) {
+                       /* we didn't make progress in this inflate
+                        * call, we're done
+                        */
+                       if (ret != Z_STREAM_END)
+                               ret = -1;
+                       break;
+               }
+
+               /* we haven't yet hit data corresponding to this page */
+               if (total_out <= start_byte) {
+                       goto next;
+               }
+
+               /*
+                * the start of the data we care about is offset into
+                * the middle of our working buffer
+                */
+               if (total_out > start_byte && buf_start < start_byte) {
+                       buf_offset = start_byte - buf_start;
+                       working_bytes -= buf_offset;
+               } else {
+                       buf_offset = 0;
+               }
+               current_buf_start = buf_start;
+
+               /* copy bytes from the working buffer into the pages */
+               while(working_bytes > 0) {
+                       bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                                   PAGE_CACHE_SIZE - buf_offset);
+                       bytes = min(bytes, working_bytes);
+                       kaddr = kmap_atomic(page_out, KM_USER0);
+                       memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+                              bytes);
+                       kunmap_atomic(kaddr, KM_USER0);
+                       flush_dcache_page(page_out);
+
+                       pg_offset += bytes;
+                       page_bytes_left -= bytes;
+                       buf_offset += bytes;
+                       working_bytes -= bytes;
+                       current_buf_start += bytes;
+
+                       /* check if we need to pick another page */
+                       if (page_bytes_left == 0) {
+                               page_out_index++;
+                               if (page_out_index >= vcnt) {
+                                       ret = 0;
+                                       goto done;
+                               }
+                               page_out = bvec[page_out_index].bv_page;
+                               pg_offset = 0;
+                               page_bytes_left = PAGE_CACHE_SIZE;
+                               start_byte = page_offset(page_out) - disk_start;
+
+                               /*
+                                * make sure our new page is covered by this
+                                * working buffer
+                                */
+                               if (total_out <= start_byte) {
+                                       goto next;
+                               }
+
+                               /* the next page in the biovec might not
+                                * be adjacent to the last page, but it
+                                * might still be found inside this working
+                                * buffer.  bump our offset pointer
+                                */
+                               if (total_out > start_byte &&
+                                   current_buf_start < start_byte) {
+                                       buf_offset = start_byte - buf_start;
+                                       working_bytes = total_out - start_byte;
+                                       current_buf_start = buf_start +
+                                               buf_offset;
+                               }
+                       }
+               }
+next:
+               workspace->inf_strm.next_out = workspace->buf;
+               workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+               if (workspace->inf_strm.avail_in == 0) {
+                       unsigned long tmp;
+                       kunmap(pages_in[page_in_index]);
+                       page_in_index++;
+                       if (page_in_index >= total_pages_in) {
+                               data_in = NULL;
+                               break;
+                       }
+                       data_in = kmap(pages_in[page_in_index]);
+                       workspace->inf_strm.next_in = data_in;
+                       tmp = srclen - workspace->inf_strm.total_in;
+                       workspace->inf_strm.avail_in = min(tmp,
+                                                          PAGE_CACHE_SIZE);
+               }
+       }
+       if (ret != Z_STREAM_END) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+done:
+       zlib_inflateEnd(&workspace->inf_strm);
+       if (data_in)
+               kunmap(pages_in[page_in_index]);
+out:
+       free_workspace(workspace);
+       return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+                         struct page *dest_page,
+                         unsigned long start_byte,
+                         size_t srclen, size_t destlen)
+{
+       int ret = 0;
+       int wbits = MAX_WBITS;
+       struct workspace *workspace;
+       unsigned long bytes_left = destlen;
+       unsigned long total_out = 0;
+       char *kaddr;
+
+       if (destlen > PAGE_CACHE_SIZE)
+               return -ENOMEM;
+
+       workspace = find_zlib_workspace();
+       if (!workspace)
+               return -ENOMEM;
+
+       workspace->inf_strm.next_in = data_in;
+       workspace->inf_strm.avail_in = srclen;
+       workspace->inf_strm.total_in = 0;
+
+       workspace->inf_strm.next_out = workspace->buf;
+       workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+       workspace->inf_strm.total_out = 0;
+       /* If it's deflate, and it's got no preset dictionary, then
+          we can tell zlib to skip the adler32 check. */
+       if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+           ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+           !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+               wbits = -((data_in[0] >> 4) + 8);
+               workspace->inf_strm.next_in += 2;
+               workspace->inf_strm.avail_in -= 2;
+       }
+
+       if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+               printk(KERN_WARNING "inflateInit failed\n");
+               ret = -1;
+               goto out;
+       }
+
+       while(bytes_left > 0) {
+               unsigned long buf_start;
+               unsigned long buf_offset;
+               unsigned long bytes;
+               unsigned long pg_offset = 0;
+
+               ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+               if (ret != Z_OK && ret != Z_STREAM_END) {
+                       break;
+               }
+
+               buf_start = total_out;
+               total_out = workspace->inf_strm.total_out;
+
+               if (total_out == buf_start) {
+                       ret = -1;
+                       break;
+               }
+
+               if (total_out <= start_byte) {
+                       goto next;
+               }
+
+               if (total_out > start_byte && buf_start < start_byte) {
+                       buf_offset = start_byte - buf_start;
+               } else {
+                       buf_offset = 0;
+               }
+
+               bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                           PAGE_CACHE_SIZE - buf_offset);
+               bytes = min(bytes, bytes_left);
+
+               kaddr = kmap_atomic(dest_page, KM_USER0);
+               memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+               kunmap_atomic(kaddr, KM_USER0);
+
+               pg_offset += bytes;
+               bytes_left -= bytes;
+next:
+               workspace->inf_strm.next_out = workspace->buf;
+               workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+       }
+       if (ret != Z_STREAM_END && bytes_left != 0) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+       zlib_inflateEnd(&workspace->inf_strm);
+out:
+       free_workspace(workspace);
+       return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}