ret = read_extent_buffer_pages(io_tree, eb, start,
                                               WAIT_COMPLETE,
                                               btree_get_extent, mirror_num);
-               if (!ret &&
-                   !verify_parent_transid(io_tree, eb, parent_transid))
+               if (!ret && !verify_parent_transid(io_tree, eb, parent_transid))
                        return ret;
 
                /*
        u64 found_start;
        unsigned long len;
        struct extent_buffer *eb;
-       int ret;
 
        tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-       if (page->private == EXTENT_PAGE_PRIVATE) {
-               WARN_ON(1);
+       if (page->private == EXTENT_PAGE_PRIVATE)
                goto out;
-       }
        if (!page->private) {
                WARN_ON(1);
                goto out;
        len = page->private >> 2;
        WARN_ON(len == 0);
 
-       eb = alloc_extent_buffer(tree, start, len, page);
-       if (eb == NULL) {
-               WARN_ON(1);
-               goto out;
-       }
-       ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
-                                            btrfs_header_generation(eb));
-       BUG_ON(ret);
-       WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
+       eb = find_extent_buffer(tree, start, len);
 
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                WARN_ON(1);
                goto err;
        }
-       if (eb->first_page != page) {
+       if (eb->pages[0] != page) {
                WARN_ON(1);
                goto err;
        }
        return 0;
 }
 
+struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
+                                      struct page *page, int max_walk)
+{
+       struct extent_buffer *eb;
+       u64 start = page_offset(page);
+       u64 target = start;
+       u64 min_start;
+
+       if (start < max_walk)
+               min_start = 0;
+       else
+               min_start = start - max_walk;
+
+       while (start >= min_start) {
+               eb = find_extent_buffer(tree, start, 0);
+               if (eb) {
+                       /*
+                        * we found an extent buffer and it contains our page
+                        * horray!
+                        */
+                       if (eb->start <= target &&
+                           eb->start + eb->len > target)
+                               return eb;
+
+                       /* we found an extent buffer that wasn't for us */
+                       free_extent_buffer(eb);
+                       return NULL;
+               }
+               if (start == 0)
+                       break;
+               start -= PAGE_CACHE_SIZE;
+       }
+       return NULL;
+}
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state)
 {
        struct extent_buffer *eb;
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
        int ret = 0;
+       int reads_done;
 
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
-       if (page->private == EXTENT_PAGE_PRIVATE)
-               goto out;
        if (!page->private)
                goto out;
 
+       tree = &BTRFS_I(page->mapping->host)->io_tree;
        len = page->private >> 2;
-       WARN_ON(len == 0);
 
-       eb = alloc_extent_buffer(tree, start, len, page);
-       if (eb == NULL) {
+       eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize));
+       if (!eb) {
                ret = -EIO;
                goto out;
        }
+       reads_done = atomic_dec_and_test(&eb->pages_reading);
+       if (!reads_done)
+               goto err;
 
        found_start = btrfs_header_bytenr(eb);
-       if (found_start != start) {
+       if (found_start != eb->start) {
                printk_ratelimited(KERN_INFO "btrfs bad tree block start "
                               "%llu %llu\n",
                               (unsigned long long)found_start,
                ret = -EIO;
                goto err;
        }
-       if (eb->first_page != page) {
-               printk(KERN_INFO "btrfs bad first page %lu %lu\n",
-                      eb->first_page->index, page->index);
-               WARN_ON(1);
-               ret = -EIO;
-               goto err;
-       }
        if (check_tree_block_fsid(root, eb)) {
                printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
                               (unsigned long long)eb->start);
                ret = -EIO;
        }
 
-       end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
-       end = eb->start + end - 1;
 err:
        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
                btree_readahead_hook(root, eb, eb->start, ret);
        }
 
+       if (ret && eb)
+               clear_extent_buffer_uptodate(tree, eb, NULL);
        free_extent_buffer(eb);
 out:
        return ret;
        len = page->private >> 2;
        WARN_ON(len == 0);
 
-       eb = alloc_extent_buffer(tree, start, len, page);
+       eb = alloc_extent_buffer(tree, start, len);
        if (eb == NULL)
                goto out;
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
-       struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-       struct extent_buffer *eb;
-       int was_dirty;
-
        tree = &BTRFS_I(page->mapping->host)->io_tree;
+
        if (!(current->flags & PF_MEMALLOC)) {
                return extent_write_full_page(tree, page,
                                              btree_get_extent, wbc);
        }
 
        redirty_page_for_writepage(wbc, page);
-       eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-       WARN_ON(!eb);
-
-       was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
-       if (!was_dirty) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
-               spin_unlock(&root->fs_info->delalloc_lock);
-       }
-       free_extent_buffer(eb);
-
        unlock_page(page);
        return 0;
 }
 {
        struct extent_io_tree *tree;
        struct extent_map_tree *map;
+       struct extent_buffer *eb;
+       struct btrfs_root *root;
        int ret;
 
        if (PageWriteback(page) || PageDirty(page))
        tree = &BTRFS_I(page->mapping->host)->io_tree;
        map = &BTRFS_I(page->mapping->host)->extent_tree;
 
+       root = BTRFS_I(page->mapping->host)->root;
+       if (page->private == EXTENT_PAGE_PRIVATE) {
+               eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize));
+               free_extent_buffer(eb);
+               if (eb)
+                       return 0;
+       }
        /*
         * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
         * slab allocation from alloc_extent_state down the callchain where
        struct extent_buffer *eb;
 
        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                                bytenr, blocksize, NULL);
+                                bytenr, blocksize);
        return eb;
 }
 
 
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
-       return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+       return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
                                        buf->start + buf->len - 1);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-       return filemap_fdatawait_range(buf->first_page->mapping,
+       return filemap_fdatawait_range(buf->pages[0]->mapping,
                                       buf->start, buf->start + buf->len - 1);
 }
 
        return 0;
 }
 
-static int bio_ready_for_csum(struct bio *bio)
-{
-       u64 length = 0;
-       u64 buf_len = 0;
-       u64 start = 0;
-       struct page *page;
-       struct extent_io_tree *io_tree = NULL;
-       struct bio_vec *bvec;
-       int i;
-       int ret;
-
-       bio_for_each_segment(bvec, bio, i) {
-               page = bvec->bv_page;
-               if (page->private == EXTENT_PAGE_PRIVATE) {
-                       length += bvec->bv_len;
-                       continue;
-               }
-               if (!page->private) {
-                       length += bvec->bv_len;
-                       continue;
-               }
-               length = bvec->bv_len;
-               buf_len = page->private >> 2;
-               start = page_offset(page) + bvec->bv_offset;
-               io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-       }
-       /* are we fully contained in this bio? */
-       if (buf_len <= length)
-               return 1;
-
-       ret = extent_range_uptodate(io_tree, start + length,
-                                   start + buf_len - 1);
-       return ret;
-}
-
 /*
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
        bio = end_io_wq->bio;
        fs_info = end_io_wq->info;
 
-       /* metadata bio reads are special because the whole tree block must
-        * be checksummed at once.  This makes sure the entire block is in
-        * ram and up to date before trying to verify things.  For
-        * blocksize <= pagesize, it is basically a noop
-        */
-       if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
-           !bio_ready_for_csum(bio)) {
-               btrfs_queue_worker(&fs_info->endio_meta_workers,
-                                  &end_io_wq->work);
-               return;
-       }
        error = end_io_wq->error;
        bio->bi_private = end_io_wq->private;
        bio->bi_end_io = end_io_wq->end_io;
                goto fail_alloc;
        }
 
+       if (btrfs_super_leafsize(disk_super) !=
+           btrfs_super_nodesize(disk_super)) {
+               printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+                      "blocksizes don't match.  node %d leaf %d\n",
+                      btrfs_super_nodesize(disk_super),
+                      btrfs_super_leafsize(disk_super));
+               err = -EINVAL;
+               goto fail_alloc;
+       }
+       if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+               printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+                      "blocksize (%d) was too large\n",
+                      btrfs_super_leafsize(disk_super));
+               err = -EINVAL;
+               goto fail_alloc;
+       }
+
        features = btrfs_super_incompat_flags(disk_super);
        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
        if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+       /*
+        * flag our filesystem as having big metadata blocks if
+        * they are bigger than the page size
+        */
+       if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+               if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+                       printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+               features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+       }
+
        btrfs_set_super_incompat_flags(disk_super, features);
 
        features = btrfs_super_compat_ro_flags(disk_super) &
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 {
        int ret;
-       struct inode *btree_inode = buf->first_page->mapping->host;
+       struct inode *btree_inode = buf->pages[0]->mapping->host;
 
        ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
                                     NULL);
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-       struct inode *btree_inode = buf->first_page->mapping->host;
+       struct inode *btree_inode = buf->pages[0]->mapping->host;
        return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
                                          buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+       struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
        u64 transid = btrfs_header_generation(buf);
        struct inode *btree_inode = root->fs_info->btree_inode;
        int was_dirty;
 
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+       struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
        int ret;
        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
        if (ret == 0)
 
 inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
 {
-       struct page *p;
-       struct address_space *mapping;
-
-       if (i == 0)
-               return eb->first_page;
-       i += eb->start >> PAGE_CACHE_SHIFT;
-       mapping = eb->first_page->mapping;
-       if (!mapping)
-               return NULL;
-
-       /*
-        * extent_buffer_page is only called after pinning the page
-        * by increasing the reference count.  So we know the page must
-        * be in the radix tree.
-        */
-       rcu_read_lock();
-       p = radix_tree_lookup(&mapping->page_tree, i);
-       rcu_read_unlock();
-
-       return p;
+       return eb->pages[i];
 }
 
 inline unsigned long num_extent_pages(u64 start, u64 len)
                (start >> PAGE_CACHE_SHIFT);
 }
 
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+       unsigned long flags;
+       spin_lock_irqsave(&leak_lock, flags);
+       list_del(&eb->leak_list);
+       spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+       if (eb->pages && eb->pages != eb->inline_pages)
+               kfree(eb->pages);
+       kmem_cache_free(extent_buffer_cache, eb);
+}
+
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                                                   u64 start,
                                                   unsigned long len,
        spin_unlock_irqrestore(&leak_lock, flags);
 #endif
        atomic_set(&eb->refs, 1);
+       atomic_set(&eb->pages_reading, 0);
+
+       if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+               struct page **pages;
+               int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+                       PAGE_CACHE_SHIFT;
+               pages = kzalloc(num_pages, mask);
+               if (!pages) {
+                       __free_extent_buffer(eb);
+                       return NULL;
+               }
+               eb->pages = pages;
+       } else {
+               eb->pages = eb->inline_pages;
+       }
 
        return eb;
 }
 
-static void __free_extent_buffer(struct extent_buffer *eb)
-{
-#if LEAK_DEBUG
-       unsigned long flags;
-       spin_lock_irqsave(&leak_lock, flags);
-       list_del(&eb->leak_list);
-       spin_unlock_irqrestore(&leak_lock, flags);
-#endif
-       kmem_cache_free(extent_buffer_cache, eb);
-}
-
 /*
  * Helper for releasing extent buffer page.
  */
        unsigned long index;
        struct page *page;
 
-       if (!eb->first_page)
-               return;
-
        index = num_extent_pages(eb->start, eb->len);
        if (start_idx >= index)
                return;
 }
 
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
-                                         struct page *page0)
+                                         u64 start, unsigned long len)
 {
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_page_accessed(eb->pages[0]);
                return eb;
        }
        rcu_read_unlock();
        if (!eb)
                return NULL;
 
-       if (page0) {
-               eb->first_page = page0;
-               i = 1;
-               index++;
-               page_cache_get(page0);
-               mark_page_accessed(page0);
-               set_page_extent_mapped(page0);
-               set_page_extent_head(page0, len);
-               uptodate = PageUptodate(page0);
-       } else {
-               i = 0;
-       }
-       for (; i < num_pages; i++, index++) {
+       for (i = 0; i < num_pages; i++, index++) {
                p = find_or_create_page(mapping, index, GFP_NOFS);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
                }
-               set_page_extent_mapped(p);
                mark_page_accessed(p);
-               if (i == 0) {
-                       eb->first_page = p;
-                       set_page_extent_head(p, len);
-               } else {
-                       set_page_private(p, EXTENT_PAGE_PRIVATE);
-               }
+               eb->pages[i] = p;
                if (!PageUptodate(p))
                        uptodate = 0;
 
                 * see below about how we avoid a nasty race with release page
                 * and why we unlock later
                 */
-               if (i != 0)
-                       unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
         * after the extent buffer is in the radix tree so
         * it doesn't get lost
         */
-       set_page_extent_mapped(eb->first_page);
-       set_page_extent_head(eb->first_page, eb->len);
-       if (!page0)
-               unlock_page(eb->first_page);
+       set_page_extent_mapped(eb->pages[0]);
+       set_page_extent_head(eb->pages[0], eb->len);
+       SetPageChecked(eb->pages[0]);
+       for (i = 1; i < num_pages; i++) {
+               p = extent_buffer_page(eb, i);
+               set_page_extent_mapped(p);
+               ClearPageChecked(p);
+               unlock_page(p);
+       }
+       unlock_page(eb->pages[0]);
        return eb;
 
 free_eb:
-       if (eb->first_page && !page0)
-               unlock_page(eb->first_page);
+       for (i = 0; i < num_pages; i++) {
+               if (eb->pages[i])
+                       unlock_page(eb->pages[i]);
+       }
 
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_page_accessed(eb->pages[0]);
                return eb;
        }
        rcu_read_unlock();
        int ret = 0;
        int locked_pages = 0;
        int all_uptodate = 1;
-       int inc_all_pages = 0;
        unsigned long num_pages;
+       unsigned long num_reads = 0;
        struct bio *bio = NULL;
        unsigned long bio_flags = 0;
 
                        lock_page(page);
                }
                locked_pages++;
-               if (!PageUptodate(page))
+               if (!PageUptodate(page)) {
+                       num_reads++;
                        all_uptodate = 0;
+               }
        }
        if (all_uptodate) {
                if (start_i == 0)
                goto unlock_exit;
        }
 
+       atomic_set(&eb->pages_reading, num_reads);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-
-               WARN_ON(!PagePrivate(page));
-
                set_page_extent_mapped(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
-
-               if (inc_all_pages)
-                       page_cache_get(page);
                if (!PageUptodate(page)) {
-                       if (start_i == 0)
-                               inc_all_pages = 1;
                        ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
                                                      get_extent, &bio,
 {
        char *dst_kaddr = page_address(dst_page);
        char *src_kaddr;
+       int must_memmove = 0;
 
        if (dst_page != src_page) {
                src_kaddr = page_address(src_page);
        } else {
                src_kaddr = dst_kaddr;
-               BUG_ON(areas_overlap(src_off, dst_off, len));
+               if (areas_overlap(src_off, dst_off, len))
+                       must_memmove = 1;
        }
 
-       memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       if (must_memmove)
+               memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       else
+               memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
 }
 
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-       if (!areas_overlap(src_offset, dst_offset, len)) {
+       if (dst_offset < src_offset) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
                return ret;
        }
 
-       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+       if (atomic_read(&eb->refs) > 1 ||
+           test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
                goto out;
        }
                ret = 0;
                goto out;
        }
-
        radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 out:
        spin_unlock(&tree->buffer_lock);