* Delayed allocation stuff
  */
 
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd);
-
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
-                             struct ext4_map_blocks *map)
-{
-       struct pagevec pvec;
-       unsigned long index, end;
-       int ret = 0, err, nr_pages, i;
-       struct inode *inode = mpd->inode;
-       struct address_space *mapping = inode->i_mapping;
-       loff_t size = i_size_read(inode);
-       unsigned int len, block_start;
-       struct buffer_head *bh, *page_bufs = NULL;
-       sector_t pblock = 0, cur_logical = 0;
-       struct ext4_io_submit io_submit;
-
-       BUG_ON(mpd->next_page <= mpd->first_page);
-       ext4_io_submit_init(&io_submit, mpd->wbc);
-       io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
-       if (!io_submit.io_end) {
-               ext4_da_block_invalidatepages(mpd);
-               return -ENOMEM;
-       }
+struct mpage_da_data {
+       struct inode *inode;
+       struct writeback_control *wbc;
+       pgoff_t first_page;     /* The first page to write */
+       pgoff_t next_page;      /* Current page to examine */
+       pgoff_t last_page;      /* Last page to examine */
        /*
-        * We need to start from the first_page to the next_page - 1
-        * to make sure we also write the mapped dirty buffer_heads.
-        * If we look at mpd->b_blocknr we would only be looking
-        * at the currently mapped buffer_heads.
+        * Extent to map - this can be after first_page because that can be
+        * fully mapped. We somewhat abuse m_flags to store whether the extent
+        * is delalloc or unwritten.
         */
-       index = mpd->first_page;
-       end = mpd->next_page - 1;
-
-       pagevec_init(&pvec, 0);
-       while (index <= end) {
-               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-               if (nr_pages == 0)
-                       break;
-               for (i = 0; i < nr_pages; i++) {
-                       int skip_page = 0;
-                       struct page *page = pvec.pages[i];
-
-                       index = page->index;
-                       if (index > end)
-                               break;
-
-                       if (index == size >> PAGE_CACHE_SHIFT)
-                               len = size & ~PAGE_CACHE_MASK;
-                       else
-                               len = PAGE_CACHE_SIZE;
-                       if (map) {
-                               cur_logical = index << (PAGE_CACHE_SHIFT -
-                                                       inode->i_blkbits);
-                               pblock = map->m_pblk + (cur_logical -
-                                                       map->m_lblk);
-                       }
-                       index++;
-
-                       BUG_ON(!PageLocked(page));
-                       BUG_ON(PageWriteback(page));
-
-                       bh = page_bufs = page_buffers(page);
-                       block_start = 0;
-                       do {
-                               if (map && (cur_logical >= map->m_lblk) &&
-                                   (cur_logical <= (map->m_lblk +
-                                                    (map->m_len - 1)))) {
-                                       if (buffer_delay(bh)) {
-                                               clear_buffer_delay(bh);
-                                               bh->b_blocknr = pblock;
-                                       }
-                                       if (buffer_unwritten(bh) ||
-                                           buffer_mapped(bh))
-                                               BUG_ON(bh->b_blocknr != pblock);
-                                       if (map->m_flags & EXT4_MAP_UNINIT)
-                                               set_buffer_uninit(bh);
-                                       clear_buffer_unwritten(bh);
-                               }
-
-                               /*
-                                * skip page if block allocation undone and
-                                * block is dirty
-                                */
-                               if (ext4_bh_delay_or_unwritten(NULL, bh))
-                                       skip_page = 1;
-                               bh = bh->b_this_page;
-                               block_start += bh->b_size;
-                               cur_logical++;
-                               pblock++;
-                       } while (bh != page_bufs);
-
-                       if (skip_page) {
-                               unlock_page(page);
-                               continue;
-                       }
-
-                       clear_page_dirty_for_io(page);
-                       err = ext4_bio_write_page(&io_submit, page, len,
-                                                 mpd->wbc);
-                       if (!err)
-                               mpd->pages_written++;
-                       /*
-                        * In error case, we have to continue because
-                        * remaining pages are still locked
-                        */
-                       if (ret == 0)
-                               ret = err;
-               }
-               pagevec_release(&pvec);
-       }
-       ext4_io_submit(&io_submit);
-       /* Drop io_end reference we got from init */
-       ext4_put_io_end_defer(io_submit.io_end);
-       return ret;
-}
+       struct ext4_map_blocks map;
+       struct ext4_io_submit io_submit;        /* IO submission data */
+};
 
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+                                      bool invalidate)
 {
        int nr_pages, i;
        pgoff_t index, end;
        struct pagevec pvec;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-       ext4_lblk_t start, last;
+
+       /* This is necessary when next_page == 0. */
+       if (mpd->first_page >= mpd->next_page)
+               return;
 
        index = mpd->first_page;
        end   = mpd->next_page - 1;
-
-       start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       ext4_es_remove_extent(inode, start, last - start + 1);
+       if (invalidate) {
+               ext4_lblk_t start, last;
+               start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+               last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+               ext4_es_remove_extent(inode, start, last - start + 1);
+       }
 
        pagevec_init(&pvec, 0);
        while (index <= end) {
                                break;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                       block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
-                       ClearPageUptodate(page);
+                       if (invalidate) {
+                               block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+                               ClearPageUptodate(page);
+                       }
                        unlock_page(page);
                }
                index = pvec.pages[nr_pages - 1]->index + 1;
                pagevec_release(&pvec);
        }
-       return;
 }
 
 static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
 
-/*
- * mpage_da_map_and_submit - go through given space, map them
- *       if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
-       int err, blks, get_blocks_flags;
-       struct ext4_map_blocks map, *mapp = NULL;
-       sector_t next = mpd->b_blocknr;
-       unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
-       loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
-       handle_t *handle = NULL;
-
-       /*
-        * If the blocks are mapped already, or we couldn't accumulate
-        * any blocks, then proceed immediately to the submission stage.
-        */
-       if ((mpd->b_size == 0) ||
-           ((mpd->b_state  & (1 << BH_Mapped)) &&
-            !(mpd->b_state & (1 << BH_Delay)) &&
-            !(mpd->b_state & (1 << BH_Unwritten))))
-               goto submit_io;
-
-       handle = ext4_journal_current_handle();
-       BUG_ON(!handle);
-
-       /*
-        * Call ext4_map_blocks() to allocate any delayed allocation
-        * blocks, or to convert an uninitialized extent to be
-        * initialized (in the case where we have written into
-        * one or more preallocated blocks).
-        *
-        * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
-        * indicate that we are on the delayed allocation path.  This
-        * affects functions in many different parts of the allocation
-        * call path.  This flag exists primarily because we don't
-        * want to change *many* call functions, so ext4_map_blocks()
-        * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
-        * inode's allocation semaphore is taken.
-        *
-        * If the blocks in questions were delalloc blocks, set
-        * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
-        * variables are updated after the blocks have been allocated.
-        */
-       map.m_lblk = next;
-       map.m_len = max_blocks;
-       /*
-        * We're in delalloc path and it is possible that we're going to
-        * need more metadata blocks than previously reserved. However
-        * we must not fail because we're in writeback and there is
-        * nothing we can do about it so it might result in data loss.
-        * So use reserved blocks to allocate metadata if possible.
-        */
-       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
-       if (ext4_should_dioread_nolock(mpd->inode))
-               get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
-       if (mpd->b_state & (1 << BH_Delay))
-               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
-
-       blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-       if (blks < 0) {
-               struct super_block *sb = mpd->inode->i_sb;
-
-               err = blks;
-               /*
-                * If get block returns EAGAIN or ENOSPC and there
-                * appears to be free blocks we will just let
-                * mpage_da_submit_io() unlock all of the pages.
-                */
-               if (err == -EAGAIN)
-                       goto submit_io;
-
-               if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-                       mpd->retval = err;
-                       goto submit_io;
-               }
-
-               /*
-                * get block failure will cause us to loop in
-                * writepages, because a_ops->writepage won't be able
-                * to make progress. The page will be redirtied by
-                * writepage and writepages will again try to write
-                * the same.
-                */
-               if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-                       ext4_msg(sb, KERN_CRIT,
-                                "delayed block allocation failed for inode %lu "
-                                "at logical offset %llu with max blocks %zd "
-                                "with error %d", mpd->inode->i_ino,
-                                (unsigned long long) next,
-                                mpd->b_size >> mpd->inode->i_blkbits, err);
-                       ext4_msg(sb, KERN_CRIT,
-                               "This should not happen!! Data will be lost");
-                       if (err == -ENOSPC)
-                               ext4_print_free_blocks(mpd->inode);
-               }
-               /* invalidate all the pages */
-               ext4_da_block_invalidatepages(mpd);
-
-               /* Mark this page range as having been completed */
-               mpd->io_done = 1;
-               return;
-       }
-       BUG_ON(blks == 0);
-
-       mapp = ↦
-       if (map.m_flags & EXT4_MAP_NEW) {
-               struct block_device *bdev = mpd->inode->i_sb->s_bdev;
-               int i;
-
-               for (i = 0; i < map.m_len; i++)
-                       unmap_underlying_metadata(bdev, map.m_pblk + i);
-       }
-
-       /*
-        * Update on-disk size along with block allocation.
-        */
-       disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
-       if (disksize > i_size_read(mpd->inode))
-               disksize = i_size_read(mpd->inode);
-       if (disksize > EXT4_I(mpd->inode)->i_disksize) {
-               ext4_update_i_disksize(mpd->inode, disksize);
-               err = ext4_mark_inode_dirty(handle, mpd->inode);
-               if (err)
-                       ext4_error(mpd->inode->i_sb,
-                                  "Failed to mark inode %lu dirty",
-                                  mpd->inode->i_ino);
-       }
-
-submit_io:
-       mpage_da_submit_io(mpd, mapp);
-       mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
-               (1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
-                                  unsigned long b_state)
-{
-       sector_t next;
-       int blkbits = mpd->inode->i_blkbits;
-       int nrblocks = mpd->b_size >> blkbits;
-
-       /*
-        * XXX Don't go larger than mballoc is willing to allocate
-        * This is a stopgap solution.  We eventually need to fold
-        * mpage_da_submit_io() into this function and then call
-        * ext4_map_blocks() multiple times in a loop
-        */
-       if (nrblocks >= (8*1024*1024 >> blkbits))
-               goto flush_it;
-
-       /* check if the reserved journal credits might overflow */
-       if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
-               if (nrblocks >= EXT4_MAX_TRANS_DATA) {
-                       /*
-                        * With non-extent format we are limited by the journal
-                        * credit available.  Total credit needed to insert
-                        * nrblocks contiguous blocks is dependent on the
-                        * nrblocks.  So limit nrblocks.
-                        */
-                       goto flush_it;
-               }
-       }
-       /*
-        * First block in the extent
-        */
-       if (mpd->b_size == 0) {
-               mpd->b_blocknr = logical;
-               mpd->b_size = 1 << blkbits;
-               mpd->b_state = b_state & BH_FLAGS;
-               return;
-       }
-
-       next = mpd->b_blocknr + nrblocks;
-       /*
-        * Can we merge the block to our big extent?
-        */
-       if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-               mpd->b_size += 1 << blkbits;
-               return;
-       }
-
-flush_it:
-       /*
-        * We couldn't merge the block to our extent, so we
-        * need to flush current  extent and start new one
-        */
-       mpage_da_map_and_submit(mpd);
-       return;
-}
-
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
        return ret;
 }
 
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
 /*
  * mballoc gives us at most this number of blocks...
  * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
  */
 #define MAX_WRITEPAGES_EXTENT_LEN 2048
 
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+                                 unsigned long b_state)
+{
+       struct ext4_map_blocks *map = &mpd->map;
+
+       /* Don't go larger than mballoc is willing to allocate */
+       if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+               return 0;
+
+       /* First block in the extent? */
+       if (map->m_len == 0) {
+               map->m_lblk = lblk;
+               map->m_len = 1;
+               map->m_flags = b_state & BH_FLAGS;
+               return 1;
+       }
+
+       /* Can we merge the block to our big extent? */
+       if (lblk == map->m_lblk + map->m_len &&
+           (b_state & BH_FLAGS) == map->m_flags) {
+               map->m_len++;
+               return 1;
+       }
+       return 0;
+}
+
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+                                   struct buffer_head *head,
+                                   struct buffer_head *bh,
+                                   ext4_lblk_t lblk)
+{
+       struct inode *inode = mpd->inode;
+       ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+                                                       >> inode->i_blkbits;
+
+       do {
+               BUG_ON(buffer_locked(bh));
+
+               if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+                   (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+                   lblk >= blocks) {
+                       /* Found extent to map? */
+                       if (mpd->map.m_len)
+                               return false;
+                       if (lblk >= blocks)
+                               return true;
+                       continue;
+               }
+               if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+                       return false;
+       } while (lblk++, (bh = bh->b_this_page) != head);
+       return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+       int len;
+       loff_t size = i_size_read(mpd->inode);
+       int err;
+
+       BUG_ON(page->index != mpd->first_page);
+       if (page->index == size >> PAGE_CACHE_SHIFT)
+               len = size & ~PAGE_CACHE_MASK;
+       else
+               len = PAGE_CACHE_SIZE;
+       clear_page_dirty_for_io(page);
+       err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+       if (!err)
+               mpd->wbc->nr_to_write--;
+       mpd->first_page++;
+
+       return err;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *                    submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+       struct pagevec pvec;
+       int nr_pages, i;
+       struct inode *inode = mpd->inode;
+       struct buffer_head *head, *bh;
+       int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+       ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+                                                       >> inode->i_blkbits;
+       pgoff_t start, end;
+       ext4_lblk_t lblk;
+       sector_t pblock;
+       int err;
+
+       start = mpd->map.m_lblk >> bpp_bits;
+       end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+       lblk = start << bpp_bits;
+       pblock = mpd->map.m_pblk;
+
+       pagevec_init(&pvec, 0);
+       while (start <= end) {
+               nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+                                         PAGEVEC_SIZE);
+               if (nr_pages == 0)
+                       break;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       if (page->index > end)
+                               break;
+                       /* Upto 'end' pages must be contiguous */
+                       BUG_ON(page->index != start);
+                       bh = head = page_buffers(page);
+                       do {
+                               if (lblk < mpd->map.m_lblk)
+                                       continue;
+                               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+                                       /*
+                                        * Buffer after end of mapped extent.
+                                        * Find next buffer in the page to map.
+                                        */
+                                       mpd->map.m_len = 0;
+                                       mpd->map.m_flags = 0;
+                                       add_page_bufs_to_extent(mpd, head, bh,
+                                                               lblk);
+                                       pagevec_release(&pvec);
+                                       return 0;
+                               }
+                               if (buffer_delay(bh)) {
+                                       clear_buffer_delay(bh);
+                                       bh->b_blocknr = pblock++;
+                               }
+                               if (mpd->map.m_flags & EXT4_MAP_UNINIT)
+                                       set_buffer_uninit(bh);
+                               clear_buffer_unwritten(bh);
+                       } while (++lblk < blocks &&
+                                (bh = bh->b_this_page) != head);
+
+                       /*
+                        * FIXME: This is going to break if dioread_nolock
+                        * supports blocksize < pagesize as we will try to
+                        * convert potentially unmapped parts of inode.
+                        */
+                       mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+                       /* Page fully mapped - let IO run! */
+                       err = mpage_submit_page(mpd, page);
+                       if (err < 0) {
+                               pagevec_release(&pvec);
+                               return err;
+                       }
+                       start++;
+               }
+               pagevec_release(&pvec);
+       }
+       /* Extent fully mapped and matches with page boundary. We are done. */
+       mpd->map.m_len = 0;
+       mpd->map.m_flags = 0;
+       return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+       struct inode *inode = mpd->inode;
+       struct ext4_map_blocks *map = &mpd->map;
+       int get_blocks_flags;
+       int err;
+
+       trace_ext4_da_write_pages_extent(inode, map);
+       /*
+        * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+        * to convert an uninitialized extent to be initialized (in the case
+        * where we have written into one or more preallocated blocks).  It is
+        * possible that we're going to need more metadata blocks than
+        * previously reserved. However we must not fail because we're in
+        * writeback and there is nothing we can do about it so it might result
+        * in data loss.  So use reserved blocks to allocate metadata if
+        * possible.
+        *
+        * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+        * in question are delalloc blocks.  This affects functions in many
+        * different parts of the allocation call path.  This flag exists
+        * primarily because we don't want to change *many* call functions, so
+        * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+        * once the inode's allocation semaphore is taken.
+        */
+       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
+       if (ext4_should_dioread_nolock(inode))
+               get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+       if (map->m_flags & (1 << BH_Delay))
+               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+       err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+       if (err < 0)
+               return err;
+
+       BUG_ON(map->m_len == 0);
+       if (map->m_flags & EXT4_MAP_NEW) {
+               struct block_device *bdev = inode->i_sb->s_bdev;
+               int i;
+
+               for (i = 0; i < map->m_len; i++)
+                       unmap_underlying_metadata(bdev, map->m_pblk + i);
+       }
+       return 0;
+}
+
+/*
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ *                              mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
+ */
+static int mpage_map_and_submit_extent(handle_t *handle,
+                                      struct mpage_da_data *mpd)
+{
+       struct inode *inode = mpd->inode;
+       struct ext4_map_blocks *map = &mpd->map;
+       int err;
+       loff_t disksize;
+
+       mpd->io_submit.io_end->offset =
+                               ((loff_t)map->m_lblk) << inode->i_blkbits;
+       while (map->m_len) {
+               err = mpage_map_one_extent(handle, mpd);
+               if (err < 0) {
+                       struct super_block *sb = inode->i_sb;
+
+                       /*
+                        * Need to commit transaction to free blocks. Let upper
+                        * layers sort it out.
+                        */
+                       if (err == -ENOSPC && ext4_count_free_clusters(sb))
+                               return -ENOSPC;
+
+                       if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+                               ext4_msg(sb, KERN_CRIT,
+                                        "Delayed block allocation failed for "
+                                        "inode %lu at logical offset %llu with"
+                                        " max blocks %u with error %d",
+                                        inode->i_ino,
+                                        (unsigned long long)map->m_lblk,
+                                        (unsigned)map->m_len, err);
+                               ext4_msg(sb, KERN_CRIT,
+                                        "This should not happen!! Data will "
+                                        "be lost\n");
+                               if (err == -ENOSPC)
+                                       ext4_print_free_blocks(inode);
+                       }
+                       /* invalidate all the pages */
+                       mpage_release_unused_pages(mpd, true);
+                       return err;
+               }
+               /*
+                * Update buffer state, submit mapped pages, and get us new
+                * extent to map
+                */
+               err = mpage_map_and_submit_buffers(mpd);
+               if (err < 0)
+                       return err;
+       }
+
+       /* Update on-disk size after IO is submitted */
+       disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+       if (disksize > i_size_read(inode))
+               disksize = i_size_read(inode);
+       if (disksize > EXT4_I(inode)->i_disksize) {
+               int err2;
+
+               ext4_update_i_disksize(inode, disksize);
+               err2 = ext4_mark_inode_dirty(handle, inode);
+               if (err2)
+                       ext4_error(inode->i_sb,
+                                  "Failed to mark inode %lu dirty",
+                                  inode->i_ino);
+               if (!err)
+                       err = err2;
+       }
+       return err;
+}
+
 /*
  * Calculate the total number of credits to reserve for one writepages
  * iteration. This is called from ext4_da_writepages(). We map an extent of
 }
 
 /*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ *                              and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
  */
-static int write_cache_pages_da(handle_t *handle,
-                               struct address_space *mapping,
-                               struct writeback_control *wbc,
-                               struct mpage_da_data *mpd,
-                               pgoff_t *done_index)
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 {
-       struct buffer_head      *bh, *head;
-       struct inode            *inode = mapping->host;
-       struct pagevec          pvec;
-       unsigned int            nr_pages;
-       sector_t                logical;
-       pgoff_t                 index, end;
-       long                    nr_to_write = wbc->nr_to_write;
-       int                     i, tag, ret = 0;
-
-       memset(mpd, 0, sizeof(struct mpage_da_data));
-       mpd->wbc = wbc;
-       mpd->inode = inode;
-       pagevec_init(&pvec, 0);
-       index = wbc->range_start >> PAGE_CACHE_SHIFT;
-       end = wbc->range_end >> PAGE_CACHE_SHIFT;
+       struct address_space *mapping = mpd->inode->i_mapping;
+       struct pagevec pvec;
+       unsigned int nr_pages;
+       pgoff_t index = mpd->first_page;
+       pgoff_t end = mpd->last_page;
+       int tag;
+       int i, err = 0;
+       int blkbits = mpd->inode->i_blkbits;
+       ext4_lblk_t lblk;
+       struct buffer_head *head;
 
-       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+       if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
 
-       *done_index = index;
+       pagevec_init(&pvec, 0);
+       mpd->map.m_len = 0;
+       mpd->next_page = index;
        while (index <= end) {
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
-                       return 0;
+                       goto out;
 
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
                        if (page->index > end)
                                goto out;
 
-                       *done_index = page->index + 1;
-
-                       /*
-                        * If we can't merge this page, and we have
-                        * accumulated an contiguous region, write it
-                        */
-                       if ((mpd->next_page != page->index) &&
-                           (mpd->next_page != mpd->first_page)) {
-                               mpage_da_map_and_submit(mpd);
-                               goto ret_extent_tail;
-                       }
+                       /* If we can't merge this page, we are done. */
+                       if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+                               goto out;
 
                        lock_page(page);
-
                        /*
-                        * If the page is no longer dirty, or its
-                        * mapping no longer corresponds to inode we
-                        * are writing (which means it has been
-                        * truncated or invalidated), or the page is
-                        * already under writeback and we are not
-                        * doing a data integrity writeback, skip the page
+                        * If the page is no longer dirty, or its mapping no
+                        * longer corresponds to inode we are writing (which
+                        * means it has been truncated or invalidated), or the
+                        * page is already under writeback and we are not doing
+                        * a data integrity writeback, skip the page
                         */
                        if (!PageDirty(page) ||
                            (PageWriteback(page) &&
-                            (wbc->sync_mode == WB_SYNC_NONE)) ||
+                            (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
                        wait_on_page_writeback(page);
                        BUG_ON(PageWriteback(page));
 
-                       /*
-                        * If we have inline data and arrive here, it means that
-                        * we will soon create the block for the 1st page, so
-                        * we'd better clear the inline data here.
-                        */
-                       if (ext4_has_inline_data(inode)) {
-                               BUG_ON(ext4_test_inode_state(inode,
-                                               EXT4_STATE_MAY_INLINE_DATA));
-                               ext4_destroy_inline_data(handle, inode);
-                       }
-
-                       if (mpd->next_page != page->index)
+                       if (mpd->map.m_len == 0)
                                mpd->first_page = page->index;
                        mpd->next_page = page->index + 1;
-                       logical = (sector_t) page->index <<
-                               (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
                        /* Add all dirty buffers to mpd */
+                       lblk = ((ext4_lblk_t)page->index) <<
+                               (PAGE_CACHE_SHIFT - blkbits);
                        head = page_buffers(page);
-                       bh = head;
-                       do {
-                               BUG_ON(buffer_locked(bh));
-                               /*
-                                * We need to try to allocate unmapped blocks
-                                * in the same page.  Otherwise we won't make
-                                * progress with the page in ext4_writepage
-                                */
-                               if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                       mpage_add_bh_to_extent(mpd, logical,
-                                                              bh->b_state);
-                                       if (mpd->io_done)
-                                               goto ret_extent_tail;
-                               } else if (buffer_dirty(bh) &&
-                                          buffer_mapped(bh)) {
-                                       /*
-                                        * mapped dirty buffer. We need to
-                                        * update the b_state because we look
-                                        * at b_state in mpage_da_map_blocks.
-                                        * We don't update b_size because if we
-                                        * find an unmapped buffer_head later
-                                        * we need to use the b_state flag of
-                                        * that buffer_head.
-                                        */
-                                       if (mpd->b_size == 0)
-                                               mpd->b_state =
-                                                       bh->b_state & BH_FLAGS;
-                               }
-                               logical++;
-                       } while ((bh = bh->b_this_page) != head);
-
-                       if (nr_to_write > 0) {
-                               nr_to_write--;
-                               if (nr_to_write == 0 &&
-                                   wbc->sync_mode == WB_SYNC_NONE)
-                                       /*
-                                        * We stop writing back only if we are
-                                        * not doing integrity sync. In case of
-                                        * integrity sync we have to keep going
-                                        * because someone may be concurrently
-                                        * dirtying pages, and we might have
-                                        * synced a lot of newly appeared dirty
-                                        * pages, but have not synced all of the
-                                        * old dirty pages.
-                                        */
+                       if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+                               goto out;
+                       /* So far everything mapped? Submit the page for IO. */
+                       if (mpd->map.m_len == 0) {
+                               err = mpage_submit_page(mpd, page);
+                               if (err < 0)
                                        goto out;
                        }
+
+                       /*
+                        * Accumulated enough dirty pages? This doesn't apply
+                        * to WB_SYNC_ALL mode. For integrity sync we have to
+                        * keep going because someone may be concurrently
+                        * dirtying pages, and we might have synced a lot of
+                        * newly appeared dirty pages, but have not synced all
+                        * of the old dirty pages.
+                        */
+                       if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+                           mpd->next_page - mpd->first_page >=
+                                                       mpd->wbc->nr_to_write)
+                               goto out;
                }
                pagevec_release(&pvec);
                cond_resched();
        }
        return 0;
-ret_extent_tail:
-       ret = MPAGE_DA_EXTENT_TAIL;
 out:
        pagevec_release(&pvec);
-       cond_resched();
-       return ret;
+       return err;
 }
 
-
 static int ext4_da_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
 {
-       pgoff_t index;
+       pgoff_t writeback_index = 0;
+       long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
+       int cycled = 1;
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-       int pages_written = 0;
-       int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0;
-       loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-       pgoff_t done_index = 0;
-       pgoff_t end;
+       bool done;
        struct blk_plug plug;
 
        trace_ext4_da_writepages(inode, wbc);
        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
 
+       /*
+        * If we have inline data and arrive here, it means that
+        * we will soon create the block for the 1st page, so
+        * we'd better clear the inline data here.
+        */
+       if (ext4_has_inline_data(inode)) {
+               /* Just inode will be modified... */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out_writepages;
+               }
+               BUG_ON(ext4_test_inode_state(inode,
+                               EXT4_STATE_MAY_INLINE_DATA));
+               ext4_destroy_inline_data(handle, inode);
+               ext4_journal_stop(handle);
+       }
+
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
 
-       range_cyclic = wbc->range_cyclic;
        if (wbc->range_cyclic) {
-               index = mapping->writeback_index;
-               if (index)
+               writeback_index = mapping->writeback_index;
+               if (writeback_index)
                        cycled = 0;
-               wbc->range_start = index << PAGE_CACHE_SHIFT;
-               wbc->range_end  = LLONG_MAX;
-               wbc->range_cyclic = 0;
-               end = -1;
+               mpd.first_page = writeback_index;
+               mpd.last_page = -1;
        } else {
-               index = wbc->range_start >> PAGE_CACHE_SHIFT;
-               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+               mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+               mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
        }
 
+       mpd.inode = inode;
+       mpd.wbc = wbc;
+       ext4_io_submit_init(&mpd.io_submit, wbc);
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-               tag_pages_for_writeback(mapping, index, end);
-
+               tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+       done = false;
        blk_start_plug(&plug);
-       while (!ret && wbc->nr_to_write > 0) {
+       while (!done && mpd.first_page <= mpd.last_page) {
+               /* For each extent of pages we use new io_end */
+               mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+               if (!mpd.io_submit.io_end) {
+                       ret = -ENOMEM;
+                       break;
+               }
 
                /*
-                * we  insert one extent at a time. So we need
-                * credit needed for single extent allocation.
-                * journalled mode is currently not supported
-                * by delalloc
+                * We have two constraints: We find one extent to map and we
+                * must always write out whole page (makes a difference when
+                * blocksize < pagesize) so that we don't block on IO when we
+                * try to write out the rest of the page. Journalled mode is
+                * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                needed_blocks = ext4_da_writepages_trans_blocks(inode);
 
-               /* start a new transaction*/
+               /* start a new transaction */
                handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
                                            needed_blocks);
                if (IS_ERR(handle)) {
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
-                       blk_finish_plug(&plug);
-                       goto out_writepages;
+                       /* Release allocated io_end */
+                       ext4_put_io_end(mpd.io_submit.io_end);
+                       break;
                }
 
-               /*
-                * Now call write_cache_pages_da() to find the next
-                * contiguous region of logical blocks that need
-                * blocks to be allocated by ext4 and submit them.
-                */
-               ret = write_cache_pages_da(handle, mapping,
-                                          wbc, &mpd, &done_index);
-               /*
-                * If we have a contiguous extent of pages and we
-                * haven't done the I/O yet, map the blocks and submit
-                * them for I/O.
-                */
-               if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                       mpage_da_map_and_submit(&mpd);
-                       ret = MPAGE_DA_EXTENT_TAIL;
+               trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+               ret = mpage_prepare_extent_to_map(&mpd);
+               if (!ret) {
+                       if (mpd.map.m_len)
+                               ret = mpage_map_and_submit_extent(handle, &mpd);
+                       else {
+                               /*
+                                * We scanned the whole range (or exhausted
+                                * nr_to_write), submitted what was mapped and
+                                * didn't find anything needing mapping. We are
+                                * done.
+                                */
+                               done = true;
+                       }
                }
-               trace_ext4_da_write_pages(inode, &mpd);
-               wbc->nr_to_write -= mpd.pages_written;
-
                ext4_journal_stop(handle);
-
-               if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
-                       /* commit the transaction which would
+               /* Submit prepared bio */
+               ext4_io_submit(&mpd.io_submit);
+               /* Unlock pages we didn't use */
+               mpage_release_unused_pages(&mpd, false);
+               /* Drop our io_end reference we got from init */
+               ext4_put_io_end(mpd.io_submit.io_end);
+
+               if (ret == -ENOSPC && sbi->s_journal) {
+                       /*
+                        * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
-               } else if (ret == MPAGE_DA_EXTENT_TAIL) {
-                       /*
-                        * Got one extent now try with rest of the pages.
-                        * If mpd.retval is set -EIO, journal is aborted.
-                        * So we don't need to write any more.
-                        */
-                       pages_written += mpd.pages_written;
-                       ret = mpd.retval;
-                       io_done = 1;
-               } else if (wbc->nr_to_write)
-                       /*
-                        * There is no more writeout needed
-                        * or we requested for a noblocking writeout
-                        * and we found the device congested
-                        */
+                       continue;
+               }
+               /* Fatal error - ENOMEM, EIO... */
+               if (ret)
                        break;
        }
        blk_finish_plug(&plug);
-       if (!io_done && !cycled) {
+       if (!ret && !cycled) {
                cycled = 1;
-               index = 0;
-               wbc->range_start = index << PAGE_CACHE_SHIFT;
-               wbc->range_end  = mapping->writeback_index - 1;
+               mpd.last_page = writeback_index - 1;
+               mpd.first_page = 0;
                goto retry;
        }
 
        /* Update index */
-       wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
-                * set the writeback_index so that range_cyclic
+                * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-               mapping->writeback_index = done_index;
+               mapping->writeback_index = mpd.first_page;
 
 out_writepages:
-       wbc->range_start = range_start;
-       trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+       trace_ext4_da_writepages_result(inode, wbc, ret,
+                                       nr_to_write - wbc->nr_to_write);
        return ret;
 }