return ret;
 }
 
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
                           unsigned int nr, struct buffer_head *bhs[])
 {
        int status = 0;
        unsigned int i;
        struct buffer_head *bh;
+       int new_bh = 0;
 
        trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
 
        if (!nr)
                goto bail;
 
+       /* Don't put buffer head and re-assign it to NULL if it is allocated
+        * outside since the caller can't be aware of this alternation!
+        */
+       new_bh = (bhs[0] == NULL);
+
        for (i = 0 ; i < nr ; i++) {
                if (bhs[i] == NULL) {
                        bhs[i] = sb_getblk(osb->sb, block++);
                        if (bhs[i] == NULL) {
                                status = -ENOMEM;
                                mlog_errno(status);
-                               goto bail;
+                               break;
                        }
                }
                bh = bhs[i];
                submit_bh(REQ_OP_READ, 0, bh);
        }
 
+read_failure:
        for (i = nr; i > 0; i--) {
                bh = bhs[i - 1];
 
+               if (unlikely(status)) {
+                       if (new_bh && bh) {
+                               /* If middle bh fails, let previous bh
+                                * finish its read and then put it to
+                                * aovoid bh leak
+                                */
+                               if (!buffer_jbd(bh))
+                                       wait_on_buffer(bh);
+                               put_bh(bh);
+                               bhs[i - 1] = NULL;
+                       } else if (bh && buffer_uptodate(bh)) {
+                               clear_buffer_uptodate(bh);
+                       }
+                       continue;
+               }
+
                /* No need to wait on the buffer if it's managed by JBD. */
                if (!buffer_jbd(bh))
                        wait_on_buffer(bh);
                         * so we can safely record this and loop back
                         * to cleanup the other buffers. */
                        status = -EIO;
-                       put_bh(bh);
-                       bhs[i - 1] = NULL;
+                       goto read_failure;
                }
        }
 
        return status;
 }
 
+/* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
+ * will be easier to handle read failure.
+ */
 int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                      struct buffer_head *bhs[], int flags,
                      int (*validate)(struct super_block *sb,
        int i, ignore_cache = 0;
        struct buffer_head *bh;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+       int new_bh = 0;
 
        trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
 
                goto bail;
        }
 
+       /* Don't put buffer head and re-assign it to NULL if it is allocated
+        * outside since the caller can't be aware of this alternation!
+        */
+       new_bh = (bhs[0] == NULL);
+
        ocfs2_metadata_cache_io_lock(ci);
        for (i = 0 ; i < nr ; i++) {
                if (bhs[i] == NULL) {
                                ocfs2_metadata_cache_io_unlock(ci);
                                status = -ENOMEM;
                                mlog_errno(status);
-                               goto bail;
+                               /* Don't forget to put previous bh! */
+                               break;
                        }
                }
                bh = bhs[i];
                }
        }
 
-       status = 0;
-
+read_failure:
        for (i = (nr - 1); i >= 0; i--) {
                bh = bhs[i];
 
                if (!(flags & OCFS2_BH_READAHEAD)) {
-                       if (status) {
-                               /* Clear the rest of the buffers on error */
-                               put_bh(bh);
-                               bhs[i] = NULL;
+                       if (unlikely(status)) {
+                               /* Clear the buffers on error including those
+                                * ever succeeded in reading
+                                */
+                               if (new_bh && bh) {
+                                       /* If middle bh fails, let previous bh
+                                        * finish its read and then put it to
+                                        * aovoid bh leak
+                                        */
+                                       if (!buffer_jbd(bh))
+                                               wait_on_buffer(bh);
+                                       put_bh(bh);
+                                       bhs[i] = NULL;
+                               } else if (bh && buffer_uptodate(bh)) {
+                                       clear_buffer_uptodate(bh);
+                               }
                                continue;
                        }
                        /* We know this can't have changed as we hold the
                                 * uptodate. */
                                status = -EIO;
                                clear_buffer_needs_validate(bh);
-                               put_bh(bh);
-                               bhs[i] = NULL;
-                               continue;
+                               goto read_failure;
                        }
 
                        if (buffer_needs_validate(bh)) {
                                BUG_ON(buffer_jbd(bh));
                                clear_buffer_needs_validate(bh);
                                status = validate(sb, bh);
-                               if (status) {
-                                       put_bh(bh);
-                                       bhs[i] = NULL;
-                                       continue;
-                               }
+                               if (status)
+                                       goto read_failure;
                        }
                }