#include <trace/events/jbd2.h>
 
 /*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
  */
 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
+       struct buffer_head *orig_bh = bh->b_private;
+
        BUFFER_TRACE(bh, "");
        if (uptodate)
                set_buffer_uptodate(bh);
        else
                clear_buffer_uptodate(bh);
+       if (orig_bh) {
+               clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+               smp_mb__after_clear_bit();
+               wake_up_bit(&orig_bh->b_state, BH_Shadow);
+       }
        unlock_buffer(bh);
 }
 
                bh = jh2bh(jh);
                clear_buffer_jwrite(bh);
                J_ASSERT_BH(bh, buffer_jbddirty(bh));
+               J_ASSERT_BH(bh, !buffer_shadow(bh));
 
                /* The metadata is now released for reuse, but we need
                    to remember it against this transaction so that when
                    required. */
                JBUFFER_TRACE(jh, "file as BJ_Forget");
                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-               /*
-                * Wake up any transactions which were waiting for this IO to
-                * complete. The barrier must be here so that changes by
-                * jbd2_journal_file_buffer() take effect before wake_up_bit()
-                * does the waitqueue check.
-                */
-               smp_mb();
-               wake_up_bit(&bh->b_state, BH_Unshadow);
                JBUFFER_TRACE(jh, "brelse shadowed buffer");
                __brelse(bh);
        }
 
        new_bh->b_size = bh_in->b_size;
        new_bh->b_bdev = journal->j_dev;
        new_bh->b_blocknr = blocknr;
+       new_bh->b_private = bh_in;
        set_buffer_mapped(new_bh);
        set_buffer_dirty(new_bh);
 
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
        spin_unlock(&journal->j_list_lock);
+       set_buffer_shadow(bh_in);
        jbd_unlock_bh_state(bh_in);
 
        return do_escape | (done_copy_out << 1);
 
               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 
+static int sleep_on_shadow_bh(void *word)
+{
+       io_schedule();
+       return 0;
+}
+
 /*
  * If the buffer is already part of the current transaction, then there
  * is nothing we need to do.  If it is already part of a prior
                 * journaled.  If the primary copy is already going to
                 * disk then we cannot do copy-out here. */
 
-               if (jh->b_jlist == BJ_Shadow) {
-                       DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
-                       wait_queue_head_t *wqh;
-
-                       wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+               if (buffer_shadow(bh)) {
                        JBUFFER_TRACE(jh, "on shadow: sleep");
                        jbd_unlock_bh_state(bh);
-                       /* commit wakes up all shadow buffers after IO */
-                       for ( ; ; ) {
-                               prepare_to_wait(wqh, &wait.wait,
-                                               TASK_UNINTERRUPTIBLE);
-                               if (jh->b_jlist != BJ_Shadow)
-                                       break;
-                               schedule();
-                       }
-                       finish_wait(wqh, &wait.wait);
+                       wait_on_bit(&bh->b_state, BH_Shadow,
+                                   sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
                        goto repeat;
                }
 
-               /* Only do the copy if the currently-owning transaction
-                * still needs it.  If it is on the Forget list, the
-                * committing transaction is past that stage.  The
-                * buffer had better remain locked during the kmalloc,
-                * but that should be true --- we hold the journal lock
-                * still and the buffer is already on the BUF_JOURNAL
-                * list so won't be flushed.
+               /*
+                * Only do the copy if the currently-owning transaction still
+                * needs it. If buffer isn't on BJ_Metadata list, the
+                * committing transaction is past that stage (here we use the
+                * fact that BH_Shadow is set under bh_state lock together with
+                * refiling to BJ_Shadow list and at this point we know the
+                * buffer doesn't have BH_Shadow set).
                 *
                 * Subtle point, though: if this is a get_undo_access,
                 * then we will be relying on the frozen_data to contain
                 * the new value of the committed_data record after the
                 * transaction, so we HAVE to force the frozen_data copy
-                * in that case. */
-
-               if (jh->b_jlist != BJ_Forget || force_copy) {
+                * in that case.
+                */
+               if (jh->b_jlist == BJ_Metadata || force_copy) {
                        JBUFFER_TRACE(jh, "generate frozen data");
                        if (!frozen_buffer) {
                                JBUFFER_TRACE(jh, "allocate memory for buffer");
 
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+
+enum jbd_state_bits {
+       BH_JBD                  /* Has an attached ext3 journal_head */
+         = BH_PrivateStart,
+       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
+       BH_Freed,               /* Has been freed (truncated) */
+       BH_Revoked,             /* Has been revoked from the log */
+       BH_RevokeValid,         /* Revoked flag is valid */
+       BH_JBDDirty,            /* Is dirty but journaled */
+       BH_State,               /* Pins most journal_head state */
+       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
+       BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
+       BH_JBDPrivateStart,     /* First bit available for private use by FS */
+};
+
+BUFFER_FNS(JBD, jbd)
+BUFFER_FNS(JWrite, jwrite)
+BUFFER_FNS(JBDDirty, jbddirty)
+TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
+BUFFER_FNS(Freed, freed)
+
 #include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)       BUG_ON(!(assert))
 
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+
+enum jbd_state_bits {
+       BH_JBD                  /* Has an attached ext3 journal_head */
+         = BH_PrivateStart,
+       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
+       BH_Freed,               /* Has been freed (truncated) */
+       BH_Revoked,             /* Has been revoked from the log */
+       BH_RevokeValid,         /* Revoked flag is valid */
+       BH_JBDDirty,            /* Is dirty but journaled */
+       BH_State,               /* Pins most journal_head state */
+       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
+       BH_Shadow,              /* IO on shadow buffer is running */
+       BH_Verified,            /* Metadata block has been verified ok */
+       BH_JBDPrivateStart,     /* First bit available for private use by FS */
+};
+
+BUFFER_FNS(JBD, jbd)
+BUFFER_FNS(JWrite, jwrite)
+BUFFER_FNS(JBDDirty, jbddirty)
+TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
+BUFFER_FNS(Freed, freed)
+BUFFER_FNS(Shadow, shadow)
+BUFFER_FNS(Verified, verified)
+
 #include <linux/jbd_common.h>
 
 #define J_ASSERT(assert)       BUG_ON(!(assert))
 
 #ifndef _LINUX_JBD_STATE_H
 #define _LINUX_JBD_STATE_H
 
-enum jbd_state_bits {
-       BH_JBD                  /* Has an attached ext3 journal_head */
-         = BH_PrivateStart,
-       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
-       BH_Freed,               /* Has been freed (truncated) */
-       BH_Revoked,             /* Has been revoked from the log */
-       BH_RevokeValid,         /* Revoked flag is valid */
-       BH_JBDDirty,            /* Is dirty but journaled */
-       BH_State,               /* Pins most journal_head state */
-       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
-       BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
-       BH_Verified,            /* Metadata block has been verified ok */
-       BH_JBDPrivateStart,     /* First bit available for private use by FS */
-};
-
-BUFFER_FNS(JBD, jbd)
-BUFFER_FNS(JWrite, jwrite)
-BUFFER_FNS(JBDDirty, jbddirty)
-TAS_BUFFER_FNS(JBDDirty, jbddirty)
-BUFFER_FNS(Revoked, revoked)
-TAS_BUFFER_FNS(Revoked, revoked)
-BUFFER_FNS(RevokeValid, revokevalid)
-TAS_BUFFER_FNS(RevokeValid, revokevalid)
-BUFFER_FNS(Freed, freed)
-BUFFER_FNS(Verified, verified)
-
 static inline struct buffer_head *jh2bh(struct journal_head *jh)
 {
        return jh->b_bh;