/*
  * Flush iclog to disk if this is the last reference to the given iclog and the
- * it is in the WANT_SYNC state.
+ * it is in the WANT_SYNC state.  If the caller passes in a non-zero
+ * @old_tail_lsn and the current log tail does not match, there may be metadata
+ * on disk that must be persisted before this iclog is written.  To satisfy that
+ * requirement, set the XLOG_ICL_NEED_FLUSH flag as a condition for writing this
+ * iclog with the new log tail value.
  */
 int
 xlog_state_release_iclog(
        struct xlog             *log,
-       struct xlog_in_core     *iclog)
+       struct xlog_in_core     *iclog,
+       xfs_lsn_t               old_tail_lsn)
 {
        xfs_lsn_t               tail_lsn;
        lockdep_assert_held(&log->l_icloglock);
        if (iclog->ic_state == XLOG_STATE_IOERROR)
                return -EIO;
 
+       /*
+        * Grabbing the current log tail needs to be atomic w.r.t. the writing
+        * of the tail LSN into the iclog so we guarantee that the log tail does
+        * not move between deciding if a cache flush is required and writing
+        * the LSN into the iclog below.
+        */
+       if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+               tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+
+               if (old_tail_lsn && tail_lsn != old_tail_lsn)
+                       iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+       }
+
        if (!atomic_dec_and_test(&iclog->ic_refcnt))
                return 0;
 
                return 0;
        }
 
-       /* update tail before writing to iclog */
-       tail_lsn = xlog_assign_tail_lsn(log->l_mp);
        iclog->ic_state = XLOG_STATE_SYNCING;
        iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
        xlog_verify_tail_lsn(log, iclog, tail_lsn);
         * iclog containing the unmount record is written.
         */
        iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
-       error = xlog_state_release_iclog(log, iclog);
+       error = xlog_state_release_iclog(log, iclog, 0);
        xlog_wait_on_iclog(iclog);
 
        if (tic) {
        return 0;
 
 release_iclog:
-       error = xlog_state_release_iclog(log, iclog);
+       error = xlog_state_release_iclog(log, iclog, 0);
        spin_unlock(&log->l_icloglock);
        return error;
 }
                ASSERT(optype & XLOG_COMMIT_TRANS);
                *commit_iclog = iclog;
        } else {
-               error = xlog_state_release_iclog(log, iclog);
+               error = xlog_state_release_iclog(log, iclog, 0);
        }
        spin_unlock(&log->l_icloglock);
 
                 * reference to the iclog.
                 */
                if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
-                       error = xlog_state_release_iclog(log, iclog);
+                       error = xlog_state_release_iclog(log, iclog, 0);
                spin_unlock(&log->l_icloglock);
                if (error)
                        return error;
                        atomic_inc(&iclog->ic_refcnt);
                        lsn = be64_to_cpu(iclog->ic_header.h_lsn);
                        xlog_state_switch_iclogs(log, iclog, 0);
-                       if (xlog_state_release_iclog(log, iclog))
+                       if (xlog_state_release_iclog(log, iclog, 0))
                                goto out_error;
 
                        if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
                }
                atomic_inc(&iclog->ic_refcnt);
                xlog_state_switch_iclogs(log, iclog, 0);
-               if (xlog_state_release_iclog(log, iclog))
+               if (xlog_state_release_iclog(log, iclog, 0))
                        goto out_error;
                if (log_flushed)
                        *log_flushed = 1;
 
        struct xfs_trans_header thdr;
        struct xfs_log_iovec    lhdr;
        struct xfs_log_vec      lvhdr = { NULL };
+       xfs_lsn_t               preflush_tail_lsn;
        xfs_lsn_t               commit_lsn;
-       xfs_lsn_t               push_seq;
+       xfs_csn_t               push_seq;
        struct bio              bio;
        DECLARE_COMPLETION_ONSTACK(bdev_flush);
 
         * because we hold the flush lock exclusively. Hence we can now issue
         * a cache flush to ensure all the completed metadata in the journal we
         * are about to overwrite is on stable storage.
+        *
+        * Because we are issuing this cache flush before we've written the
+        * tail lsn to the iclog, we can have metadata IO completions move the
+        * tail forwards between the completion of this flush and the iclog
+        * being written. In this case, we need to re-issue the cache flush
+        * before the iclog write. To detect whether the log tail moves, sample
+        * the tail LSN *before* we issue the flush.
         */
+       preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
        xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
                                &bdev_flush);
 
         * storage.
         */
        commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
-       xlog_state_release_iclog(log, commit_iclog);
+       xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
        spin_unlock(&log->l_icloglock);
        return;