return 0;
 }
 
+static int
+gen7_render_ring_cs_stall_wa(struct intel_ring_buffer *ring)
+{
+       int ret;
+
+       ret = intel_ring_begin(ring, 4);
+       if (ret)
+               return ret;
+
+       intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4));
+       intel_ring_emit(ring, PIPE_CONTROL_CS_STALL |
+                             PIPE_CONTROL_STALL_AT_SCOREBOARD);
+       intel_ring_emit(ring, 0);
+       intel_ring_emit(ring, 0);
+       intel_ring_advance(ring);
+
+       return 0;
+}
+
 static int
 gen7_render_ring_flush(struct intel_ring_buffer *ring,
                       u32 invalidate_domains, u32 flush_domains)
        u32 scratch_addr = pc->gtt_offset + 128;
        int ret;
 
+       /*
+        * Ensure that any following seqno writes only happen when the render
+        * cache is indeed flushed.
+        *
+        * Workaround: 4th PIPE_CONTROL command (except the ones with only
+        * read-cache invalidate bits set) must have the CS_STALL bit set. We
+        * don't try to be clever and just set it unconditionally.
+        */
+       flags |= PIPE_CONTROL_CS_STALL;
+
        /* Just flush everything.  Experiments have shown that reducing the
         * number of bits based on the write domains has little performance
         * impact.
        if (flush_domains) {
                flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
                flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
-               /*
-                * Ensure that any following seqno writes only happen
-                * when the render cache is indeed flushed.
-                */
-               flags |= PIPE_CONTROL_CS_STALL;
        }
        if (invalidate_domains) {
                flags |= PIPE_CONTROL_TLB_INVALIDATE;
                 * TLB invalidate requires a post-sync write.
                 */
                flags |= PIPE_CONTROL_QW_WRITE;
+
+               /* Workaround: we must issue a pipe_control with CS-stall bit
+                * set before a pipe_control command that has the state cache
+                * invalidate bit set. */
+               gen7_render_ring_cs_stall_wa(ring);
        }
 
        ret = intel_ring_begin(ring, 4);