static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 {
-       if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
+       if (obj->cache_dirty)
                return false;
 
        if (!i915_gem_object_is_coherent(obj))
        return st;
 }
 
+static void __start_cpu_write(struct drm_i915_gem_object *obj)
+{
+       obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+       obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+       if (cpu_write_needs_clflush(obj))
+               obj->cache_dirty = true;
+}
+
 static void
 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
                                struct sg_table *pages,
            !i915_gem_object_is_coherent(obj))
                drm_clflush_sg(pages);
 
-       obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-       obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+       __start_cpu_write(obj);
 }
 
 static void
                               args->size, &args->handle);
 }
 
+static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
+{
+       return !(obj->cache_level == I915_CACHE_NONE ||
+                obj->cache_level == I915_CACHE_WT);
+}
+
 /**
  * Creates a new mm object and returns a handle to it.
  * @dev: drm device pointer
        case I915_GEM_DOMAIN_CPU:
                i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
                break;
+
+       case I915_GEM_DOMAIN_RENDER:
+               if (gpu_write_needs_clflush(obj))
+                       obj->cache_dirty = true;
+               break;
        }
 
        obj->base.write_domain = 0;
         * optimizes for the case when the gpu will dirty the data
         * anyway again before the next pread happens.
         */
-       if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+       if (!obj->cache_dirty &&
+           !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
                *needs_clflush = CLFLUSH_BEFORE;
 
 out:
         * This optimizes for the case when the gpu will use the data
         * right away and we therefore have to clflush anyway.
         */
-       if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
+       if (!obj->cache_dirty) {
                *needs_clflush |= CLFLUSH_AFTER;
 
-       /* Same trick applies to invalidate partially written cachelines read
-        * before writing.
-        */
-       if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
-               *needs_clflush |= CLFLUSH_BEFORE;
+               /*
+                * Same trick applies to invalidate partially written
+                * cachelines read before writing.
+                */
+               if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+                       *needs_clflush |= CLFLUSH_BEFORE;
+       }
 
 out:
        intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 
 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
 {
-       if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
-               return;
-
-       i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
+       /*
+        * We manually flush the CPU domain so that we can override and
+        * force the flush for the display, and perform it asyncrhonously.
+        */
+       flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
+       if (obj->cache_dirty)
+               i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
        obj->base.write_domain = 0;
 }
 
                }
        }
 
-       if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
-           i915_gem_object_is_coherent(obj))
-               obj->cache_dirty = true;
-
        list_for_each_entry(vma, &obj->vma_list, obj_link)
                vma->node.color = cache_level;
        obj->cache_level = cache_level;
+       obj->cache_dirty = true; /* Always invalidate stale cachelines */
 
        return 0;
 }
        if (ret)
                return ret;
 
-       if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
-               return 0;
-
        flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 
        /* Flush the CPU cache if it's still invalid. */
        /* It should now be out of any other write domains, and we can update
         * the domain values for our changes.
         */
-       GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
+       GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
 
        /* If we're writing through the CPU, then the GPU read domains will
         * need to be invalidated at next use.
         */
-       if (write) {
-               obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-               obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-       }
+       if (write)
+               __start_cpu_write(obj);
 
        return 0;
 }
        } else
                obj->cache_level = I915_CACHE_NONE;
 
+       obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+
        trace_i915_gem_object_create(obj);
 
        return obj;
 
        mutex_lock(&dev_priv->drm.struct_mutex);
        for (p = phases; *p; p++) {
-               list_for_each_entry(obj, *p, global_link) {
-                       obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-                       obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-               }
+               list_for_each_entry(obj, *p, global_link)
+                       __start_cpu_write(obj);
        }
        mutex_unlock(&dev_priv->drm.struct_mutex);
 
 
 static void __i915_do_clflush(struct drm_i915_gem_object *obj)
 {
        drm_clflush_sg(obj->mm.pages);
-       obj->cache_dirty = false;
-
        intel_fb_obj_flush(obj, ORIGIN_CPU);
 }
 
        struct clflush *clflush = container_of(work, typeof(*clflush), work);
        struct drm_i915_gem_object *obj = clflush->obj;
 
-       if (!obj->cache_dirty)
-               goto out;
-
        if (i915_gem_object_pin_pages(obj)) {
                DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
                goto out;
         * anything not backed by physical memory we consider to be always
         * coherent and not need clflushing.
         */
-       if (!i915_gem_object_has_struct_page(obj))
+       if (!i915_gem_object_has_struct_page(obj)) {
+               obj->cache_dirty = false;
                return;
-
-       obj->cache_dirty = true;
+       }
 
        /* If the GPU is snooping the contents of the CPU cache,
         * we do not need to manually clear the CPU cache lines.  However,
        if (!(flags & I915_CLFLUSH_SYNC))
                clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
        if (clflush) {
+               GEM_BUG_ON(!obj->cache_dirty);
+
                dma_fence_init(&clflush->dma,
                               &i915_clflush_ops,
                               &clflush_lock,
        } else {
                GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
        }
+
+       obj->cache_dirty = false;
 }
 
                return DBG_USE_CPU_RELOC > 0;
 
        return (HAS_LLC(to_i915(obj->base.dev)) ||
-               obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+               obj->cache_dirty ||
                obj->cache_level != I915_CACHE_NONE);
 }
 
                if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
                        continue;
 
-               if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
+               if (obj->cache_dirty)
                        i915_gem_clflush_object(obj, 0);
-                       obj->base.write_domain = 0;
-               }
 
                ret = i915_gem_request_await_object
                        (eb->request, obj, obj->base.pending_write_domain);
        return 0;
 }
 
-static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
-{
-       return !(obj->cache_level == I915_CACHE_NONE ||
-                obj->cache_level == I915_CACHE_WT);
-}
-
 void i915_vma_move_to_active(struct i915_vma *vma,
                             struct drm_i915_gem_request *req,
                             unsigned int flags)
        i915_gem_active_set(&vma->last_read[idx], req);
        list_move_tail(&vma->vm_link, &vma->vm->active_list);
 
+       obj->base.write_domain = 0;
        if (flags & EXEC_OBJECT_WRITE) {
+               obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
+
                if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
                        i915_gem_active_set(&obj->frontbuffer_write, req);
 
-               /* update for the implicit flush after a batch */
-               obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
-               if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
-                       obj->cache_dirty = true;
+               obj->base.read_domains = 0;
        }
+       obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
 
        if (flags & EXEC_OBJECT_NEEDS_FENCE)
                i915_gem_active_set(&vma->last_fence, req);