Another month, another story in the cache coherency saga. This time, we
come to the realisation that i915_gem_object_is_coherent() has been
reporting whether we can read from the target without requiring a cache
invalidate; but we were using it in places for testing whether we could
write into the object without requiring a cache flush. So split the
tracking into two, one to decide before reads, one after writes.
See commit 
e27ab73d17ef ("drm/i915: Mark CPU cache as dirty on every
transition for CPU writes") for the previous entry in this saga.
v2: Be verbose
v3: Remove unused function (i915_gem_object_is_coherent)
v4: Fix inverted coherency check prior to execbuf (from v2)
v5: Add comment for nasty code where we are optimising on gcc's behalf.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101109
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101555
Testcase: igt/kms_mmap_write_crc
Testcase: igt/kms_pwrite_crc
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Tested-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170811111116.10373-1-chris@chris-wilson.co.uk
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
          i915_gem_gtt.o \
          i915_gem_internal.o \
          i915_gem.o \
+         i915_gem_object.o \
          i915_gem_render_state.o \
          i915_gem_request.o \
          i915_gem_shrinker.o \
 
                     unsigned long addr, unsigned long pfn, unsigned long size,
                     struct io_mapping *iomap);
 
-static inline bool i915_gem_object_is_coherent(struct drm_i915_gem_object *obj)
-{
-       return (obj->cache_level != I915_CACHE_NONE ||
-               HAS_LLC(to_i915(obj->base.dev)));
-}
-
 #endif
 
        if (obj->cache_dirty)
                return false;
 
-       if (!obj->cache_coherent)
+       if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
                return true;
 
        return obj->pin_display;
 
        if (needs_clflush &&
            (obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
-           !obj->cache_coherent)
+           !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
                drm_clflush_sg(pages);
 
        __start_cpu_write(obj);
        if (ret)
                return ret;
 
-       if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+       if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
+           !static_cpu_has(X86_FEATURE_CLFLUSH)) {
                ret = i915_gem_object_set_to_cpu_domain(obj, false);
                if (ret)
                        goto err_unpin;
        if (ret)
                return ret;
 
-       if (obj->cache_coherent || !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+       if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
+           !static_cpu_has(X86_FEATURE_CLFLUSH)) {
                ret = i915_gem_object_set_to_cpu_domain(obj, true);
                if (ret)
                        goto err_unpin;
 
        list_for_each_entry(vma, &obj->vma_list, obj_link)
                vma->node.color = cache_level;
-       obj->cache_level = cache_level;
-       obj->cache_coherent = i915_gem_object_is_coherent(obj);
+       i915_gem_object_set_cache_coherency(obj, cache_level);
        obj->cache_dirty = true; /* Always invalidate stale cachelines */
 
        return 0;
 {
        struct drm_i915_gem_object *obj;
        struct address_space *mapping;
+       unsigned int cache_level;
        gfp_t mask;
        int ret;
 
        obj->base.write_domain = I915_GEM_DOMAIN_CPU;
        obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 
-       if (HAS_LLC(dev_priv)) {
+       if (HAS_LLC(dev_priv))
                /* On some devices, we can have the GPU use the LLC (the CPU
                 * cache) for about a 10% performance improvement
                 * compared to uncached.  Graphics requests other than
                 * However, we maintain the display planes as UC, and so
                 * need to rebind when first used as such.
                 */
-               obj->cache_level = I915_CACHE_LLC;
-       } else
-               obj->cache_level = I915_CACHE_NONE;
+               cache_level = I915_CACHE_LLC;
+       else
+               cache_level = I915_CACHE_NONE;
 
-       obj->cache_coherent = i915_gem_object_is_coherent(obj);
-       obj->cache_dirty = !obj->cache_coherent;
+       i915_gem_object_set_cache_coherency(obj, cache_level);
 
        trace_i915_gem_object_create(obj);
 
 
         * snooping behaviour occurs naturally as the result of our domain
         * tracking.
         */
-       if (!(flags & I915_CLFLUSH_FORCE) && obj->cache_coherent)
+       if (!(flags & I915_CLFLUSH_FORCE) &&
+           obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)
                return false;
 
        trace_i915_gem_object_clflush(obj);
 
                        eb->request->capture_list = capture;
                }
 
-               if (unlikely(obj->cache_dirty && !obj->cache_coherent)) {
+               /*
+                * If the GPU is not _reading_ through the CPU cache, we need
+                * to make sure that any writes (both previous GPU writes from
+                * before a change in snooping levels and normal CPU writes)
+                * caught in that cache are flushed to main memory.
+                *
+                * We want to say
+                *   obj->cache_dirty &&
+                *   !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)
+                * but gcc's optimiser doesn't handle that as well and emits
+                * two jumps instead of one. Maybe one day...
+                */
+               if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) {
                        if (i915_gem_clflush_object(obj, 0))
                                entry->flags &= ~EXEC_OBJECT_ASYNC;
                }
 
                                phys_addr_t size)
 {
        struct drm_i915_gem_object *obj;
+       unsigned int cache_level;
 
        GEM_BUG_ON(!size);
        GEM_BUG_ON(!IS_ALIGNED(size, PAGE_SIZE));
 
        obj->base.read_domains = I915_GEM_DOMAIN_CPU;
        obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-       obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
-       obj->cache_coherent = i915_gem_object_is_coherent(obj);
-       obj->cache_dirty = !obj->cache_coherent;
+
+       cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+       i915_gem_object_set_cache_coherency(obj, cache_level);
 
        return obj;
 }
 
--- /dev/null
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "i915_drv.h"
+#include "i915_gem_object.h"
+
+/**
+ * Mark up the object's coherency levels for a given cache_level
+ * @obj: #drm_i915_gem_object
+ * @cache_level: cache level
+ */
+void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj,
+                                        unsigned int cache_level)
+{
+       obj->cache_level = cache_level;
+
+       if (cache_level != I915_CACHE_NONE)
+               obj->cache_coherent = (I915_BO_CACHE_COHERENT_FOR_READ |
+                                      I915_BO_CACHE_COHERENT_FOR_WRITE);
+       else if (HAS_LLC(to_i915(obj->base.dev)))
+               obj->cache_coherent = I915_BO_CACHE_COHERENT_FOR_READ;
+       else
+               obj->cache_coherent = 0;
+
+       obj->cache_dirty =
+               !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE);
+}
 
 
 #include <drm/i915_drm.h>
 
+#include "i915_gem_request.h"
 #include "i915_selftest.h"
 
+struct drm_i915_gem_object;
+
 struct drm_i915_gem_object_ops {
        unsigned int flags;
 #define I915_GEM_OBJECT_HAS_STRUCT_PAGE BIT(0)
         */
        unsigned long gt_ro:1;
        unsigned int cache_level:3;
+       unsigned int cache_coherent:2;
+#define I915_BO_CACHE_COHERENT_FOR_READ BIT(0)
+#define I915_BO_CACHE_COHERENT_FOR_WRITE BIT(1)
        unsigned int cache_dirty:1;
-       unsigned int cache_coherent:1;
 
        atomic_t frontbuffer_bits;
        unsigned int frontbuffer_ggtt_origin; /* write once */
        return engine;
 }
 
+void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj,
+                                        unsigned int cache_level);
 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj);
 
 #endif
 
                               struct drm_mm_node *stolen)
 {
        struct drm_i915_gem_object *obj;
+       unsigned int cache_level;
 
        obj = i915_gem_object_alloc(dev_priv);
        if (obj == NULL)
 
        obj->stolen = stolen;
        obj->base.read_domains = I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT;
-       obj->cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE;
-       obj->cache_coherent = true; /* assumptions! more like cache_oblivious */
+       cache_level = HAS_LLC(dev_priv) ? I915_CACHE_LLC : I915_CACHE_NONE;
+       i915_gem_object_set_cache_coherency(obj, cache_level);
 
        if (i915_gem_object_pin_pages(obj))
                goto cleanup;
 
        i915_gem_object_init(obj, &i915_gem_userptr_ops);
        obj->base.read_domains = I915_GEM_DOMAIN_CPU;
        obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-       obj->cache_level = I915_CACHE_LLC;
-       obj->cache_coherent = i915_gem_object_is_coherent(obj);
-       obj->cache_dirty = !obj->cache_coherent;
+       i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC);
 
        obj->userptr.ptr = args->user_ptr;
        obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
 
                dma_addr_t dma_size)
 {
        struct drm_i915_gem_object *obj;
+       unsigned int cache_level;
 
        GEM_BUG_ON(!phys_size || phys_size > dma_size);
        GEM_BUG_ON(!IS_ALIGNED(phys_size, PAGE_SIZE));
 
        obj->base.read_domains = I915_GEM_DOMAIN_CPU;
        obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-       obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
-       obj->cache_coherent = i915_gem_object_is_coherent(obj);
-       obj->cache_dirty = !obj->cache_coherent;
+       cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+       i915_gem_object_set_cache_coherency(obj, cache_level);
        obj->scratch = phys_size;
 
        return obj;