drm/i915/gen8: Add infrastructure to initialize WA batch buffers

author Arun Siluvery <arun.siluvery@linux.intel.com>

Fri, 19 Jun 2015 18:07:01 +0000 (19:07 +0100)

committer Daniel Vetter <daniel.vetter@ffwll.ch>

Tue, 23 Jun 2015 12:01:39 +0000 (14:01 +0200)
author Arun Siluvery <arun.siluvery@linux.intel.com>
Fri, 19 Jun 2015 18:07:01 +0000 (19:07 +0100)
committer Daniel Vetter <daniel.vetter@ffwll.ch>
Tue, 23 Jun 2015 12:01:39 +0000 (14:01 +0200)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 9f5485ddcbe6e15d1b18871ae9aebd03b1035c16..9b791a3d47ddeb8b81a60f493124f7c0882eae0c 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -211,6 +211,7 @@ enum {
         FAULT_AND_CONTINUE /* Unsupported */
  };
  #define GEN8_CTX_ID_SHIFT 32
+#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
  
  static int intel_lr_context_pin(struct intel_engine_cs *ring,
                 struct intel_context *ctx);
@@ -1077,6 +1078,191 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring,
         return 0;
  }
  
+#define wa_ctx_emit(batch, cmd)                                                \
+       do {                                                            \
+               if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
+                       return -ENOSPC;                                 \
+               }                                                       \
+               batch[index++] = (cmd);                                 \
+       } while (0)
+
+static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx,
+                                   uint32_t offset,
+                                   uint32_t start_alignment)
+{
+       return wa_ctx->offset = ALIGN(offset, start_alignment);
+}
+
+static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx,
+                            uint32_t offset,
+                            uint32_t size_alignment)
+{
+       wa_ctx->size = offset - wa_ctx->offset;
+
+       WARN(wa_ctx->size % size_alignment,
+            "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n",
+            wa_ctx->size, size_alignment);
+       return 0;
+}
+
+/**
+ * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA
+ *
+ * @ring: only applicable for RCS
+ * @wa_ctx: structure representing wa_ctx
+ *  offset: specifies start of the batch, should be cache-aligned. This is updated
+ *    with the offset value received as input.
+ *  size: size of the batch in DWORDS but HW expects in terms of cachelines
+ * @batch: page in which WA are loaded
+ * @offset: This field specifies the start of the batch, it should be
+ *  cache-aligned otherwise it is adjusted accordingly.
+ *  Typically we only have one indirect_ctx and per_ctx batch buffer which are
+ *  initialized at the beginning and shared across all contexts but this field
+ *  helps us to have multiple batches at different offsets and select them based
+ *  on a criteria. At the moment this batch always start at the beginning of the page
+ *  and at this point we don't have multiple wa_ctx batch buffers.
+ *
+ *  The number of WA applied are not known at the beginning; we use this field
+ *  to return the no of DWORDS written.
+
+ *  It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
+ *  so it adds NOOPs as padding to make it cacheline aligned.
+ *  MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
+ *  makes a complete batch buffer.
+ *
+ * Return: non-zero if we exceed the PAGE_SIZE limit.
+ */
+
+static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
+                                   struct i915_wa_ctx_bb *wa_ctx,
+                                   uint32_t *const batch,
+                                   uint32_t *offset)
+{
+       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
+
+       /* FIXME: Replace me with WA */
+       wa_ctx_emit(batch, MI_NOOP);
+
+       /* Pad to end of cacheline */
+       while (index % CACHELINE_DWORDS)
+               wa_ctx_emit(batch, MI_NOOP);
+
+       /*
+        * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+        * execution depends on the length specified in terms of cache lines
+        * in the register CTX_RCS_INDIRECT_CTX
+        */
+
+       return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
+}
+
+/**
+ * gen8_init_perctx_bb() - initialize per ctx batch with WA
+ *
+ * @ring: only applicable for RCS
+ * @wa_ctx: structure representing wa_ctx
+ *  offset: specifies start of the batch, should be cache-aligned.
+ *  size: size of the batch in DWORDS but HW expects in terms of cachelines
+ * @offset: This field specifies the start of this batch.
+ *   This batch is started immediately after indirect_ctx batch. Since we ensure
+ *   that indirect_ctx ends on a cacheline this batch is aligned automatically.
+ *
+ *   The number of DWORDS written are returned using this field.
+ *
+ *  This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
+ *  to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
+ */
+static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
+                              struct i915_wa_ctx_bb *wa_ctx,
+                              uint32_t *const batch,
+                              uint32_t *offset)
+{
+       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
+
+       wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
+
+       return wa_ctx_end(wa_ctx, *offset = index, 1);
+}
+
+static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
+{
+       int ret;
+
+       ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size));
+       if (!ring->wa_ctx.obj) {
+               DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
+               return -ENOMEM;
+       }
+
+       ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0);
+       if (ret) {
+               DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
+                                ret);
+               drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
+{
+       if (ring->wa_ctx.obj) {
+               i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
+               drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+               ring->wa_ctx.obj = NULL;
+       }
+}
+
+static int intel_init_workaround_bb(struct intel_engine_cs *ring)
+{
+       int ret;
+       uint32_t *batch;
+       uint32_t offset;
+       struct page *page;
+       struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
+
+       WARN_ON(ring->id != RCS);
+
+       ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
+       if (ret) {
+               DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
+               return ret;
+       }
+
+       page = i915_gem_object_get_page(wa_ctx->obj, 0);
+       batch = kmap_atomic(page);
+       offset = 0;
+
+       if (INTEL_INFO(ring->dev)->gen == 8) {
+               ret = gen8_init_indirectctx_bb(ring,
+                                              &wa_ctx->indirect_ctx,
+                                              batch,
+                                              &offset);
+               if (ret)
+                       goto out;
+
+               ret = gen8_init_perctx_bb(ring,
+                                         &wa_ctx->per_ctx,
+                                         batch,
+                                         &offset);
+               if (ret)
+                       goto out;
+       } else {
+               WARN(INTEL_INFO(ring->dev)->gen >= 8,
+                    "WA batch buffer is not initialized for Gen%d\n",
+                    INTEL_INFO(ring->dev)->gen);
+               lrc_destroy_wa_ctx_obj(ring);
+       }
+
+out:
+       kunmap_atomic(batch);
+       if (ret)
+               lrc_destroy_wa_ctx_obj(ring);
+
+       return ret;
+}
+
  static int gen8_init_common_ring(struct intel_engine_cs *ring)
  {
         struct drm_device *dev = ring->dev;
@@ -1411,6 +1597,8 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
                 kunmap(sg_page(ring->status_page.obj->pages->sgl));
                 ring->status_page.obj = NULL;
         }
+
+       lrc_destroy_wa_ctx_obj(ring);
  }
  
  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring)
@@ -1474,7 +1662,22 @@ static int logical_render_ring_init(struct drm_device *dev)
         if (ret)
                 return ret;
  
-       return intel_init_pipe_control(ring);
+       ret = intel_init_workaround_bb(ring);
+       if (ret) {
+               /*
+                * We continue even if we fail to initialize WA batch
+                * because we only expect rare glitches but nothing
+                * critical to prevent us from using GPU
+                */
+               DRM_ERROR("WA batch buffer initialization failed: %d\n",
+                         ret);
+       }
+
+       ret = intel_init_pipe_control(ring);
+       if (ret)
+               lrc_destroy_wa_ctx_obj(ring);
+
+       return ret;
  }
  
  static int logical_bsd_ring_init(struct drm_device *dev)
@@ -1754,15 +1957,27 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
         reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
         reg_state[CTX_SECOND_BB_STATE+1] = 0;
         if (ring->id == RCS) {
-               /* TODO: according to BSpec, the register state context
-                * for CHV does not have these. OTOH, these registers do
-                * exist in CHV. I'm waiting for a clarification */
                 reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
                 reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
                 reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
                 reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
                 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8;
                 reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
+               if (ring->wa_ctx.obj) {
+                       struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
+                       uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj);
+
+                       reg_state[CTX_RCS_INDIRECT_CTX+1] =
+                               (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |
+                               (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS);
+
+                       reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
+                               CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
+
+                       reg_state[CTX_BB_PER_CTX_PTR+1] =
+                               (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) |
+                               0x01;
+               }
         }
         reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
         reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h

index e539314ae87e0280b3d1195c380c4e3764e7c330..64850293559c4fbe110104d910f6c01f816f7feb 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -12,6 +12,7 @@
   * workarounds!
   */
  #define CACHELINE_BYTES 64
+#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
  
  /*
   * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
@@ -120,6 +121,25 @@ struct intel_ringbuffer {
  struct intel_context;
  struct drm_i915_reg_descriptor;
  
+/*
+ * we use a single page to load ctx workarounds so all of these
+ * values are referred in terms of dwords
+ *
+ * struct i915_wa_ctx_bb:
+ *  offset: specifies batch starting position, also helpful in case
+ *    if we want to have multiple batches at different offsets based on
+ *    some criteria. It is not a requirement at the moment but provides
+ *    an option for future use.
+ *  size: size of the batch in DWORDS
+ */
+struct  i915_ctx_workarounds {
+       struct i915_wa_ctx_bb {
+               u32 offset;
+               u32 size;
+       } indirect_ctx, per_ctx;
+       struct drm_i915_gem_object *obj;
+};
+
  struct  intel_engine_cs {
         const char      *name;
         enum intel_ring_id {
@@ -143,6 +163,7 @@ struct  intel_engine_cs {
         struct i915_gem_batch_pool batch_pool;
  
         struct intel_hw_status_page status_page;
+       struct i915_ctx_workarounds wa_ctx;
  
         unsigned irq_refcount; /* protected by dev_priv->irq_lock */
         u32             irq_enable_mask;        /* bitmask to enable ring interrupt */
author	Arun Siluvery <arun.siluvery@linux.intel.com>
	Fri, 19 Jun 2015 18:07:01 +0000 (19:07 +0100)
committer	Daniel Vetter <daniel.vetter@ffwll.ch>
	Tue, 23 Jun 2015 12:01:39 +0000 (14:01 +0200)
drivers/gpu/drm/i915/intel_lrc.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_ringbuffer.h		patch \| blob \| history