#define MI_BATCH_BUFFER_START  MI_INSTR(0x31, 0)
 #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
 #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
-#define   MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define   MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
+#define   MI_BATCH_PREDICATE         REG_BIT(15) /* HSW+ on RCS only*/
 
 /*
  * 3D instructions used by the kernel
 #define   PIPE_CONTROL_CS_STALL                                (1<<20)
 #define   PIPE_CONTROL_TLB_INVALIDATE                  (1<<18)
 #define   PIPE_CONTROL_MEDIA_STATE_CLEAR               (1<<16)
+#define   PIPE_CONTROL_WRITE_TIMESTAMP                 (3<<14)
 #define   PIPE_CONTROL_QW_WRITE                                (1<<14)
 #define   PIPE_CONTROL_POST_SYNC_OP_MASK                (3<<14)
 #define   PIPE_CONTROL_DEPTH_STALL                     (1<<13)
 
        /* 8 bytes */
        INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
 
+       /* 6 * 8 bytes */
+       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
+
+       /* 4 bytes */
+       INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
 };
 
 #endif /* __INTEL_GT_TYPES_H__ */
 
                        i915_wedged_get, i915_wedged_set,
                        "%llu\n");
 
+static int
+i915_perf_noa_delay_set(void *data, u64 val)
+{
+       struct drm_i915_private *i915 = data;
+       const u32 clk = RUNTIME_INFO(i915)->cs_timestamp_frequency_khz;
+
+       /*
+        * This would lead to infinite waits as we're doing timestamp
+        * difference on the CS with only 32bits.
+        */
+       if (val > mul_u32_u32(U32_MAX, clk))
+               return -EINVAL;
+
+       atomic64_set(&i915->perf.noa_programming_delay, val);
+       return 0;
+}
+
+static int
+i915_perf_noa_delay_get(void *data, u64 *val)
+{
+       struct drm_i915_private *i915 = data;
+
+       *val = atomic64_read(&i915->perf.noa_programming_delay);
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
+                       i915_perf_noa_delay_get,
+                       i915_perf_noa_delay_set,
+                       "%llu\n");
+
 #define DROP_UNBOUND   BIT(0)
 #define DROP_BOUND     BIT(1)
 #define DROP_RETIRE    BIT(2)
        const char *name;
        const struct file_operations *fops;
 } i915_debugfs_files[] = {
+       {"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
        {"i915_wedged", &i915_wedged_fops},
        {"i915_cache_sharing", &i915_cache_sharing_fops},
        {"i915_gem_drop_caches", &i915_drop_caches_fops},
 
 #include "gem/i915_gem_context.h"
 #include "gt/intel_engine_pm.h"
 #include "gt/intel_engine_user.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_lrc_reg.h"
 
 #include "i915_drv.h"
                free_oa_config_bo(oa_bo);
 }
 
+static void
+free_noa_wait(struct i915_perf_stream *stream)
+{
+       i915_vma_unpin_and_release(&stream->noa_wait, 0);
+}
+
 static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 {
        struct i915_perf *perf = stream->perf;
                oa_put_render_ctx_id(stream);
 
        free_oa_configs(stream);
+       free_noa_wait(stream);
 
        if (perf->spurious_report_rs.missed) {
                DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
        return ret;
 }
 
+static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
+                                 bool save, i915_reg_t reg, u32 offset,
+                                 u32 dword_count)
+{
+       u32 cmd;
+       u32 d;
+
+       cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM;
+       if (INTEL_GEN(stream->perf->i915) >= 8)
+               cmd++;
+
+       for (d = 0; d < dword_count; d++) {
+               *cs++ = cmd;
+               *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
+               *cs++ = intel_gt_scratch_offset(stream->engine->gt,
+                                               offset) + 4 * d;
+               *cs++ = 0;
+       }
+
+       return cs;
+}
+
+static int alloc_noa_wait(struct i915_perf_stream *stream)
+{
+       struct drm_i915_private *i915 = stream->perf->i915;
+       struct drm_i915_gem_object *bo;
+       struct i915_vma *vma;
+       const u64 delay_ticks = 0xffffffffffffffff -
+               DIV64_U64_ROUND_UP(
+                       atomic64_read(&stream->perf->noa_programming_delay) *
+                       RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
+                       1000000ull);
+       const u32 base = stream->engine->mmio_base;
+#define CS_GPR(x) GEN8_RING_CS_GPR(base, x)
+       u32 *batch, *ts0, *cs, *jump;
+       int ret, i;
+       enum {
+               START_TS,
+               NOW_TS,
+               DELTA_TS,
+               JUMP_PREDICATE,
+               DELTA_TARGET,
+               N_CS_GPR
+       };
+
+       bo = i915_gem_object_create_internal(i915, 4096);
+       if (IS_ERR(bo)) {
+               DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
+               return PTR_ERR(bo);
+       }
+
+       /*
+        * We pin in GGTT because we jump into this buffer now because
+        * multiple OA config BOs will have a jump to this address and it
+        * needs to be fixed during the lifetime of the i915/perf stream.
+        */
+       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 0, PIN_HIGH);
+       if (IS_ERR(vma)) {
+               ret = PTR_ERR(vma);
+               goto err_unref;
+       }
+
+       batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+       if (IS_ERR(batch)) {
+               ret = PTR_ERR(batch);
+               goto err_unpin;
+       }
+
+       /* Save registers. */
+       for (i = 0; i < N_CS_GPR; i++)
+               cs = save_restore_register(
+                       stream, cs, true /* save */, CS_GPR(i),
+                       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+       cs = save_restore_register(
+               stream, cs, true /* save */, MI_PREDICATE_RESULT_1,
+               INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+       /* First timestamp snapshot location. */
+       ts0 = cs;
+
+       /*
+        * Initial snapshot of the timestamp register to implement the wait.
+        * We work with 32b values, so clear out the top 32b bits of the
+        * register because the ALU works 64bits.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(1);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)) + 4;
+       *cs++ = 0;
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
+       *cs++ = i915_mmio_reg_offset(CS_GPR(START_TS));
+
+       /*
+        * This is the location we're going to jump back into until the
+        * required amount of time has passed.
+        */
+       jump = cs;
+
+       /*
+        * Take another snapshot of the timestamp register. Take care to clear
+        * up the top 32bits of CS_GPR(1) as we're using it for other
+        * operations below.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(1);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)) + 4;
+       *cs++ = 0;
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
+       *cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS));
+
+       /*
+        * Do a diff between the 2 timestamps and store the result back into
+        * CS_GPR(1).
+        */
+       *cs++ = MI_MATH(5);
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+       *cs++ = MI_MATH_SUB;
+       *cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU);
+       *cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+       /*
+        * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
+        * timestamp have rolled over the 32bits) into the predicate register
+        * to be used for the predicated jump.
+        */
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
+       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+       /* Restart from the beginning if we had timestamps roll over. */
+       *cs++ = (INTEL_GEN(i915) < 8 ?
+                MI_BATCH_BUFFER_START :
+                MI_BATCH_BUFFER_START_GEN8) |
+               MI_BATCH_PREDICATE;
+       *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
+       *cs++ = 0;
+
+       /*
+        * Now add the diff between to previous timestamps and add it to :
+        *      (((1 * << 64) - 1) - delay_ns)
+        *
+        * When the Carry Flag contains 1 this means the elapsed time is
+        * longer than the expected delay, and we can exit the wait loop.
+        */
+       *cs++ = MI_LOAD_REGISTER_IMM(2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET));
+       *cs++ = lower_32_bits(delay_ticks);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)) + 4;
+       *cs++ = upper_32_bits(delay_ticks);
+
+       *cs++ = MI_MATH(4);
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS));
+       *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET));
+       *cs++ = MI_MATH_ADD;
+       *cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+       /*
+        * Transfer the result into the predicate register to be used for the
+        * predicated jump.
+        */
+       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+       *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
+       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+       /* Predicate the jump.  */
+       *cs++ = (INTEL_GEN(i915) < 8 ?
+                MI_BATCH_BUFFER_START :
+                MI_BATCH_BUFFER_START_GEN8) |
+               MI_BATCH_PREDICATE;
+       *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
+       *cs++ = 0;
+
+       /* Restore registers. */
+       for (i = 0; i < N_CS_GPR; i++)
+               cs = save_restore_register(
+                       stream, cs, false /* restore */, CS_GPR(i),
+                       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+       cs = save_restore_register(
+               stream, cs, false /* restore */, MI_PREDICATE_RESULT_1,
+               INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+       /* And return to the ring. */
+       *cs++ = MI_BATCH_BUFFER_END;
+
+       GEM_BUG_ON(cs - batch > PAGE_SIZE / sizeof(*batch));
+
+       i915_gem_object_flush_map(bo);
+       i915_gem_object_unpin_map(bo);
+
+       stream->noa_wait = vma;
+       return 0;
+
+err_unpin:
+       __i915_vma_unpin(vma);
+err_unref:
+       i915_gem_object_put(bo);
+       return ret;
+}
+
 static void config_oa_regs(struct intel_uncore *uncore,
                           const struct i915_oa_reg *regs,
                           u32 n_regs)
                }
        }
 
+       ret = alloc_noa_wait(stream);
+       if (ret) {
+               DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
+               goto err_noa_wait_alloc;
+       }
+
        stream->oa_config = i915_perf_get_oa_config(perf, props->metrics_set);
        if (!stream->oa_config) {
                DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
        intel_engine_pm_put(stream->engine);
 
 err_config:
+       free_noa_wait(stream);
+
+err_noa_wait_alloc:
        if (stream->ctx)
                oa_put_render_ctx_id(stream);
 
                ratelimit_set_flags(&perf->spurious_report_rs,
                                    RATELIMIT_MSG_ON_RELEASE);
 
+               atomic64_set(&perf->noa_programming_delay,
+                            500 * 1000 /* 500us */);
+
                perf->i915 = i915;
        }
 }
        memset(&perf->ops, 0, sizeof(perf->ops));
        perf->i915 = NULL;
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftests/i915_perf.c"
+#endif
 
                 */
                u32 head;
        } oa_buffer;
+
+       /**
+        * A batch buffer doing a wait on the GPU for the NOA logic to be
+        * reprogrammed.
+        */
+       struct i915_vma *noa_wait;
 };
 
 /**
 
        struct i915_oa_ops ops;
        const struct i915_oa_format *oa_formats;
+
+       atomic64_t noa_programming_delay;
 };
 
 #endif /* _I915_PERF_TYPES_H_ */
 
 #define MI_PREDICATE_SRC0_UDW  _MMIO(0x2400 + 4)
 #define MI_PREDICATE_SRC1      _MMIO(0x2408)
 #define MI_PREDICATE_SRC1_UDW  _MMIO(0x2408 + 4)
-
+#define MI_PREDICATE_DATA       _MMIO(0x2410)
+#define MI_PREDICATE_RESULT     _MMIO(0x2418)
+#define MI_PREDICATE_RESULT_1   _MMIO(0x241c)
 #define MI_PREDICATE_RESULT_2  _MMIO(0x2214)
 #define  LOWER_SLICE_ENABLED   (1 << 0)
 #define  LOWER_SLICE_DISABLED  (0 << 0)
 
 selftest(hangcheck, intel_hangcheck_live_selftests)
 selftest(execlists, intel_execlists_live_selftests)
 selftest(guc, intel_guc_live_selftest)
+selftest(perf, i915_perf_live_selftests)
 
--- /dev/null
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include <linux/kref.h>
+
+#include "gem/i915_gem_pm.h"
+#include "gt/intel_gt.h"
+
+#include "i915_selftest.h"
+
+#include "igt_flush_test.h"
+#include "lib_sw_fence.h"
+
+static struct i915_perf_stream *
+test_stream(struct i915_perf *perf)
+{
+       struct drm_i915_perf_open_param param = {};
+       struct perf_open_properties props = {
+               .engine = intel_engine_lookup_user(perf->i915,
+                                                  I915_ENGINE_CLASS_RENDER,
+                                                  0),
+               .sample_flags = SAMPLE_OA_REPORT,
+               .oa_format = I915_OA_FORMAT_C4_B8,
+               .metrics_set = 1,
+       };
+       struct i915_perf_stream *stream;
+
+       stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+       if (!stream)
+               return NULL;
+
+       stream->perf = perf;
+
+       mutex_lock(&perf->lock);
+       if (i915_oa_stream_init(stream, ¶m, &props)) {
+               kfree(stream);
+               stream =  NULL;
+       }
+       mutex_unlock(&perf->lock);
+
+       return stream;
+}
+
+static void stream_destroy(struct i915_perf_stream *stream)
+{
+       struct i915_perf *perf = stream->perf;
+
+       mutex_lock(&perf->lock);
+       i915_perf_destroy_locked(stream);
+       mutex_unlock(&perf->lock);
+}
+
+static int live_sanitycheck(void *arg)
+{
+       struct drm_i915_private *i915 = arg;
+       struct i915_perf_stream *stream;
+
+       /* Quick check we can create a perf stream */
+
+       stream = test_stream(&i915->perf);
+       if (!stream)
+               return -EINVAL;
+
+       stream_destroy(stream);
+       return 0;
+}
+
+static int write_timestamp(struct i915_request *rq, int slot)
+{
+       u32 *cs;
+       int len;
+
+       cs = intel_ring_begin(rq, 6);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
+
+       len = 5;
+       if (INTEL_GEN(rq->i915) >= 8)
+               len++;
+
+       *cs++ = GFX_OP_PIPE_CONTROL(len);
+       *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB |
+               PIPE_CONTROL_STORE_DATA_INDEX |
+               PIPE_CONTROL_WRITE_TIMESTAMP;
+       *cs++ = slot * sizeof(u32);
+       *cs++ = 0;
+       *cs++ = 0;
+       *cs++ = 0;
+
+       intel_ring_advance(rq, cs);
+
+       return 0;
+}
+
+static ktime_t poll_status(struct i915_request *rq, int slot)
+{
+       while (!intel_read_status_page(rq->engine, slot) &&
+              !i915_request_completed(rq))
+               cpu_relax();
+
+       return ktime_get();
+}
+
+static int live_noa_delay(void *arg)
+{
+       struct drm_i915_private *i915 = arg;
+       struct i915_perf_stream *stream;
+       struct i915_request *rq;
+       ktime_t t0, t1;
+       u64 expected;
+       u32 delay;
+       int err;
+       int i;
+
+       /* Check that the GPU delays matches expectations */
+
+       stream = test_stream(&i915->perf);
+       if (!stream)
+               return -ENOMEM;
+
+       expected = atomic64_read(&stream->perf->noa_programming_delay);
+
+       if (stream->engine->class != RENDER_CLASS) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       for (i = 0; i < 4; i++)
+               intel_write_status_page(stream->engine, 0x100 + i, 0);
+
+       rq = i915_request_create(stream->engine->kernel_context);
+       if (IS_ERR(rq)) {
+               err = PTR_ERR(rq);
+               goto out;
+       }
+
+       if (rq->engine->emit_init_breadcrumb &&
+           i915_request_timeline(rq)->has_initial_breadcrumb) {
+               err = rq->engine->emit_init_breadcrumb(rq);
+               if (err) {
+                       i915_request_add(rq);
+                       goto out;
+               }
+       }
+
+       err = write_timestamp(rq, 0x100);
+       if (err) {
+               i915_request_add(rq);
+               goto out;
+       }
+
+       err = rq->engine->emit_bb_start(rq,
+                                       i915_ggtt_offset(stream->noa_wait), 0,
+                                       I915_DISPATCH_SECURE);
+       if (err) {
+               i915_request_add(rq);
+               goto out;
+       }
+
+       err = write_timestamp(rq, 0x102);
+       if (err) {
+               i915_request_add(rq);
+               goto out;
+       }
+
+       i915_request_get(rq);
+       i915_request_add(rq);
+
+       preempt_disable();
+       t0 = poll_status(rq, 0x100);
+       t1 = poll_status(rq, 0x102);
+       preempt_enable();
+
+       pr_info("CPU delay: %lluns, expected %lluns\n",
+               ktime_sub(t1, t0), expected);
+
+       delay = intel_read_status_page(stream->engine, 0x102);
+       delay -= intel_read_status_page(stream->engine, 0x100);
+       delay = div_u64(mul_u32_u32(delay, 1000 * 1000),
+                       RUNTIME_INFO(i915)->cs_timestamp_frequency_khz);
+       pr_info("GPU delay: %uns, expected %lluns\n",
+               delay, expected);
+
+       if (4 * delay < 3 * expected || 2 * delay > 3 * expected) {
+               pr_err("GPU delay [%uus] outside of expected threshold! [%lluus, %lluus]\n",
+                      delay / 1000,
+                      div_u64(3 * expected, 4000),
+                      div_u64(3 * expected, 2000));
+               err = -EINVAL;
+       }
+
+       i915_request_put(rq);
+out:
+       stream_destroy(stream);
+       return err;
+}
+
+int i915_perf_live_selftests(struct drm_i915_private *i915)
+{
+       static const struct i915_subtest tests[] = {
+               SUBTEST(live_sanitycheck),
+               SUBTEST(live_noa_delay),
+       };
+       struct i915_perf *perf = &i915->perf;
+
+       if (!perf->metrics_kobj || !perf->ops.enable_metric_set)
+               return 0;
+
+       if (intel_gt_is_wedged(&i915->gt))
+               return 0;
+
+       return i915_subtests(tests, i915);
+}