return 0;
 
 err_unpin:
-       __i915_vma_unpin(vma);
+       i915_vma_unpin_and_release(&vma, 0);
 err_unref:
        i915_gem_object_put(bo);
        return ret;
 }
 
-static void config_oa_regs(struct intel_uncore *uncore,
-                          const struct i915_oa_reg *regs,
-                          u32 n_regs)
+static u32 *write_cs_mi_lri(u32 *cs,
+                           const struct i915_oa_reg *reg_data,
+                           u32 n_regs)
 {
        u32 i;
 
        for (i = 0; i < n_regs; i++) {
-               const struct i915_oa_reg *reg = regs + i;
+               if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
+                       u32 n_lri = min_t(u32,
+                                         n_regs - i,
+                                         MI_LOAD_REGISTER_IMM_MAX_REGS);
+
+                       *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+               }
+               *cs++ = i915_mmio_reg_offset(reg_data[i].addr);
+               *cs++ = reg_data[i].value;
+       }
+
+       return cs;
+}
+
+static int num_lri_dwords(int num_regs)
+{
+       int count = 0;
+
+       if (num_regs > 0) {
+               count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
+               count += num_regs * 2;
+       }
+
+       return count;
+}
+
+static struct i915_oa_config_bo *
+alloc_oa_config_buffer(struct i915_perf_stream *stream,
+                      struct i915_oa_config *oa_config)
+{
+       struct drm_i915_gem_object *obj;
+       struct i915_oa_config_bo *oa_bo;
+       size_t config_length = 0;
+       u32 *cs;
+       int err;
+
+       oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
+       if (!oa_bo)
+               return ERR_PTR(-ENOMEM);
+
+       config_length += num_lri_dwords(oa_config->mux_regs_len);
+       config_length += num_lri_dwords(oa_config->b_counter_regs_len);
+       config_length += num_lri_dwords(oa_config->flex_regs_len);
+       config_length++; /* MI_BATCH_BUFFER_END */
+       config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
+
+       obj = i915_gem_object_create_shmem(stream->perf->i915, config_length);
+       if (IS_ERR(obj)) {
+               err = PTR_ERR(obj);
+               goto err_free;
+       }
+
+       cs = i915_gem_object_pin_map(obj, I915_MAP_WB);
+       if (IS_ERR(cs)) {
+               err = PTR_ERR(cs);
+               goto err_oa_bo;
+       }
 
-               intel_uncore_write(uncore, reg->addr, reg->value);
+       cs = write_cs_mi_lri(cs,
+                            oa_config->mux_regs,
+                            oa_config->mux_regs_len);
+       cs = write_cs_mi_lri(cs,
+                            oa_config->b_counter_regs,
+                            oa_config->b_counter_regs_len);
+       cs = write_cs_mi_lri(cs,
+                            oa_config->flex_regs,
+                            oa_config->flex_regs_len);
+
+       *cs++ = MI_BATCH_BUFFER_END;
+
+       i915_gem_object_flush_map(obj);
+       i915_gem_object_unpin_map(obj);
+
+       oa_bo->vma = i915_vma_instance(obj,
+                                      &stream->engine->gt->ggtt->vm,
+                                      NULL);
+       if (IS_ERR(oa_bo->vma)) {
+               err = PTR_ERR(oa_bo->vma);
+               goto err_oa_bo;
        }
+
+       oa_bo->oa_config = i915_oa_config_get(oa_config);
+       llist_add(&oa_bo->node, &stream->oa_config_bos);
+
+       return oa_bo;
+
+err_oa_bo:
+       i915_gem_object_put(obj);
+err_free:
+       kfree(oa_bo);
+       return ERR_PTR(err);
 }
 
-static void delay_after_mux(void)
+static struct i915_vma *
+get_oa_vma(struct i915_perf_stream *stream, struct i915_oa_config *oa_config)
 {
+       struct i915_oa_config_bo *oa_bo;
+
        /*
-        * It apparently takes a fairly long time for a new MUX
-        * configuration to be be applied after these register writes.
-        * This delay duration was derived empirically based on the
-        * render_basic config but hopefully it covers the maximum
-        * configuration latency.
-        *
-        * As a fallback, the checks in _append_oa_reports() to skip
-        * invalid OA reports do also seem to work to discard reports
-        * generated before this config has completed - albeit not
-        * silently.
-        *
-        * Unfortunately this is essentially a magic number, since we
-        * don't currently know of a reliable mechanism for predicting
-        * how long the MUX config will take to apply and besides
-        * seeing invalid reports we don't know of a reliable way to
-        * explicitly check that the MUX config has landed.
-        *
-        * It's even possible we've miss characterized the underlying
-        * problem - it just seems like the simplest explanation why
-        * a delay at this location would mitigate any invalid reports.
+        * Look for the buffer in the already allocated BOs attached
+        * to the stream.
         */
-       usleep_range(15000, 20000);
+       llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) {
+               if (oa_bo->oa_config == oa_config &&
+                   memcmp(oa_bo->oa_config->uuid,
+                          oa_config->uuid,
+                          sizeof(oa_config->uuid)) == 0)
+                       goto out;
+       }
+
+       oa_bo = alloc_oa_config_buffer(stream, oa_config);
+       if (IS_ERR(oa_bo))
+               return ERR_CAST(oa_bo);
+
+out:
+       return i915_vma_get(oa_bo->vma);
+}
+
+static int emit_oa_config(struct i915_perf_stream *stream,
+                         struct intel_context *ce)
+{
+       struct i915_request *rq;
+       struct i915_vma *vma;
+       int err;
+
+       vma = get_oa_vma(stream, stream->oa_config);
+       if (IS_ERR(vma))
+               return PTR_ERR(vma);
+
+       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
+       if (err)
+               goto err_vma_put;
+
+       rq = i915_request_create(ce);
+       if (IS_ERR(rq)) {
+               err = PTR_ERR(rq);
+               goto err_vma_unpin;
+       }
+
+       i915_vma_lock(vma);
+       err = i915_request_await_object(rq, vma->obj, 0);
+       if (!err)
+               err = i915_vma_move_to_active(vma, rq, 0);
+       i915_vma_unlock(vma);
+       if (err)
+               goto err_add_request;
+
+       err = rq->engine->emit_bb_start(rq,
+                                       vma->node.start, 0,
+                                       I915_DISPATCH_SECURE);
+err_add_request:
+       i915_request_add(rq);
+err_vma_unpin:
+       i915_vma_unpin(vma);
+err_vma_put:
+       i915_vma_put(vma);
+       return err;
 }
 
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
        struct intel_uncore *uncore = stream->uncore;
-       const struct i915_oa_config *oa_config = stream->oa_config;
 
        /*
         * PRM:
        intel_uncore_rmw(uncore, GEN6_UCGCTL1,
                         0, GEN6_CSUNIT_CLOCK_GATE_DISABLE);
 
-       config_oa_regs(uncore, oa_config->mux_regs, oa_config->mux_regs_len);
-       delay_after_mux();
-
-       config_oa_regs(uncore, oa_config->b_counter_regs,
-                      oa_config->b_counter_regs_len);
-
-       return 0;
+       return emit_oa_config(stream, stream->engine->kernel_context);
 }
 
 static void hsw_disable_metric_set(struct i915_perf_stream *stream)
        if (ret)
                return ret;
 
-       config_oa_regs(uncore, oa_config->mux_regs, oa_config->mux_regs_len);
-       delay_after_mux();
-
-       config_oa_regs(uncore, oa_config->b_counter_regs,
-                      oa_config->b_counter_regs_len);
-
-       return 0;
+       return emit_oa_config(stream, stream->engine->kernel_context);
 }
 
 static void gen8_disable_metric_set(struct i915_perf_stream *stream)