From 0f1fdf5592259cc68ee5ec968c6cddb26d0ecf77 Mon Sep 17 00:00:00 2001
From: Zhanjun Dong <zhanjun.dong@intel.com>
Date: Fri, 4 Oct 2024 12:34:28 -0700
Subject: [PATCH 01/16] drm/xe/guc: Save manual engine capture into capture
 list

Save manual engine capture into capture list.
This removes duplicate register definitions across manual-capture vs
guc-err-capture.

Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
Reviewed-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241004193428.3311145-7-zhanjun.dong@intel.com
---
 drivers/gpu/drm/xe/xe_devcoredump.c     |   2 +-
 drivers/gpu/drm/xe/xe_guc_capture.c     | 153 +++++++++++++-
 drivers/gpu/drm/xe/xe_guc_capture.h     |   3 +-
 drivers/gpu/drm/xe/xe_hw_engine.c       | 254 +-----------------------
 drivers/gpu/drm/xe/xe_hw_engine.h       |   2 -
 drivers/gpu/drm/xe/xe_hw_engine_types.h |  59 ------
 6 files changed, 163 insertions(+), 310 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 30f9e8cb328c..99842a35dbf0 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -117,7 +117,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
 	drm_puts(&p, "\n**** HW Engines ****\n");
 	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
 		if (ss->hwe[i])
-			xe_hw_engine_snapshot_print(ss->hwe[i], &p);
+			xe_engine_snapshot_print(ss->hwe[i], &p);
 
 	drm_puts(&p, "\n**** VM state ****\n");
 	xe_vm_snapshot_print(ss->vm, &p);
diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c
index 3853b778c4f0..41262bda20ed 100644
--- a/drivers/gpu/drm/xe/xe_guc_capture.c
+++ b/drivers/gpu/drm/xe/xe_guc_capture.c
@@ -935,7 +935,7 @@ guc_capture_init_node(struct xe_guc *guc, struct __guc_capture_parsed_output *no
  *                   guc->capture->cachelist and populated with the error-capture
  *                   data from GuC and then it's added into guc->capture->outlist linked
  *                   list. This list is used for matchup and printout by xe_devcoredump_read
- *                   and xe_hw_engine_snapshot_print, (when user invokes the devcoredump sysfs).
+ *                   and xe_engine_snapshot_print, (when user invokes the devcoredump sysfs).
  *
  * GUC --> notify context reset:
  * -----------------------------
@@ -943,12 +943,13 @@ guc_capture_init_node(struct xe_guc *guc, struct __guc_capture_parsed_output *no
  *                   L--> xe_devcoredump
  *                          L--> devcoredump_snapshot
  *                               --> xe_hw_engine_snapshot_capture
+ *                               --> xe_engine_manual_capture(For manual capture)
  *
  * User Sysfs / Debugfs
  * --------------------
  *      --> xe_devcoredump_read->
  *             L--> xxx_snapshot_print
- *                    L--> xe_hw_engine_snapshot_print
+ *                    L--> xe_engine_snapshot_print
  *                         Print register lists values saved at
  *                         guc->capture->outlist
  *
@@ -1524,6 +1525,129 @@ guc_capture_create_prealloc_nodes(struct xe_guc *guc)
 	__guc_capture_create_prealloc_nodes(guc);
 }
 
+static void
+read_reg_to_node(struct xe_hw_engine *hwe, const struct __guc_mmio_reg_descr_group *list,
+		 struct guc_mmio_reg *regs)
+{
+	int i;
+
+	if (!list || list->num_regs == 0)
+		return;
+
+	if (!regs)
+		return;
+
+	for (i = 0; i < list->num_regs; i++) {
+		struct __guc_mmio_reg_descr desc = list->list[i];
+		u32 value;
+
+		if (!list->list)
+			return;
+
+		if (list->type == GUC_STATE_CAPTURE_TYPE_ENGINE_INSTANCE) {
+			value = xe_hw_engine_mmio_read32(hwe, desc.reg);
+		} else {
+			if (list->type == GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS &&
+			    FIELD_GET(GUC_REGSET_STEERING_NEEDED, desc.flags)) {
+				int group, instance;
+
+				group = FIELD_GET(GUC_REGSET_STEERING_GROUP, desc.flags);
+				instance = FIELD_GET(GUC_REGSET_STEERING_INSTANCE, desc.flags);
+				value = xe_gt_mcr_unicast_read(hwe->gt, XE_REG_MCR(desc.reg.addr),
+							       group, instance);
+			} else {
+				value = xe_mmio_read32(&hwe->gt->mmio, desc.reg);
+			}
+		}
+
+		regs[i].value = value;
+		regs[i].offset = desc.reg.addr;
+		regs[i].flags = desc.flags;
+		regs[i].mask = desc.mask;
+	}
+}
+
+/**
+ * xe_engine_manual_capture - Take a manual engine snapshot from engine.
+ * @hwe: Xe HW Engine.
+ * @snapshot: The engine snapshot
+ *
+ * Take engine snapshot from engine read.
+ *
+ * Returns: None
+ */
+void
+xe_engine_manual_capture(struct xe_hw_engine *hwe, struct xe_hw_engine_snapshot *snapshot)
+{
+	struct xe_gt *gt = hwe->gt;
+	struct xe_device *xe = gt_to_xe(gt);
+	struct xe_guc *guc = &gt->uc.guc;
+	struct xe_devcoredump *devcoredump = &xe->devcoredump;
+	enum guc_capture_list_class_type capture_class;
+	const struct __guc_mmio_reg_descr_group *list;
+	struct __guc_capture_parsed_output *new;
+	enum guc_state_capture_type type;
+	u16 guc_id = 0;
+	u32 lrca = 0;
+
+	new = guc_capture_get_prealloc_node(guc);
+	if (!new)
+		return;
+
+	capture_class = xe_engine_class_to_guc_capture_class(hwe->class);
+	for (type = GUC_STATE_CAPTURE_TYPE_GLOBAL; type < GUC_STATE_CAPTURE_TYPE_MAX; type++) {
+		struct gcap_reg_list_info *reginfo = &new->reginfo[type];
+		/*
+		 * regsinfo->regs is allocated based on guc->capture->max_mmio_per_node
+		 * which is based on the descriptor list driving the population so
+		 * should not overflow
+		 */
+
+		/* Get register list for the type/class */
+		list = xe_guc_capture_get_reg_desc_list(gt, GUC_CAPTURE_LIST_INDEX_PF, type,
+							capture_class, false);
+		if (!list) {
+			xe_gt_dbg(gt, "Empty GuC capture register descriptor for %s",
+				  hwe->name);
+			continue;
+		}
+
+		read_reg_to_node(hwe, list, reginfo->regs);
+		reginfo->num_regs = list->num_regs;
+
+		/* Capture steering registers for rcs/ccs */
+		if (capture_class == GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE) {
+			list = xe_guc_capture_get_reg_desc_list(gt, GUC_CAPTURE_LIST_INDEX_PF,
+								type, capture_class, true);
+			if (list) {
+				read_reg_to_node(hwe, list, &reginfo->regs[reginfo->num_regs]);
+				reginfo->num_regs += list->num_regs;
+			}
+		}
+	}
+
+	if (devcoredump && devcoredump->captured) {
+		struct xe_guc_submit_exec_queue_snapshot *ge = devcoredump->snapshot.ge;
+
+		if (ge) {
+			guc_id = ge->guc.id;
+			if (ge->lrc[0])
+				lrca = ge->lrc[0]->context_desc;
+		}
+	}
+
+	new->eng_class = xe_engine_class_to_guc_class(hwe->class);
+	new->eng_inst = hwe->instance;
+	new->guc_id = guc_id;
+	new->lrca = lrca;
+	new->is_partial = 0;
+	new->locked = 1;
+	new->source = XE_ENGINE_CAPTURE_SOURCE_MANUAL;
+
+	guc_capture_add_node_to_outlist(guc->capture, new);
+	devcoredump->snapshot.matched_node = new;
+}
+
 static struct guc_mmio_reg *
 guc_capture_find_reg(struct gcap_reg_list_info *reginfo, u32 addr, u32 flags)
 {
@@ -1609,7 +1733,7 @@ snapshot_print_by_list_order(struct xe_hw_engine_snapshot *snapshot, struct drm_
  *
  * This function prints out a given Xe HW Engine snapshot object.
  */
-void xe_engine_guc_capture_print(struct xe_hw_engine_snapshot *snapshot, struct drm_printer *p)
+void xe_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot, struct drm_printer *p)
 {
 	const char *grptype[GUC_STATE_CAPTURE_GROUP_TYPE_MAX] = {
 		"full-capture",
@@ -1648,6 +1772,8 @@ void xe_engine_guc_capture_print(struct xe_hw_engine_snapshot *snapshot, struct
 	drm_printf(p, "\tCoverage: %s\n", grptype[devcore_snapshot->matched_node->is_partial]);
 	drm_printf(p, "\tForcewake: domain 0x%x, ref %d\n",
 		   snapshot->forcewake.domain, snapshot->forcewake.ref);
+	drm_printf(p, "\tReserved: %s\n",
+		   str_yes_no(snapshot->kernel_reserved));
 
 	for (type = GUC_STATE_CAPTURE_TYPE_GLOBAL; type < GUC_STATE_CAPTURE_TYPE_MAX; type++) {
 		list = xe_guc_capture_get_reg_desc_list(gt, GUC_CAPTURE_LIST_INDEX_PF, type,
@@ -1757,8 +1883,27 @@ xe_engine_snapshot_capture_for_job(struct xe_sched_job *job)
 			continue;
 		}
 
-		if (!coredump->snapshot.hwe[id])
+		if (!coredump->snapshot.hwe[id]) {
 			coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe, job);
+		} else {
+			struct __guc_capture_parsed_output *new;
+
+			new = xe_guc_capture_get_matching_and_lock(job);
+			if (new) {
+				struct xe_guc *guc =  &q->gt->uc.guc;
+
+				/*
+				 * If we are in here, it means we found a fresh
+				 * GuC-err-capture node for this engine after
+				 * previously failing to find a match in the
+				 * early part of guc_exec_queue_timedout_job.
+				 * Thus we must free the manually captured node
+				 */
+				guc_capture_free_outlist_node(guc->capture,
+							      coredump->snapshot.matched_node);
+				coredump->snapshot.matched_node = new;
+			}
+		}
 
 		break;
 	}
diff --git a/drivers/gpu/drm/xe/xe_guc_capture.h b/drivers/gpu/drm/xe/xe_guc_capture.h
index fe695ab08a74..97a795d13dd1 100644
--- a/drivers/gpu/drm/xe/xe_guc_capture.h
+++ b/drivers/gpu/drm/xe/xe_guc_capture.h
@@ -51,8 +51,9 @@ const struct __guc_mmio_reg_descr_group *
 xe_guc_capture_get_reg_desc_list(struct xe_gt *gt, u32 owner, u32 type,
 				 enum guc_capture_list_class_type capture_class, bool is_ext);
 struct __guc_capture_parsed_output *xe_guc_capture_get_matching_and_lock(struct xe_sched_job *job);
+void xe_engine_manual_capture(struct xe_hw_engine *hwe, struct xe_hw_engine_snapshot *snapshot);
+void xe_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot, struct drm_printer *p);
 void xe_engine_snapshot_capture_for_job(struct xe_sched_job *job);
-void xe_engine_guc_capture_print(struct xe_hw_engine_snapshot *snapshot, struct drm_printer *p);
 void xe_guc_capture_steered_list_init(struct xe_guc *guc);
 void xe_guc_capture_put_matched_nodes(struct xe_guc *guc);
 int xe_guc_capture_init(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
index 18a06c2f78d1..1557acee3523 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine.c
+++ b/drivers/gpu/drm/xe/xe_hw_engine.c
@@ -826,117 +826,6 @@ void xe_hw_engine_handle_irq(struct xe_hw_engine *hwe, u16 intr_vec)
 		xe_hw_fence_irq_run(hwe->fence_irq);
 }
 
-static bool
-is_slice_common_per_gslice(struct xe_device *xe)
-{
-	return GRAPHICS_VERx100(xe) >= 1255;
-}
-
-static void
-xe_hw_engine_snapshot_instdone_capture(struct xe_hw_engine *hwe,
-				       struct xe_hw_engine_snapshot *snapshot)
-{
-	struct xe_gt *gt = hwe->gt;
-	struct xe_mmio *mmio = &gt->mmio;
-	struct xe_device *xe = gt_to_xe(gt);
-	unsigned int dss;
-	u16 group, instance;
-
-	snapshot->reg.instdone.ring = xe_hw_engine_mmio_read32(hwe, RING_INSTDONE(0));
-
-	if (snapshot->hwe->class != XE_ENGINE_CLASS_RENDER)
-		return;
-
-	if (is_slice_common_per_gslice(xe) == false) {
-		snapshot->reg.instdone.slice_common[0] =
-			xe_mmio_read32(mmio, SC_INSTDONE);
-		snapshot->reg.instdone.slice_common_extra[0] =
-			xe_mmio_read32(mmio, SC_INSTDONE_EXTRA);
-		snapshot->reg.instdone.slice_common_extra2[0] =
-			xe_mmio_read32(mmio, SC_INSTDONE_EXTRA2);
-	} else {
-		for_each_geometry_dss(dss, gt, group, instance) {
-			snapshot->reg.instdone.slice_common[dss] =
-				xe_gt_mcr_unicast_read(gt, XEHPG_SC_INSTDONE, group, instance);
-			snapshot->reg.instdone.slice_common_extra[dss] =
-				xe_gt_mcr_unicast_read(gt, XEHPG_SC_INSTDONE_EXTRA, group, instance);
-			snapshot->reg.instdone.slice_common_extra2[dss] =
-				xe_gt_mcr_unicast_read(gt, XEHPG_SC_INSTDONE_EXTRA2, group, instance);
-		}
-	}
-
-	for_each_geometry_dss(dss, gt, group, instance) {
-		snapshot->reg.instdone.sampler[dss] =
-			xe_gt_mcr_unicast_read(gt, SAMPLER_INSTDONE, group, instance);
-		snapshot->reg.instdone.row[dss] =
-			xe_gt_mcr_unicast_read(gt, ROW_INSTDONE, group, instance);
-
-		if (GRAPHICS_VERx100(xe) >= 1255)
-			snapshot->reg.instdone.geom_svg[dss] =
-				xe_gt_mcr_unicast_read(gt, XEHPG_INSTDONE_GEOM_SVGUNIT,
-						       group, instance);
-	}
-}
-
-static void
-xe_hw_engine_manual_capture(struct xe_hw_engine *hwe, struct xe_hw_engine_snapshot *snapshot)
-{
-	u64 val;
-
-	snapshot->reg.ring_execlist_status =
-		xe_hw_engine_mmio_read32(hwe, RING_EXECLIST_STATUS_LO(0));
-	val = xe_hw_engine_mmio_read32(hwe, RING_EXECLIST_STATUS_HI(0));
-	snapshot->reg.ring_execlist_status |= val << 32;
-
-	snapshot->reg.ring_execlist_sq_contents =
-		xe_hw_engine_mmio_read32(hwe, RING_EXECLIST_SQ_CONTENTS_LO(0));
-	val = xe_hw_engine_mmio_read32(hwe, RING_EXECLIST_SQ_CONTENTS_HI(0));
-	snapshot->reg.ring_execlist_sq_contents |= val << 32;
-
-	snapshot->reg.ring_acthd = xe_hw_engine_mmio_read32(hwe, RING_ACTHD(0));
-	val = xe_hw_engine_mmio_read32(hwe, RING_ACTHD_UDW(0));
-	snapshot->reg.ring_acthd |= val << 32;
-
-	snapshot->reg.ring_bbaddr = xe_hw_engine_mmio_read32(hwe, RING_BBADDR(0));
-	val = xe_hw_engine_mmio_read32(hwe, RING_BBADDR_UDW(0));
-	snapshot->reg.ring_bbaddr |= val << 32;
-
-	snapshot->reg.ring_dma_fadd =
-		xe_hw_engine_mmio_read32(hwe, RING_DMA_FADD(0));
-	val = xe_hw_engine_mmio_read32(hwe, RING_DMA_FADD_UDW(0));
-	snapshot->reg.ring_dma_fadd |= val << 32;
-
-	snapshot->reg.ring_hwstam = xe_hw_engine_mmio_read32(hwe, RING_HWSTAM(0));
-	snapshot->reg.ring_hws_pga = xe_hw_engine_mmio_read32(hwe, RING_HWS_PGA(0));
-	snapshot->reg.ring_start = xe_hw_engine_mmio_read32(hwe, RING_START(0));
-	if (GRAPHICS_VERx100(hwe->gt->tile->xe) >= 2000) {
-		val = xe_hw_engine_mmio_read32(hwe, RING_START_UDW(0));
-		snapshot->reg.ring_start |= val << 32;
-	}
-	if (xe_gt_has_indirect_ring_state(hwe->gt)) {
-		snapshot->reg.indirect_ring_state =
-			xe_hw_engine_mmio_read32(hwe, INDIRECT_RING_STATE(0));
-	}
-
-	snapshot->reg.ring_head =
-		xe_hw_engine_mmio_read32(hwe, RING_HEAD(0)) & HEAD_ADDR;
-	snapshot->reg.ring_tail =
-		xe_hw_engine_mmio_read32(hwe, RING_TAIL(0)) & TAIL_ADDR;
-	snapshot->reg.ring_ctl = xe_hw_engine_mmio_read32(hwe, RING_CTL(0));
-	snapshot->reg.ring_mi_mode =
-		xe_hw_engine_mmio_read32(hwe, RING_MI_MODE(0));
-	snapshot->reg.ring_mode = xe_hw_engine_mmio_read32(hwe, RING_MODE(0));
-	snapshot->reg.ring_imr = xe_hw_engine_mmio_read32(hwe, RING_IMR(0));
-	snapshot->reg.ring_esr = xe_hw_engine_mmio_read32(hwe, RING_ESR(0));
-	snapshot->reg.ring_emr = xe_hw_engine_mmio_read32(hwe, RING_EMR(0));
-	snapshot->reg.ring_eir = xe_hw_engine_mmio_read32(hwe, RING_EIR(0));
-	snapshot->reg.ipehr = xe_hw_engine_mmio_read32(hwe, RING_IPEHR(0));
-	xe_hw_engine_snapshot_instdone_capture(hwe, snapshot);
-
-	if (snapshot->hwe->class == XE_ENGINE_CLASS_COMPUTE)
-		snapshot->reg.rcu_mode = xe_mmio_read32(&hwe->gt->mmio, RCU_MODE);
-}
-
 /**
  * xe_hw_engine_snapshot_capture - Take a quick snapshot of the HW Engine.
  * @hwe: Xe HW Engine.
@@ -952,7 +841,6 @@ struct xe_hw_engine_snapshot *
 xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_sched_job *job)
 {
 	struct xe_hw_engine_snapshot *snapshot;
-	size_t len;
 	struct __guc_capture_parsed_output *node;
 
 	if (!xe_hw_engine_is_valid(hwe))
@@ -963,28 +851,6 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_sched_job *job
 	if (!snapshot)
 		return NULL;
 
-	/* Because XE_MAX_DSS_FUSE_BITS is defined in xe_gt_types.h and it
-	 * includes xe_hw_engine_types.h the length of this 3 registers can't be
-	 * set in struct xe_hw_engine_snapshot, so here doing additional
-	 * allocations.
-	 */
-	len = (XE_MAX_DSS_FUSE_BITS * sizeof(u32));
-	snapshot->reg.instdone.slice_common = kzalloc(len, GFP_ATOMIC);
-	snapshot->reg.instdone.slice_common_extra = kzalloc(len, GFP_ATOMIC);
-	snapshot->reg.instdone.slice_common_extra2 = kzalloc(len, GFP_ATOMIC);
-	snapshot->reg.instdone.sampler = kzalloc(len, GFP_ATOMIC);
-	snapshot->reg.instdone.row = kzalloc(len, GFP_ATOMIC);
-	snapshot->reg.instdone.geom_svg = kzalloc(len, GFP_ATOMIC);
-	if (!snapshot->reg.instdone.slice_common ||
-	    !snapshot->reg.instdone.slice_common_extra ||
-	    !snapshot->reg.instdone.slice_common_extra2 ||
-	    !snapshot->reg.instdone.sampler ||
-	    !snapshot->reg.instdone.row ||
-	    !snapshot->reg.instdone.geom_svg) {
-		xe_hw_engine_snapshot_free(snapshot);
-		return NULL;
-	}
-
 	snapshot->name = kstrdup(hwe->name, GFP_ATOMIC);
 	snapshot->hwe = hwe;
 	snapshot->logical_instance = hwe->logical_instance;
@@ -1013,114 +879,13 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_sched_job *job
 	}
 
 	/* otherwise, do manual capture */
-	xe_hw_engine_manual_capture(hwe, snapshot);
+	xe_engine_manual_capture(hwe, snapshot);
 	snapshot->source = XE_ENGINE_CAPTURE_SOURCE_MANUAL;
 	xe_gt_dbg(hwe->gt, "Proceeding with manual engine snapshot");
 
 	return snapshot;
 }
 
-static void
-xe_hw_engine_snapshot_instdone_print(struct xe_hw_engine_snapshot *snapshot, struct drm_printer *p)
-{
-	struct xe_gt *gt = snapshot->hwe->gt;
-	struct xe_device *xe = gt_to_xe(gt);
-	u16 group, instance;
-	unsigned int dss;
-
-	drm_printf(p, "\tRING_INSTDONE: 0x%08x\n", snapshot->reg.instdone.ring);
-
-	if (snapshot->hwe->class != XE_ENGINE_CLASS_RENDER)
-		return;
-
-	if (is_slice_common_per_gslice(xe) == false) {
-		drm_printf(p, "\tSC_INSTDONE[0]: 0x%08x\n",
-			   snapshot->reg.instdone.slice_common[0]);
-		drm_printf(p, "\tSC_INSTDONE_EXTRA[0]: 0x%08x\n",
-			   snapshot->reg.instdone.slice_common_extra[0]);
-		drm_printf(p, "\tSC_INSTDONE_EXTRA2[0]: 0x%08x\n",
-			   snapshot->reg.instdone.slice_common_extra2[0]);
-	} else {
-		for_each_geometry_dss(dss, gt, group, instance) {
-			drm_printf(p, "\tSC_INSTDONE[%u]: 0x%08x\n", dss,
-				   snapshot->reg.instdone.slice_common[dss]);
-			drm_printf(p, "\tSC_INSTDONE_EXTRA[%u]: 0x%08x\n", dss,
-				   snapshot->reg.instdone.slice_common_extra[dss]);
-			drm_printf(p, "\tSC_INSTDONE_EXTRA2[%u]: 0x%08x\n", dss,
-				   snapshot->reg.instdone.slice_common_extra2[dss]);
-		}
-	}
-
-	for_each_geometry_dss(dss, gt, group, instance) {
-		drm_printf(p, "\tSAMPLER_INSTDONE[%u]: 0x%08x\n", dss,
-			   snapshot->reg.instdone.sampler[dss]);
-		drm_printf(p, "\tROW_INSTDONE[%u]: 0x%08x\n", dss,
-			   snapshot->reg.instdone.row[dss]);
-
-		if (GRAPHICS_VERx100(xe) >= 1255)
-			drm_printf(p, "\tINSTDONE_GEOM_SVGUNIT[%u]: 0x%08x\n",
-				   dss, snapshot->reg.instdone.geom_svg[dss]);
-	}
-}
-
-static void __xe_hw_engine_manual_print(struct xe_hw_engine_snapshot *snapshot,
-					struct drm_printer *p)
-{
-	drm_printf(p, "%s (physical), logical instance=%d\n",
-		   snapshot->name ? snapshot->name : "",
-		   snapshot->logical_instance);
-	drm_printf(p, "\tForcewake: domain 0x%x, ref %d\n",
-		   snapshot->forcewake.domain, snapshot->forcewake.ref);
-	drm_printf(p, "\tReserved: %s\n",
-		   str_yes_no(snapshot->kernel_reserved));
-	drm_printf(p, "\tHWSTAM: 0x%08x\n", snapshot->reg.ring_hwstam);
-	drm_printf(p, "\tRING_HWS_PGA: 0x%08x\n", snapshot->reg.ring_hws_pga);
-	drm_printf(p, "\tRING_EXECLIST_STATUS: 0x%016llx\n",
-		   snapshot->reg.ring_execlist_status);
-	drm_printf(p, "\tRING_EXECLIST_SQ_CONTENTS: 0x%016llx\n",
-		   snapshot->reg.ring_execlist_sq_contents);
-	drm_printf(p, "\tRING_START: 0x%016llx\n", snapshot->reg.ring_start);
-	drm_printf(p, "\tRING_HEAD: 0x%08x\n", snapshot->reg.ring_head);
-	drm_printf(p, "\tRING_TAIL: 0x%08x\n", snapshot->reg.ring_tail);
-	drm_printf(p, "\tRING_CTL: 0x%08x\n", snapshot->reg.ring_ctl);
-	drm_printf(p, "\tRING_MI_MODE: 0x%08x\n", snapshot->reg.ring_mi_mode);
-	drm_printf(p, "\tRING_MODE: 0x%08x\n",
-		   snapshot->reg.ring_mode);
-	drm_printf(p, "\tRING_IMR: 0x%08x\n", snapshot->reg.ring_imr);
-	drm_printf(p, "\tRING_ESR: 0x%08x\n", snapshot->reg.ring_esr);
-	drm_printf(p, "\tRING_EMR: 0x%08x\n", snapshot->reg.ring_emr);
-	drm_printf(p, "\tRING_EIR: 0x%08x\n", snapshot->reg.ring_eir);
-	drm_printf(p, "\tACTHD: 0x%016llx\n", snapshot->reg.ring_acthd);
-	drm_printf(p, "\tBBADDR: 0x%016llx\n", snapshot->reg.ring_bbaddr);
-	drm_printf(p, "\tDMA_FADDR: 0x%016llx\n", snapshot->reg.ring_dma_fadd);
-	drm_printf(p, "\tINDIRECT_RING_STATE: 0x%08x\n",
-		   snapshot->reg.indirect_ring_state);
-	drm_printf(p, "\tIPEHR: 0x%08x\n", snapshot->reg.ipehr);
-	xe_hw_engine_snapshot_instdone_print(snapshot, p);
-
-	if (snapshot->hwe->class == XE_ENGINE_CLASS_COMPUTE)
-		drm_printf(p, "\tRCU_MODE: 0x%08x\n",
-			   snapshot->reg.rcu_mode);
-}
-
-/**
- * xe_hw_engine_snapshot_print - Print out a given Xe HW Engine snapshot.
- * @snapshot: Xe HW Engine snapshot object.
- * @p: drm_printer where it will be printed out.
- *
- * This function prints out a given Xe HW Engine snapshot object.
- */
-void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot,
-				 struct drm_printer *p)
-{
-	if (!snapshot)
-		return;
-
-	if (snapshot->source == XE_ENGINE_CAPTURE_SOURCE_MANUAL)
-		__xe_hw_engine_manual_print(snapshot, p);
-	else
-		xe_engine_guc_capture_print(snapshot, p);
-}
 /**
  * xe_hw_engine_snapshot_free - Free all allocated objects for a given snapshot.
  * @snapshot: Xe HW Engine snapshot object.
@@ -1130,15 +895,18 @@ void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot,
  */
 void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot)
 {
+	struct xe_gt *gt;
 	if (!snapshot)
 		return;
 
-	kfree(snapshot->reg.instdone.slice_common);
-	kfree(snapshot->reg.instdone.slice_common_extra);
-	kfree(snapshot->reg.instdone.slice_common_extra2);
-	kfree(snapshot->reg.instdone.sampler);
-	kfree(snapshot->reg.instdone.row);
-	kfree(snapshot->reg.instdone.geom_svg);
+	gt = snapshot->hwe->gt;
+	/*
+	 * xe_guc_capture_put_matched_nodes is called here and from
+	 * xe_devcoredump_snapshot_free, to cover the 2 calling paths
+	 * of hw_engines - debugfs and devcoredump free.
+	 */
+	xe_guc_capture_put_matched_nodes(&gt->uc.guc);
+
 	kfree(snapshot->name);
 	kfree(snapshot);
 }
@@ -1155,7 +923,7 @@ void xe_hw_engine_print(struct xe_hw_engine *hwe, struct drm_printer *p)
 	struct xe_hw_engine_snapshot *snapshot;
 
 	snapshot = xe_hw_engine_snapshot_capture(hwe, NULL);
-	xe_hw_engine_snapshot_print(snapshot, p);
+	xe_engine_snapshot_print(snapshot, p);
 	xe_hw_engine_snapshot_free(snapshot);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_hw_engine.h b/drivers/gpu/drm/xe/xe_hw_engine.h
index c2428326a366..da0a6922a26f 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine.h
+++ b/drivers/gpu/drm/xe/xe_hw_engine.h
@@ -58,8 +58,6 @@ u32 xe_hw_engine_mask_per_class(struct xe_gt *gt,
 struct xe_hw_engine_snapshot *
 xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_sched_job *job);
 void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot);
-void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot,
-				 struct drm_printer *p);
 void xe_hw_engine_print(struct xe_hw_engine *hwe, struct drm_printer *p);
 void xe_hw_engine_setup_default_lrc_state(struct xe_hw_engine *hwe);
 
diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
index 55805c78d9d1..719f27ef00a5 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
+++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
@@ -182,65 +182,6 @@ struct xe_hw_engine_snapshot {
 	u32 mmio_base;
 	/** @kernel_reserved: Engine reserved, can't be used by userspace */
 	bool kernel_reserved;
-	/** @reg: Useful MMIO register snapshot */
-	struct {
-		/** @reg.ring_execlist_status: RING_EXECLIST_STATUS */
-		u64 ring_execlist_status;
-		/** @reg.ring_execlist_sq_contents: RING_EXECLIST_SQ_CONTENTS */
-		u64 ring_execlist_sq_contents;
-		/** @reg.ring_acthd: RING_ACTHD */
-		u64 ring_acthd;
-		/** @reg.ring_bbaddr: RING_BBADDR */
-		u64 ring_bbaddr;
-		/** @reg.ring_dma_fadd: RING_DMA_FADD */
-		u64 ring_dma_fadd;
-		/** @reg.ring_hwstam: RING_HWSTAM */
-		u32 ring_hwstam;
-		/** @reg.ring_hws_pga: RING_HWS_PGA */
-		u32 ring_hws_pga;
-		/** @reg.ring_start: RING_START */
-		u64 ring_start;
-		/** @reg.ring_head: RING_HEAD */
-		u32 ring_head;
-		/** @reg.ring_tail: RING_TAIL */
-		u32 ring_tail;
-		/** @reg.ring_ctl: RING_CTL */
-		u32 ring_ctl;
-		/** @reg.ring_mi_mode: RING_MI_MODE */
-		u32 ring_mi_mode;
-		/** @reg.ring_mode: RING_MODE */
-		u32 ring_mode;
-		/** @reg.ring_imr: RING_IMR */
-		u32 ring_imr;
-		/** @reg.ring_esr: RING_ESR */
-		u32 ring_esr;
-		/** @reg.ring_emr: RING_EMR */
-		u32 ring_emr;
-		/** @reg.ring_eir: RING_EIR */
-		u32 ring_eir;
-		/** @reg.indirect_ring_state: INDIRECT_RING_STATE */
-		u32 indirect_ring_state;
-		/** @reg.ipehr: IPEHR */
-		u32 ipehr;
-		/** @reg.rcu_mode: RCU_MODE */
-		u32 rcu_mode;
-		struct {
-			/** @reg.instdone.ring: RING_INSTDONE */
-			u32 ring;
-			/** @reg.instdone.slice_common: SC_INSTDONE */
-			u32 *slice_common;
-			/** @reg.instdone.slice_common_extra: SC_INSTDONE_EXTRA */
-			u32 *slice_common_extra;
-			/** @reg.instdone.slice_common_extra2: SC_INSTDONE_EXTRA2 */
-			u32 *slice_common_extra2;
-			/** @reg.instdone.sampler: SAMPLER_INSTDONE */
-			u32 *sampler;
-			/** @reg.instdone.row: ROW_INSTDONE */
-			u32 *row;
-			/** @reg.instdone.geom_svg: INSTDONE_GEOM_SVGUNIT */
-			u32 *geom_svg;
-		} instdone;
-	} reg;
 };
 
 #endif
-- 
2.51.0


From 67ec9f87bd6c57db1251bb2244d242f7ca5a0b6a Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Mon, 7 Oct 2024 08:45:42 +0100
Subject: [PATCH 02/16] drm/xe/bmg: improve cache flushing behaviour

The BSpec says that EN_L3_RW_CCS_CACHE_FLUSH must be toggled
on for manual global invalidation to take effect and actually flush
device cache, however this also turns on flushing for things like
pipecontrol, which occurs between submissions for compute/render. This
sounds like massive overkill for our needs, where we already have the
manual flushing on the display side with the global invalidation. Some
observations on BMG:

1. Disabling l2 caching for host writes and stubbing out the driver
   global invalidation but keeping EN_L3_RW_CCS_CACHE_FLUSH enabled, has
   no impact on wb-transient-vs-display IGT, which makes sense since the
   pipecontrol is now flushing the device cache after the render copy.
   Without EN_L3_RW_CCS_CACHE_FLUSH the test then fails, which is also
   expected since device cache is now dirty and display engine can't see
   the writes.

2. Disabling EN_L3_RW_CCS_CACHE_FLUSH, but keeping the driver global
   invalidation also has no impact on wb-transient-vs-display. This
   suggests that the global invalidation still works as expected and is
   flushing the device cache without EN_L3_RW_CCS_CACHE_FLUSH turned on.

With that drop EN_L3_RW_CCS_CACHE_FLUSH. This helps some workloads since
we no longer flush the device cache between submissions as part of
pipecontrol.

Edit: We now also have clarification from HW side that BSpec was indeed
wrong here.

v2:
  - Rebase and update commit message.

BSpec: 71718
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Vitasta Wattal <vitasta.wattal@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Nirmoy Das <nirmoy.das@intel.com>
Reviewed-by: Nirmoy Das <nirmoy.das@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241007074541.33937-2-matthew.auld@intel.com
---
 drivers/gpu/drm/xe/regs/xe_gt_regs.h | 3 ---
 drivers/gpu/drm/xe/xe_gt.c           | 1 -
 2 files changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index 55092089bff6..f531eeeb8957 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -401,9 +401,6 @@
 
 #define XE2_GLOBAL_INVAL			XE_REG(0xb404)
 
-#define SCRATCH1LPFC				XE_REG(0xb474)
-#define   EN_L3_RW_CCS_CACHE_FLUSH		REG_BIT(0)
-
 #define XE2LPM_L3SQCREG2			XE_REG_MCR(0xb604)
 
 #define XE2LPM_L3SQCREG3			XE_REG_MCR(0xb608)
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 2d5e78311b76..1c79660fb086 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -108,7 +108,6 @@ static void xe_gt_enable_host_l2_vram(struct xe_gt *gt)
 		return;
 
 	if (!xe_gt_is_media_type(gt)) {
-		xe_mmio_write32(&gt->mmio, SCRATCH1LPFC, EN_L3_RW_CCS_CACHE_FLUSH);
 		reg = xe_gt_mcr_unicast_read_any(gt, XE2_GAMREQSTRM_CTRL);
 		reg |= CG_DIS_CNTLBUS;
 		xe_gt_mcr_multicast_write(gt, XE2_GAMREQSTRM_CTRL, reg);
-- 
2.51.0


From 7a7593e5885bc172050a75ddf2bb6aeb96c8a8a0 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Thomas=20Hellstr=C3=B6m?= <thomas.hellstrom@linux.intel.com>
Date: Fri, 4 Oct 2024 16:11:20 +0200
Subject: [PATCH 03/16] drm/xe/tests: Fix the shrinker test compiler warnings.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The xe_bo_shrink_kunit test has an uninitialized value and illegal
integer size conversions on 32-bit. Fix.

v2:
- Use div64_u64 to ensure the u64 division compiles everywhere. (Matt Auld)

Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/20240913195649.GA61514@thelio-3990X/
Fixes: 5a90b60db5e6 ("drm/xe: Add a xe_bo subtest for shrinking / swapping")
Cc: dri-devel@lists.freedesktop.org
Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com> #v1
Signed-off-by: Thomas HellstrÃ¶m <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241004141121.186177-1-thomas.hellstrom@linux.intel.com
---
 drivers/gpu/drm/xe/tests/xe_bo.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c
index 7d3fd720478b..cd811aa2b227 100644
--- a/drivers/gpu/drm/xe/tests/xe_bo.c
+++ b/drivers/gpu/drm/xe/tests/xe_bo.c
@@ -7,6 +7,7 @@
 #include <kunit/visibility.h>
 
 #include <linux/iosys-map.h>
+#include <linux/math64.h>
 #include <linux/random.h>
 #include <linux/swap.h>
 
@@ -440,7 +441,7 @@ static int shrink_test_run_device(struct xe_device *xe)
 	LIST_HEAD(bos);
 	struct xe_bo_link *link, *next;
 	struct sysinfo si;
-	size_t ram, ram_and_swap, purgeable, alloced, to_alloc, limit;
+	u64 ram, ram_and_swap, purgeable = 0, alloced, to_alloc, limit;
 	unsigned int interrupted = 0, successful = 0, count = 0;
 	struct rnd_state prng;
 	u64 rand_seed;
@@ -469,7 +470,7 @@ static int shrink_test_run_device(struct xe_device *xe)
 	ram_and_swap = ram + get_nr_swap_pages() * PAGE_SIZE;
 	if (to_alloc > ram_and_swap)
 		purgeable = to_alloc - ram_and_swap;
-	purgeable += purgeable / 5;
+	purgeable += div64_u64(purgeable, 5);
 
 	kunit_info(test, "Free ram is %lu bytes. Will allocate twice of that.\n",
 		   (unsigned long)ram);
-- 
2.51.0


From 081cb8948cfe322076cd23f22f85ba68f73e2c4b Mon Sep 17 00:00:00 2001
From: Gustavo Sousa <gustavo.sousa@intel.com>
Date: Tue, 8 Oct 2024 13:46:26 -0700
Subject: [PATCH 04/16] drm/xe/xe3: Add initial set of workarounds

Implement the initial set of workarounds for Xe3 IPs.

Signed-off-by: Gustavo Sousa <gustavo.sousa@intel.com>
Signed-off-by: Matt Atwood <matthew.s.atwood@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241008204626.55802-2-matthew.s.atwood@intel.com
---
 drivers/gpu/drm/xe/regs/xe_engine_regs.h |  1 +
 drivers/gpu/drm/xe/regs/xe_gt_regs.h     |  3 ++
 drivers/gpu/drm/xe/xe_wa.c               | 47 ++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_wa_oob.rules       |  1 +
 4 files changed, 52 insertions(+)

diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
index 81b71903675e..7c78496e6213 100644
--- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
@@ -186,6 +186,7 @@
 
 #define VDBOX_CGCTL3F10(base)			XE_REG((base) + 0x3f10)
 #define   IECPUNIT_CLKGATE_DIS			REG_BIT(22)
+#define   RAMDFTUNIT_CLKGATE_DIS		REG_BIT(9)
 
 #define VDBOX_CGCTL3F18(base)			XE_REG((base) + 0x3f18)
 #define   ALNUNIT_CLKGATE_DIS			REG_BIT(13)
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index f531eeeb8957..42dc55cb23f4 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -286,6 +286,9 @@
 #define   GAMTLBVEBOX0_CLKGATE_DIS		REG_BIT(16)
 #define   LTCDD_CLKGATE_DIS			REG_BIT(10)
 
+#define UNSLCGCTL9454				XE_REG(0x9454)
+#define   LSCFE_CLKGATE_DIS			REG_BIT(4)
+
 #define XEHP_SLICE_UNIT_LEVEL_CLKGATE		XE_REG_MCR(0x94d4)
 #define   L3_CR2X_CLKGATE_DIS			REG_BIT(17)
 #define   L3_CLKGATE_DIS			REG_BIT(16)
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index 94ea76b098ed..0ee532eb26d6 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -252,6 +252,34 @@ static const struct xe_rtp_entry_sr gt_was[] = {
 	  XE_RTP_ENTRY_FLAG(FOREACH_ENGINE),
 	},
 
+	/* Xe3_LPG */
+
+	{ XE_RTP_NAME("14021871409"),
+	  XE_RTP_RULES(GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0)),
+	  XE_RTP_ACTIONS(SET(UNSLCGCTL9454, LSCFE_CLKGATE_DIS))
+	},
+
+	/* Xe3_LPM */
+
+	{ XE_RTP_NAME("16021867713"),
+	  XE_RTP_RULES(MEDIA_VERSION(3000),
+		       ENGINE_CLASS(VIDEO_DECODE)),
+	  XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F1C(0), MFXPIPE_CLKGATE_DIS)),
+	  XE_RTP_ENTRY_FLAG(FOREACH_ENGINE),
+	},
+	{ XE_RTP_NAME("16021865536"),
+	  XE_RTP_RULES(MEDIA_VERSION(3000),
+		       ENGINE_CLASS(VIDEO_DECODE)),
+	  XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F10(0), IECPUNIT_CLKGATE_DIS)),
+	  XE_RTP_ENTRY_FLAG(FOREACH_ENGINE),
+	},
+	{ XE_RTP_NAME("14021486841"),
+	  XE_RTP_RULES(MEDIA_VERSION(3000), MEDIA_STEP(A0, B0),
+		       ENGINE_CLASS(VIDEO_DECODE)),
+	  XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F10(0), RAMDFTUNIT_CLKGATE_DIS)),
+	  XE_RTP_ENTRY_FLAG(FOREACH_ENGINE),
+	},
+
 	{}
 };
 
@@ -568,6 +596,13 @@ static const struct xe_rtp_entry_sr engine_was[] = {
 			     XE_RTP_ACTION_FLAG(ENGINE_BASE)))
 	},
 
+	/* Xe3_LPG */
+
+	{ XE_RTP_NAME("14021402888"),
+	  XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3001), FUNC(xe_rtp_match_first_render_or_compute)),
+	  XE_RTP_ACTIONS(SET(HALF_SLICE_CHICKEN7, CLEAR_OPTIMIZATION_DISABLE))
+	},
+
 	{}
 };
 
@@ -739,6 +774,18 @@ static const struct xe_rtp_entry_sr lrc_was[] = {
 	  XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX))
 	},
 
+	/* Xe3_LPG */
+	{ XE_RTP_NAME("14021490052"),
+	  XE_RTP_RULES(GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0),
+		       ENGINE_CLASS(RENDER)),
+	  XE_RTP_ACTIONS(SET(FF_MODE,
+			     DIS_MESH_PARTIAL_AUTOSTRIP |
+			     DIS_MESH_AUTOSTRIP),
+			 SET(VFLSKPD,
+			     DIS_PARTIAL_AUTOSTRIP |
+			     DIS_AUTOSTRIP))
+	},
+
 	{}
 };
 
diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
index 0154fbe154e9..264d6e116499 100644
--- a/drivers/gpu/drm/xe/xe_wa_oob.rules
+++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
@@ -33,6 +33,7 @@
 		GRAPHICS_VERSION(2004)
 22019338487	MEDIA_VERSION(2000)
 		GRAPHICS_VERSION(2001)
+		MEDIA_VERSION(3000), MEDIA_STEP(A0, B0)
 22019338487_display	PLATFORM(LUNARLAKE)
 16023588340	GRAPHICS_VERSION(2001)
 14019789679	GRAPHICS_VERSION(1255)
-- 
2.51.0


From 8fb1da9f9bfb02f710a7f826d50781b0b030cf53 Mon Sep 17 00:00:00 2001
From: Aradhya Bhatia <aradhya.bhatia@intel.com>
Date: Wed, 9 Oct 2024 12:25:42 +0530
Subject: [PATCH 05/16] drm/xe/xe2lpg: Extend Wa_15016589081 for xe2lpg

Add workaround (wa) 15016589081 which applies to Xe2_v3_LPG_MD.

Xe2_v3_LPG_MD is a Lunar Lake platform with GFX version: 20.04.
This wa is type: permanent, and hence is applicable on all steppings.

Signed-off-by: Aradhya Bhatia <aradhya.bhatia@intel.com>
Reviewed-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241009065542.283151-1-aradhya.bhatia@intel.com
---
 drivers/gpu/drm/xe/xe_wa.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index 0ee532eb26d6..0c90438e8e72 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -746,6 +746,10 @@ static const struct xe_rtp_entry_sr lrc_was[] = {
 			     DIS_PARTIAL_AUTOSTRIP |
 			     DIS_AUTOSTRIP))
 	},
+	{ XE_RTP_NAME("15016589081"),
+	  XE_RTP_RULES(GRAPHICS_VERSION(2004), ENGINE_CLASS(RENDER)),
+	  XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX))
+	},
 
 	/* Xe2_HPG */
 	{ XE_RTP_NAME("15010599737"),
-- 
2.51.0


From cfcbc0520d5055825f0647ab922b655688605183 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Wed, 9 Oct 2024 09:48:09 +0100
Subject: [PATCH 06/16] drm/xe: fix unbalanced rpm put() with fence_fini()

Currently we can call fence_fini() twice if something goes wrong when
sending the GuC CT for the tlb request, since we signal the fence and
return an error, leading to the caller also calling fini() on the error
path in the case of stack version of the flow, which leads to an extra
rpm put() which might later cause device to enter suspend when it
shouldn't. It looks like we can just drop the fini() call since the
fence signaller side will already call this for us.

There are known mysterious splats with device going to sleep even with
an rpm ref, and this could be one candidate.

v2 (Matt B):
  - Prefer warning if we detect double fini()

Fixes: 0a382f9bc5dc ("drm/xe: Hold a PM ref when GT TLB invalidations are inflight")
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Nirmoy Das <nirmoy.das@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Nirmoy Das <nirmoy.das@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241009084808.204432-3-matthew.auld@intel.com
---
 drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 29 +++++++++------------
 drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h |  1 -
 drivers/gpu/drm/xe/xe_vm.c                  |  8 ++----
 3 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
index 98616de0c5bb..a530a933eedc 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -37,6 +37,15 @@ static long tlb_timeout_jiffies(struct xe_gt *gt)
 	return hw_tlb_timeout + 2 * delay;
 }
 
+static void xe_gt_tlb_invalidation_fence_fini(struct xe_gt_tlb_invalidation_fence *fence)
+{
+	if (WARN_ON_ONCE(!fence->gt))
+		return;
+
+	xe_pm_runtime_put(gt_to_xe(fence->gt));
+	fence->gt = NULL; /* fini() should be called once */
+}
+
 static void
 __invalidation_fence_signal(struct xe_device *xe, struct xe_gt_tlb_invalidation_fence *fence)
 {
@@ -204,7 +213,7 @@ static int send_tlb_invalidation(struct xe_guc *guc,
 						   tlb_timeout_jiffies(gt));
 		}
 		spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
-	} else if (ret < 0) {
+	} else {
 		__invalidation_fence_signal(xe, fence);
 	}
 	if (!ret) {
@@ -267,10 +276,8 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
 
 		xe_gt_tlb_invalidation_fence_init(gt, &fence, true);
 		ret = xe_gt_tlb_invalidation_guc(gt, &fence);
-		if (ret < 0) {
-			xe_gt_tlb_invalidation_fence_fini(&fence);
+		if (ret)
 			return ret;
-		}
 
 		xe_gt_tlb_invalidation_fence_wait(&fence);
 	} else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) {
@@ -498,7 +505,8 @@ static const struct dma_fence_ops invalidation_fence_ops = {
  * @stack: fence is stack variable
  *
  * Initialize TLB invalidation fence for use. xe_gt_tlb_invalidation_fence_fini
- * must be called if fence is not signaled.
+ * will be automatically called when fence is signalled (all fences must signal),
+ * even on error.
  */
 void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt,
 				       struct xe_gt_tlb_invalidation_fence *fence,
@@ -518,14 +526,3 @@ void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt,
 		dma_fence_get(&fence->base);
 	fence->gt = gt;
 }
-
-/**
- * xe_gt_tlb_invalidation_fence_fini - Finalize TLB invalidation fence
- * @fence: TLB invalidation fence to finalize
- *
- * Drop PM ref which fence took durinig init.
- */
-void xe_gt_tlb_invalidation_fence_fini(struct xe_gt_tlb_invalidation_fence *fence)
-{
-	xe_pm_runtime_put(gt_to_xe(fence->gt));
-}
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
index a84065fa324c..f430d5797af7 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
@@ -28,7 +28,6 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
 void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt,
 				       struct xe_gt_tlb_invalidation_fence *fence,
 				       bool stack);
-void xe_gt_tlb_invalidation_fence_fini(struct xe_gt_tlb_invalidation_fence *fence);
 
 static inline void
 xe_gt_tlb_invalidation_fence_wait(struct xe_gt_tlb_invalidation_fence *fence)
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index ce9dca4d4e87..c99380271de6 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3199,10 +3199,8 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
 
 			ret = xe_gt_tlb_invalidation_vma(tile->primary_gt,
 							 &fence[fence_id], vma);
-			if (ret < 0) {
-				xe_gt_tlb_invalidation_fence_fini(&fence[fence_id]);
+			if (ret)
 				goto wait;
-			}
 			++fence_id;
 
 			if (!tile->media_gt)
@@ -3214,10 +3212,8 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
 
 			ret = xe_gt_tlb_invalidation_vma(tile->media_gt,
 							 &fence[fence_id], vma);
-			if (ret < 0) {
-				xe_gt_tlb_invalidation_fence_fini(&fence[fence_id]);
+			if (ret)
 				goto wait;
-			}
 			++fence_id;
 		}
 	}
-- 
2.51.0


From a187c1b0a800565a4db6372268692aff99df7f53 Mon Sep 17 00:00:00 2001
From: Matthew Auld <matthew.auld@intel.com>
Date: Wed, 9 Oct 2024 09:48:10 +0100
Subject: [PATCH 07/16] drm/xe: fix unbalanced rpm put() with declare_wedged()

Technically the or_reset() means we call the action on failure, however
that would lead to unbalanced rpm put(). Move the get() earlier to fix
this. It should be extremely unlikely to ever trigger this in practice.

Fixes: 452bca0edbd0 ("drm/xe: Don't suspend device upon wedge")
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Nirmoy Das <nirmoy.das@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Nirmoy Das <nirmoy.das@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241009084808.204432-4-matthew.auld@intel.com
---
 drivers/gpu/drm/xe/xe_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index cd241a8e1838..962751c966d1 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -1016,13 +1016,13 @@ void xe_device_declare_wedged(struct xe_device *xe)
 		return;
 	}
 
+	xe_pm_runtime_get_noresume(xe);
+
 	if (drmm_add_action_or_reset(&xe->drm, xe_device_wedged_fini, xe)) {
 		drm_err(&xe->drm, "Failed to register xe_device_wedged_fini clean-up. Although device is wedged.\n");
 		return;
 	}
 
-	xe_pm_runtime_get_noresume(xe);
-
 	if (!atomic_xchg(&xe->wedged.flag, 1)) {
 		xe->needs_flr_on_fini = true;
 		drm_err(&xe->drm,
-- 
2.51.0


From 46bcb0a1214ac6677df8660ac0f6bdf1eff27e8f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Wed, 9 Oct 2024 17:05:10 +0100
Subject: [PATCH 08/16] drm/xe/guc: Fix inverted logic on snapshot->copy check
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Currently the check to see if snapshot->copy has been allocated is
inverted and ends up dereferencing snapshot->copy when free'ing
objects in the array when it is null or not free'ing the objects
when snapshot->copy is allocated. Fix this by using the correct
non-null pointer check logic.

Fixes: d8ce1a977226 ("drm/xe/guc: Use a two stage dump for GuC logs and add more info")
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Thomas HellstrÃ¶m <thomas.hellstrom@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241009160510.372195-1-colin.i.king@gmail.com
---
 drivers/gpu/drm/xe/xe_guc_log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index 93921f04153f..cc70f448d879 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -122,7 +122,7 @@ void xe_guc_log_snapshot_free(struct xe_guc_log_snapshot *snapshot)
 	if (!snapshot)
 		return;
 
-	if (!snapshot->copy) {
+	if (snapshot->copy) {
 		for (i = 0; i < snapshot->num_chunks; i++)
 			kfree(snapshot->copy[i]);
 		kfree(snapshot->copy);
-- 
2.51.0


From a4de6beb83fc5adee788518350247c629568901e Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 9 Oct 2024 22:43:57 +0300
Subject: [PATCH 09/16] drm/xe/display: Separate the d3cold and non-d3cold
 runtime PM handling

For clarity separate the d3cold and non-d3cold runtime PM handling. The
only change in behavior is disabling polling later during runtime
resume. This shouldn't make a difference, since the poll disabling is
handled from a work, which could run at any point wrt. the runtime
resume handler. The work will also require a runtime PM reference,
syncing it with the resume handler.

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241009194358.1321200-4-imre.deak@intel.com
---
 drivers/gpu/drm/xe/display/xe_display.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c
index 26b2cae11d46..9ef75a20c3c0 100644
--- a/drivers/gpu/drm/xe/display/xe_display.c
+++ b/drivers/gpu/drm/xe/display/xe_display.c
@@ -341,6 +341,9 @@ static void __xe_display_pm_suspend(struct xe_device *xe, bool runtime)
 	intel_opregion_suspend(display, s2idle ? PCI_D1 : PCI_D3cold);
 
 	intel_dmc_suspend(xe);
+
+	if (runtime && has_display(xe))
+		intel_hpd_poll_enable(xe);
 }
 
 void xe_display_pm_suspend(struct xe_device *xe)
@@ -383,8 +386,10 @@ void xe_display_pm_runtime_suspend(struct xe_device *xe)
 	if (!xe->info.probe_display)
 		return;
 
-	if (xe->d3cold.allowed)
+	if (xe->d3cold.allowed) {
 		__xe_display_pm_suspend(xe, true);
+		return;
+	}
 
 	intel_hpd_poll_enable(xe);
 }
@@ -447,9 +452,11 @@ static void __xe_display_pm_resume(struct xe_device *xe, bool runtime)
 		intel_display_driver_resume(xe);
 		drm_kms_helper_poll_enable(&xe->drm);
 		intel_display_driver_enable_user_access(xe);
-		intel_hpd_poll_disable(xe);
 	}
 
+	if (has_display(xe))
+		intel_hpd_poll_disable(xe);
+
 	intel_opregion_resume(display);
 
 	intel_fbdev_set_suspend(&xe->drm, FBINFO_STATE_RUNNING, false);
@@ -467,10 +474,12 @@ void xe_display_pm_runtime_resume(struct xe_device *xe)
 	if (!xe->info.probe_display)
 		return;
 
-	intel_hpd_poll_disable(xe);
-
-	if (xe->d3cold.allowed)
+	if (xe->d3cold.allowed) {
 		__xe_display_pm_resume(xe, true);
+		return;
+	}
+
+	intel_hpd_poll_disable(xe);
 }
 
 
-- 
2.51.0


From bbc4a30de095f0349d3c278500345a1b620d495e Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 9 Oct 2024 22:43:58 +0300
Subject: [PATCH 10/16] drm/xe/display: Add missing HPD interrupt enabling
 during non-d3cold RPM resume

Atm the display HPD interrupts that got disabled during runtime
suspend, are re-enabled only if d3cold is enabled. Fix things by
also re-enabling the interrupts if d3cold is disabled.

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241009194358.1321200-5-imre.deak@intel.com
---
 drivers/gpu/drm/xe/display/xe_display.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c
index 9ef75a20c3c0..5c6b74c36b60 100644
--- a/drivers/gpu/drm/xe/display/xe_display.c
+++ b/drivers/gpu/drm/xe/display/xe_display.c
@@ -479,6 +479,7 @@ void xe_display_pm_runtime_resume(struct xe_device *xe)
 		return;
 	}
 
+	intel_hpd_init(xe);
 	intel_hpd_poll_disable(xe);
 }
 
-- 
2.51.0


From 90521df5fc43980e4575bd8c5b1cb62afe1a9f5f Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Wed, 2 Oct 2024 17:16:56 -0700
Subject: [PATCH 11/16] drm/xe: Take job list lock in xe_sched_add_pending_job

A fragile micro optimization in xe_sched_add_pending_job relied on both
the GPU scheduler being stopped and fence signaling stopped to safely
add a job to the pending list without the job list lock in
xe_sched_add_pending_job. Remove this optimization and just take the job
list lock.

Fixes: 7ddb9403dd74 ("drm/xe: Sample ctx timestamp to determine if jobs have timed out")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241003001657.3517883-2-matthew.brost@intel.com
---
 drivers/gpu/drm/xe/xe_gpu_scheduler.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
index 5ad5629a6c60..64b2ae6839db 100644
--- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
+++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
@@ -63,7 +63,9 @@ xe_sched_invalidate_job(struct xe_sched_job *job, int threshold)
 static inline void xe_sched_add_pending_job(struct xe_gpu_scheduler *sched,
 					    struct xe_sched_job *job)
 {
+	spin_lock(&sched->base.job_list_lock);
 	list_add(&job->drm.list, &sched->base.pending_list);
+	spin_unlock(&sched->base.job_list_lock);
 }
 
 static inline
-- 
2.51.0


From ea2f6a77d0c40d97f4a4dc93fee4afe15d94926d Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Wed, 2 Oct 2024 17:16:57 -0700
Subject: [PATCH 12/16] drm/xe: Don't free job in TDR

Freeing job in TDR is not safe as TDR can pass the run_job thread
resulting in UAF. It is only safe for free job to naturally be called by
the scheduler. Rather free job in TDR, add to pending list.

Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2811
Cc: Matthew Auld <matthew.auld@intel.com>
Fixes: e275d61c5f3f ("drm/xe/guc: Handle timing out of signaled jobs gracefully")
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241003001657.3517883-3-matthew.brost@intel.com
---
 drivers/gpu/drm/xe/xe_guc_submit.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index c1ebc693a617..0e5649b394b6 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1105,10 +1105,13 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
 
 	/*
 	 * TDR has fired before free job worker. Common if exec queue
-	 * immediately closed after last fence signaled.
+	 * immediately closed after last fence signaled. Add back to pending
+	 * list so job can be freed and kick scheduler ensuring free job is not
+	 * lost.
 	 */
 	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
-		guc_exec_queue_free_job(drm_job);
+		xe_sched_add_pending_job(sched, job);
+		xe_sched_submission_start(sched);
 
 		return DRM_GPU_SCHED_STAT_NOMINAL;
 	}
-- 
2.51.0


From b8b1163248759ba18509f7443a2d19b15b4c1df8 Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Wed, 11 Sep 2024 08:26:22 -0700
Subject: [PATCH 13/16] drm/xe: Use bookkeep slots for external BO's in exec
 IOCTL
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Fix external BO's dma-resv usage in exec IOCTL using bookkeep slots
rather than write slots. This leaves syncing to user space rather than
the KMD blindly enforcing write semantics on every external BO.

Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Cc: JosÃ© Roberto de Souza <jose.souza@intel.com>
Cc: Kenneth Graunke <kenneth.w.graunke@intel.com>
Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
Reported-by: Simona Vetter <simona.vetter@ffwll.ch>
Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2673
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: JosÃ© Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240911152622.903058-1-matthew.brost@intel.com
---
 drivers/gpu/drm/xe/xe_exec.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 7b38485817dc..f23ac1e2ed88 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -41,11 +41,6 @@
  * user knows an exec writes to a BO and reads from the BO in the next exec, it
  * is the user's responsibility to pass in / out fence between the two execs).
  *
- * Implicit dependencies for external BOs are handled by using the dma-buf
- * implicit dependency uAPI (TODO: add link). To make this works each exec must
- * install the job's fence into the DMA_RESV_USAGE_WRITE slot of every external
- * BO mapped in the VM.
- *
  * We do not allow a user to trigger a bind at exec time rather we have a VM
  * bind IOCTL which uses the same in / out fence interface as exec. In that
  * sense, a VM bind is basically the same operation as an exec from the user
@@ -59,8 +54,8 @@
  * behind any pending kernel operations on any external BOs in VM or any BOs
  * private to the VM. This is accomplished by the rebinds waiting on BOs
  * DMA_RESV_USAGE_KERNEL slot (kernel ops) and kernel ops waiting on all BOs
- * slots (inflight execs are in the DMA_RESV_USAGE_BOOKING for private BOs and
- * in DMA_RESV_USAGE_WRITE for external BOs).
+ * slots (inflight execs are in the DMA_RESV_USAGE_BOOKKEEP for private BOs and
+ * for external BOs).
  *
  * Rebinds / dma-resv usage applies to non-compute mode VMs only as for compute
  * mode VMs we use preempt fences and a rebind worker (TODO: add link).
@@ -304,7 +299,8 @@ retry:
 	xe_sched_job_arm(job);
 	if (!xe_vm_in_lr_mode(vm))
 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, &job->drm.s_fence->finished,
-					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_WRITE);
+					 DMA_RESV_USAGE_BOOKKEEP,
+					 DMA_RESV_USAGE_BOOKKEEP);
 
 	for (i = 0; i < num_syncs; i++) {
 		xe_sync_entry_signal(&syncs[i], &job->drm.s_fence->finished);
-- 
2.51.0


From 9d559cdcb21f42188d4c3ff3b4fe42b240f4af5d Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Thu, 10 Oct 2024 20:56:16 -0700
Subject: [PATCH 14/16] drm/xe/query: Increase timestamp width

Starting with Xe2 the timestamp is a full 64 bit counter, contrary to
the 36 bit that was available before. Although 36 should be sufficient
for any reasonable delta calculation (for Xe2, of about 30min), it's
surprising to userspace to get something truncated. Also if the
timestamp being compared to is coming from the GPU and the application
is not careful enough to apply the width there, a delta calculation
would be wrong.

Extend it to full 64-bits starting with Xe2.

v2: Expand width=64 to media gt, as it's just a wrong tagging in the
spec - empirical tests show it goes beyond 36 bits and match the engines
for the main gt

Bspec: 60411
Cc: Szymon Morek <szymon.morek@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241011035618.1057602-1-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_query.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
index 158629971eab..07093af0b32e 100644
--- a/drivers/gpu/drm/xe/xe_query.c
+++ b/drivers/gpu/drm/xe/xe_query.c
@@ -164,7 +164,11 @@ query_engine_cycles(struct xe_device *xe,
 			  cpu_clock);
 
 	xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
-	resp.width = 36;
+
+	if (GRAPHICS_VER(xe) >= 20)
+		resp.width = 64;
+	else
+		resp.width = 36;
 
 	/* Only write to the output fields of user query */
 	if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp))
-- 
2.51.0


From 5c84985b07acc0fefd2d619c0bb03eed18f769b5 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Thu, 10 Oct 2024 20:56:17 -0700
Subject: [PATCH 15/16] drm/xe/query: Move timestamp reg to
 hwe_read_timestamp()

__read_timestamps() is actually reading the timestamp from a certain
hwe. Use it as parameter, move register declarations to be inside that
function and rename it.

Reviewed-by: Sai Teja Pottumuttu <sai.teja.pottumuttu@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241011035618.1057602-2-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_query.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
index 07093af0b32e..69307ff4146a 100644
--- a/drivers/gpu/drm/xe/xe_query.c
+++ b/drivers/gpu/drm/xe/xe_query.c
@@ -85,16 +85,13 @@ static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
 }
 
 static void
-__read_timestamps(struct xe_gt *gt,
-		  struct xe_reg lower_reg,
-		  struct xe_reg upper_reg,
-		  u64 *engine_ts,
-		  u64 *cpu_ts,
-		  u64 *cpu_delta,
-		  __ktime_func_t cpu_clock)
+hwe_read_timestamp(struct xe_hw_engine *hwe, u64 *engine_ts, u64 *cpu_ts,
+		   u64 *cpu_delta, __ktime_func_t cpu_clock)
 {
-	struct xe_mmio *mmio = &gt->mmio;
+	struct xe_mmio *mmio = &hwe->gt->mmio;
 	u32 upper, lower, old_upper, loop = 0;
+	struct xe_reg upper_reg = RING_TIMESTAMP_UDW(hwe->mmio_base),
+		      lower_reg = RING_TIMESTAMP(hwe->mmio_base);
 
 	upper = xe_mmio_read32(mmio, upper_reg);
 	do {
@@ -155,13 +152,8 @@ query_engine_cycles(struct xe_device *xe,
 	if (xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL))
 		return -EIO;
 
-	__read_timestamps(gt,
-			  RING_TIMESTAMP(hwe->mmio_base),
-			  RING_TIMESTAMP_UDW(hwe->mmio_base),
-			  &resp.engine_cycles,
-			  &resp.cpu_timestamp,
-			  &resp.cpu_delta,
-			  cpu_clock);
+	hwe_read_timestamp(hwe, &resp.engine_cycles, &resp.cpu_timestamp,
+			   &resp.cpu_delta, cpu_clock);
 
 	xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
 
-- 
2.51.0


From 735be7acc52fe8f9e29c4327de0993f2c946acba Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@intel.com>
Date: Thu, 10 Oct 2024 20:56:18 -0700
Subject: [PATCH 16/16] drm/xe/query: Tidy up error EFAULT returns

Move the error handling together in a single branch since all of them
are doing similar thing and return the same error.

Reviewed-by: Sai Teja Pottumuttu <sai.teja.pottumuttu@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241011035618.1057602-3-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_query.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
index 69307ff4146a..5093a243e9fe 100644
--- a/drivers/gpu/drm/xe/xe_query.c
+++ b/drivers/gpu/drm/xe/xe_query.c
@@ -163,16 +163,10 @@ query_engine_cycles(struct xe_device *xe,
 		resp.width = 36;
 
 	/* Only write to the output fields of user query */
-	if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp))
-		return -EFAULT;
-
-	if (put_user(resp.cpu_delta, &query_ptr->cpu_delta))
-		return -EFAULT;
-
-	if (put_user(resp.engine_cycles, &query_ptr->engine_cycles))
-		return -EFAULT;
-
-	if (put_user(resp.width, &query_ptr->width))
+	if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp) ||
+	    put_user(resp.cpu_delta, &query_ptr->cpu_delta) ||
+	    put_user(resp.engine_cycles, &query_ptr->engine_cycles) ||
+	    put_user(resp.width, &query_ptr->width))
 		return -EFAULT;
 
 	return 0;
-- 
2.51.0