From 3b04c2cfd71c54117237c72f2a08ff0ae1f602e2 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 11 Sep 2024 16:55:30 +0100 Subject: [PATCH 01/16] drm/xe/bo: add some annotations in bo_put() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit If the put() triggers bo destroy then there is at least one potential sleeping lock. Also annotate bos_lock and ggtt lock. Signed-off-by: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Tejas Upadhyay Cc: "Thomas Hellström" Reviewed-by: Matthew Brost Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20240911155527.178910-8-matthew.auld@intel.com --- drivers/gpu/drm/xe/xe_bo.c | 14 ++++++++++++++ drivers/gpu/drm/xe/xe_bo.h | 6 +----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index a4dadd5a424c..5f2f1ec46b57 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -2350,6 +2350,20 @@ void xe_bo_put_commit(struct llist_head *deferred) drm_gem_object_free(&bo->ttm.base.refcount); } +void xe_bo_put(struct xe_bo *bo) +{ + might_sleep(); + if (bo) { +#ifdef CONFIG_PROC_FS + if (bo->client) + might_lock(&bo->client->bos_lock); +#endif + if (bo->ggtt_node && bo->ggtt_node->ggtt) + might_lock(&bo->ggtt_node->ggtt->lock); + drm_gem_object_put(&bo->ttm.base); + } +} + /** * xe_bo_dumb_create - Create a dumb bo as backing for a fb * @file_priv: ... diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index dbfb3209615d..6e4be52306df 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -126,11 +126,7 @@ static inline struct xe_bo *xe_bo_get(struct xe_bo *bo) return bo; } -static inline void xe_bo_put(struct xe_bo *bo) -{ - if (bo) - drm_gem_object_put(&bo->ttm.base); -} +void xe_bo_put(struct xe_bo *bo); static inline void __xe_bo_unset_bulk_move(struct xe_bo *bo) { -- 2.51.0 From f96dbf7c321d70834d46f3aedb75a671e839b51e Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 10 Sep 2024 18:18:20 -0700 Subject: [PATCH 02/16] drm/xe: Do not run GPU page fault handler on a closed VM Closing a VM removes page table memory thus we shouldn't touch page tables when a VM is closed. Do not run the GPU page fault handler once the VM is closed to avoid touching page tables. Signed-off-by: Matthew Brost Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20240911011820.825127-1-matthew.brost@intel.com --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 730eec07795e..00af059a8971 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -212,6 +212,12 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf) * TODO: Change to read lock? Using write lock for simplicity. */ down_write(&vm->lock); + + if (xe_vm_is_closed(vm)) { + err = -ENOENT; + goto unlock_vm; + } + vma = lookup_vma(vm, pf->page_addr); if (!vma) { err = -EINVAL; -- 2.51.0 From bbb1ed0b4437ef728569457a136540ce2e6b11c4 Mon Sep 17 00:00:00 2001 From: Yu Jiaoliang Date: Fri, 6 Sep 2024 15:01:09 +0800 Subject: [PATCH 03/16] drm/xe: Use ERR_CAST to return an error-valued pointer Instead of directly casting and returning an error-valued pointer, use ERR_CAST to make the error handling more explicit and improve code clarity. Signed-off-by: Yu Jiaoliang Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240906070109.1852860-1-yujiaoliang@vivo.com --- drivers/gpu/drm/xe/xe_sa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_sa.c b/drivers/gpu/drm/xe/xe_sa.c index fe2cb2a96f78..e055bed7ae55 100644 --- a/drivers/gpu/drm/xe/xe_sa.c +++ b/drivers/gpu/drm/xe/xe_sa.c @@ -53,7 +53,7 @@ struct xe_sa_manager *xe_sa_bo_manager_init(struct xe_tile *tile, u32 size, u32 if (IS_ERR(bo)) { drm_err(&xe->drm, "failed to allocate bo for sa manager: %ld\n", PTR_ERR(bo)); - return (struct xe_sa_manager *)bo; + return ERR_CAST(bo); } sa_manager->bo = bo; sa_manager->is_iomem = bo->vmap.is_iomem; -- 2.51.0 From 9ba0e0f30ca42a98af3689460063edfb6315718a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Thu, 12 Sep 2024 08:38:42 -0700 Subject: [PATCH 04/16] drm/xe/oa: Fix overflow in oa batch buffer MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit By default xe_bb_create_job() appends a MI_BATCH_BUFFER_END to batch buffer, this is not a problem if batch buffer is only used once but oa reuses the batch buffer for the same metric and at each call it appends a MI_BATCH_BUFFER_END, printing the warning below and then overflowing. [ 381.072016] ------------[ cut here ]------------ [ 381.072019] xe 0000:00:02.0: [drm] Assertion `bb->len * 4 + bb_prefetch(q->gt) <= size` failed! platform: LUNARLAKE subplatform: 1 graphics: Xe2_LPG / Xe2_HPG 20.04 step B0 media: Xe2_LPM / Xe2_HPM 20.00 step B0 tile: 0 VRAM 0 B GT: 0 type 1 So here checking if batch buffer already have MI_BATCH_BUFFER_END if not append it. v2: - simply fix, suggestion from Ashutosh Cc: Ashutosh Dixit Signed-off-by: José Roberto de Souza Reviewed-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20240912153842.35813-1-jose.souza@intel.com --- drivers/gpu/drm/xe/xe_bb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_bb.c b/drivers/gpu/drm/xe/xe_bb.c index a13e0b3a169e..ef777dbdf4ec 100644 --- a/drivers/gpu/drm/xe/xe_bb.c +++ b/drivers/gpu/drm/xe/xe_bb.c @@ -65,7 +65,8 @@ __xe_bb_create_job(struct xe_exec_queue *q, struct xe_bb *bb, u64 *addr) { u32 size = drm_suballoc_size(bb->bo); - bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + if (bb->len == 0 || bb->cs[bb->len - 1] != MI_BATCH_BUFFER_END) + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; xe_gt_assert(q->gt, bb->len * 4 + bb_prefetch(q->gt) <= size); -- 2.51.0 From cdb389a4c9bc2faea866b517afc3aa3faef46022 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 13 Sep 2024 14:02:54 +0800 Subject: [PATCH 05/16] drm/xe/irq: Remove unneeded semicolon Remove unnecessary semicolon in pick_engine_gt(). Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8757 Signed-off-by: Jiapeng Chong Reviewed-by: Himal Prasad Ghimiray Reviewed-by: Shekhar Chauhan Link: https://patchwork.freedesktop.org/patch/msgid/20240913060254.26678-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c index 07577b418205..e501002e8a04 100644 --- a/drivers/gpu/drm/xe/xe_irq.c +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -280,7 +280,7 @@ static struct xe_gt *pick_engine_gt(struct xe_tile *tile, return tile->media_gt; default: break; - }; + } fallthrough; default: return tile->primary_gt; -- 2.51.0 From 02fdf821ed79f59c40d766a85947aa7cc25d4364 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 12 Sep 2024 22:38:12 +0200 Subject: [PATCH 06/16] drm/xe/guc: Fix GUC_{SUBMIT,FIRMWARE}_VER helper macros MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Those macros rely on non-existing MAKE_VER_STRUCT macro, while the correct one that should be used is named MAKE_GUC_VER_STRUCT. Fixes: 4eb0aab6e443 ("drm/xe/guc: Bump minimum required GuC version to v70.29.2") Signed-off-by: Michal Wajdeczko Cc: Julia Filipchuk Cc: John Harrison Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240912203817.1880-2-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_guc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h index c3e6b51f7a09..42116b167c98 100644 --- a/drivers/gpu/drm/xe/xe_guc.h +++ b/drivers/gpu/drm/xe/xe_guc.h @@ -18,8 +18,10 @@ */ #define MAKE_GUC_VER(maj, min, pat) (((maj) << 16) | ((min) << 8) | (pat)) #define MAKE_GUC_VER_STRUCT(ver) MAKE_GUC_VER((ver).major, (ver).minor, (ver).patch) -#define GUC_SUBMIT_VER(guc) MAKE_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY]) -#define GUC_FIRMWARE_VER(guc) MAKE_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_RELEASE]) +#define GUC_SUBMIT_VER(guc) \ + MAKE_GUC_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY]) +#define GUC_FIRMWARE_VER(guc) \ + MAKE_GUC_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_RELEASE]) struct drm_printer; -- 2.51.0 From 804ce41f66e22d20751dd98e696ae3e0a958e4ac Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 12 Sep 2024 22:38:13 +0200 Subject: [PATCH 07/16] drm/xe/guc: Add PF2GUC_SAVE_RESTORE_VF to ABI MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In upcoming patches we will add support to the PF driver to save and restore a VF state maintained by the GuC to allow VF migration. Add necessary H2G definitions to our GuC firmware ABI header. Signed-off-by: Michal Wajdeczko Cc: Michał Winiarski Cc: Tomasz Lis Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240912203817.1880-3-michal.wajdeczko@intel.com --- .../gpu/drm/xe/abi/guc_actions_sriov_abi.h | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/drivers/gpu/drm/xe/abi/guc_actions_sriov_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_sriov_abi.h index 181180f5945c..b6a1852749dd 100644 --- a/drivers/gpu/drm/xe/abi/guc_actions_sriov_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_actions_sriov_abi.h @@ -557,4 +557,65 @@ #define VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_2_VALUE64 GUC_HXG_REQUEST_MSG_n_DATAn #define VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_3_VALUE96 GUC_HXG_REQUEST_MSG_n_DATAn +/** + * DOC: PF2GUC_SAVE_RESTORE_VF + * + * This message is used by the PF to migrate VF info state maintained by the GuC. + * + * This message must be sent as `CTB HXG Message`_. + * + * Available since GuC version 70.25.0 + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_HOST_ | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:16 | DATA0 = **OPCODE** - operation to take: | + * | | | | + * | | | - _`GUC_PF_OPCODE_VF_SAVE` = 0 | + * | | | - _`GUC_PF_OPCODE_VF_RESTORE` = 1 | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | ACTION = _`GUC_ACTION_PF2GUC_SAVE_RESTORE_VF` = 0x550B | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | **VFID** - VF identifier | + * +---+-------+--------------------------------------------------------------+ + * | 2 | 31:0 | **ADDR_LO** - lower 32-bits of GGTT offset to the buffer | + * | | | where the VF info will be save to or restored from. | + * +---+-------+--------------------------------------------------------------+ + * | 3 | 31:0 | **ADDR_HI** - upper 32-bits of GGTT offset to the buffer | + * | | | where the VF info will be save to or restored from. | + * +---+-------+--------------------------------------------------------------+ + * | 4 | 27:0 | **SIZE** - size of the buffer (in dwords) | + * | +-------+--------------------------------------------------------------+ + * | | 31:28 | MBZ | + * +---+-------+--------------------------------------------------------------+ + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_GUC_ | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_RESPONSE_SUCCESS_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:0 | DATA0 = **USED** - size of used buffer space (in dwords) | + * +---+-------+--------------------------------------------------------------+ + */ +#define GUC_ACTION_PF2GUC_SAVE_RESTORE_VF 0x550Bu + +#define PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_LEN (GUC_HXG_EVENT_MSG_MIN_LEN + 4u) +#define PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_0_OPCODE GUC_HXG_EVENT_MSG_0_DATA0 +#define GUC_PF_OPCODE_VF_SAVE 0u +#define GUC_PF_OPCODE_VF_RESTORE 1u +#define PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_1_VFID GUC_HXG_EVENT_MSG_n_DATAn +#define PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_2_ADDR_LO GUC_HXG_EVENT_MSG_n_DATAn +#define PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_3_ADDR_HI GUC_HXG_EVENT_MSG_n_DATAn +#define PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_4_SIZE (0xfffffffu << 0) +#define PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_4_MBZ (0xfu << 28) + +#define PF2GUC_SAVE_RESTORE_VF_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN +#define PF2GUC_SAVE_RESTORE_VF_RESPONSE_MSG_0_USED GUC_HXG_RESPONSE_MSG_0_DATA0 + #endif -- 2.51.0 From d86e3737c7ab907690c20bcde7c1f78f42fce6c2 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 13 Sep 2024 14:00:13 +0200 Subject: [PATCH 08/16] drm/xe/pf: Add functions to save and restore VF GuC state MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit To successfully migrate a VM with attached GPU VF we also need to migrate VF's GuC state. Add necessary functions that interacts with GuC to save and restore a VF GuC state. We will start using them in upcoming patches. Since VF migration requires many more changes in the driver, enable those functions only under debug config. Signed-off-by: Michal Wajdeczko Cc: Michał Winiarski Cc: Tomasz Lis Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240913120013.1924-1-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_gt_sriov_pf.c | 2 + drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c | 334 ++++++++++++++++++ drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h | 17 + .../drm/xe/xe_gt_sriov_pf_migration_types.h | 40 +++ drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h | 6 + 6 files changed, 400 insertions(+) create mode 100644 drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c create mode 100644 drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h create mode 100644 drivers/gpu/drm/xe/xe_gt_sriov_pf_migration_types.h diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index edfd812e0f41..8f1c5c329f79 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -129,6 +129,7 @@ xe-$(CONFIG_PCI_IOV) += \ xe_gt_sriov_pf.o \ xe_gt_sriov_pf_config.o \ xe_gt_sriov_pf_control.o \ + xe_gt_sriov_pf_migration.o \ xe_gt_sriov_pf_monitor.o \ xe_gt_sriov_pf_policy.o \ xe_gt_sriov_pf_service.o \ diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c index 065a9878f8e9..e71fc3d2bda2 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c @@ -13,6 +13,7 @@ #include "xe_gt_sriov_pf_config.h" #include "xe_gt_sriov_pf_control.h" #include "xe_gt_sriov_pf_helpers.h" +#include "xe_gt_sriov_pf_migration.h" #include "xe_gt_sriov_pf_service.h" #include "xe_mmio.h" @@ -89,6 +90,7 @@ void xe_gt_sriov_pf_init_hw(struct xe_gt *gt) pf_enable_ggtt_guest_update(gt); xe_gt_sriov_pf_service_update(gt); + xe_gt_sriov_pf_migration_init(gt); } static u32 pf_get_vf_regs_stride(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c new file mode 100644 index 000000000000..b7188fa6ac07 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2024 Intel Corporation + */ + +#include + +#include "abi/guc_actions_sriov_abi.h" +#include "xe_bo.h" +#include "xe_gt_sriov_pf_helpers.h" +#include "xe_gt_sriov_pf_migration.h" +#include "xe_gt_sriov_printk.h" +#include "xe_guc.h" +#include "xe_guc_ct.h" +#include "xe_sriov.h" + +/* Return: number of dwords saved/restored/required or a negative error code on failure */ +static int guc_action_vf_save_restore(struct xe_guc *guc, u32 vfid, u32 opcode, + u64 addr, u32 ndwords) +{ + u32 request[PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_LEN] = { + FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) | + FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | + FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_PF2GUC_SAVE_RESTORE_VF) | + FIELD_PREP(PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_0_OPCODE, opcode), + FIELD_PREP(PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_1_VFID, vfid), + FIELD_PREP(PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_2_ADDR_LO, lower_32_bits(addr)), + FIELD_PREP(PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_3_ADDR_HI, upper_32_bits(addr)), + FIELD_PREP(PF2GUC_SAVE_RESTORE_VF_REQUEST_MSG_4_SIZE, ndwords), + }; + + return xe_guc_ct_send_block(&guc->ct, request, ARRAY_SIZE(request)); +} + +/* Return: size of the state in dwords or a negative error code on failure */ +static int pf_send_guc_query_vf_state_size(struct xe_gt *gt, unsigned int vfid) +{ + int ret; + + ret = guc_action_vf_save_restore(>->uc.guc, vfid, GUC_PF_OPCODE_VF_SAVE, 0, 0); + return ret ?: -ENODATA; +} + +/* Return: number of state dwords saved or a negative error code on failure */ +static int pf_send_guc_save_vf_state(struct xe_gt *gt, unsigned int vfid, + void *buff, size_t size) +{ + const int ndwords = size / sizeof(u32); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = tile_to_xe(tile); + struct xe_guc *guc = >->uc.guc; + struct xe_bo *bo; + int ret; + + xe_gt_assert(gt, size % sizeof(u32) == 0); + xe_gt_assert(gt, size == ndwords * sizeof(u32)); + + bo = xe_bo_create_pin_map(xe, tile, NULL, + ALIGN(size, PAGE_SIZE), + ttm_bo_type_kernel, + XE_BO_FLAG_SYSTEM | + XE_BO_FLAG_GGTT | + XE_BO_FLAG_GGTT_INVALIDATE); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + ret = guc_action_vf_save_restore(guc, vfid, GUC_PF_OPCODE_VF_SAVE, + xe_bo_ggtt_addr(bo), ndwords); + if (!ret) + ret = -ENODATA; + else if (ret > ndwords) + ret = -EPROTO; + else if (ret > 0) + xe_map_memcpy_from(xe, buff, &bo->vmap, 0, ret * sizeof(u32)); + + xe_bo_unpin_map_no_vm(bo); + return ret; +} + +/* Return: number of state dwords restored or a negative error code on failure */ +static int pf_send_guc_restore_vf_state(struct xe_gt *gt, unsigned int vfid, + const void *buff, size_t size) +{ + const int ndwords = size / sizeof(u32); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = tile_to_xe(tile); + struct xe_guc *guc = >->uc.guc; + struct xe_bo *bo; + int ret; + + xe_gt_assert(gt, size % sizeof(u32) == 0); + xe_gt_assert(gt, size == ndwords * sizeof(u32)); + + bo = xe_bo_create_pin_map(xe, tile, NULL, + ALIGN(size, PAGE_SIZE), + ttm_bo_type_kernel, + XE_BO_FLAG_SYSTEM | + XE_BO_FLAG_GGTT | + XE_BO_FLAG_GGTT_INVALIDATE); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + xe_map_memcpy_to(xe, &bo->vmap, 0, buff, size); + + ret = guc_action_vf_save_restore(guc, vfid, GUC_PF_OPCODE_VF_RESTORE, + xe_bo_ggtt_addr(bo), ndwords); + if (!ret) + ret = -ENODATA; + else if (ret > ndwords) + ret = -EPROTO; + + xe_bo_unpin_map_no_vm(bo); + return ret; +} + +static bool pf_migration_supported(struct xe_gt *gt) +{ + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + return gt->sriov.pf.migration.supported; +} + +static struct mutex *pf_migration_mutex(struct xe_gt *gt) +{ + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + return >->sriov.pf.migration.snapshot_lock; +} + +static struct xe_gt_sriov_state_snapshot *pf_pick_vf_snapshot(struct xe_gt *gt, + unsigned int vfid) +{ + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, vfid <= xe_sriov_pf_get_totalvfs(gt_to_xe(gt))); + lockdep_assert_held(pf_migration_mutex(gt)); + + return >->sriov.pf.vfs[vfid].snapshot; +} + +static unsigned int pf_snapshot_index(struct xe_gt *gt, struct xe_gt_sriov_state_snapshot *snapshot) +{ + return container_of(snapshot, struct xe_gt_sriov_metadata, snapshot) - gt->sriov.pf.vfs; +} + +static void pf_free_guc_state(struct xe_gt *gt, struct xe_gt_sriov_state_snapshot *snapshot) +{ + struct xe_device *xe = gt_to_xe(gt); + + drmm_kfree(&xe->drm, snapshot->guc.buff); + snapshot->guc.buff = NULL; + snapshot->guc.size = 0; +} + +static int pf_alloc_guc_state(struct xe_gt *gt, + struct xe_gt_sriov_state_snapshot *snapshot, + size_t size) +{ + struct xe_device *xe = gt_to_xe(gt); + void *p; + + pf_free_guc_state(gt, snapshot); + + if (!size) + return -ENODATA; + + if (size % sizeof(u32)) + return -EINVAL; + + if (size > SZ_2M) + return -EFBIG; + + p = drmm_kzalloc(&xe->drm, size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + snapshot->guc.buff = p; + snapshot->guc.size = size; + return 0; +} + +static void pf_dump_guc_state(struct xe_gt *gt, struct xe_gt_sriov_state_snapshot *snapshot) +{ + if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV)) { + unsigned int vfid __maybe_unused = pf_snapshot_index(gt, snapshot); + + xe_gt_sriov_dbg_verbose(gt, "VF%u GuC state is %zu dwords:\n", + vfid, snapshot->guc.size / sizeof(u32)); + print_hex_dump_bytes("state: ", DUMP_PREFIX_OFFSET, + snapshot->guc.buff, min(SZ_64, snapshot->guc.size)); + } +} + +static int pf_save_vf_guc_state(struct xe_gt *gt, unsigned int vfid) +{ + struct xe_gt_sriov_state_snapshot *snapshot = pf_pick_vf_snapshot(gt, vfid); + size_t size; + int ret; + + ret = pf_send_guc_query_vf_state_size(gt, vfid); + if (ret < 0) + goto fail; + size = ret * sizeof(u32); + xe_gt_sriov_dbg_verbose(gt, "VF%u state size is %d dwords (%zu bytes)\n", vfid, ret, size); + + ret = pf_alloc_guc_state(gt, snapshot, size); + if (ret < 0) + goto fail; + + ret = pf_send_guc_save_vf_state(gt, vfid, snapshot->guc.buff, size); + if (ret < 0) + goto fail; + size = ret * sizeof(u32); + xe_gt_assert(gt, size); + xe_gt_assert(gt, size <= snapshot->guc.size); + snapshot->guc.size = size; + + pf_dump_guc_state(gt, snapshot); + return 0; + +fail: + xe_gt_sriov_dbg(gt, "Unable to save VF%u state (%pe)\n", vfid, ERR_PTR(ret)); + pf_free_guc_state(gt, snapshot); + return ret; +} + +/** + * xe_gt_sriov_pf_migration_save_guc_state() - Take a GuC VF state snapshot. + * @gt: the &xe_gt + * @vfid: the VF identifier + * + * This function is for PF only. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_gt_sriov_pf_migration_save_guc_state(struct xe_gt *gt, unsigned int vfid) +{ + int err; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, vfid != PFID); + xe_gt_assert(gt, vfid <= xe_sriov_pf_get_totalvfs(gt_to_xe(gt))); + + if (!pf_migration_supported(gt)) + return -ENOPKG; + + mutex_lock(pf_migration_mutex(gt)); + err = pf_save_vf_guc_state(gt, vfid); + mutex_unlock(pf_migration_mutex(gt)); + + return err; +} + +static int pf_restore_vf_guc_state(struct xe_gt *gt, unsigned int vfid) +{ + struct xe_gt_sriov_state_snapshot *snapshot = pf_pick_vf_snapshot(gt, vfid); + int ret; + + if (!snapshot->guc.size) + return -ENODATA; + + xe_gt_sriov_dbg_verbose(gt, "restoring %zu dwords of VF%u GuC state\n", + snapshot->guc.size / sizeof(u32), vfid); + ret = pf_send_guc_restore_vf_state(gt, vfid, snapshot->guc.buff, snapshot->guc.size); + if (ret < 0) + goto fail; + + xe_gt_sriov_dbg_verbose(gt, "restored %d dwords of VF%u GuC state\n", ret, vfid); + return 0; + +fail: + xe_gt_sriov_dbg(gt, "Failed to restore VF%u GuC state (%pe)\n", vfid, ERR_PTR(ret)); + return ret; +} + +/** + * xe_gt_sriov_pf_migration_restore_guc_state() - Restore a GuC VF state. + * @gt: the &xe_gt + * @vfid: the VF identifier + * + * This function is for PF only. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_gt_sriov_pf_migration_restore_guc_state(struct xe_gt *gt, unsigned int vfid) +{ + int ret; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, vfid != PFID); + xe_gt_assert(gt, vfid <= xe_sriov_pf_get_totalvfs(gt_to_xe(gt))); + + if (!pf_migration_supported(gt)) + return -ENOPKG; + + mutex_lock(pf_migration_mutex(gt)); + ret = pf_restore_vf_guc_state(gt, vfid); + mutex_unlock(pf_migration_mutex(gt)); + + return ret; +} + +static bool pf_check_migration_support(struct xe_gt *gt) +{ + /* GuC 70.25 with save/restore v2 is required */ + xe_gt_assert(gt, GUC_FIRMWARE_VER(>->uc.guc) >= MAKE_GUC_VER(70, 25, 0)); + + /* XXX: for now this is for feature enabling only */ + return IS_ENABLED(CONFIG_DRM_XE_DEBUG); +} + +/** + * xe_gt_sriov_pf_migration_init() - Initialize support for VF migration. + * @gt: the &xe_gt + * + * This function is for PF only. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_gt_sriov_pf_migration_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + int err; + + xe_gt_assert(gt, IS_SRIOV_PF(xe)); + + gt->sriov.pf.migration.supported = pf_check_migration_support(gt); + + if (!pf_migration_supported(gt)) + return 0; + + err = drmm_mutex_init(&xe->drm, >->sriov.pf.migration.snapshot_lock); + if (err) + return err; + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h new file mode 100644 index 000000000000..6643d730a9ab --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2024 Intel Corporation + */ + +#ifndef _XE_GT_SRIOV_PF_MIGRATION_H_ +#define _XE_GT_SRIOV_PF_MIGRATION_H_ + +#include + +struct xe_gt; + +int xe_gt_sriov_pf_migration_init(struct xe_gt *gt); +int xe_gt_sriov_pf_migration_save_guc_state(struct xe_gt *gt, unsigned int vfid); +int xe_gt_sriov_pf_migration_restore_guc_state(struct xe_gt *gt, unsigned int vfid); + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration_types.h new file mode 100644 index 000000000000..1f3110b6d44f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration_types.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2024 Intel Corporation + */ + +#ifndef _XE_GT_SRIOV_PF_MIGRATION_TYPES_H_ +#define _XE_GT_SRIOV_PF_MIGRATION_TYPES_H_ + +#include +#include + +/** + * struct xe_gt_sriov_state_snapshot - GT-level per-VF state snapshot data. + * + * Used by the PF driver to maintain per-VF migration data. + */ +struct xe_gt_sriov_state_snapshot { + /** @guc: GuC VF state snapshot */ + struct { + /** @guc.buff: buffer with the VF state */ + u32 *buff; + /** @guc.size: size of the buffer (must be dwords aligned) */ + u32 size; + } guc; +}; + +/** + * struct xe_gt_sriov_pf_migration - GT-level data. + * + * Used by the PF driver to maintain non-VF specific per-GT data. + */ +struct xe_gt_sriov_pf_migration { + /** @supported: indicates whether the feature is supported */ + bool supported; + + /** @snapshot_lock: protects all VFs snapshots */ + struct mutex snapshot_lock; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h index 28e1b130bf87..0426b1a77069 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_types.h @@ -10,6 +10,7 @@ #include "xe_gt_sriov_pf_config_types.h" #include "xe_gt_sriov_pf_control_types.h" +#include "xe_gt_sriov_pf_migration_types.h" #include "xe_gt_sriov_pf_monitor_types.h" #include "xe_gt_sriov_pf_policy_types.h" #include "xe_gt_sriov_pf_service_types.h" @@ -29,6 +30,9 @@ struct xe_gt_sriov_metadata { /** @version: negotiated VF/PF ABI version */ struct xe_gt_sriov_pf_service_version version; + + /** @snapshot: snapshot of the VF state data */ + struct xe_gt_sriov_state_snapshot snapshot; }; /** @@ -36,6 +40,7 @@ struct xe_gt_sriov_metadata { * @service: service data. * @control: control data. * @policy: policy data. + * @migration: migration data. * @spare: PF-only provisioning configuration. * @vfs: metadata for all VFs. */ @@ -43,6 +48,7 @@ struct xe_gt_sriov_pf { struct xe_gt_sriov_pf_service service; struct xe_gt_sriov_pf_control control; struct xe_gt_sriov_pf_policy policy; + struct xe_gt_sriov_pf_migration migration; struct xe_gt_sriov_spare_config spare; struct xe_gt_sriov_metadata *vfs; }; -- 2.51.0 From 14423f08c3eaad3ad198e308865d984692d6c5f7 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 12 Sep 2024 22:38:15 +0200 Subject: [PATCH 09/16] drm/xe/pf: Save VF GuC state when pausing VF MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Since usually pausing the VF is done as a first step to migrate that VF, immediately save VF GuC state as a final step of the VF pausing to have that data ready to export when needed. Signed-off-by: Michal Wajdeczko Cc: Michał Winiarski Cc: Tomasz Lis Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240912203817.1880-5-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c | 41 ++++++++++++++++++- .../gpu/drm/xe/xe_gt_sriov_pf_control_types.h | 2 + 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c index b4fd5a81aff1..1f50aec3a059 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c @@ -13,6 +13,7 @@ #include "xe_gt_sriov_pf_config.h" #include "xe_gt_sriov_pf_control.h" #include "xe_gt_sriov_pf_helpers.h" +#include "xe_gt_sriov_pf_migration.h" #include "xe_gt_sriov_pf_monitor.h" #include "xe_gt_sriov_pf_service.h" #include "xe_gt_sriov_printk.h" @@ -177,6 +178,7 @@ static const char *control_bit_to_string(enum xe_gt_sriov_control_bits bit) CASE2STR(PAUSE_SEND_PAUSE); CASE2STR(PAUSE_WAIT_GUC); CASE2STR(PAUSE_GUC_DONE); + CASE2STR(PAUSE_SAVE_GUC); CASE2STR(PAUSE_FAILED); CASE2STR(PAUSED); CASE2STR(RESUME_WIP); @@ -416,6 +418,10 @@ static void pf_enter_vf_ready(struct xe_gt *gt, unsigned int vfid) * : | : / * : v : / * : PAUSE_GUC_DONE o-----restart + * : | : + * : | o---<--busy : + * : v / / : + * : PAUSE_SAVE_GUC : * : / : * : / : * :....o..............o...............o...........: @@ -435,6 +441,7 @@ static void pf_exit_vf_pause_wip(struct xe_gt *gt, unsigned int vfid) pf_escape_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_SEND_PAUSE); pf_escape_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_WAIT_GUC); pf_escape_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_GUC_DONE); + pf_escape_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_SAVE_GUC); } } @@ -465,12 +472,41 @@ static void pf_enter_vf_pause_rejected(struct xe_gt *gt, unsigned int vfid) pf_enter_vf_pause_failed(gt, vfid); } +static void pf_enter_vf_pause_save_guc(struct xe_gt *gt, unsigned int vfid) +{ + if (!pf_enter_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_SAVE_GUC)) + pf_enter_vf_state_machine_bug(gt, vfid); +} + +static bool pf_exit_vf_pause_save_guc(struct xe_gt *gt, unsigned int vfid) +{ + int err; + + if (!pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_SAVE_GUC)) + return false; + + err = xe_gt_sriov_pf_migration_save_guc_state(gt, vfid); + if (err) { + /* retry if busy */ + if (err == -EBUSY) { + pf_enter_vf_pause_save_guc(gt, vfid); + return true; + } + /* give up on error */ + if (err == -EIO) + pf_enter_vf_mismatch(gt, vfid); + } + + pf_enter_vf_pause_completed(gt, vfid); + return true; +} + static bool pf_exit_vf_pause_guc_done(struct xe_gt *gt, unsigned int vfid) { if (!pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_GUC_DONE)) return false; - pf_enter_vf_pause_completed(gt, vfid); + pf_enter_vf_pause_save_guc(gt, vfid); return true; } @@ -1339,6 +1375,9 @@ static bool pf_process_vf_state_machine(struct xe_gt *gt, unsigned int vfid) if (pf_exit_vf_pause_guc_done(gt, vfid)) return true; + if (pf_exit_vf_pause_save_guc(gt, vfid)) + return true; + if (pf_exit_vf_resume_send_resume(gt, vfid)) return true; diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h index 11830aafea45..f02f941b4ad2 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h @@ -27,6 +27,7 @@ * @XE_GT_SRIOV_STATE_PAUSE_SEND_PAUSE: indicates that the PF is about to send a PAUSE command. * @XE_GT_SRIOV_STATE_PAUSE_WAIT_GUC: indicates that the PF awaits for a response from the GuC. * @XE_GT_SRIOV_STATE_PAUSE_GUC_DONE: indicates that the PF has received a response from the GuC. + * @XE_GT_SRIOV_STATE_PAUSE_SAVE_GUC: indicates that the PF needs to save the VF GuC state. * @XE_GT_SRIOV_STATE_PAUSE_FAILED: indicates that a VF pause operation has failed. * @XE_GT_SRIOV_STATE_PAUSED: indicates that the VF is paused. * @XE_GT_SRIOV_STATE_RESUME_WIP: indicates the a VF resume operation is in progress. @@ -56,6 +57,7 @@ enum xe_gt_sriov_control_bits { XE_GT_SRIOV_STATE_PAUSE_SEND_PAUSE, XE_GT_SRIOV_STATE_PAUSE_WAIT_GUC, XE_GT_SRIOV_STATE_PAUSE_GUC_DONE, + XE_GT_SRIOV_STATE_PAUSE_SAVE_GUC, XE_GT_SRIOV_STATE_PAUSE_FAILED, XE_GT_SRIOV_STATE_PAUSED, -- 2.51.0 From d620448fb5673d0705e50e3f73b890a10cfe7c80 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 12 Sep 2024 22:38:16 +0200 Subject: [PATCH 10/16] drm/xe/pf: Allow to view and replace VF GuC state over debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit For feature enabling and testing purposes, allow to view saved VF GuC state and to replace it, but only under strict debug config. Signed-off-by: Michal Wajdeczko Cc: Michał Winiarski Cc: Tomasz Lis Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240912203817.1880-6-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c | 46 ++++++++++ drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c | 85 +++++++++++++++++++ drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h | 7 ++ 3 files changed, 138 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c index 2290ddaf9594..e990a63ec998 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c @@ -17,6 +17,7 @@ #include "xe_gt_sriov_pf_control.h" #include "xe_gt_sriov_pf_debugfs.h" #include "xe_gt_sriov_pf_helpers.h" +#include "xe_gt_sriov_pf_migration.h" #include "xe_gt_sriov_pf_monitor.h" #include "xe_gt_sriov_pf_policy.h" #include "xe_gt_sriov_pf_service.h" @@ -375,6 +376,44 @@ static const struct file_operations control_ops = { .llseek = default_llseek, }; +/* + * /sys/kernel/debug/dri/0/ + * ├── gt0 + * │   ├── vf1 + * │   │   ├── guc_state + */ +static ssize_t guc_state_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct dentry *dent = file_dentry(file); + struct dentry *parent = dent->d_parent; + struct xe_gt *gt = extract_gt(parent); + unsigned int vfid = extract_vfid(parent); + + return xe_gt_sriov_pf_migration_read_guc_state(gt, vfid, buf, count, pos); +} + +static ssize_t guc_state_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + struct dentry *dent = file_dentry(file); + struct dentry *parent = dent->d_parent; + struct xe_gt *gt = extract_gt(parent); + unsigned int vfid = extract_vfid(parent); + + if (*pos) + return -EINVAL; + + return xe_gt_sriov_pf_migration_write_guc_state(gt, vfid, buf, count); +} + +static const struct file_operations guc_state_ops = { + .owner = THIS_MODULE, + .read = guc_state_read, + .write = guc_state_write, + .llseek = default_llseek, +}; + /** * xe_gt_sriov_pf_debugfs_register - Register SR-IOV PF specific entries in GT debugfs. * @gt: the &xe_gt to register @@ -423,5 +462,12 @@ void xe_gt_sriov_pf_debugfs_register(struct xe_gt *gt, struct dentry *root) pf_add_config_attrs(gt, vfdentry, VFID(n)); debugfs_create_file("control", 0600, vfdentry, NULL, &control_ops); + + /* for testing/debugging purposes only! */ + if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { + debugfs_create_file("guc_state", + IS_ENABLED(CONFIG_DRM_XE_DEBUG_SRIOV) ? 0600 : 0400, + vfdentry, NULL, &guc_state_ops); + } } } diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c index b7188fa6ac07..c712111aa30d 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c @@ -297,6 +297,91 @@ int xe_gt_sriov_pf_migration_restore_guc_state(struct xe_gt *gt, unsigned int vf return ret; } +#ifdef CONFIG_DEBUG_FS +/** + * xe_gt_sriov_pf_migration_read_guc_state() - Read a GuC VF state. + * @gt: the &xe_gt + * @vfid: the VF identifier + * @buf: the user space buffer to read to + * @count: the maximum number of bytes to read + * @pos: the current position in the buffer + * + * This function is for PF only. + * + * This function reads up to @count bytes from the saved VF GuC state buffer + * at offset @pos into the user space address starting at @buf. + * + * Return: the number of bytes read or a negative error code on failure. + */ +ssize_t xe_gt_sriov_pf_migration_read_guc_state(struct xe_gt *gt, unsigned int vfid, + char __user *buf, size_t count, loff_t *pos) +{ + struct xe_gt_sriov_state_snapshot *snapshot; + ssize_t ret; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, vfid != PFID); + xe_gt_assert(gt, vfid <= xe_sriov_pf_get_totalvfs(gt_to_xe(gt))); + + if (!pf_migration_supported(gt)) + return -ENOPKG; + + mutex_lock(pf_migration_mutex(gt)); + snapshot = pf_pick_vf_snapshot(gt, vfid); + if (snapshot->guc.size) + ret = simple_read_from_buffer(buf, count, pos, snapshot->guc.buff, + snapshot->guc.size); + else + ret = -ENODATA; + mutex_unlock(pf_migration_mutex(gt)); + + return ret; +} + +/** + * xe_gt_sriov_pf_migration_write_guc_state() - Write a GuC VF state. + * @gt: the &xe_gt + * @vfid: the VF identifier + * @buf: the user space buffer with GuC VF state + * @size: the size of GuC VF state (in bytes) + * + * This function is for PF only. + * + * This function reads @size bytes of the VF GuC state stored at user space + * address @buf and writes it into a internal VF state buffer. + * + * Return: the number of bytes used or a negative error code on failure. + */ +ssize_t xe_gt_sriov_pf_migration_write_guc_state(struct xe_gt *gt, unsigned int vfid, + const char __user *buf, size_t size) +{ + struct xe_gt_sriov_state_snapshot *snapshot; + loff_t pos = 0; + ssize_t ret; + + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + xe_gt_assert(gt, vfid != PFID); + xe_gt_assert(gt, vfid <= xe_sriov_pf_get_totalvfs(gt_to_xe(gt))); + + if (!pf_migration_supported(gt)) + return -ENOPKG; + + mutex_lock(pf_migration_mutex(gt)); + snapshot = pf_pick_vf_snapshot(gt, vfid); + ret = pf_alloc_guc_state(gt, snapshot, size); + if (!ret) { + ret = simple_write_to_buffer(snapshot->guc.buff, size, &pos, buf, size); + if (ret < 0) + pf_free_guc_state(gt, snapshot); + else + pf_dump_guc_state(gt, snapshot); + } + mutex_unlock(pf_migration_mutex(gt)); + + return ret; +} +#endif /* CONFIG_DEBUG_FS */ + static bool pf_check_migration_support(struct xe_gt *gt) { /* GuC 70.25 with save/restore v2 is required */ diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h index 6643d730a9ab..09faeae00ddb 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.h @@ -14,4 +14,11 @@ int xe_gt_sriov_pf_migration_init(struct xe_gt *gt); int xe_gt_sriov_pf_migration_save_guc_state(struct xe_gt *gt, unsigned int vfid); int xe_gt_sriov_pf_migration_restore_guc_state(struct xe_gt *gt, unsigned int vfid); +#ifdef CONFIG_DEBUG_FS +ssize_t xe_gt_sriov_pf_migration_read_guc_state(struct xe_gt *gt, unsigned int vfid, + char __user *buf, size_t count, loff_t *pos); +ssize_t xe_gt_sriov_pf_migration_write_guc_state(struct xe_gt *gt, unsigned int vfid, + const char __user *buf, size_t count); +#endif + #endif -- 2.51.0 From 20e3aa503feb2deafd4185f50cee0da047f62e21 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 12 Sep 2024 22:38:17 +0200 Subject: [PATCH 11/16] drm/xe/pf: Allow to trigger VF GuC state restore from debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit For feature enabling and testing purposes, allow to restore saved or replaced VF GuC state from debugfs, bypassing normal migration flow. This is available only under strict debug config. Signed-off-by: Michal Wajdeczko Cc: Michał Winiarski Cc: Tomasz Lis Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20240912203817.1880-7-michal.wajdeczko@intel.com --- drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c index e990a63ec998..ccbcf6e572d0 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_debugfs.c @@ -313,6 +313,9 @@ static const struct { { "stop", xe_gt_sriov_pf_control_stop_vf }, { "pause", xe_gt_sriov_pf_control_pause_vf }, { "resume", xe_gt_sriov_pf_control_resume_vf }, +#ifdef CONFIG_DRM_XE_DEBUG_SRIOV + { "restore!", xe_gt_sriov_pf_migration_restore_guc_state }, +#endif }; static ssize_t control_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) -- 2.51.0 From fdc81c43f0c14ace6383024a02585e3fcbd1ceba Mon Sep 17 00:00:00 2001 From: He Lugang Date: Wed, 11 Sep 2024 18:22:15 +0800 Subject: [PATCH 12/16] drm/xe: use devm_add_action_or_reset() helper Use devm_add_action_or_reset() to release resources in case of failure, because the cleanup function will be automatically called. Reviewed-by: Rodrigo Vivi Signed-off-by: He Lugang Link: https://patchwork.freedesktop.org/patch/msgid/9631BC17D1E028A2+20240911102215.84865-1-helugang@uniontech.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_freq.c | 4 ++-- drivers/gpu/drm/xe/xe_gt_sysfs.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_freq.c b/drivers/gpu/drm/xe/xe_gt_freq.c index 552435951f11..6bd39b2c5003 100644 --- a/drivers/gpu/drm/xe/xe_gt_freq.c +++ b/drivers/gpu/drm/xe/xe_gt_freq.c @@ -237,11 +237,11 @@ int xe_gt_freq_init(struct xe_gt *gt) if (!gt->freq) return -ENOMEM; - err = devm_add_action(xe->drm.dev, freq_fini, gt->freq); + err = sysfs_create_files(gt->freq, freq_attrs); if (err) return err; - err = sysfs_create_files(gt->freq, freq_attrs); + err = devm_add_action_or_reset(xe->drm.dev, freq_fini, gt->freq); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_gt_sysfs.c b/drivers/gpu/drm/xe/xe_gt_sysfs.c index a05c3699e8b9..ec2b8246204b 100644 --- a/drivers/gpu/drm/xe/xe_gt_sysfs.c +++ b/drivers/gpu/drm/xe/xe_gt_sysfs.c @@ -51,5 +51,5 @@ int xe_gt_sysfs_init(struct xe_gt *gt) gt->sysfs = &kg->base; - return devm_add_action(xe->drm.dev, gt_sysfs_fini, gt); + return devm_add_action_or_reset(xe->drm.dev, gt_sysfs_fini, gt); } -- 2.51.0 From 37173392741c425191b959acb3adf70c9a4610c0 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 16 Sep 2024 09:49:12 +0100 Subject: [PATCH 13/16] drm/xe/vram: fix ccs offset calculation Spec says SW is expected to round up to the nearest 128K, if not already aligned for the CC unit view of CCS. We are seeing the assert sometimes pop on BMG to tell us that there is a hole between GSM and CCS, as well as popping other asserts with having a vram size with strange alignment, which is likely caused by misaligned offset here. v2 (Shuicheng): - Do the round_up() on final SW address. BSpec: 68023 Fixes: b5c2ca0372dc ("drm/xe/xe2hpg: Determine flat ccs offset for vram") Signed-off-by: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Akshata Jahagirdar Cc: Lucas De Marchi Cc: Shuicheng Lin Cc: Matt Roper Cc: stable@vger.kernel.org # v6.10+ Reviewed-by: Himal Prasad Ghimiray Tested-by: Shuicheng Lin Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240916084911.13119-2-matthew.auld@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_vram.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index 7e765b1499b1..2a623bfcda7e 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -182,6 +182,7 @@ static inline u64 get_flat_ccs_offset(struct xe_gt *gt, u64 tile_size) offset = offset_hi << 32; /* HW view bits 39:32 */ offset |= offset_lo << 6; /* HW view bits 31:6 */ offset *= num_enabled; /* convert to SW view */ + offset = round_up(offset, SZ_128K); /* SW must round up to nearest 128K */ /* We don't expect any holes */ xe_assert_msg(xe, offset == (xe_mmio_read64_2x32(>_to_tile(gt)->mmio, GSMBASE) - -- 2.51.0 From bc6763187270154ab48715d26bc129eba34e471a Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Mon, 16 Sep 2024 12:21:49 -0700 Subject: [PATCH 14/16] drm/xe/rtp: Remove unneeded semicolon Fix coccicheck report with regard to unneeded semicolon. This is currently the only case according to make coccicheck \ MODE=report \ COCCI=scripts/coccinelle/misc/semicolon.cocci \ M=drivers/gpu/drm/xe Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409151152.pJ4ukp5k-lkp@intel.com/ Reviewed-by: Jagmeet Randhawa Link: https://patchwork.freedesktop.org/patch/msgid/20240916192149.855996-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_rtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_rtp.c b/drivers/gpu/drm/xe/xe_rtp.c index 86c705d18c0d..b13d4d62f0b1 100644 --- a/drivers/gpu/drm/xe/xe_rtp.c +++ b/drivers/gpu/drm/xe/xe_rtp.c @@ -196,7 +196,7 @@ static void rtp_get_context(struct xe_rtp_process_ctx *ctx, *gt = (*hwe)->gt; *xe = gt_to_xe(*gt); break; - }; + } } /** -- 2.51.0 From 8a677d5b0a20ed4715cb4a8b0f8a08712ec0997c Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Tue, 17 Sep 2024 16:32:43 -0400 Subject: [PATCH 15/16] drm/xe/display: Remove i915_drv.h include Change HAS_DISPLAY towards intel_display and remove one of the last includes of i915_drv.h in Xe. Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240917203243.659393-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/display/xe_display.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 86009a17e4c2..94ac537f2514 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -13,7 +13,6 @@ #include #include "soc/intel_dram.h" -#include "i915_drv.h" /* FIXME: HAS_DISPLAY() depends on this */ #include "intel_acpi.h" #include "intel_audio.h" #include "intel_bw.h" @@ -34,7 +33,7 @@ static bool has_display(struct xe_device *xe) { - return HAS_DISPLAY(xe); + return HAS_DISPLAY(&xe->display); } /** -- 2.51.0 From ec2d1539e159f53eae708e194c449cfefa004994 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Thu, 12 Sep 2024 17:45:07 -0400 Subject: [PATCH 16/16] drm/xe: Restore pci state upon resume The pci state was saved, but not restored. Restore right after the power state transition request like every other driver. v2: Use right fixes tag, since this was there initialy, but accidentally removed. Fixes: f6761c68c0ac ("drm/xe/display: Improve s2idle handling.") Cc: Maarten Lankhorst Cc: Lucas De Marchi Reviewed-by: Jonathan Cavitt Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240912214507.456897-1-rodrigo.vivi@intel.com Signed-off-by: Maarten Lankhorst --- drivers/gpu/drm/xe/xe_pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index a1d08e20cd34..2c2d269ebe8e 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -944,6 +944,8 @@ static int xe_pci_resume(struct device *dev) if (err) return err; + pci_restore_state(pdev); + err = pci_enable_device(pdev); if (err) return err; -- 2.51.0