From: Graham Sider Date: Mon, 6 Feb 2023 19:04:42 +0000 (-0500) Subject: drm/amdgpu/bu: Add use_mtype_cc_wa module param X-Git-Tag: v6.5-rc1~153^2~7^2~339 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=895797d9193b38e759bc01268a8e3887e521f682;p=linux.git drm/amdgpu/bu: Add use_mtype_cc_wa module param By default, set use_mtype_cc_wa to 1 to set PTE coherence flag MTYPE_CC instead of MTYPE_RW by default. This is required for the time being to mitigate a bug causing XCCs to hit stale data due to TCC marking fully dirty lines as exclusive. Signed-off-by: Graham Sider Reviewed-by: Joseph Greathouse Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index cb9373f8c25a..cd2a29a7e26d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -212,6 +212,7 @@ extern int amdgpu_noretry; extern int amdgpu_force_asic_type; extern int amdgpu_smartshift_bias; extern int amdgpu_use_xgmi_p2p; +extern bool amdgpu_use_mtype_cc_wa; #ifdef CONFIG_HSA_AMD extern int sched_policy; extern bool debug_evictions; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index da4e50aef95a..8bc37826a99f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -822,6 +822,13 @@ MODULE_PARM_DESC(no_queue_eviction_on_vm_fault, "No queue eviction on VM fault ( module_param_named(no_queue_eviction_on_vm_fault, amdgpu_no_queue_eviction_on_vm_fault, int, 0444); #endif +/** + * DOC: use_mtype_cc_wa (bool) + */ +bool amdgpu_use_mtype_cc_wa = true; +MODULE_PARM_DESC(use_mtype_cc_wa, "Use MTYPE_CC workaround (0 = use MTYPE_RW where applicable, 1 = use MTYPE_CC where applicable (default))"); +module_param_named(use_mtype_cc_wa, amdgpu_use_mtype_cc_wa, bool, 0444); + /** * DOC: pcie_p2p (bool) * Enable PCIe P2P (requires large-BAR). Default value: true (on) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 2eb67b53e497..8623b93c05ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1187,6 +1187,7 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED; unsigned int mtype; + unsigned int mtype_default; bool snoop = false; switch (adev->ip_versions[GC_HWIP][0]) { @@ -1230,7 +1231,10 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, /* FIXME: Needs more work for handling multiple memory * partitions (> NPS1 mode) e.g. NPS4 for both APU and dGPU * modes. + * FIXME: Temporarily using MTYPE_CC instead of MTYPE_RW where applicable. + * To force use of MTYPE_RW, set use_mtype_cc_wa=0 */ + mtype_default = amdgpu_use_mtype_cc_wa ? MTYPE_CC : MTYPE_RW; snoop = true; if (uncached) { mtype = MTYPE_UC; @@ -1245,14 +1249,14 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, * socket should be treated as remote access so MTYPE_RW * cannot be used always. */ - mtype = MTYPE_RW; + mtype = mtype_default; } else if (adev->flags & AMD_IS_APU) { /* APU on carve out mode */ - mtype = MTYPE_RW; + mtype = mtype_default; } else { /* dGPU */ if (is_vram && bo_adev == adev) - mtype = MTYPE_RW; + mtype = mtype_default; else if (is_vram) mtype = MTYPE_NC; else diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 2b2129dd1e4a..477ef9294203 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1198,9 +1198,12 @@ svm_range_get_pte_flags(struct kfd_node *node, if (uncached) { mapping_flags |= AMDGPU_VM_MTYPE_UC; } else if (domain == SVM_RANGE_VRAM_DOMAIN) { - /* local HBM region close to partition */ + /* local HBM region close to partition + * FIXME: Temporarily using MTYPE_CC instead of MTYPE_RW where applicable. + * To force use of MTYPE_RW, set use_mtype_cc_wa=0 + */ if (bo_node == node) - mapping_flags |= AMDGPU_VM_MTYPE_RW; + mapping_flags |= amdgpu_use_mtype_cc_wa ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; /* local HBM region far from partition or remote XGMI GPU */ else if (svm_nodes_in_same_hive(bo_node, node)) mapping_flags |= AMDGPU_VM_MTYPE_NC;