From 8949843762631d9d0fc526dfb61a272dca29fc6f Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:38:46 +0530 Subject: [PATCH 01/16] drm/amdgpu: Enable userq fence interrupt support MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add support to handle the userqueue protected fence signal hardware interrupt. Create a xarray which maps the doorbell index to the fence driver address. This would help to retrieve the fence driver information when an userq fence interrupt is triggered. Firmware sends the doorbell offset value and this info is compared with the queue's mqd doorbell offset value. If they are same, we process the userq fence interrupt. v1:(Christian): - use xa_load to extract the fence driver. - move the amdgpu_userq_fence_driver_process call within the xa_lock as there is a chance that fence_drv might be freed. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 25 +++++++++---------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cfaa5d77b20a..fdd271edb0b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4336,6 +4336,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(&adev->virt.rlcg_reg_lock); spin_lock_init(&adev->wb.lock); + xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); + INIT_LIST_HEAD(&adev->reset_list); INIT_LIST_HEAD(&adev->ras_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 1a9565b61266..cd473c985e36 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -71,6 +71,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, struct amdgpu_usermode_queue *userq) { struct amdgpu_userq_fence_driver *fence_drv; + unsigned long flags; int r; fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL); @@ -98,6 +99,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, fence_drv->context = dma_fence_context_alloc(1); get_task_comm(fence_drv->timeline_name, current); + xa_lock_irqsave(&adev->userq_xa, flags); + __xa_store(&adev->userq_xa, userq->doorbell_index, + fence_drv, GFP_KERNEL); + xa_unlock_irqrestore(&adev->userq_xa, flags); + userq->fence_drv = fence_drv; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 62a6da083a8f..b061b654d1ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -49,6 +49,7 @@ #include "nbio_v4_3.h" #include "mes_v11_0.h" #include "mes_v11_0_userqueue.h" +#include "amdgpu_userq_fence.h" #define GFX11_NUM_GFX_RINGS 1 #define GFX11_MEC_HPD_SIZE 2048 @@ -6334,25 +6335,23 @@ static int gfx_v11_0_eop_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - int i; + u32 doorbell_offset = entry->src_data[0]; u8 me_id, pipe_id, queue_id; struct amdgpu_ring *ring; - uint32_t mes_queue_id = entry->src_data[0]; + int i; DRM_DEBUG("IH: CP EOP\n"); - if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) { - struct amdgpu_mes_queue *queue; + if (adev->enable_mes && doorbell_offset) { + struct amdgpu_userq_fence_driver *fence_drv = NULL; + struct xarray *xa = &adev->userq_xa; + unsigned long flags; - mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK; - - spin_lock(&adev->mes.queue_id_lock); - queue = idr_find(&adev->mes.queue_id_idr, mes_queue_id); - if (queue) { - DRM_DEBUG("process mes queue id = %d\n", mes_queue_id); - amdgpu_fence_process(queue->ring); - } - spin_unlock(&adev->mes.queue_id_lock); + xa_lock_irqsave(xa, flags); + fence_drv = xa_load(xa, doorbell_offset); + if (fence_drv) + amdgpu_userq_fence_driver_process(fence_drv); + xa_unlock_irqrestore(xa, flags); } else { me_id = (entry->ring_id & 0x0c) >> 2; pipe_id = (entry->ring_id & 0x03) >> 0; -- 2.51.0 From 70773bef4e091ff6d2a91e3dfb4f29013eb81f1f Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 25 Sep 2024 18:09:49 +0200 Subject: [PATCH 02/16] drm/amdgpu: update userqueue BOs and PDs MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch updates the VM_IOCTL to allow userspace to synchronize the mapping/unmapping of a BO in the page table. The major changes are: - it adds a drm_timeline object as an input parameter to the VM IOCTL. - this object is used by the kernel to sync the update of the BO in the page table during the mapping of the object. - the kernel also synchronizes the tlb flush of the page table entry of this object during the unmapping (Added in this series: https://patchwork.freedesktop.org/series/131276/ and https://patchwork.freedesktop.org/patch/584182/) - the userspace can wait on this timeline, and then the BO is ready to be consumed by the GPU. The UAPI for the same has been approved here: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/392 V2: - remove the eviction fence coupling V3: - added the drm timeline support instead of input/output fence (Christian) V4: - made timeline 64-bit (Christian) - bug fix (Arvind) V5: GLCTS bug fix (Arvind) V6: Rename syncobj_handle -> timeline_syncobj_out Rename point -> timeline_point_in (Marek) V7: Addressed review comments from Christian: - do not send last_update fence in case of vm_clear_freed, instead return the fence from gen_va_update_vm - move the functions to update bo_mapping to amdgpu_gem.c - do not use amdgpu_userq_update_vm anymore in userq_create() V8: Addressed review comments from Christian: - Split amdgpu_gem_update_bo_mapping function. - amdgpu_gem_va_update_vm should return stub for error. V9: Addressed review comments from Christian: - Rename the function amdgpu_gem_update_timeline_node. - amdgpu_gem_update_timeline_node should be void function. - when timeline_point is zero don't allocate a chain and call drm_syncobj_replace_fence() instead of drm_syncobj_add_point(). V11: rebase V12: Fix 32-bit holes issue in sturct drm_amdgpu_gem_va. V13: Fix the review comment by renaming timeline syncobj (Marek) Cc: Alex Deucher Cc: Felix Kuehling Cc: Christian König Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Arvind Yadav Signed-off-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 113 ++++++++++++++++++++++-- include/uapi/drm/amdgpu_drm.h | 9 ++ 2 files changed, 113 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 69429df09477..542a1b70f2ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_display.h" @@ -44,6 +45,75 @@ #include "amdgpu_xgmi.h" #include "amdgpu_vm.h" +static int +amdgpu_gem_update_timeline_node(struct drm_file *filp, + uint32_t syncobj_handle, + uint64_t point, + struct drm_syncobj **syncobj, + struct dma_fence_chain **chain) +{ + if (!syncobj_handle) + return 0; + + /* Find the sync object */ + *syncobj = drm_syncobj_find(filp, syncobj_handle); + if (!*syncobj) + return -ENOENT; + + if (!point) + return 0; + + /* Allocate the chain node */ + *chain = dma_fence_chain_alloc(); + if (!*chain) { + drm_syncobj_put(*syncobj); + return -ENOMEM; + } + + return 0; +} + +static void +amdgpu_gem_update_bo_mapping(struct drm_file *filp, + struct amdgpu_bo_va *bo_va, + uint32_t operation, + uint64_t point, + struct dma_fence *fence, + struct drm_syncobj *syncobj, + struct dma_fence_chain *chain) +{ + struct amdgpu_bo *bo = bo_va ? bo_va->base.bo : NULL; + struct amdgpu_fpriv *fpriv = filp->driver_priv; + struct amdgpu_vm *vm = &fpriv->vm; + struct dma_fence *last_update; + + if (!syncobj) + return; + + /* Find the last update fence */ + switch (operation) { + case AMDGPU_VA_OP_MAP: + case AMDGPU_VA_OP_REPLACE: + if (bo && (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)) + last_update = vm->last_update; + else + last_update = bo_va->last_pt_update; + break; + case AMDGPU_VA_OP_UNMAP: + case AMDGPU_VA_OP_CLEAR: + last_update = fence; + break; + default: + return; + } + + /* Add fence to timeline */ + if (!point) + drm_syncobj_replace_fence(syncobj, last_update); + else + drm_syncobj_add_point(syncobj, chain, last_update, point); +} + static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf) { struct ttm_buffer_object *bo = vmf->vma->vm_private_data; @@ -638,18 +708,23 @@ out: * * Update the bo_va directly after setting its address. Errors are not * vital here, so they are not reported back to userspace. + * + * Returns resulting fence if freed BO(s) got cleared from the PT. + * otherwise stub fence in case of error. */ -static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev, - struct amdgpu_vm *vm, - struct amdgpu_bo_va *bo_va, - uint32_t operation) +static struct dma_fence * +amdgpu_gem_va_update_vm(struct amdgpu_device *adev, + struct amdgpu_vm *vm, + struct amdgpu_bo_va *bo_va, + uint32_t operation) { + struct dma_fence *fence = dma_fence_get_stub(); int r; if (!amdgpu_vm_ready(vm)) - return; + return fence; - r = amdgpu_vm_clear_freed(adev, vm, NULL); + r = amdgpu_vm_clear_freed(adev, vm, &fence); if (r) goto error; @@ -665,6 +740,8 @@ static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev, error: if (r && r != -ERESTARTSYS) DRM_ERROR("Couldn't update BO_VA (%d)\n", r); + + return fence; } /** @@ -713,6 +790,9 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, struct amdgpu_fpriv *fpriv = filp->driver_priv; struct amdgpu_bo *abo; struct amdgpu_bo_va *bo_va; + struct drm_syncobj *timeline_syncobj = NULL; + struct dma_fence_chain *timeline_chain = NULL; + struct dma_fence *fence; struct drm_exec exec; uint64_t va_flags; uint64_t vm_size; @@ -827,9 +907,24 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, default: break; } - if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE) && !adev->debug_vm) - amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va, - args->operation); + if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE) && !adev->debug_vm) { + + r = amdgpu_gem_update_timeline_node(filp, + args->vm_timeline_syncobj_out, + args->vm_timeline_point, + &timeline_syncobj, + &timeline_chain); + + fence = amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va, + args->operation); + + if (!r) + amdgpu_gem_update_bo_mapping(filp, bo_va, + args->operation, + args->vm_timeline_point, + fence, timeline_syncobj, + timeline_chain); + } error: drm_exec_fini(&exec); diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 1a21259cb8c4..ca82935ff93a 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -857,6 +857,15 @@ struct drm_amdgpu_gem_va { __u64 offset_in_bo; /** Specify mapping size. Must be correctly aligned. */ __u64 map_size; + /** + * vm_timeline_point is a sequence number used to add new timeline point. + */ + __u64 vm_timeline_point; + /** + * The vm page table update fence is installed in given vm_timeline_syncobj_out + * at vm_timeline_point. + */ + __u32 vm_timeline_syncobj_out; }; #define AMDGPU_HW_IP_GFX 0 -- 2.51.0 From ac4a1f7f13309f8235a0c4719c34094de3303dfd Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:46:49 +0530 Subject: [PATCH 03/16] drm/amdgpu: Remove the MES self test MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Remove MES self test as this conflicts the userqueue fence interrupts. v2:(Christian) - remove the amdgpu_mes_self_test() function and any now unused code. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 - drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 169 --------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 2 - drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 14 +- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 13 +- 5 files changed, 2 insertions(+), 199 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fdd271edb0b9..4da18ed0beea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5144,9 +5144,6 @@ exit: } adev->in_suspend = false; - if (adev->enable_mes) - amdgpu_mes_self_test(adev); - if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) DRM_WARN("smart shift update failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index f8202e6ed4b1..acdca3110c24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1405,175 +1405,6 @@ out_unlock: return r; } -static int amdgpu_mes_test_create_gang_and_queues(struct amdgpu_device *adev, - int pasid, int *gang_id, - int queue_type, int num_queue, - struct amdgpu_ring **added_rings, - struct amdgpu_mes_ctx_data *ctx_data) -{ - struct amdgpu_ring *ring; - struct amdgpu_mes_gang_properties gprops = {0}; - int r, j; - - /* create a gang for the process */ - gprops.priority = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - gprops.gang_quantum = adev->mes.default_gang_quantum; - gprops.inprocess_gang_priority = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - gprops.priority_level = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - gprops.global_priority_level = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - - r = amdgpu_mes_add_gang(adev, pasid, &gprops, gang_id); - if (r) { - DRM_ERROR("failed to add gang\n"); - return r; - } - - /* create queues for the gang */ - for (j = 0; j < num_queue; j++) { - r = amdgpu_mes_add_ring(adev, *gang_id, queue_type, j, - ctx_data, &ring); - if (r) { - DRM_ERROR("failed to add ring\n"); - break; - } - - DRM_INFO("ring %s was added\n", ring->name); - added_rings[j] = ring; - } - - return 0; -} - -static int amdgpu_mes_test_queues(struct amdgpu_ring **added_rings) -{ - struct amdgpu_ring *ring; - int i, r; - - for (i = 0; i < AMDGPU_MES_CTX_MAX_RINGS; i++) { - ring = added_rings[i]; - if (!ring) - continue; - - r = amdgpu_ring_test_helper(ring); - if (r) - return r; - - r = amdgpu_ring_test_ib(ring, 1000 * 10); - if (r) { - DRM_DEV_ERROR(ring->adev->dev, - "ring %s ib test failed (%d)\n", - ring->name, r); - return r; - } else - DRM_INFO("ring %s ib test pass\n", ring->name); - } - - return 0; -} - -int amdgpu_mes_self_test(struct amdgpu_device *adev) -{ - struct amdgpu_vm *vm = NULL; - struct amdgpu_mes_ctx_data ctx_data = {0}; - struct amdgpu_ring *added_rings[AMDGPU_MES_CTX_MAX_RINGS] = { NULL }; - int gang_ids[3] = {0}; - int queue_types[][2] = { { AMDGPU_RING_TYPE_GFX, 1 }, - { AMDGPU_RING_TYPE_COMPUTE, 1 }, - { AMDGPU_RING_TYPE_SDMA, 1} }; - int i, r, pasid, k = 0; - - pasid = amdgpu_pasid_alloc(16); - if (pasid < 0) { - dev_warn(adev->dev, "No more PASIDs available!"); - pasid = 0; - } - - vm = kzalloc(sizeof(*vm), GFP_KERNEL); - if (!vm) { - r = -ENOMEM; - goto error_pasid; - } - - r = amdgpu_vm_init(adev, vm, -1); - if (r) { - DRM_ERROR("failed to initialize vm\n"); - goto error_pasid; - } - - r = amdgpu_mes_ctx_alloc_meta_data(adev, &ctx_data); - if (r) { - DRM_ERROR("failed to alloc ctx meta data\n"); - goto error_fini; - } - - ctx_data.meta_data_gpu_addr = AMDGPU_VA_RESERVED_BOTTOM; - r = amdgpu_mes_ctx_map_meta_data(adev, vm, &ctx_data); - if (r) { - DRM_ERROR("failed to map ctx meta data\n"); - goto error_vm; - } - - r = amdgpu_mes_create_process(adev, pasid, vm); - if (r) { - DRM_ERROR("failed to create MES process\n"); - goto error_vm; - } - - for (i = 0; i < ARRAY_SIZE(queue_types); i++) { - /* On GFX v10.3, fw hasn't supported to map sdma queue. */ - if (amdgpu_ip_version(adev, GC_HWIP, 0) >= - IP_VERSION(10, 3, 0) && - amdgpu_ip_version(adev, GC_HWIP, 0) < - IP_VERSION(11, 0, 0) && - queue_types[i][0] == AMDGPU_RING_TYPE_SDMA) - continue; - - r = amdgpu_mes_test_create_gang_and_queues(adev, pasid, - &gang_ids[i], - queue_types[i][0], - queue_types[i][1], - &added_rings[k], - &ctx_data); - if (r) - goto error_queues; - - k += queue_types[i][1]; - } - - /* start ring test and ib test for MES queues */ - amdgpu_mes_test_queues(added_rings); - -error_queues: - /* remove all queues */ - for (i = 0; i < ARRAY_SIZE(added_rings); i++) { - if (!added_rings[i]) - continue; - amdgpu_mes_remove_ring(adev, added_rings[i]); - } - - for (i = 0; i < ARRAY_SIZE(gang_ids); i++) { - if (!gang_ids[i]) - continue; - amdgpu_mes_remove_gang(adev, gang_ids[i]); - } - - amdgpu_mes_destroy_process(adev, pasid); - -error_vm: - amdgpu_mes_ctx_unmap_meta_data(adev, &ctx_data); - -error_fini: - amdgpu_vm_fini(adev, vm); - -error_pasid: - if (pasid) - amdgpu_pasid_free(pasid); - - amdgpu_mes_ctx_free_meta_data(&ctx_data); - kfree(vm); - return 0; -} - int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe) { const struct mes_firmware_header_v1_0 *mes_hdr; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 52dd54a32fb4..78362a838212 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -470,8 +470,6 @@ int amdgpu_mes_ctx_map_meta_data(struct amdgpu_device *adev, int amdgpu_mes_ctx_unmap_meta_data(struct amdgpu_device *adev, struct amdgpu_mes_ctx_data *ctx_data); -int amdgpu_mes_self_test(struct amdgpu_device *adev); - int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev); /* diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 4cfd86aa2ea3..ccc19a40f03d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1712,22 +1712,10 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block *ip_block) return 0; } -static int mes_v11_0_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend && - (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3))) - amdgpu_mes_self_test(adev); - - return 0; -} - static const struct amd_ip_funcs mes_v11_0_ip_funcs = { .name = "mes_v11_0", .early_init = mes_v11_0_early_init, - .late_init = mes_v11_0_late_init, + .late_init = NULL, .sw_init = mes_v11_0_sw_init, .sw_fini = mes_v11_0_sw_fini, .hw_init = mes_v11_0_hw_init, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 62aba0b5dbe2..801928555eff 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1820,21 +1820,10 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block) return 0; } -static int mes_v12_0_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend) - amdgpu_mes_self_test(adev); - - return 0; -} - static const struct amd_ip_funcs mes_v12_0_ip_funcs = { .name = "mes_v12_0", .early_init = mes_v12_0_early_init, - .late_init = mes_v12_0_late_init, + .late_init = NULL, .sw_init = mes_v12_0_sw_init, .sw_fini = mes_v12_0_sw_fini, .hw_init = mes_v12_0_hw_init, -- 2.51.0 From e7cf21fbb2773cfcf70189be524041b21e6509dc Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:51:58 +0530 Subject: [PATCH 04/16] drm/amdgpu: Few optimization and fixes for userq fence driver MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Few optimization and fixes for userq fence driver. v1:(Christian): - Remove unnecessary comments. - In drm_exec_init call give num_bo_handles as last parameter it would making allocation of the array more efficient - Handle return value of __xa_store() and improve the error handling of amdgpu_userq_fence_driver_alloc(). v2:(Christian): - Revert userq_xa xarray init to XA_FLAGS_LOCK_IRQ. - move the xa_unlock before the error check of the call xa_err(__xa_store()) and moved this change to a separate patch as this is adding a missing error handling. - Removed the unnecessary comments. Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 44 ++++++++++++------- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 6 +-- .../gpu/drm/amd/include/amdgpu_userqueue.h | 2 +- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index cd473c985e36..b475643ab5ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -77,7 +77,8 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL); if (!fence_drv) { DRM_ERROR("Failed to allocate memory for fence driver\n"); - return -ENOMEM; + r = -ENOMEM; + goto free_fence_drv; } /* Acquire seq64 memory */ @@ -85,7 +86,8 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, &fence_drv->cpu_addr); if (r) { kfree(fence_drv); - return -ENOMEM; + r = -ENOMEM; + goto free_seq64; } memset(fence_drv->cpu_addr, 0, sizeof(u64)); @@ -95,7 +97,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, spin_lock_init(&fence_drv->fence_list_lock); fence_drv->adev = adev; - fence_drv->uq_fence_drv_xa_ref = &userq->uq_fence_drv_xa; + fence_drv->fence_drv_xa_ptr = &userq->fence_drv_xa; fence_drv->context = dma_fence_context_alloc(1); get_task_comm(fence_drv->timeline_name, current); @@ -107,6 +109,13 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, userq->fence_drv = fence_drv; return 0; + +free_seq64: + amdgpu_seq64_free(adev, fence_drv->gpu_addr); +free_fence_drv: + kfree(fence_drv); + + return r; } void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv) @@ -148,7 +157,7 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref) struct amdgpu_device *adev = fence_drv->adev; struct amdgpu_userq_fence *fence, *tmp; struct xarray *xa = &adev->userq_xa; - unsigned long index; + unsigned long index, flags; struct dma_fence *f; spin_lock(&fence_drv->fence_list_lock); @@ -165,11 +174,11 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref) } spin_unlock(&fence_drv->fence_list_lock); - xa_lock(xa); + xa_lock_irqsave(xa, flags); xa_for_each(xa, index, xa_fence_drv) if (xa_fence_drv == fence_drv) __xa_erase(xa, index); - xa_unlock(xa); + xa_unlock_irqrestore(xa, flags); /* Free seq64 memory */ amdgpu_seq64_free(adev, fence_drv->gpu_addr); @@ -213,12 +222,12 @@ int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq, amdgpu_userq_fence_driver_get(fence_drv); dma_fence_get(fence); - if (!xa_empty(&userq->uq_fence_drv_xa)) { + if (!xa_empty(&userq->fence_drv_xa)) { struct amdgpu_userq_fence_driver *stored_fence_drv; unsigned long index, count = 0; int i = 0; - xa_for_each(&userq->uq_fence_drv_xa, index, stored_fence_drv) + xa_for_each(&userq->fence_drv_xa, index, stored_fence_drv) count++; userq_fence->fence_drv_array = @@ -227,9 +236,9 @@ int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq, GFP_KERNEL); if (userq_fence->fence_drv_array) { - xa_for_each(&userq->uq_fence_drv_xa, index, stored_fence_drv) { + xa_for_each(&userq->fence_drv_xa, index, stored_fence_drv) { userq_fence->fence_drv_array[i] = stored_fence_drv; - xa_erase(&userq->uq_fence_drv_xa, index); + xa_erase(&userq->fence_drv_xa, index); i++; } } @@ -379,7 +388,6 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, struct drm_exec exec; u64 wptr; - /* Array of syncobj handles */ num_syncobj_handles = args->num_syncobj_handles; syncobj_handles = memdup_user(u64_to_user_ptr(args->syncobj_handles_array), sizeof(u32) * num_syncobj_handles); @@ -401,7 +409,6 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, } } - /* Array of bo handles */ num_bo_handles = args->num_bo_handles; bo_handles = memdup_user(u64_to_user_ptr(args->bo_handles_array), sizeof(u32) * num_bo_handles); @@ -423,7 +430,9 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, } } - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, num_bo_handles); + + /* Lock all BOs with retry handling */ drm_exec_until_all_locked(&exec) { r = drm_exec_prepare_array(&exec, gobj, num_bo_handles, 1); drm_exec_retry_on_contention(&exec); @@ -520,7 +529,6 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, goto free_timeline_handles; } - /* Array of GEM object handles */ gobj = kmalloc_array(num_bo_handles, sizeof(*gobj), GFP_KERNEL); if (!gobj) { r = -ENOMEM; @@ -535,7 +543,9 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, } } - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, num_bo_handles); + + /* Lock all BOs with retry handling */ drm_exec_until_all_locked(&exec) { r = drm_exec_prepare_array(&exec, gobj, num_bo_handles, 0); drm_exec_retry_on_contention(&exec); @@ -706,8 +716,8 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, * Otherwise, we would gather those references until we don't * have any more space left and crash. */ - if (fence_drv->uq_fence_drv_xa_ref) { - r = xa_alloc(fence_drv->uq_fence_drv_xa_ref, &index, fence_drv, + if (fence_drv->fence_drv_xa_ptr) { + r = xa_alloc(fence_drv->fence_drv_xa_ptr, &index, fence_drv, xa_limit_32b, GFP_KERNEL); if (r) goto free_fences; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h index f72424248cc5..89c82ba38b50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h @@ -54,7 +54,7 @@ struct amdgpu_userq_fence_driver { spinlock_t fence_list_lock; struct list_head fences; struct amdgpu_device *adev; - struct xarray *uq_fence_drv_xa_ref; + struct xarray *fence_drv_xa_ptr; char timeline_name[TASK_COMM_LEN]; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c index 34c1297d7970..15c568fb062b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c @@ -47,8 +47,8 @@ static void amdgpu_userq_walk_and_drop_fence_drv(struct xarray *xa) static void amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq) { - amdgpu_userq_walk_and_drop_fence_drv(&userq->uq_fence_drv_xa); - xa_destroy(&userq->uq_fence_drv_xa); + amdgpu_userq_walk_and_drop_fence_drv(&userq->fence_drv_xa); + xa_destroy(&userq->fence_drv_xa); /* Drop the fence_drv reference held by user queue */ amdgpu_userq_fence_driver_put(userq->fence_drv); } @@ -260,7 +260,7 @@ amdgpu_userqueue_create(struct drm_file *filp, union drm_amdgpu_userq *args) } queue->doorbell_index = index; - xa_init_flags(&queue->uq_fence_drv_xa, XA_FLAGS_ALLOC); + xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC); r = amdgpu_userq_fence_driver_alloc(adev, queue); if (r) { DRM_ERROR("Failed to alloc fence driver\n"); diff --git a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h index 7df837fedce0..b942f3f5ea35 100644 --- a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h +++ b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h @@ -47,7 +47,7 @@ struct amdgpu_usermode_queue { struct amdgpu_userq_obj db_obj; struct amdgpu_userq_obj fw_obj; struct amdgpu_userq_obj wptr_obj; - struct xarray uq_fence_drv_xa; + struct xarray fence_drv_xa; struct amdgpu_userq_fence_driver *fence_drv; }; -- 2.51.0 From fbea3d3174f4215ca1c867a1c035c1fa5ad01015 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:55:22 +0530 Subject: [PATCH 05/16] drm/amdgpu: Add the missing error handling for xa_store() call MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add the missing error handling for xa_store() call in the function amdgpu_userq_fence_driver_alloc(). Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index b475643ab5ec..969a3a75d815 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -102,9 +102,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, get_task_comm(fence_drv->timeline_name, current); xa_lock_irqsave(&adev->userq_xa, flags); - __xa_store(&adev->userq_xa, userq->doorbell_index, - fence_drv, GFP_KERNEL); + r = xa_err(__xa_store(&adev->userq_xa, userq->doorbell_index, + fence_drv, GFP_KERNEL)); xa_unlock_irqrestore(&adev->userq_xa, flags); + if (r) + goto free_seq64; userq->fence_drv = fence_drv; -- 2.51.0 From d8675102ba322a4ccd0dd8bc5adf799246314ddf Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:56:57 +0530 Subject: [PATCH 06/16] drm/amdgpu: add vm root BO lock before accessing the vm MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add a vm root BO lock before accessing the userqueue VM. v1:(Christian) - Keep the VM locked until you are done with the mapping. - Grab a temporary BO reference, drop the VM lock and acquire the BO. When you are done with everything just drop the BO lock and then the temporary BO reference. Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 969a3a75d815..6c9346f822c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -321,7 +321,6 @@ static const struct dma_fence_ops amdgpu_userq_fence_ops = { /** * amdgpu_userq_fence_read_wptr - Read the userq wptr value * - * @filp: drm file private data structure * @queue: user mode queue structure pointer * @wptr: write pointer value * @@ -331,25 +330,29 @@ static const struct dma_fence_ops amdgpu_userq_fence_ops = { * * Returns wptr value on success, error on failure. */ -static int amdgpu_userq_fence_read_wptr(struct drm_file *filp, - struct amdgpu_usermode_queue *queue, +static int amdgpu_userq_fence_read_wptr(struct amdgpu_usermode_queue *queue, u64 *wptr) { - struct amdgpu_fpriv *fpriv = filp->driver_priv; struct amdgpu_bo_va_mapping *mapping; - struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo *bo; u64 addr, *ptr; int r; + r = amdgpu_bo_reserve(queue->vm->root.bo, false); + if (r) + return r; + addr = queue->userq_prop->wptr_gpu_addr; addr &= AMDGPU_GMC_HOLE_MASK; - mapping = amdgpu_vm_bo_lookup_mapping(vm, addr >> PAGE_SHIFT); - if (!mapping) + mapping = amdgpu_vm_bo_lookup_mapping(queue->vm, addr >> PAGE_SHIFT); + if (!mapping) { + DRM_ERROR("Failed to lookup amdgpu_bo_va_mapping\n"); return -EINVAL; + } - bo = mapping->bo_va->base.bo; + bo = amdgpu_bo_ref(mapping->bo_va->base.bo); + amdgpu_bo_unreserve(queue->vm->root.bo); r = amdgpu_bo_reserve(bo, true); if (r) { DRM_ERROR("Failed to reserve userqueue wptr bo"); @@ -366,11 +369,14 @@ static int amdgpu_userq_fence_read_wptr(struct drm_file *filp, amdgpu_bo_kunmap(bo); amdgpu_bo_unreserve(bo); + amdgpu_bo_unref(&bo); return 0; map_error: amdgpu_bo_unreserve(bo); + amdgpu_bo_unref(&bo); + return r; } @@ -449,7 +455,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, goto exec_fini; } - r = amdgpu_userq_fence_read_wptr(filp, queue, &wptr); + r = amdgpu_userq_fence_read_wptr(queue, &wptr); if (r) goto exec_fini; -- 2.51.0 From cb4a73f46f253b5f7a30b1e0488c8ef2832e8747 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:59:04 +0530 Subject: [PATCH 07/16] drm/amdgpu: Add separate array of read and write for BO handles MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Drop AMDGPU_USERQ_BO_WRITE as this should not be a global option of the IOCTL, It should be option per buffer. Hence adding separate array for read and write BO handles. v2(Marek): - Internal kernel details shouldn't be here. This file should only document the observed behavior, not the implementation . v3: - Fix DAL CI clang issue. v4: - Added Alex RB to merge the kernel UAPI changes since he has already approved the amdgpu_drm.h changes. Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Alex Deucher Acked-by: Christian König Suggested-by: Marek Olšák Suggested-by: Christian König Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 246 +++++++++++++----- include/uapi/drm/amdgpu_drm.h | 50 ++-- 2 files changed, 215 insertions(+), 81 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 6c9346f822c1..9f1ca8659335 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -386,12 +386,14 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, struct amdgpu_fpriv *fpriv = filp->driver_priv; struct amdgpu_userq_mgr *userq_mgr = &fpriv->userq_mgr; struct drm_amdgpu_userq_signal *args = data; + struct drm_gem_object **gobj_write = NULL; + struct drm_gem_object **gobj_read = NULL; struct amdgpu_usermode_queue *queue; - struct drm_gem_object **gobj = NULL; struct drm_syncobj **syncobj = NULL; + u32 *bo_handles_write, num_write_bo_handles; u32 *syncobj_handles, num_syncobj_handles; - u32 *bo_handles, num_bo_handles; - int r, i, entry, boentry; + u32 *bo_handles_read, num_read_bo_handles; + int r, i, entry, rentry, wentry; struct dma_fence *fence; struct drm_exec exec; u64 wptr; @@ -417,32 +419,63 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, } } - num_bo_handles = args->num_bo_handles; - bo_handles = memdup_user(u64_to_user_ptr(args->bo_handles_array), - sizeof(u32) * num_bo_handles); - if (IS_ERR(bo_handles)) + num_read_bo_handles = args->num_read_bo_handles; + bo_handles_read = memdup_user(u64_to_user_ptr(args->bo_read_handles), + sizeof(u32) * num_read_bo_handles); + if (IS_ERR(bo_handles_read)) { + r = PTR_ERR(bo_handles_read); goto free_syncobj; + } + + /* Array of pointers to the GEM read objects */ + gobj_read = kmalloc_array(num_read_bo_handles, sizeof(*gobj_read), GFP_KERNEL); + if (!gobj_read) { + r = -ENOMEM; + goto free_bo_handles_read; + } + + for (rentry = 0; rentry < num_read_bo_handles; rentry++) { + gobj_read[rentry] = drm_gem_object_lookup(filp, bo_handles_read[rentry]); + if (!gobj_read[rentry]) { + r = -ENOENT; + goto put_gobj_read; + } + } + + num_write_bo_handles = args->num_write_bo_handles; + bo_handles_write = memdup_user(u64_to_user_ptr(args->bo_write_handles), + sizeof(u32) * num_write_bo_handles); + if (IS_ERR(bo_handles_write)) { + r = PTR_ERR(bo_handles_write); + goto put_gobj_read; + } - /* Array of pointers to the GEM objects */ - gobj = kmalloc_array(num_bo_handles, sizeof(*gobj), GFP_KERNEL); - if (!gobj) { + /* Array of pointers to the GEM write objects */ + gobj_write = kmalloc_array(num_write_bo_handles, sizeof(*gobj_write), GFP_KERNEL); + if (!gobj_write) { r = -ENOMEM; - goto free_bo_handles; + goto free_bo_handles_write; } - for (boentry = 0; boentry < num_bo_handles; boentry++) { - gobj[boentry] = drm_gem_object_lookup(filp, bo_handles[boentry]); - if (!gobj[boentry]) { + for (wentry = 0; wentry < num_write_bo_handles; wentry++) { + gobj_write[wentry] = drm_gem_object_lookup(filp, bo_handles_write[wentry]); + if (!gobj_write[wentry]) { r = -ENOENT; - goto put_gobj; + goto put_gobj_write; } } - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, num_bo_handles); + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, + (num_read_bo_handles + num_write_bo_handles)); /* Lock all BOs with retry handling */ drm_exec_until_all_locked(&exec) { - r = drm_exec_prepare_array(&exec, gobj, num_bo_handles, 1); + r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1); + drm_exec_retry_on_contention(&exec); + if (r) + goto exec_fini; + + r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1); drm_exec_retry_on_contention(&exec); if (r) goto exec_fini; @@ -464,10 +497,21 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, if (r) goto exec_fini; - for (i = 0; i < num_bo_handles; i++) - dma_resv_add_fence(gobj[i]->resv, fence, - dma_resv_usage_rw(args->bo_flags & - AMDGPU_USERQ_BO_WRITE)); + for (i = 0; i < num_read_bo_handles; i++) { + if (!gobj_read || !gobj_read[i]->resv) + continue; + + dma_resv_add_fence(gobj_read[i]->resv, fence, + DMA_RESV_USAGE_READ); + } + + for (i = 0; i < num_write_bo_handles; i++) { + if (!gobj_write || !gobj_write[i]->resv) + continue; + + dma_resv_add_fence(gobj_write[i]->resv, fence, + DMA_RESV_USAGE_WRITE); + } /* Add the created fence to syncobj/BO's */ for (i = 0; i < num_syncobj_handles; i++) @@ -478,12 +522,18 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, exec_fini: drm_exec_fini(&exec); -put_gobj: - while (boentry-- > 0) - drm_gem_object_put(gobj[boentry]); - kfree(gobj); -free_bo_handles: - kfree(bo_handles); +put_gobj_write: + while (wentry-- > 0) + drm_gem_object_put(gobj_write[wentry]); + kfree(gobj_write); +free_bo_handles_write: + kfree(bo_handles_write); +put_gobj_read: + while (rentry-- > 0) + drm_gem_object_put(gobj_read[rentry]); + kfree(gobj_read); +free_bo_handles_read: + kfree(bo_handles_read); free_syncobj: while (entry-- > 0) if (syncobj[entry]) @@ -498,28 +548,37 @@ free_syncobj_handles: int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - u32 *syncobj_handles, *timeline_points, *timeline_handles, *bo_handles; - u32 num_syncobj, num_bo_handles, num_points; + u32 *syncobj_handles, *timeline_points, *timeline_handles, *bo_handles_read, *bo_handles_write; + u32 num_syncobj, num_read_bo_handles, num_write_bo_handles, num_points; struct drm_amdgpu_userq_fence_info *fence_info = NULL; struct drm_amdgpu_userq_wait *wait_info = data; + struct drm_gem_object **gobj_write; + struct drm_gem_object **gobj_read; struct dma_fence **fences = NULL; - struct drm_gem_object **gobj; + int r, i, rentry, wentry, cnt; struct drm_exec exec; - int r, i, entry, cnt; u64 num_fences = 0; - num_bo_handles = wait_info->num_bo_handles; - bo_handles = memdup_user(u64_to_user_ptr(wait_info->bo_handles_array), - sizeof(u32) * num_bo_handles); - if (IS_ERR(bo_handles)) - return PTR_ERR(bo_handles); + num_read_bo_handles = wait_info->num_read_bo_handles; + bo_handles_read = memdup_user(u64_to_user_ptr(wait_info->bo_read_handles), + sizeof(u32) * num_read_bo_handles); + if (IS_ERR(bo_handles_read)) + return PTR_ERR(bo_handles_read); + + num_write_bo_handles = wait_info->num_write_bo_handles; + bo_handles_write = memdup_user(u64_to_user_ptr(wait_info->bo_write_handles), + sizeof(u32) * num_write_bo_handles); + if (IS_ERR(bo_handles_write)) { + r = PTR_ERR(bo_handles_write); + goto free_bo_handles_read; + } num_syncobj = wait_info->num_syncobj_handles; syncobj_handles = memdup_user(u64_to_user_ptr(wait_info->syncobj_handles_array), sizeof(u32) * num_syncobj); if (IS_ERR(syncobj_handles)) { r = PTR_ERR(syncobj_handles); - goto free_bo_handles; + goto free_bo_handles_write; } num_points = wait_info->num_points; @@ -537,29 +596,51 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, goto free_timeline_handles; } - gobj = kmalloc_array(num_bo_handles, sizeof(*gobj), GFP_KERNEL); - if (!gobj) { + gobj_read = kmalloc_array(num_read_bo_handles, sizeof(*gobj_read), GFP_KERNEL); + if (!gobj_read) { r = -ENOMEM; goto free_timeline_points; } - for (entry = 0; entry < num_bo_handles; entry++) { - gobj[entry] = drm_gem_object_lookup(filp, bo_handles[entry]); - if (!gobj[entry]) { + for (rentry = 0; rentry < num_read_bo_handles; rentry++) { + gobj_read[rentry] = drm_gem_object_lookup(filp, bo_handles_read[rentry]); + if (!gobj_read[rentry]) { + r = -ENOENT; + goto put_gobj_read; + } + } + + gobj_write = kmalloc_array(num_write_bo_handles, sizeof(*gobj_write), GFP_KERNEL); + if (!gobj_write) { + r = -ENOMEM; + goto put_gobj_read; + } + + for (wentry = 0; wentry < num_write_bo_handles; wentry++) { + gobj_write[wentry] = drm_gem_object_lookup(filp, bo_handles_write[wentry]); + if (!gobj_write[wentry]) { r = -ENOENT; - goto put_gobj; + goto put_gobj_write; } } - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, num_bo_handles); + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, + (num_read_bo_handles + num_write_bo_handles)); /* Lock all BOs with retry handling */ drm_exec_until_all_locked(&exec) { - r = drm_exec_prepare_array(&exec, gobj, num_bo_handles, 0); + r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1); drm_exec_retry_on_contention(&exec); if (r) { drm_exec_fini(&exec); - goto put_gobj; + goto put_gobj_write; + } + + r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1); + drm_exec_retry_on_contention(&exec); + if (r) { + drm_exec_fini(&exec); + goto put_gobj_write; } } @@ -600,13 +681,21 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, } /* Count GEM objects fence */ - for (i = 0; i < num_bo_handles; i++) { + for (i = 0; i < num_read_bo_handles; i++) { struct dma_resv_iter resv_cursor; struct dma_fence *fence; - dma_resv_for_each_fence(&resv_cursor, gobj[i]->resv, - dma_resv_usage_rw(wait_info->bo_wait_flags & - AMDGPU_USERQ_BO_WRITE), fence) + dma_resv_for_each_fence(&resv_cursor, gobj_read[i]->resv, + DMA_RESV_USAGE_READ, fence) + num_fences++; + } + + for (i = 0; i < num_write_bo_handles; i++) { + struct dma_resv_iter resv_cursor; + struct dma_fence *fence; + + dma_resv_for_each_fence(&resv_cursor, gobj_write[i]->resv, + DMA_RESV_USAGE_WRITE, fence) num_fences++; } @@ -632,14 +721,30 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, goto free_fence_info; } - /* Retrieve GEM objects fence */ - for (i = 0; i < num_bo_handles; i++) { + /* Retrieve GEM read objects fence */ + for (i = 0; i < num_read_bo_handles; i++) { + struct dma_resv_iter resv_cursor; + struct dma_fence *fence; + + dma_resv_for_each_fence(&resv_cursor, gobj_read[i]->resv, + DMA_RESV_USAGE_READ, fence) { + if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { + r = -EINVAL; + goto free_fences; + } + + fences[num_fences++] = fence; + dma_fence_get(fence); + } + } + + /* Retrieve GEM write objects fence */ + for (i = 0; i < num_write_bo_handles; i++) { struct dma_resv_iter resv_cursor; struct dma_fence *fence; - dma_resv_for_each_fence(&resv_cursor, gobj[i]->resv, - dma_resv_usage_rw(wait_info->bo_wait_flags & - AMDGPU_USERQ_BO_WRITE), fence) { + dma_resv_for_each_fence(&resv_cursor, gobj_write[i]->resv, + DMA_RESV_USAGE_WRITE, fence) { if (WARN_ON_ONCE(num_fences >= wait_info->num_fences)) { r = -EINVAL; goto free_fences; @@ -755,14 +860,19 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, } drm_exec_fini(&exec); - for (i = 0; i < num_bo_handles; i++) - drm_gem_object_put(gobj[i]); - kfree(gobj); + for (i = 0; i < num_read_bo_handles; i++) + drm_gem_object_put(gobj_read[i]); + kfree(gobj_read); + + for (i = 0; i < num_write_bo_handles; i++) + drm_gem_object_put(gobj_write[i]); + kfree(gobj_write); kfree(timeline_points); kfree(timeline_handles); kfree(syncobj_handles); - kfree(bo_handles); + kfree(bo_handles_write); + kfree(bo_handles_read); return 0; @@ -774,18 +884,24 @@ free_fence_info: kfree(fence_info); exec_fini: drm_exec_fini(&exec); -put_gobj: - while (entry-- > 0) - drm_gem_object_put(gobj[entry]); - kfree(gobj); +put_gobj_write: + while (wentry-- > 0) + drm_gem_object_put(gobj_write[wentry]); + kfree(gobj_write); +put_gobj_read: + while (rentry-- > 0) + drm_gem_object_put(gobj_read[rentry]); + kfree(gobj_read); free_timeline_points: kfree(timeline_points); free_timeline_handles: kfree(timeline_handles); free_syncobj_handles: kfree(syncobj_handles); -free_bo_handles: - kfree(bo_handles); +free_bo_handles_write: + kfree(bo_handles_write); +free_bo_handles_read: + kfree(bo_handles_read); return r; } diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index ca82935ff93a..02cf03e811d5 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -452,9 +452,6 @@ struct drm_amdgpu_userq_mqd_compute_gfx11 { __u64 eop_va; }; -/* dma_resv usage flag */ -#define AMDGPU_USERQ_BO_WRITE 1 - /* userq signal/wait ioctl */ struct drm_amdgpu_userq_signal { /** @@ -484,20 +481,30 @@ struct drm_amdgpu_userq_signal { */ __u64 syncobj_point; /** - * @bo_handles_array: An array of GEM BO handles used by the userq fence creation - * IOCTL to install the created dma_fence object which can be utilized by - * userspace to synchronize the BO usage between user processes. + * @bo_read_handles: The list of BO handles that the submitted user queue job + * is using for read only. This will update BO fences in the kernel. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of BO handles that the submitted user queue job + * is using for write only. This will update BO fences in the kernel. + */ + __u64 bo_write_handles; + /** + * @num_read_bo_handles: A count that represents the number of read BO handles in + * @bo_read_handles. */ - __u64 bo_handles_array; + __u32 num_read_bo_handles; /** - * @num_bo_handles: A count that represents the number of GEM BO handles in - * @bo_handles_array. + * @num_write_bo_handles: A count that represents the number of write BO handles in + * @bo_write_handles. */ - __u32 num_bo_handles; + __u32 num_write_bo_handles; /** * @bo_flags: flags to indicate BOs synchronize for READ or WRITE */ __u32 bo_flags; + __u32 pad; }; struct drm_amdgpu_userq_fence_info { @@ -551,20 +558,31 @@ struct drm_amdgpu_userq_wait { */ __u64 syncobj_timeline_points; /** - * @bo_handles_array: An array of GEM BO handles defined to fetch the fence - * wait information of every BO handles in the array. + * @bo_read_handles: The list of read BO handles submitted by the user queue + * job to get the va/value pairs. */ - __u64 bo_handles_array; + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of write BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_write_handles; /** * @num_syncobj_handles: A count that represents the number of syncobj handles in * @syncobj_handles_array. */ __u32 num_syncobj_handles; /** - * @num_bo_handles: A count that represents the number of GEM BO handles in - * @bo_handles_array. + * @num_read_bo_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_read_bo_handles; + /** + * @num_write_bo_handles: A count that represents the number of write BO handles in + * @bo_write_handles. */ - __u32 num_bo_handles; + __u32 num_write_bo_handles; + __u32 pad; /** * @userq_fence_info: An array of fence information (va and value) pair of each * objects stored in @syncobj_handles_array and @bo_handles_array. -- 2.51.0 From f7cb6a28e172bd470803d64697f7a9c708609da2 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 11:19:26 +0530 Subject: [PATCH 08/16] drm/amdgpu: Add gpu_addr support to seq64 allocation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add gpu address support to seq64 alloc function. v1:(Christian) - Add the user of this new interface change to the same patch. Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 10 ++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 1 + 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c index e22cb2b5cd92..0defad71044c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c @@ -163,7 +163,8 @@ error: * Returns: * 0 on success or a negative error code on failure */ -int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, u64 **cpu_addr) +int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, + u64 *gpu_addr, u64 **cpu_addr) { unsigned long bit_pos; @@ -172,7 +173,12 @@ int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, u64 **cpu_addr) return -ENOSPC; __set_bit(bit_pos, adev->seq64.used); + *va = bit_pos * sizeof(u64) + amdgpu_seq64_get_va_base(adev); + + if (gpu_addr) + *gpu_addr = bit_pos * sizeof(u64) + adev->seq64.gpu_addr; + *cpu_addr = bit_pos + adev->seq64.cpu_base_addr; return 0; @@ -233,7 +239,7 @@ int amdgpu_seq64_init(struct amdgpu_device *adev) */ r = amdgpu_bo_create_kernel(adev, AMDGPU_VA_RESERVED_SEQ64_SIZE, PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, - &adev->seq64.sbo, NULL, + &adev->seq64.sbo, &adev->seq64.gpu_addr, (void **)&adev->seq64.cpu_base_addr); if (r) { dev_warn(adev->dev, "(%d) create seq64 failed\n", r); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h index 4203b2ab318d..26a249aaaee1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h @@ -32,13 +32,14 @@ struct amdgpu_seq64 { struct amdgpu_bo *sbo; u32 num_sem; + u64 gpu_addr; u64 *cpu_base_addr; DECLARE_BITMAP(used, AMDGPU_MAX_SEQ64_SLOTS); }; void amdgpu_seq64_fini(struct amdgpu_device *adev); int amdgpu_seq64_init(struct amdgpu_device *adev); -int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *gpu_addr, u64 **cpu_addr); +int amdgpu_seq64_alloc(struct amdgpu_device *adev, u64 *va, u64 *gpu_addr, u64 **cpu_addr); void amdgpu_seq64_free(struct amdgpu_device *adev, u64 gpu_addr); int amdgpu_seq64_map(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_bo_va **bo_va); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 9f1ca8659335..d7697d3f55e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -82,7 +82,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, } /* Acquire seq64 memory */ - r = amdgpu_seq64_alloc(adev, &fence_drv->gpu_addr, + r = amdgpu_seq64_alloc(adev, &fence_drv->va, &fence_drv->gpu_addr, &fence_drv->cpu_addr); if (r) { kfree(fence_drv); @@ -113,7 +113,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, return 0; free_seq64: - amdgpu_seq64_free(adev, fence_drv->gpu_addr); + amdgpu_seq64_free(adev, fence_drv->va); free_fence_drv: kfree(fence_drv); @@ -183,7 +183,7 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref) xa_unlock_irqrestore(xa, flags); /* Free seq64 memory */ - amdgpu_seq64_free(adev, fence_drv->gpu_addr); + amdgpu_seq64_free(adev, fence_drv->va); kfree(fence_drv); } @@ -839,7 +839,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, } /* Store drm syncobj's gpu va address and value */ - fence_info[cnt].va = fence_drv->gpu_addr; + fence_info[cnt].va = fence_drv->va; fence_info[cnt].value = fences[i]->seqno; dma_fence_put(fences[i]); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h index 89c82ba38b50..f1a90840ac1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h @@ -44,6 +44,7 @@ struct amdgpu_userq_fence { struct amdgpu_userq_fence_driver { struct kref refcount; + u64 va; u64 gpu_addr; u64 *cpu_addr; u64 context; -- 2.51.0 From 189ee986b0142c8176aa09c47e66b611ca29d731 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 11:26:37 +0530 Subject: [PATCH 09/16] drm/amdgpu: add userq specific kernel config for fence ioctls MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Keep the user queue fence signal and wait IOCTLs in the kernel config CONFIG_DRM_AMDGPU_NAVI3X_USERQ. v2(Christian): - Remove the userq specific config added for kernel queues fence init function. v3(Alex): - It will be better to return an error(-ENOTSUPP) in these cases. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index d7697d3f55e5..85af0d520092 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -318,6 +318,7 @@ static const struct dma_fence_ops amdgpu_userq_fence_ops = { .release = amdgpu_userq_fence_release, }; +#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /** * amdgpu_userq_fence_read_wptr - Read the userq wptr value * @@ -544,7 +545,15 @@ free_syncobj_handles: return r; } +#else +int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + return -ENOTSUPP; +} +#endif +#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { @@ -905,3 +914,10 @@ free_bo_handles_read: return r; } +#else +int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + return -ENOTSUPP; +} +#endif -- 2.51.0 From 38c67ec9aa4be7bc0ae55e7bebe95f615fa09a1e Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 25 Sep 2024 18:10:41 +0200 Subject: [PATCH 10/16] drm/amdgpu: Add input fence to sync bo map/unmap MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch adds input fences to VM_IOCTL for buffer object. The kernel will map/unmap the BO only when the fence is signaled. The UAPI for the same has been approved here: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/392 V2: Bug fix (Arvind) V3: Bug fix (Arvind) V4: Rename UAPI objects as per UAPI review (Marek) V5: Addressed review comemnts from Christian - function should return error. - Add 'TODO' comment - The input fence should be independent of the operation. V6: Addressed review comemnts from Christian - Release the memory allocated by memdup_user(). V7: Addressed review comemnts from Christian - Drop the debug print and add "return r;" for the error handling. V11: Rebase v12: Fix 32-bit holes issue in sturct drm_amdgpu_gem_va. v13: Fix deadlock issue. v14: Fix merge conflict. v15: Fix review comment by renaming syncobj handles. Cc: Alex Deucher Cc: Christian Koenig Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Arvind Yadav Signed-off-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 46 +++++++++++++++++++++++++ include/uapi/drm/amdgpu_drm.h | 4 +++ 2 files changed, 50 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 542a1b70f2ee..bdf46cb4327f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -45,6 +45,45 @@ #include "amdgpu_xgmi.h" #include "amdgpu_vm.h" +static int +amdgpu_gem_add_input_fence(struct drm_file *filp, + uint64_t syncobj_handles_array, + uint32_t num_syncobj_handles) +{ + struct dma_fence *fence; + uint32_t *syncobj_handles; + int ret, i; + + if (!num_syncobj_handles) + return 0; + + syncobj_handles = memdup_user(u64_to_user_ptr(syncobj_handles_array), + sizeof(uint32_t) * num_syncobj_handles); + if (IS_ERR(syncobj_handles)) + return PTR_ERR(syncobj_handles); + + for (i = 0; i < num_syncobj_handles; i++) { + + if (!syncobj_handles[i]) { + ret = -EINVAL; + goto free_memdup; + } + + ret = drm_syncobj_find_fence(filp, syncobj_handles[i], 0, 0, &fence); + if (ret) + goto free_memdup; + + dma_fence_wait(fence, false); + + /* TODO: optimize async handling */ + dma_fence_put(fence); + } + +free_memdup: + kfree(syncobj_handles); + return ret; +} + static int amdgpu_gem_update_timeline_node(struct drm_file *filp, uint32_t syncobj_handle, @@ -854,6 +893,12 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, abo = NULL; } + r = amdgpu_gem_add_input_fence(filp, + args->input_fence_syncobj_handles, + args->num_syncobj_handles); + if (r) + goto error_put_gobj; + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT | DRM_EXEC_IGNORE_DUPLICATES, 0); drm_exec_until_all_locked(&exec) { @@ -928,6 +973,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, error: drm_exec_fini(&exec); +error_put_gobj: drm_gem_object_put(gobj); return r; } diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 02cf03e811d5..0910a6f8c5f2 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -884,6 +884,10 @@ struct drm_amdgpu_gem_va { * at vm_timeline_point. */ __u32 vm_timeline_syncobj_out; + /** the number of syncobj handles in @input_fence_syncobj_handles */ + __u32 num_syncobj_handles; + /** Array of sync object handle to wait for given input fences */ + __u64 input_fence_syncobj_handles; }; #define AMDGPU_HW_IP_GFX 0 -- 2.51.0 From 5f2f78314c5c3e4d87d638eea5a9592e89947e9e Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Tue, 28 Nov 2023 19:52:30 +0530 Subject: [PATCH 11/16] Revert "drm/amdgpu: don't allow userspace to create a doorbell BO" This reverts commit 6be2ad4f0073c541146caa66c5ae936c955a8224. This patch was to block userspace to use doorbell manager UAPI until usermode queue UAPI gets approved. UQ UAPI got approved in the following MR: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/392 Cc: Christian Koenig Cc: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index bdf46cb4327f..06171eab6e92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -430,10 +430,6 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data, uint32_t handle, initial_domain; int r; - /* reject DOORBELLs until userspace code to use it is available */ - if (args->in.domains & AMDGPU_GEM_DOMAIN_DOORBELL) - return -EINVAL; - /* reject invalid gem flags */ if (flags & ~(AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | AMDGPU_GEM_CREATE_NO_CPU_ACCESS | -- 2.51.0 From 2e06b175fff5ba1415b74fed441c0d066b8c8dab Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Mon, 11 Nov 2024 12:34:30 +0100 Subject: [PATCH 12/16] drm/amdgpu: fix userqueue UAPI comments This patch fixes some of the pending UAPI review comments from the libDRM/UAPI review process. - It updates some outdated comments in the userqueue UAPI header highlighted during the libdrm UAPI review. - It removes the GDS BO support which was found unused. - It also removes the unused flags parameter from the UAPI. - It also adds a padding variables in userqueue in/out structures. (Pierre-Eric and Marek) - clarify comments on top of drm_amdgpu_userq_in - clarify comment for queue_id (in) - clarify comment for mqd - clarify comment for compute MQD size - clarify comment for queue_id (out) - remove GDB object from BO object list - remove the unused flags parameter Cc: Alex Deucher Cc: Christian Koenig Reviewed-by: Alex Deucher Signed-off-by: Shashank Sharma Signed-off-by: Arvind Yadav Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 6 --- .../gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c | 4 +- include/uapi/drm/amdgpu_drm.h | 54 +++++++++---------- 3 files changed, 26 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c index 15c568fb062b..e946c6d1dd6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c @@ -225,11 +225,6 @@ amdgpu_userqueue_create(struct drm_file *filp, union drm_amdgpu_userq *args) return -EINVAL; } - if (args->in.flags) { - DRM_ERROR("Usermode queue flags not supported yet\n"); - return -EINVAL; - } - mutex_lock(&uq_mgr->userq_mutex); uq_funcs = adev->userq_funcs[args->in.ip_type]; @@ -248,7 +243,6 @@ amdgpu_userqueue_create(struct drm_file *filp, union drm_amdgpu_userq *args) queue->doorbell_handle = args->in.doorbell_handle; queue->doorbell_index = args->in.doorbell_offset; queue->queue_type = args->in.ip_type; - queue->flags = args->in.flags; queue->vm = &fpriv->vm; /* Convert relative doorbell offset into absolute doorbell index */ diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c index b3aa49ff1a87..fe4efe5ba6ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0_userqueue.c @@ -201,8 +201,8 @@ static int mes_v11_0_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr, mqd->shadow_base_lo = mqd_gfx_v11->shadow_va & 0xFFFFFFFC; mqd->shadow_base_hi = upper_32_bits(mqd_gfx_v11->shadow_va); - mqd->gds_bkup_base_lo = mqd_gfx_v11->gds_va & 0xFFFFFFFC; - mqd->gds_bkup_base_hi = upper_32_bits(mqd_gfx_v11->gds_va); + mqd->gds_bkup_base_lo = 0; + mqd->gds_bkup_base_hi = 0; mqd->fw_work_area_base_lo = mqd_gfx_v11->csa_va & 0xFFFFFFFC; mqd->fw_work_area_base_hi = upper_32_bits(mqd_gfx_v11->csa_va); diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 0910a6f8c5f2..6158496cc1d0 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -325,35 +325,28 @@ union drm_amdgpu_ctx { union drm_amdgpu_ctx_out out; }; -/* user queue IOCTL */ +/* user queue IOCTL operations */ #define AMDGPU_USERQ_OP_CREATE 1 #define AMDGPU_USERQ_OP_FREE 2 -/* Flag to indicate secure buffer related workload, unused for now */ -#define AMDGPU_USERQ_MQD_FLAGS_SECURE (1 << 0) -/* Flag to indicate AQL workload, unused for now */ -#define AMDGPU_USERQ_MQD_FLAGS_AQL (1 << 1) - /* - * MQD (memory queue descriptor) is a set of parameters which allow - * the GPU to uniquely define and identify a usermode queue. This - * structure defines the MQD for GFX-V11 IP ver 0. + * This structure is a container to pass input configuration + * info for all supported userqueue related operations. + * For operation AMDGPU_USERQ_OP_CREATE: user is expected + * to set all fields, excep the parameter 'queue_id'. + * For operation AMDGPU_USERQ_OP_FREE: the only input parameter expected + * to be set is 'queue_id', eveything else is ignored. */ struct drm_amdgpu_userq_in { /** AMDGPU_USERQ_OP_* */ __u32 op; - /** Queue handle for USERQ_OP_FREE */ + /** Queue id passed for operation USERQ_OP_FREE */ __u32 queue_id; /** the target GPU engine to execute workload (AMDGPU_HW_IP_*) */ __u32 ip_type; - /** - * @flags: flags to indicate special function for queue like secure - * buffer (TMZ). Unused for now. - */ - __u32 flags; /** * @doorbell_handle: the handle of doorbell GEM object - * associated to this client. + * associated with this userqueue client. */ __u32 doorbell_handle; /** @@ -362,7 +355,7 @@ struct drm_amdgpu_userq_in { * and doorbell_offset in the doorbell bo. */ __u32 doorbell_offset; - + __u32 _pad; /** * @queue_va: Virtual address of the GPU memory which holds the queue * object. The queue holds the workload packets. @@ -387,25 +380,31 @@ struct drm_amdgpu_userq_in { */ __u64 wptr_va; /** - * @mqd: Queue descriptor for USERQ_OP_CREATE + * @mqd: MQD (memory queue descriptor) is a set of parameters which allow + * the GPU to uniquely define and identify a usermode queue. + * * MQD data can be of different size for different GPU IP/engine and * their respective versions/revisions, so this points to a __u64 * - * which holds MQD of this usermode queue. + * which holds IP specific MQD of this usermode queue. */ __u64 mqd; /** * @size: size of MQD data in bytes, it must match the MQD structure * size of the respective engine/revision defined in UAPI for ex, for - * gfx_v11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx_v11). + * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11). */ __u64 mqd_size; }; +/* The structure to carry output of userqueue ops */ struct drm_amdgpu_userq_out { - /** Queue handle */ + /** + * For operation AMDGPU_USERQ_OP_CREATE: This field contains a unique + * queue ID to represent the newly created userqueue in the system, otherwise + * it should be ignored. + */ __u32 queue_id; - /** Flags */ - __u32 flags; + __u32 _pad; }; union drm_amdgpu_userq { @@ -420,11 +419,6 @@ struct drm_amdgpu_userq_mqd_gfx11 { * Use AMDGPU_INFO_IOCTL to find the exact size of the object. */ __u64 shadow_va; - /** - * @gds_va: Virtual address of the GPU memory to hold the GDS buffer. - * Use AMDGPU_INFO_IOCTL to find the exact size of the object. - */ - __u64 gds_va; /** * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. * Use AMDGPU_INFO_IOCTL to find the exact size of the object. @@ -446,8 +440,8 @@ struct drm_amdgpu_userq_mqd_sdma_gfx11 { struct drm_amdgpu_userq_mqd_compute_gfx11 { /** * @eop_va: Virtual address of the GPU memory to hold the EOP buffer. - * This must be a from a separate GPU object, and must be at least 1 page - * sized. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. */ __u64 eop_va; }; -- 2.51.0 From d9e697f19bda777a7934e3bbdc67889b06d58019 Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Wed, 30 Oct 2024 15:32:27 +0100 Subject: [PATCH 13/16] drm/amdgpu: bypass SRIOV check for shadow size info Currently, the shadow FW space size and alignment information is protected under a flag (adev->gfx.cp_gfx_shadow) which gets set only in case of SRIOV setups. if (amdgpu_sriov_vf(adev)) adev->gfx.cp_gfx_shadow = true; But we need this information for GFX Userqueues, so that user can create these objects while creating userqueue. This patch series creates a method to get this information bypassing the dependency on this check. This patch: - adds a new input parameter flag to the gfx.funcs->get_gfx_shadow_info fptr definition, so that it can accommodate the information without the check (adev->gfx.cp_gfx_shadow) on request. - updates the existing definition of amdgpu_gfx_get_gfx_shadow_info to adjust with this new flag. Next patch in the series is adding a UAPI which will consume this info. V2: split this patch from the new UAPI patch Cc: Alex Deucher Cc: Christian Koenig Cc: Arvind Yadav Reviewed-by: Alex Deucher Signed-off-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 +++-- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 5d70b3ae3fe1..319e6e547c73 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -305,7 +305,8 @@ struct amdgpu_gfx_funcs { void (*init_spm_golden)(struct amdgpu_device *adev); void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable); int (*get_gfx_shadow_info)(struct amdgpu_device *adev, - struct amdgpu_gfx_shadow_info *shadow_info); + struct amdgpu_gfx_shadow_info *shadow_info, + bool skip_check); enum amdgpu_gfx_partition (*query_partition_mode)(struct amdgpu_device *adev); int (*switch_partition_mode)(struct amdgpu_device *adev, @@ -503,7 +504,7 @@ struct amdgpu_gfx_ras_mem_id_entry { #define amdgpu_gfx_select_se_sh(adev, se, sh, instance, xcc_id) ((adev)->gfx.funcs->select_se_sh((adev), (se), (sh), (instance), (xcc_id))) #define amdgpu_gfx_select_me_pipe_q(adev, me, pipe, q, vmid, xcc_id) ((adev)->gfx.funcs->select_me_pipe_q((adev), (me), (pipe), (q), (vmid), (xcc_id))) #define amdgpu_gfx_init_spm_golden(adev) (adev)->gfx.funcs->init_spm_golden((adev)) -#define amdgpu_gfx_get_gfx_shadow_info(adev, si) ((adev)->gfx.funcs->get_gfx_shadow_info((adev), (si))) +#define amdgpu_gfx_get_gfx_shadow_info(adev, si) ((adev)->gfx.funcs->get_gfx_shadow_info((adev), (si), false)) /** * amdgpu_gfx_create_bitmask - create a bitmask diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index b061b654d1ec..933c4ad5eff9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1086,14 +1086,21 @@ static void gfx_v11_0_select_me_pipe_q(struct amdgpu_device *adev, #define MQD_FWWORKAREA_SIZE 484 #define MQD_FWWORKAREA_ALIGNMENT 256 -static int gfx_v11_0_get_gfx_shadow_info(struct amdgpu_device *adev, +static void gfx_v11_0_get_gfx_shadow_info_nocheck(struct amdgpu_device *adev, struct amdgpu_gfx_shadow_info *shadow_info) { - if (adev->gfx.cp_gfx_shadow) { - shadow_info->shadow_size = MQD_SHADOW_BASE_SIZE; - shadow_info->shadow_alignment = MQD_SHADOW_BASE_ALIGNMENT; - shadow_info->csa_size = MQD_FWWORKAREA_SIZE; - shadow_info->csa_alignment = MQD_FWWORKAREA_ALIGNMENT; + shadow_info->shadow_size = MQD_SHADOW_BASE_SIZE; + shadow_info->shadow_alignment = MQD_SHADOW_BASE_ALIGNMENT; + shadow_info->csa_size = MQD_FWWORKAREA_SIZE; + shadow_info->csa_alignment = MQD_FWWORKAREA_ALIGNMENT; +} + +static int gfx_v11_0_get_gfx_shadow_info(struct amdgpu_device *adev, + struct amdgpu_gfx_shadow_info *shadow_info, + bool skip_check) +{ + if (adev->gfx.cp_gfx_shadow || skip_check) { + gfx_v11_0_get_gfx_shadow_info_nocheck(adev, shadow_info); return 0; } else { memset(shadow_info, 0, sizeof(struct amdgpu_gfx_shadow_info)); -- 2.51.0 From 2761bb9a31f1b863037547d73dc6aac1461ceab6 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Mon, 11 Nov 2024 12:43:07 +0530 Subject: [PATCH 14/16] drm/amdgpu: Modify userq signal/wait struct field names MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Modify kernel UAPI userq signal/wait struct field names and description corresponding to the libdrm UAPI review comments. libdrm MR: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/392 Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 20 ++-- include/uapi/drm/amdgpu_drm.h | 102 +++++++----------- 2 files changed, 46 insertions(+), 76 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 85af0d520092..6157a540c929 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -400,7 +400,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, u64 wptr; num_syncobj_handles = args->num_syncobj_handles; - syncobj_handles = memdup_user(u64_to_user_ptr(args->syncobj_handles_array), + syncobj_handles = memdup_user(u64_to_user_ptr(args->syncobj_handles), sizeof(u32) * num_syncobj_handles); if (IS_ERR(syncobj_handles)) return PTR_ERR(syncobj_handles); @@ -420,7 +420,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, } } - num_read_bo_handles = args->num_read_bo_handles; + num_read_bo_handles = args->num_bo_read_handles; bo_handles_read = memdup_user(u64_to_user_ptr(args->bo_read_handles), sizeof(u32) * num_read_bo_handles); if (IS_ERR(bo_handles_read)) { @@ -443,7 +443,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, } } - num_write_bo_handles = args->num_write_bo_handles; + num_write_bo_handles = args->num_bo_write_handles; bo_handles_write = memdup_user(u64_to_user_ptr(args->bo_write_handles), sizeof(u32) * num_write_bo_handles); if (IS_ERR(bo_handles_write)) { @@ -558,23 +558,23 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { u32 *syncobj_handles, *timeline_points, *timeline_handles, *bo_handles_read, *bo_handles_write; - u32 num_syncobj, num_read_bo_handles, num_write_bo_handles, num_points; + u32 num_syncobj, num_read_bo_handles, num_write_bo_handles; struct drm_amdgpu_userq_fence_info *fence_info = NULL; struct drm_amdgpu_userq_wait *wait_info = data; struct drm_gem_object **gobj_write; struct drm_gem_object **gobj_read; struct dma_fence **fences = NULL; + u16 num_points, num_fences = 0; int r, i, rentry, wentry, cnt; struct drm_exec exec; - u64 num_fences = 0; - num_read_bo_handles = wait_info->num_read_bo_handles; + num_read_bo_handles = wait_info->num_bo_read_handles; bo_handles_read = memdup_user(u64_to_user_ptr(wait_info->bo_read_handles), sizeof(u32) * num_read_bo_handles); if (IS_ERR(bo_handles_read)) return PTR_ERR(bo_handles_read); - num_write_bo_handles = wait_info->num_write_bo_handles; + num_write_bo_handles = wait_info->num_bo_write_handles; bo_handles_write = memdup_user(u64_to_user_ptr(wait_info->bo_write_handles), sizeof(u32) * num_write_bo_handles); if (IS_ERR(bo_handles_write)) { @@ -583,14 +583,14 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, } num_syncobj = wait_info->num_syncobj_handles; - syncobj_handles = memdup_user(u64_to_user_ptr(wait_info->syncobj_handles_array), + syncobj_handles = memdup_user(u64_to_user_ptr(wait_info->syncobj_handles), sizeof(u32) * num_syncobj); if (IS_ERR(syncobj_handles)) { r = PTR_ERR(syncobj_handles); goto free_bo_handles_write; } - num_points = wait_info->num_points; + num_points = wait_info->num_syncobj_timeline_handles; timeline_handles = memdup_user(u64_to_user_ptr(wait_info->syncobj_timeline_handles), sizeof(u32) * num_points); if (IS_ERR(timeline_handles)) { @@ -858,7 +858,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, wait_info->num_fences = cnt; /* Copy userq fence info to user space */ - if (copy_to_user(u64_to_user_ptr(wait_info->userq_fence_info), + if (copy_to_user(u64_to_user_ptr(wait_info->out_fences), fence_info, wait_info->num_fences * sizeof(*fence_info))) { r = -EFAULT; goto free_fences; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 6158496cc1d0..72dc16dbca7f 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -453,27 +453,17 @@ struct drm_amdgpu_userq_signal { * to retrieve the WPTR. */ __u32 queue_id; + __u32 pad; /** - * @flags: flags to indicate special function for userq fence creation. - * Unused for now. - */ - __u32 flags; - /** - * @syncobj_handles_array: An array of syncobj handles used by the userq fence - * creation IOCTL to install the created dma_fence object which can be - * utilized by userspace to explicitly synchronize GPU commands. + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to be signaled. */ - __u64 syncobj_handles_array; + __u64 syncobj_handles; /** * @num_syncobj_handles: A count that represents the number of syncobj handles in - * @syncobj_handles_array. + * @syncobj_handles. */ __u64 num_syncobj_handles; - /** - * @syncobj_point: A given point on the timeline to be signaled. - * Unused for now. - */ - __u64 syncobj_point; /** * @bo_read_handles: The list of BO handles that the submitted user queue job * is using for read only. This will update BO fences in the kernel. @@ -485,20 +475,15 @@ struct drm_amdgpu_userq_signal { */ __u64 bo_write_handles; /** - * @num_read_bo_handles: A count that represents the number of read BO handles in + * @num_bo_read_handles: A count that represents the number of read BO handles in * @bo_read_handles. */ - __u32 num_read_bo_handles; + __u32 num_bo_read_handles; /** - * @num_write_bo_handles: A count that represents the number of write BO handles in + * @num_bo_write_handles: A count that represents the number of write BO handles in * @bo_write_handles. */ - __u32 num_write_bo_handles; - /** - * @bo_flags: flags to indicate BOs synchronize for READ or WRITE - */ - __u32 bo_flags; - __u32 pad; + __u32 num_bo_write_handles; }; struct drm_amdgpu_userq_fence_info { @@ -517,38 +502,18 @@ struct drm_amdgpu_userq_fence_info { struct drm_amdgpu_userq_wait { /** - * @waitq_id: Queue handle used to retrieve the queue information to store - * the fence driver references in the wait user queue structure. - */ - __u32 waitq_id; - /** - * @flags: flags to specify special function for userq wait information. - * Unused for now. - */ - __u32 flags; - /** - * @bo_wait_flags: flags to define the BOs for READ or WRITE to store the - * matching fence wait info pair in @userq_fence_info. - */ - __u32 bo_wait_flags; - /** - * @num_points: A count that represents the number of timeline syncobj handles in - * syncobj_handles_array. - */ - __u32 num_points; - /** - * @syncobj_handles_array: An array of syncobj handles defined to get the - * fence wait information of every syncobj handles in the array. + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to get the va/value pairs. */ - __u64 syncobj_handles_array; + __u64 syncobj_handles; /** - * @syncobj_timeline_handles: An array of timeline syncobj handles defined to get the - * fence wait information of every timeline syncobj handles in the array. + * @syncobj_timeline_handles: The list of timeline syncobj handles submitted by + * the user queue job to get the va/value pairs at given @syncobj_timeline_points. */ - __u64 syncobj_timeline_handles; + __u64 syncobj_timeline_handles; /** - * @syncobj_timeline_points: An array of timeline syncobj points defined to get the - * fence wait points of every timeline syncobj handles in the syncobj_handles_array. + * @syncobj_timeline_points: The list of timeline syncobj points submitted by the + * user queue job for the corresponding @syncobj_timeline_handles. */ __u64 syncobj_timeline_points; /** @@ -561,32 +526,37 @@ struct drm_amdgpu_userq_wait { * job to get the va/value pairs. */ __u64 bo_write_handles; + /** + * @num_syncobj_timeline_handles: A count that represents the number of timeline + * syncobj handles in @syncobj_timeline_handles. + */ + __u16 num_syncobj_timeline_handles; + /** + * @num_fences: This field can be used both as input and output. As input it defines + * the maximum number of fences that can be returned and as output it will specify + * how many fences were actually returned from the ioctl. + */ + __u16 num_fences; /** * @num_syncobj_handles: A count that represents the number of syncobj handles in - * @syncobj_handles_array. + * @syncobj_handles. */ __u32 num_syncobj_handles; /** - * @num_read_bo_handles: A count that represents the number of read BO handles in + * @num_bo_read_handles: A count that represents the number of read BO handles in * @bo_read_handles. */ - __u32 num_read_bo_handles; + __u32 num_bo_read_handles; /** - * @num_write_bo_handles: A count that represents the number of write BO handles in + * @num_bo_write_handles: A count that represents the number of write BO handles in * @bo_write_handles. */ - __u32 num_write_bo_handles; - __u32 pad; - /** - * @userq_fence_info: An array of fence information (va and value) pair of each - * objects stored in @syncobj_handles_array and @bo_handles_array. - */ - __u64 userq_fence_info; + __u32 num_bo_write_handles; /** - * @num_fences: A count that represents the number of actual fences installed in - * each syncobj and bo handles. + * @out_fences: The field is a return value from the ioctl containing the list of + * address/value pairs to wait for. */ - __u64 num_fences; + __u64 out_fences; }; /* vm ioctl */ -- 2.51.0 From aed7caf2d4fc659cacfd4358ae4893e2a50480f1 Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Thu, 24 Oct 2024 17:07:42 +0200 Subject: [PATCH 15/16] drm/amdgpu: add get_gfx_shadow_info callback for gfx12 This callback gets the size and alignment requirements for the gfx shadow buffer for preemption. Cc: Alex Deucher Cc: Christian Koenig Reviewed-by: Alex Deucher Signed-off-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index b4211e6e6486..4f9a73bae332 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -910,6 +910,34 @@ static void gfx_v12_0_select_me_pipe_q(struct amdgpu_device *adev, soc24_grbm_select(adev, me, pipe, q, vm); } +/* all sizes are in bytes */ +#define MQD_SHADOW_BASE_SIZE 73728 +#define MQD_SHADOW_BASE_ALIGNMENT 256 +#define MQD_FWWORKAREA_SIZE 484 +#define MQD_FWWORKAREA_ALIGNMENT 256 + +static void gfx_v12_0_get_gfx_shadow_info_nocheck(struct amdgpu_device *adev, + struct amdgpu_gfx_shadow_info *shadow_info) +{ + shadow_info->shadow_size = MQD_SHADOW_BASE_SIZE; + shadow_info->shadow_alignment = MQD_SHADOW_BASE_ALIGNMENT; + shadow_info->csa_size = MQD_FWWORKAREA_SIZE; + shadow_info->csa_alignment = MQD_FWWORKAREA_ALIGNMENT; +} + +static int gfx_v12_0_get_gfx_shadow_info(struct amdgpu_device *adev, + struct amdgpu_gfx_shadow_info *shadow_info, + bool skip_check) +{ + if (adev->gfx.cp_gfx_shadow || skip_check) { + gfx_v12_0_get_gfx_shadow_info_nocheck(adev, shadow_info); + return 0; + } + + memset(shadow_info, 0, sizeof(struct amdgpu_gfx_shadow_info)); + return -EINVAL; +} + static const struct amdgpu_gfx_funcs gfx_v12_0_gfx_funcs = { .get_gpu_clock_counter = &gfx_v12_0_get_gpu_clock_counter, .select_se_sh = &gfx_v12_0_select_se_sh, @@ -918,6 +946,7 @@ static const struct amdgpu_gfx_funcs gfx_v12_0_gfx_funcs = { .read_wave_vgprs = &gfx_v12_0_read_wave_vgprs, .select_me_pipe_q = &gfx_v12_0_select_me_pipe_q, .update_perfmon_mgcg = &gfx_v12_0_update_perf_clk, + .get_gfx_shadow_info = &gfx_v12_0_get_gfx_shadow_info, }; static int gfx_v12_0_gpu_early_init(struct amdgpu_device *adev) -- 2.51.0 From 90c448fef3120d79bd8031665213981c966dbaf4 Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Wed, 30 Oct 2024 15:39:42 +0100 Subject: [PATCH 16/16] drm/amdgpu: add new AMDGPU_INFO subquery for userq objects This patch adds a new subquery (AMDGPU_INFO_UQ_FW_AREAS) in AMDGPU_INFO_IOCTL to get the size and alignment of shadow and csa objects from the FW setup. This information is required for the userqueue consumers. V2: Added Alex's suggestions and addressed review comments: - make this query IP specific (GFX/SDMA etc) - give a better title (AMDGPU_INFO_UQ_METADATA) - restructured the code as per sample code shared by Alex V3: Split the UAPI patch from shadow_size_fn modifications V4: Addressed review comments from UAPI review (Marek/Pierre-Eric) - Change the query name to AMDGPU_INFO_UQ_FW_AREAS - remove unused inpur parameter for AMDGPU_HW_IP* UAPI link: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/400/ Cc: Alex Deucher Cc: Christian Koenig Cc: Arvind Yadav Reviewed-by: Alex Deucher Signed-off-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 36 ++++++++++++++++++++++ include/uapi/drm/amdgpu_drm.h | 40 +++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 1fcf0ef06315..2e07776dd408 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -371,6 +371,26 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info, return 0; } +static int amdgpu_userq_metadata_info_gfx(struct amdgpu_device *adev, + struct drm_amdgpu_info *info, + struct drm_amdgpu_info_uq_metadata_gfx *meta) +{ + int ret = -EOPNOTSUPP; + + if (adev->gfx.funcs->get_gfx_shadow_info) { + struct amdgpu_gfx_shadow_info shadow = {}; + + adev->gfx.funcs->get_gfx_shadow_info(adev, &shadow, true); + meta->shadow_size = shadow.shadow_size; + meta->shadow_alignment = shadow.shadow_alignment; + meta->csa_size = shadow.csa_size; + meta->csa_alignment = shadow.csa_alignment; + ret = 0; + } + + return ret; +} + static int amdgpu_hw_ip_info(struct amdgpu_device *adev, struct drm_amdgpu_info *info, struct drm_amdgpu_info_hw_ip *result) @@ -1294,6 +1314,22 @@ out: return copy_to_user(out, &gpuvm_fault, min((size_t)size, sizeof(gpuvm_fault))) ? -EFAULT : 0; } + case AMDGPU_INFO_UQ_FW_AREAS: { + struct drm_amdgpu_info_uq_metadata meta_info = {}; + + switch (info->query_hw_ip.type) { + case AMDGPU_HW_IP_GFX: + ret = amdgpu_userq_metadata_info_gfx(adev, info, &meta_info.gfx); + if (ret) + return ret; + + ret = copy_to_user(out, &meta_info, + min((size_t)size, sizeof(meta_info))) ? -EFAULT : 0; + return 0; + default: + return -EINVAL; + } + } default: DRM_DEBUG_KMS("Invalid request %d\n", info->query); return -EINVAL; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 72dc16dbca7f..5dbd9037afe7 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -1193,6 +1193,8 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { #define AMDGPU_INFO_MAX_IBS 0x22 /* query last page fault info */ #define AMDGPU_INFO_GPUVM_FAULT 0x23 +/* query FW object size and alignment */ +#define AMDGPU_INFO_UQ_FW_AREAS 0x24 #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 #define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff @@ -1469,6 +1471,27 @@ struct drm_amdgpu_info_hw_ip { __u32 ip_discovery_version; }; +/* GFX metadata BO sizes and alignment info (in bytes) */ +struct drm_amdgpu_info_uq_fw_areas_gfx { + /* shadow area size */ + __u32 shadow_size; + /* shadow area base virtual mem alignment */ + __u32 shadow_alignment; + /* context save area size */ + __u32 csa_size; + /* context save area base virtual mem alignment */ + __u32 csa_alignment; +}; + +/* IP specific fw related information used in the + * subquery AMDGPU_INFO_UQ_FW_AREAS + */ +struct drm_amdgpu_info_uq_fw_areas { + union { + struct drm_amdgpu_info_uq_fw_areas_gfx gfx; + }; +}; + struct drm_amdgpu_info_num_handles { /** Max handles as supported by firmware for UVD */ __u32 uvd_max_handles; @@ -1532,6 +1555,23 @@ struct drm_amdgpu_info_gpuvm_fault { __u32 vmhub; }; +struct drm_amdgpu_info_uq_metadata_gfx { + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; +}; + +struct drm_amdgpu_info_uq_metadata { + union { + struct drm_amdgpu_info_uq_metadata_gfx gfx; + }; +}; + /* * Supported GPU families */ -- 2.51.0