]> www.infradead.org Git - users/hch/block.git/commitdiff
drm/amdkfd: CRIU resume shared virtual memory ranges
authorRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Mon, 8 Nov 2021 22:33:42 +0000 (17:33 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 7 Feb 2022 22:59:53 +0000 (17:59 -0500)
In CRIU resume stage, resume all the shared virtual memory ranges from
the data stored inside the resuming kfd process during CRIU restore
phase. Also setup xnack mode and free up the resources.

KFD_IOCTL_SVM_ATTR_CLR_FLAGS is not available for querying via get_attr
interface but we must clear the flags during restore as there might be
some default flags set when the prange is created. Also handle the
invalid PREFETCH atribute values saved during checkpoint by replacing
them with another dummy KFD_IOCTL_SVM_ATTR_SET_FLAGS attribute.

(rajneesh: Fixed the checkpatch reported problems)
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
drivers/gpu/drm/amd/amdkfd/kfd_svm.h

index c143f242a84d7f999b7be26ddf57005767d40724..64e3b4e3a7126068ebfe0fc2fd8ab731ed258409 100644 (file)
@@ -2766,7 +2766,17 @@ static int criu_resume(struct file *filep,
        }
 
        mutex_lock(&target->mutex);
+       ret = kfd_criu_resume_svm(target);
+       if (ret) {
+               pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
+               goto exit;
+       }
+
        ret =  amdgpu_amdkfd_criu_resume(target->kgd_process_info);
+       if (ret)
+               pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
+
+exit:
        mutex_unlock(&target->mutex);
 
        kfd_unref_process(target);
index 41ac049b3316179972f25fb995f0e012002bcf7b..41f03d165bade5899b214b0a56e463e749b22a09 100644 (file)
@@ -3487,6 +3487,109 @@ fill_values:
        return 0;
 }
 
+int kfd_criu_resume_svm(struct kfd_process *p)
+{
+       struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
+       int nattr_common = 4, nattr_accessibility = 1;
+       struct criu_svm_metadata *criu_svm_md = NULL;
+       struct svm_range_list *svms = &p->svms;
+       struct criu_svm_metadata *next = NULL;
+       uint32_t set_flags = 0xffffffff;
+       int i, j, num_attrs, ret = 0;
+       uint64_t set_attr_size;
+       struct mm_struct *mm;
+
+       if (list_empty(&svms->criu_svm_metadata_list)) {
+               pr_debug("No SVM data from CRIU restore stage 2\n");
+               return ret;
+       }
+
+       mm = get_task_mm(p->lead_thread);
+       if (!mm) {
+               pr_err("failed to get mm for the target process\n");
+               return -ESRCH;
+       }
+
+       num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
+
+       i = j = 0;
+       list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
+               pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
+                        i, criu_svm_md->data.start_addr, criu_svm_md->data.size);
+
+               for (j = 0; j < num_attrs; j++) {
+                       pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x \ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
+                                i, j, criu_svm_md->data.attrs[j].type,
+                                i, j, criu_svm_md->data.attrs[j].value);
+                       switch (criu_svm_md->data.attrs[j].type) {
+                       /* During Checkpoint operation, the query for
+                        * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might
+                        * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were
+                        * not used by the range which was checkpointed. Care
+                        * must be taken to not restore with an invalid value
+                        * otherwise the gpuidx value will be invalid and
+                        * set_attr would eventually fail so just replace those
+                        * with another dummy attribute such as
+                        * KFD_IOCTL_SVM_ATTR_SET_FLAGS.
+                        */
+                       case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
+                               if (criu_svm_md->data.attrs[j].value ==
+                                   KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
+                                       criu_svm_md->data.attrs[j].type =
+                                               KFD_IOCTL_SVM_ATTR_SET_FLAGS;
+                                       criu_svm_md->data.attrs[j].value = 0;
+                               }
+                               break;
+                       case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
+                               set_flags = criu_svm_md->data.attrs[j].value;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+
+               /* CLR_FLAGS is not available via get_attr during checkpoint but
+                * it needs to be inserted before restoring the ranges so
+                * allocate extra space for it before calling set_attr
+                */
+               set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
+                                               (num_attrs + 1);
+               set_attr_new = krealloc(set_attr, set_attr_size,
+                                           GFP_KERNEL);
+               if (!set_attr_new) {
+                       ret = -ENOMEM;
+                       goto exit;
+               }
+               set_attr = set_attr_new;
+
+               memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
+                                       sizeof(struct kfd_ioctl_svm_attribute));
+               set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
+               set_attr[num_attrs].value = ~set_flags;
+
+               ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
+                                        criu_svm_md->data.size, num_attrs + 1,
+                                        set_attr);
+               if (ret) {
+                       pr_err("CRIU: failed to set range attributes\n");
+                       goto exit;
+               }
+
+               i++;
+       }
+exit:
+       kfree(set_attr);
+       list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
+               pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
+                                               criu_svm_md->data.start_addr);
+               kfree(criu_svm_md);
+       }
+
+       mmput(mm);
+       return ret;
+
+}
+
 int kfd_criu_restore_svm(struct kfd_process *p,
                         uint8_t __user *user_priv_ptr,
                         uint64_t *priv_data_offset,
index 3b5948f67de2917aedc835146f3df92d15ec2e72..66c77f00ac3e4e8c554768b5df8b685e9e405d62 100644 (file)
@@ -192,6 +192,7 @@ int kfd_criu_restore_svm(struct kfd_process *p,
                         uint8_t __user *user_priv_ptr,
                         uint64_t *priv_data_offset,
                         uint64_t max_priv_data_size);
+int kfd_criu_resume_svm(struct kfd_process *p);
 struct kfd_process_device *
 svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev);
 void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_struct *mm);
@@ -253,6 +254,11 @@ static inline int kfd_criu_restore_svm(struct kfd_process *p,
        return -EINVAL;
 }
 
+static inline int kfd_criu_resume_svm(struct kfd_process *p)
+{
+       return 0;
+}
+
 #define KFD_IS_SVM_API_SUPPORTED(dev) false
 
 #endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */