return r;
 }
 
+static uint64_t
+svm_migrate_node_physical_addr(struct amdgpu_device *adev,
+                              struct drm_mm_node **mm_node, uint64_t *offset)
+{
+       struct drm_mm_node *node = *mm_node;
+       uint64_t pos = *offset;
+
+       if (node->start == AMDGPU_BO_INVALID_OFFSET) {
+               pr_debug("drm node is not validated\n");
+               return 0;
+       }
+
+       pr_debug("vram node start 0x%llx npages 0x%llx\n", node->start,
+                node->size);
+
+       if (pos >= node->size) {
+               do  {
+                       pos -= node->size;
+                       node++;
+               } while (pos >= node->size);
+
+               *mm_node = node;
+               *offset = pos;
+       }
+
+       return (node->start + pos) << PAGE_SHIFT;
+}
+
+unsigned long
+svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr)
+{
+       return (addr + adev->kfd.dev->pgmap.range.start) >> PAGE_SHIFT;
+}
+
+static void
+svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
+{
+       struct page *page;
+
+       page = pfn_to_page(pfn);
+       page->zone_device_data = prange;
+       get_page(page);
+       lock_page(page);
+}
+
+static void
+svm_migrate_put_vram_page(struct amdgpu_device *adev, unsigned long addr)
+{
+       struct page *page;
+
+       page = pfn_to_page(svm_migrate_addr_to_pfn(adev, addr));
+       unlock_page(page);
+       put_page(page);
+}
+
+
+static int
+svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
+                        struct migrate_vma *migrate, struct dma_fence **mfence,
+                        dma_addr_t *scratch)
+{
+       uint64_t npages = migrate->cpages;
+       struct device *dev = adev->dev;
+       struct drm_mm_node *node;
+       dma_addr_t *src;
+       uint64_t *dst;
+       uint64_t vram_addr;
+       uint64_t offset;
+       uint64_t i, j;
+       int r = -ENOMEM;
+
+       pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
+                prange->last);
+
+       src = scratch;
+       dst = (uint64_t *)(scratch + npages);
+
+       r = svm_range_vram_node_new(adev, prange, true);
+       if (r) {
+               pr_debug("failed %d get 0x%llx pages from vram\n", r, npages);
+               goto out;
+       }
+
+       node = prange->ttm_res->mm_node;
+       offset = prange->offset;
+       vram_addr = svm_migrate_node_physical_addr(adev, &node, &offset);
+       if (!vram_addr) {
+               WARN_ONCE(1, "vram node address is 0\n");
+               r = -ENOMEM;
+               goto out;
+       }
+
+       for (i = j = 0; i < npages; i++) {
+               struct page *spage;
+
+               dst[i] = vram_addr + (j << PAGE_SHIFT);
+               migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
+               svm_migrate_get_vram_page(prange, migrate->dst[i]);
+
+               migrate->dst[i] = migrate_pfn(migrate->dst[i]);
+               migrate->dst[i] |= MIGRATE_PFN_LOCKED;
+
+               if (migrate->src[i] & MIGRATE_PFN_VALID) {
+                       spage = migrate_pfn_to_page(migrate->src[i]);
+                       src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE,
+                                             DMA_TO_DEVICE);
+                       r = dma_mapping_error(dev, src[i]);
+                       if (r) {
+                               pr_debug("failed %d dma_map_page\n", r);
+                               goto out_free_vram_pages;
+                       }
+               } else {
+                       if (j) {
+                               r = svm_migrate_copy_memory_gart(
+                                               adev, src + i - j,
+                                               dst + i - j, j,
+                                               FROM_RAM_TO_VRAM,
+                                               mfence);
+                               if (r)
+                                       goto out_free_vram_pages;
+                               offset += j;
+                               vram_addr = (node->start + offset) << PAGE_SHIFT;
+                               j = 0;
+                       } else {
+                               offset++;
+                               vram_addr += PAGE_SIZE;
+                       }
+                       if (offset >= node->size) {
+                               node++;
+                               pr_debug("next node size 0x%llx\n", node->size);
+                               vram_addr = node->start << PAGE_SHIFT;
+                               offset = 0;
+                       }
+                       continue;
+               }
+
+               pr_debug("dma mapping src to 0x%llx, page_to_pfn 0x%lx\n",
+                        src[i] >> PAGE_SHIFT, page_to_pfn(spage));
+
+               if (j + offset >= node->size - 1 && i < npages - 1) {
+                       r = svm_migrate_copy_memory_gart(adev, src + i - j,
+                                                        dst + i - j, j + 1,
+                                                        FROM_RAM_TO_VRAM,
+                                                        mfence);
+                       if (r)
+                               goto out_free_vram_pages;
+
+                       node++;
+                       pr_debug("next node size 0x%llx\n", node->size);
+                       vram_addr = node->start << PAGE_SHIFT;
+                       offset = 0;
+                       j = 0;
+               } else {
+                       j++;
+               }
+       }
+
+       r = svm_migrate_copy_memory_gart(adev, src + i - j, dst + i - j, j,
+                                        FROM_RAM_TO_VRAM, mfence);
+
+out_free_vram_pages:
+       if (r) {
+               pr_debug("failed %d to copy memory to vram\n", r);
+               while (i--) {
+                       svm_migrate_put_vram_page(adev, dst[i]);
+                       migrate->dst[i] = 0;
+               }
+       }
+
+out:
+       return r;
+}
+
+static int
+svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
+                       struct vm_area_struct *vma, uint64_t start,
+                       uint64_t end)
+{
+       uint64_t npages = (end - start) >> PAGE_SHIFT;
+       struct dma_fence *mfence = NULL;
+       struct migrate_vma migrate;
+       dma_addr_t *scratch;
+       size_t size;
+       void *buf;
+       int r = -ENOMEM;
+       int retry = 0;
+
+       memset(&migrate, 0, sizeof(migrate));
+       migrate.vma = vma;
+       migrate.start = start;
+       migrate.end = end;
+       migrate.flags = MIGRATE_VMA_SELECT_SYSTEM;
+       migrate.pgmap_owner = adev;
+
+       size = 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t);
+       size *= npages;
+       buf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
+       if (!buf)
+               goto out;
+
+       migrate.src = buf;
+       migrate.dst = migrate.src + npages;
+       scratch = (dma_addr_t *)(migrate.dst + npages);
+
+retry:
+       r = migrate_vma_setup(&migrate);
+       if (r) {
+               pr_debug("failed %d prepare migrate svms 0x%p [0x%lx 0x%lx]\n",
+                        r, prange->svms, prange->start, prange->last);
+               goto out_free;
+       }
+       if (migrate.cpages != npages) {
+               pr_debug("collect 0x%lx/0x%llx pages, retry\n", migrate.cpages,
+                        npages);
+               migrate_vma_finalize(&migrate);
+               if (retry++ >= 3) {
+                       r = -ENOMEM;
+                       pr_debug("failed %d migrate svms 0x%p [0x%lx 0x%lx]\n",
+                                r, prange->svms, prange->start, prange->last);
+                       goto out_free;
+               }
+
+               goto retry;
+       }
+
+       if (migrate.cpages) {
+               svm_migrate_copy_to_vram(adev, prange, &migrate, &mfence,
+                                        scratch);
+               migrate_vma_pages(&migrate);
+               svm_migrate_copy_done(adev, mfence);
+               migrate_vma_finalize(&migrate);
+       }
+
+       svm_range_dma_unmap(adev->dev, scratch, 0, npages);
+       svm_range_free_dma_mappings(prange);
+
+out_free:
+       kvfree(buf);
+out:
+       return r;
+}
+
+/**
+ * svm_migrate_ram_to_vram - migrate svm range from system to device
+ * @prange: range structure
+ * @best_loc: the device to migrate to
+ *
+ * Context: Process context, caller hold mmap read lock, svms lock, prange lock
+ *
+ * Return:
+ * 0 - OK, otherwise error code
+ */
+int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc)
+{
+       unsigned long addr, start, end;
+       struct vm_area_struct *vma;
+       struct amdgpu_device *adev;
+       struct mm_struct *mm;
+       int r = 0;
+
+       if (prange->actual_loc == best_loc) {
+               pr_debug("svms 0x%p [0x%lx 0x%lx] already on best_loc 0x%x\n",
+                        prange->svms, prange->start, prange->last, best_loc);
+               return 0;
+       }
+
+       adev = svm_range_get_adev_by_id(prange, best_loc);
+       if (!adev) {
+               pr_debug("failed to get device by id 0x%x\n", best_loc);
+               return -ENODEV;
+       }
+
+       pr_debug("svms 0x%p [0x%lx 0x%lx] to gpu 0x%x\n", prange->svms,
+                prange->start, prange->last, best_loc);
+
+       /* FIXME: workaround for page locking bug with invalid pages */
+       svm_range_prefault(prange, mm);
+
+       start = prange->start << PAGE_SHIFT;
+       end = (prange->last + 1) << PAGE_SHIFT;
+
+       mm = current->mm;
+
+       for (addr = start; addr < end;) {
+               unsigned long next;
+
+               vma = find_vma(mm, addr);
+               if (!vma || addr < vma->vm_start)
+                       break;
+
+               next = min(vma->vm_end, end);
+               r = svm_migrate_vma_to_vram(adev, prange, vma, addr, next);
+               if (r) {
+                       pr_debug("failed to migrate\n");
+                       break;
+               }
+               addr = next;
+       }
+
+       if (!r)
+               prange->actual_loc = best_loc;
+
+       return r;
+}
+
 static void svm_migrate_page_free(struct page *page)
 {
 }
 
        FROM_VRAM_TO_RAM
 };
 
+int svm_migrate_ram_to_vram(struct svm_range *prange,  uint32_t best_loc);
+
 #if defined(CONFIG_DEVICE_PRIVATE)
 int svm_migrate_init(struct amdgpu_device *adev);
 void svm_migrate_fini(struct amdgpu_device *adev);
 
 #include "amdgpu_xgmi.h"
 #include "kfd_priv.h"
 #include "kfd_svm.h"
+#include "kfd_migrate.h"
 
 #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
 
        return r;
 }
 
-static void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
-                               unsigned long offset, unsigned long npages)
+void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
+                        unsigned long offset, unsigned long npages)
 {
        enum dma_data_direction dir = DMA_BIDIRECTIONAL;
        int i;
        }
 }
 
-static void svm_range_free_dma_mappings(struct svm_range *prange)
+void svm_range_free_dma_mappings(struct svm_range *prange)
 {
        struct kfd_process_device *pdd;
        dma_addr_t *dma_addr;
        svm_range_vram_node_free(prange);
        svm_range_free_dma_mappings(prange);
        mutex_destroy(&prange->lock);
+       mutex_destroy(&prange->migrate_mutex);
        kfree(prange);
 }
 
        INIT_LIST_HEAD(&prange->deferred_list);
        INIT_LIST_HEAD(&prange->child_list);
        atomic_set(&prange->invalid, 0);
+       mutex_init(&prange->migrate_mutex);
        mutex_init(&prange->lock);
        svm_range_set_default_attributes(&prange->preferred_loc,
                                         &prange->prefetch_loc,
                        pr_debug("failed %d to dma map range\n", r);
                        goto unreserve_out;
                }
+
+               prange->validated_once = true;
        }
 
        svm_range_lock(prange);
                         prange->svms, prange, prange->start, prange->last,
                         invalid);
 
+               /*
+                * If range is migrating, wait for migration is done.
+                */
+               mutex_lock(&prange->migrate_mutex);
+
                r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
                                               false, true);
-               if (r) {
+               if (r)
                        pr_debug("failed %d to map 0x%lx to gpus\n", r,
                                 prange->start);
-                       goto unlock_out;
-               }
+
+               mutex_unlock(&prange->migrate_mutex);
+               if (r)
+                       goto out_reschedule;
 
                if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
-                       goto unlock_out;
+                       goto out_reschedule;
        }
 
        if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
            evicted_ranges)
-               goto unlock_out;
+               goto out_reschedule;
 
        evicted_ranges = 0;
 
 
        pr_debug("restore svm ranges successfully\n");
 
-unlock_out:
+out_reschedule:
        mutex_unlock(&svms->lock);
        mmap_write_unlock(mm);
        mutex_unlock(&process_info->lock);
                list_del_init(&prange->deferred_list);
                spin_unlock(&svms->deferred_list_lock);
 
+               mutex_lock(&prange->migrate_mutex);
                while (!list_empty(&prange->child_list)) {
                        struct svm_range *pchild;
 
                        list_del_init(&pchild->child_list);
                        svm_range_handle_list_op(svms, pchild);
                }
+               mutex_unlock(&prange->migrate_mutex);
 
                svm_range_handle_list_op(svms, prange);
                mutex_unlock(&svms->lock);
        return 0;
 }
 
+/* svm_range_best_location - decide the best actual location
+ * @prange: svm range structure
+ *
+ * For xnack off:
+ * If range map to single GPU, the best acutal location is prefetch loc, which
+ * can be CPU or GPU.
+ *
+ * If range map to multiple GPUs, only if mGPU connection on xgmi same hive,
+ * the best actual location could be prefetch_loc GPU. If mGPU connection on
+ * PCIe, the best actual location is always CPU, because GPU cannot access vram
+ * of other GPUs, assuming PCIe small bar (large bar support is not upstream).
+ *
+ * For xnack on:
+ * The best actual location is prefetch location. If mGPU connection on xgmi
+ * same hive, range map to multiple GPUs. Otherwise, the range only map to
+ * actual location GPU. Other GPU access vm fault will trigger migration.
+ *
+ * Context: Process context
+ *
+ * Return:
+ * 0 for CPU or GPU id
+ */
+static uint32_t svm_range_best_location(struct svm_range *prange)
+{
+       DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
+       uint32_t best_loc = prange->prefetch_loc;
+       struct kfd_process_device *pdd;
+       struct amdgpu_device *bo_adev;
+       struct amdgpu_device *adev;
+       struct kfd_process *p;
+       uint32_t gpuidx;
+
+       p = container_of(prange->svms, struct kfd_process, svms);
+
+       /* xnack on */
+       if (p->xnack_enabled)
+               goto out;
+
+       /* xnack off */
+       if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
+               goto out;
+
+       bo_adev = svm_range_get_adev_by_id(prange, best_loc);
+       bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
+                 MAX_GPU_INSTANCE);
+
+       for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
+               pdd = kfd_process_device_from_gpuidx(p, gpuidx);
+               if (!pdd) {
+                       pr_debug("failed to get device by idx 0x%x\n", gpuidx);
+                       continue;
+               }
+               adev = (struct amdgpu_device *)pdd->dev->kgd;
+
+               if (adev == bo_adev)
+                       continue;
+
+               if (!amdgpu_xgmi_same_hive(adev, bo_adev)) {
+                       best_loc = 0;
+                       break;
+               }
+       }
+
+out:
+       pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
+                p->xnack_enabled, &p->svms, prange->start, prange->last,
+                best_loc);
+
+       return best_loc;
+}
+
+/* FIXME: This is a workaround for page locking bug when some pages are
+ * invalid during migration to VRAM
+ */
+void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm)
+{
+       struct hmm_range *hmm_range;
+       int r;
+
+       if (prange->validated_once)
+               return;
+
+       r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
+                                      prange->start << PAGE_SHIFT,
+                                      prange->npages, &hmm_range,
+                                      false, true);
+       if (!r) {
+               amdgpu_hmm_range_get_pages_done(hmm_range);
+               prange->validated_once = true;
+       }
+}
+
+/* svm_range_trigger_migration - start page migration if prefetch loc changed
+ * @mm: current process mm_struct
+ * @prange: svm range structure
+ * @migrated: output, true if migration is triggered
+ *
+ * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
+ * from ram to vram.
+ * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
+ * from vram to ram.
+ *
+ * If GPU vm fault retry is not enabled, migration interact with MMU notifier
+ * and restore work:
+ * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
+ *    stops all queues, schedule restore work
+ * 2. svm_range_restore_work wait for migration is done by
+ *    a. svm_range_validate_vram takes prange->migrate_mutex
+ *    b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
+ * 3. restore work update mappings of GPU, resume all queues.
+ *
+ * Context: Process context
+ *
+ * Return:
+ * 0 - OK, otherwise - error code of migration
+ */
+static int
+svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
+                           bool *migrated)
+{
+       uint32_t best_loc;
+       int r = 0;
+
+       *migrated = false;
+       best_loc = svm_range_best_location(prange);
+
+       if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
+           best_loc == prange->actual_loc)
+               return 0;
+
+       if (best_loc && !prange->actual_loc &&
+           !(prange->flags & KFD_IOCTL_SVM_FLAG_HOST_ACCESS))
+               return 0;
+
+       if (best_loc) {
+               pr_debug("migrate from ram to vram\n");
+               r = svm_migrate_ram_to_vram(prange, best_loc);
+
+               if (!r)
+                       *migrated = true;
+       }
+
+       return r;
+}
+
 static int
 svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
                   uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
         * case because the rollback wouldn't be guaranteed to work either.
         */
        list_for_each_entry(prange, &update_list, update_list) {
+               bool migrated;
+
+               mutex_lock(&prange->migrate_mutex);
+
+               r = svm_range_trigger_migration(mm, prange, &migrated);
+               if (r)
+                       goto out_unlock_range;
+
+               if (migrated) {
+                       pr_debug("restore_work will update mappings of GPUs\n");
+                       mutex_unlock(&prange->migrate_mutex);
+                       continue;
+               }
+
                r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
                                               true, true);
-               if (r) {
-                       pr_debug("failed %d to map 0x%lx to gpus\n", r,
-                                prange->start);
+               if (r)
+                       pr_debug("failed %d to map svm range\n", r);
+
+out_unlock_range:
+               mutex_unlock(&prange->migrate_mutex);
+               if (r)
                        break;
-               }
        }
 
        svm_range_debug_dump(svms);
 
  * struct svm_range - shared virtual memory range
  *
  * @svms:       list of svm ranges, structure defined in kfd_process
+ * @migrate_mutex: to serialize range migration, validation and mapping update
  * @start:      range start address in pages
  * @last:       range last address in pages
  * @it_node:    node [start, last] stored in interval tree, start, last are page
  */
 struct svm_range {
        struct svm_range_list           *svms;
+       struct mutex                    migrate_mutex;
        unsigned long                   start;
        unsigned long                   last;
        struct interval_tree_node       it_node;
        struct list_head                child_list;
        DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
        DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
+       bool                            validated_once;
 };
 
 static inline void svm_range_lock(struct svm_range *prange)
 int svm_range_vram_node_new(struct amdgpu_device *adev,
                            struct svm_range *prange, bool clear);
 void svm_range_vram_node_free(struct svm_range *prange);
+void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
+                        unsigned long offset, unsigned long npages);
+void svm_range_free_dma_mappings(struct svm_range *prange);
+void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm);
 
 #endif /* KFD_SVM_H_ */