It is recommended to set the soft limit always below the hard limit,
         otherwise the hard limit will take precedence.
  
- 8. Move charges at task migration
- =================================
 +.. _cgroup-v1-memory-move-charges:
 +
+ 8. Move charges at task migration (DEPRECATED!)
+ ===============================================
+ 
+ THIS IS DEPRECATED!
+ 
+ It's expensive and unreliable! It's better practice to launch workload
+ tasks directly from inside their target cgroup. Use dedicated workload
+ cgroups to allow fine-grained policy adjustments without having to
+ move physical pages between control domains.
  
  Users can move charges associated with a task along with task migration, that
  is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
 
- =============
 -.. _numaperf:
 -
+ =======================
+ NUMA Memory Performance
+ =======================
+ 
  NUMA Locality
  =============
  
 
--- /dev/null
-       vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND;
 +// SPDX-License-Identifier: GPL-2.0-only
 +/*
 + * Copyright (C) 2020-2023 Intel Corporation
 + */
 +
 +#include <linux/dma-buf.h>
 +#include <linux/highmem.h>
 +#include <linux/module.h>
 +#include <linux/set_memory.h>
 +#include <linux/xarray.h>
 +
 +#include <drm/drm_cache.h>
 +#include <drm/drm_debugfs.h>
 +#include <drm/drm_file.h>
 +#include <drm/drm_utils.h>
 +
 +#include "ivpu_drv.h"
 +#include "ivpu_gem.h"
 +#include "ivpu_hw.h"
 +#include "ivpu_mmu.h"
 +#include "ivpu_mmu_context.h"
 +
 +MODULE_IMPORT_NS(DMA_BUF);
 +
 +static const struct drm_gem_object_funcs ivpu_gem_funcs;
 +
 +static struct lock_class_key prime_bo_lock_class_key;
 +
 +static int __must_check prime_alloc_pages_locked(struct ivpu_bo *bo)
 +{
 +      /* Pages are managed by the underlying dma-buf */
 +      return 0;
 +}
 +
 +static void prime_free_pages_locked(struct ivpu_bo *bo)
 +{
 +      /* Pages are managed by the underlying dma-buf */
 +}
 +
 +static int prime_map_pages_locked(struct ivpu_bo *bo)
 +{
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +      struct sg_table *sgt;
 +
 +      sgt = dma_buf_map_attachment_unlocked(bo->base.import_attach, DMA_BIDIRECTIONAL);
 +      if (IS_ERR(sgt)) {
 +              ivpu_err(vdev, "Failed to map attachment: %ld\n", PTR_ERR(sgt));
 +              return PTR_ERR(sgt);
 +      }
 +
 +      bo->sgt = sgt;
 +      return 0;
 +}
 +
 +static void prime_unmap_pages_locked(struct ivpu_bo *bo)
 +{
 +      dma_buf_unmap_attachment_unlocked(bo->base.import_attach, bo->sgt, DMA_BIDIRECTIONAL);
 +      bo->sgt = NULL;
 +}
 +
 +static const struct ivpu_bo_ops prime_ops = {
 +      .type = IVPU_BO_TYPE_PRIME,
 +      .name = "prime",
 +      .alloc_pages = prime_alloc_pages_locked,
 +      .free_pages = prime_free_pages_locked,
 +      .map_pages = prime_map_pages_locked,
 +      .unmap_pages = prime_unmap_pages_locked,
 +};
 +
 +static int __must_check shmem_alloc_pages_locked(struct ivpu_bo *bo)
 +{
 +      int npages = bo->base.size >> PAGE_SHIFT;
 +      struct page **pages;
 +
 +      pages = drm_gem_get_pages(&bo->base);
 +      if (IS_ERR(pages))
 +              return PTR_ERR(pages);
 +
 +      if (bo->flags & DRM_IVPU_BO_WC)
 +              set_pages_array_wc(pages, npages);
 +      else if (bo->flags & DRM_IVPU_BO_UNCACHED)
 +              set_pages_array_uc(pages, npages);
 +
 +      bo->pages = pages;
 +      return 0;
 +}
 +
 +static void shmem_free_pages_locked(struct ivpu_bo *bo)
 +{
 +      if (ivpu_bo_cache_mode(bo) != DRM_IVPU_BO_CACHED)
 +              set_pages_array_wb(bo->pages, bo->base.size >> PAGE_SHIFT);
 +
 +      drm_gem_put_pages(&bo->base, bo->pages, true, false);
 +      bo->pages = NULL;
 +}
 +
 +static int ivpu_bo_map_pages_locked(struct ivpu_bo *bo)
 +{
 +      int npages = bo->base.size >> PAGE_SHIFT;
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +      struct sg_table *sgt;
 +      int ret;
 +
 +      sgt = drm_prime_pages_to_sg(&vdev->drm, bo->pages, npages);
 +      if (IS_ERR(sgt)) {
 +              ivpu_err(vdev, "Failed to allocate sgtable\n");
 +              return PTR_ERR(sgt);
 +      }
 +
 +      ret = dma_map_sgtable(vdev->drm.dev, sgt, DMA_BIDIRECTIONAL, 0);
 +      if (ret) {
 +              ivpu_err(vdev, "Failed to map BO in IOMMU: %d\n", ret);
 +              goto err_free_sgt;
 +      }
 +
 +      bo->sgt = sgt;
 +      return 0;
 +
 +err_free_sgt:
 +      kfree(sgt);
 +      return ret;
 +}
 +
 +static void ivpu_bo_unmap_pages_locked(struct ivpu_bo *bo)
 +{
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +
 +      dma_unmap_sgtable(vdev->drm.dev, bo->sgt, DMA_BIDIRECTIONAL, 0);
 +      sg_free_table(bo->sgt);
 +      kfree(bo->sgt);
 +      bo->sgt = NULL;
 +}
 +
 +static const struct ivpu_bo_ops shmem_ops = {
 +      .type = IVPU_BO_TYPE_SHMEM,
 +      .name = "shmem",
 +      .alloc_pages = shmem_alloc_pages_locked,
 +      .free_pages = shmem_free_pages_locked,
 +      .map_pages = ivpu_bo_map_pages_locked,
 +      .unmap_pages = ivpu_bo_unmap_pages_locked,
 +};
 +
 +static int __must_check internal_alloc_pages_locked(struct ivpu_bo *bo)
 +{
 +      unsigned int i, npages = bo->base.size >> PAGE_SHIFT;
 +      struct page **pages;
 +      int ret;
 +
 +      pages = kvmalloc_array(npages, sizeof(*bo->pages), GFP_KERNEL);
 +      if (!pages)
 +              return -ENOMEM;
 +
 +      for (i = 0; i < npages; i++) {
 +              pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
 +              if (!pages[i]) {
 +                      ret = -ENOMEM;
 +                      goto err_free_pages;
 +              }
 +              cond_resched();
 +      }
 +
 +      bo->pages = pages;
 +      return 0;
 +
 +err_free_pages:
 +      while (i--)
 +              put_page(pages[i]);
 +      kvfree(pages);
 +      return ret;
 +}
 +
 +static void internal_free_pages_locked(struct ivpu_bo *bo)
 +{
 +      unsigned int i, npages = bo->base.size >> PAGE_SHIFT;
 +
 +      for (i = 0; i < npages; i++)
 +              put_page(bo->pages[i]);
 +
 +      kvfree(bo->pages);
 +      bo->pages = NULL;
 +}
 +
 +static const struct ivpu_bo_ops internal_ops = {
 +      .type = IVPU_BO_TYPE_INTERNAL,
 +      .name = "internal",
 +      .alloc_pages = internal_alloc_pages_locked,
 +      .free_pages = internal_free_pages_locked,
 +      .map_pages = ivpu_bo_map_pages_locked,
 +      .unmap_pages = ivpu_bo_unmap_pages_locked,
 +};
 +
 +static int __must_check ivpu_bo_alloc_and_map_pages_locked(struct ivpu_bo *bo)
 +{
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +      int ret;
 +
 +      lockdep_assert_held(&bo->lock);
 +      drm_WARN_ON(&vdev->drm, bo->sgt);
 +
 +      ret = bo->ops->alloc_pages(bo);
 +      if (ret) {
 +              ivpu_err(vdev, "Failed to allocate pages for BO: %d", ret);
 +              return ret;
 +      }
 +
 +      ret = bo->ops->map_pages(bo);
 +      if (ret) {
 +              ivpu_err(vdev, "Failed to map pages for BO: %d", ret);
 +              goto err_free_pages;
 +      }
 +      return ret;
 +
 +err_free_pages:
 +      bo->ops->free_pages(bo);
 +      return ret;
 +}
 +
 +static void ivpu_bo_unmap_and_free_pages(struct ivpu_bo *bo)
 +{
 +      mutex_lock(&bo->lock);
 +
 +      WARN_ON(!bo->sgt);
 +      bo->ops->unmap_pages(bo);
 +      WARN_ON(bo->sgt);
 +      bo->ops->free_pages(bo);
 +      WARN_ON(bo->pages);
 +
 +      mutex_unlock(&bo->lock);
 +}
 +
 +/*
 + * ivpu_bo_pin() - pin the backing physical pages and map them to VPU.
 + *
 + * This function pins physical memory pages, then maps the physical pages
 + * to IOMMU address space and finally updates the VPU MMU page tables
 + * to allow the VPU to translate VPU address to IOMMU address.
 + */
 +int __must_check ivpu_bo_pin(struct ivpu_bo *bo)
 +{
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +      int ret = 0;
 +
 +      mutex_lock(&bo->lock);
 +
 +      if (!bo->vpu_addr) {
 +              ivpu_err(vdev, "vpu_addr not set for BO ctx_id: %d handle: %d\n",
 +                       bo->ctx->id, bo->handle);
 +              ret = -EINVAL;
 +              goto unlock;
 +      }
 +
 +      if (!bo->sgt) {
 +              ret = ivpu_bo_alloc_and_map_pages_locked(bo);
 +              if (ret)
 +                      goto unlock;
 +      }
 +
 +      if (!bo->mmu_mapped) {
 +              ret = ivpu_mmu_context_map_sgt(vdev, bo->ctx, bo->vpu_addr, bo->sgt,
 +                                             ivpu_bo_is_snooped(bo));
 +              if (ret) {
 +                      ivpu_err(vdev, "Failed to map BO in MMU: %d\n", ret);
 +                      goto unlock;
 +              }
 +              bo->mmu_mapped = true;
 +      }
 +
 +unlock:
 +      mutex_unlock(&bo->lock);
 +
 +      return ret;
 +}
 +
 +static int
 +ivpu_bo_alloc_vpu_addr(struct ivpu_bo *bo, struct ivpu_mmu_context *ctx,
 +                     const struct ivpu_addr_range *range)
 +{
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +      int ret;
 +
 +      if (!range) {
 +              if (bo->flags & DRM_IVPU_BO_HIGH_MEM)
 +                      range = &vdev->hw->ranges.user_high;
 +              else
 +                      range = &vdev->hw->ranges.user_low;
 +      }
 +
 +      mutex_lock(&ctx->lock);
 +      ret = ivpu_mmu_context_insert_node_locked(ctx, range, bo->base.size, &bo->mm_node);
 +      if (!ret) {
 +              bo->ctx = ctx;
 +              bo->vpu_addr = bo->mm_node.start;
 +              list_add_tail(&bo->ctx_node, &ctx->bo_list);
 +      }
 +      mutex_unlock(&ctx->lock);
 +
 +      return ret;
 +}
 +
 +static void ivpu_bo_free_vpu_addr(struct ivpu_bo *bo)
 +{
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +      struct ivpu_mmu_context *ctx = bo->ctx;
 +
 +      ivpu_dbg(vdev, BO, "remove from ctx: ctx %d vpu_addr 0x%llx allocated %d mmu_mapped %d\n",
 +               ctx->id, bo->vpu_addr, (bool)bo->sgt, bo->mmu_mapped);
 +
 +      mutex_lock(&bo->lock);
 +
 +      if (bo->mmu_mapped) {
 +              drm_WARN_ON(&vdev->drm, !bo->sgt);
 +              ivpu_mmu_context_unmap_sgt(vdev, ctx, bo->vpu_addr, bo->sgt);
 +              bo->mmu_mapped = false;
 +      }
 +
 +      mutex_lock(&ctx->lock);
 +      list_del(&bo->ctx_node);
 +      bo->vpu_addr = 0;
 +      bo->ctx = NULL;
 +      ivpu_mmu_context_remove_node_locked(ctx, &bo->mm_node);
 +      mutex_unlock(&ctx->lock);
 +
 +      mutex_unlock(&bo->lock);
 +}
 +
 +void ivpu_bo_remove_all_bos_from_context(struct ivpu_mmu_context *ctx)
 +{
 +      struct ivpu_bo *bo, *tmp;
 +
 +      list_for_each_entry_safe(bo, tmp, &ctx->bo_list, ctx_node)
 +              ivpu_bo_free_vpu_addr(bo);
 +}
 +
 +static struct ivpu_bo *
 +ivpu_bo_alloc(struct ivpu_device *vdev, struct ivpu_mmu_context *mmu_context,
 +            u64 size, u32 flags, const struct ivpu_bo_ops *ops,
 +            const struct ivpu_addr_range *range, u64 user_ptr)
 +{
 +      struct ivpu_bo *bo;
 +      int ret = 0;
 +
 +      if (drm_WARN_ON(&vdev->drm, size == 0 || !PAGE_ALIGNED(size)))
 +              return ERR_PTR(-EINVAL);
 +
 +      switch (flags & DRM_IVPU_BO_CACHE_MASK) {
 +      case DRM_IVPU_BO_CACHED:
 +      case DRM_IVPU_BO_UNCACHED:
 +      case DRM_IVPU_BO_WC:
 +              break;
 +      default:
 +              return ERR_PTR(-EINVAL);
 +      }
 +
 +      bo = kzalloc(sizeof(*bo), GFP_KERNEL);
 +      if (!bo)
 +              return ERR_PTR(-ENOMEM);
 +
 +      mutex_init(&bo->lock);
 +      bo->base.funcs = &ivpu_gem_funcs;
 +      bo->flags = flags;
 +      bo->ops = ops;
 +      bo->user_ptr = user_ptr;
 +
 +      if (ops->type == IVPU_BO_TYPE_SHMEM)
 +              ret = drm_gem_object_init(&vdev->drm, &bo->base, size);
 +      else
 +              drm_gem_private_object_init(&vdev->drm, &bo->base, size);
 +
 +      if (ret) {
 +              ivpu_err(vdev, "Failed to initialize drm object\n");
 +              goto err_free;
 +      }
 +
 +      if (flags & DRM_IVPU_BO_MAPPABLE) {
 +              ret = drm_gem_create_mmap_offset(&bo->base);
 +              if (ret) {
 +                      ivpu_err(vdev, "Failed to allocate mmap offset\n");
 +                      goto err_release;
 +              }
 +      }
 +
 +      if (mmu_context) {
 +              ret = ivpu_bo_alloc_vpu_addr(bo, mmu_context, range);
 +              if (ret) {
 +                      ivpu_err(vdev, "Failed to add BO to context: %d\n", ret);
 +                      goto err_release;
 +              }
 +      }
 +
 +      return bo;
 +
 +err_release:
 +      drm_gem_object_release(&bo->base);
 +err_free:
 +      kfree(bo);
 +      return ERR_PTR(ret);
 +}
 +
 +static void ivpu_bo_free(struct drm_gem_object *obj)
 +{
 +      struct ivpu_bo *bo = to_ivpu_bo(obj);
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +
 +      if (bo->ctx)
 +              ivpu_dbg(vdev, BO, "free: ctx %d vpu_addr 0x%llx allocated %d mmu_mapped %d\n",
 +                       bo->ctx->id, bo->vpu_addr, (bool)bo->sgt, bo->mmu_mapped);
 +      else
 +              ivpu_dbg(vdev, BO, "free: ctx (released) allocated %d mmu_mapped %d\n",
 +                       (bool)bo->sgt, bo->mmu_mapped);
 +
 +      drm_WARN_ON(&vdev->drm, !dma_resv_test_signaled(obj->resv, DMA_RESV_USAGE_READ));
 +
 +      vunmap(bo->kvaddr);
 +
 +      if (bo->ctx)
 +              ivpu_bo_free_vpu_addr(bo);
 +
 +      if (bo->sgt)
 +              ivpu_bo_unmap_and_free_pages(bo);
 +
 +      if (bo->base.import_attach)
 +              drm_prime_gem_destroy(&bo->base, bo->sgt);
 +
 +      drm_gem_object_release(&bo->base);
 +
 +      mutex_destroy(&bo->lock);
 +      kfree(bo);
 +}
 +
 +static int ivpu_bo_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
 +{
 +      struct ivpu_bo *bo = to_ivpu_bo(obj);
 +      struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 +
 +      ivpu_dbg(vdev, BO, "mmap: ctx %u handle %u vpu_addr 0x%llx size %zu type %s",
 +               bo->ctx->id, bo->handle, bo->vpu_addr, bo->base.size, bo->ops->name);
 +
 +      if (obj->import_attach) {
 +              /* Drop the reference drm_gem_mmap_obj() acquired.*/
 +              drm_gem_object_put(obj);
 +              vma->vm_private_data = NULL;
 +              return dma_buf_mmap(obj->dma_buf, vma, 0);
 +      }
 +
++      vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND);
 +      vma->vm_page_prot = ivpu_bo_pgprot(bo, vm_get_page_prot(vma->vm_flags));
 +
 +      return 0;
 +}
 +
 +static struct sg_table *ivpu_bo_get_sg_table(struct drm_gem_object *obj)
 +{
 +      struct ivpu_bo *bo = to_ivpu_bo(obj);
 +      loff_t npages = obj->size >> PAGE_SHIFT;
 +      int ret = 0;
 +
 +      mutex_lock(&bo->lock);
 +
 +      if (!bo->sgt)
 +              ret = ivpu_bo_alloc_and_map_pages_locked(bo);
 +
 +      mutex_unlock(&bo->lock);
 +
 +      if (ret)
 +              return ERR_PTR(ret);
 +
 +      return drm_prime_pages_to_sg(obj->dev, bo->pages, npages);
 +}
 +
 +static vm_fault_t ivpu_vm_fault(struct vm_fault *vmf)
 +{
 +      struct vm_area_struct *vma = vmf->vma;
 +      struct drm_gem_object *obj = vma->vm_private_data;
 +      struct ivpu_bo *bo = to_ivpu_bo(obj);
 +      loff_t npages = obj->size >> PAGE_SHIFT;
 +      pgoff_t page_offset;
 +      struct page *page;
 +      vm_fault_t ret;
 +      int err;
 +
 +      mutex_lock(&bo->lock);
 +
 +      if (!bo->sgt) {
 +              err = ivpu_bo_alloc_and_map_pages_locked(bo);
 +              if (err) {
 +                      ret = vmf_error(err);
 +                      goto unlock;
 +              }
 +      }
 +
 +      /* We don't use vmf->pgoff since that has the fake offset */
 +      page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
 +      if (page_offset >= npages) {
 +              ret = VM_FAULT_SIGBUS;
 +      } else {
 +              page = bo->pages[page_offset];
 +              ret = vmf_insert_pfn(vma, vmf->address, page_to_pfn(page));
 +      }
 +
 +unlock:
 +      mutex_unlock(&bo->lock);
 +
 +      return ret;
 +}
 +
 +static const struct vm_operations_struct ivpu_vm_ops = {
 +      .fault = ivpu_vm_fault,
 +      .open = drm_gem_vm_open,
 +      .close = drm_gem_vm_close,
 +};
 +
 +static const struct drm_gem_object_funcs ivpu_gem_funcs = {
 +      .free = ivpu_bo_free,
 +      .mmap = ivpu_bo_mmap,
 +      .vm_ops = &ivpu_vm_ops,
 +      .get_sg_table = ivpu_bo_get_sg_table,
 +};
 +
 +int
 +ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 +{
 +      struct ivpu_file_priv *file_priv = file->driver_priv;
 +      struct ivpu_device *vdev = file_priv->vdev;
 +      struct drm_ivpu_bo_create *args = data;
 +      u64 size = PAGE_ALIGN(args->size);
 +      struct ivpu_bo *bo;
 +      int ret;
 +
 +      if (args->flags & ~DRM_IVPU_BO_FLAGS)
 +              return -EINVAL;
 +
 +      if (size == 0)
 +              return -EINVAL;
 +
 +      bo = ivpu_bo_alloc(vdev, &file_priv->ctx, size, args->flags, &shmem_ops, NULL, 0);
 +      if (IS_ERR(bo)) {
 +              ivpu_err(vdev, "Failed to create BO: %pe (ctx %u size %llu flags 0x%x)",
 +                       bo, file_priv->ctx.id, args->size, args->flags);
 +              return PTR_ERR(bo);
 +      }
 +
 +      ret = drm_gem_handle_create(file, &bo->base, &bo->handle);
 +      if (!ret) {
 +              args->vpu_addr = bo->vpu_addr;
 +              args->handle = bo->handle;
 +      }
 +
 +      drm_gem_object_put(&bo->base);
 +
 +      ivpu_dbg(vdev, BO, "alloc shmem: ctx %u vpu_addr 0x%llx size %zu flags 0x%x\n",
 +               file_priv->ctx.id, bo->vpu_addr, bo->base.size, bo->flags);
 +
 +      return ret;
 +}
 +
 +struct ivpu_bo *
 +ivpu_bo_alloc_internal(struct ivpu_device *vdev, u64 vpu_addr, u64 size, u32 flags)
 +{
 +      const struct ivpu_addr_range *range;
 +      struct ivpu_addr_range fixed_range;
 +      struct ivpu_bo *bo;
 +      pgprot_t prot;
 +      int ret;
 +
 +      drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(vpu_addr));
 +      drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(size));
 +
 +      if (vpu_addr) {
 +              fixed_range.start = vpu_addr;
 +              fixed_range.end = vpu_addr + size;
 +              range = &fixed_range;
 +      } else {
 +              range = &vdev->hw->ranges.global_low;
 +      }
 +
 +      bo = ivpu_bo_alloc(vdev, &vdev->gctx, size, flags, &internal_ops, range, 0);
 +      if (IS_ERR(bo)) {
 +              ivpu_err(vdev, "Failed to create BO: %pe (vpu_addr 0x%llx size %llu flags 0x%x)",
 +                       bo, vpu_addr, size, flags);
 +              return NULL;
 +      }
 +
 +      ret = ivpu_bo_pin(bo);
 +      if (ret)
 +              goto err_put;
 +
 +      if (ivpu_bo_cache_mode(bo) != DRM_IVPU_BO_CACHED)
 +              drm_clflush_pages(bo->pages, bo->base.size >> PAGE_SHIFT);
 +
 +      prot = ivpu_bo_pgprot(bo, PAGE_KERNEL);
 +      bo->kvaddr = vmap(bo->pages, bo->base.size >> PAGE_SHIFT, VM_MAP, prot);
 +      if (!bo->kvaddr) {
 +              ivpu_err(vdev, "Failed to map BO into kernel virtual memory\n");
 +              goto err_put;
 +      }
 +
 +      ivpu_dbg(vdev, BO, "alloc internal: ctx 0 vpu_addr 0x%llx size %zu flags 0x%x\n",
 +               bo->vpu_addr, bo->base.size, flags);
 +
 +      return bo;
 +
 +err_put:
 +      drm_gem_object_put(&bo->base);
 +      return NULL;
 +}
 +
 +void ivpu_bo_free_internal(struct ivpu_bo *bo)
 +{
 +      drm_gem_object_put(&bo->base);
 +}
 +
 +struct drm_gem_object *ivpu_gem_prime_import(struct drm_device *dev, struct dma_buf *buf)
 +{
 +      struct ivpu_device *vdev = to_ivpu_device(dev);
 +      struct dma_buf_attachment *attach;
 +      struct ivpu_bo *bo;
 +
 +      attach = dma_buf_attach(buf, dev->dev);
 +      if (IS_ERR(attach))
 +              return ERR_CAST(attach);
 +
 +      get_dma_buf(buf);
 +
 +      bo = ivpu_bo_alloc(vdev, NULL, buf->size, DRM_IVPU_BO_MAPPABLE, &prime_ops, NULL, 0);
 +      if (IS_ERR(bo)) {
 +              ivpu_err(vdev, "Failed to import BO: %pe (size %lu)", bo, buf->size);
 +              goto err_detach;
 +      }
 +
 +      lockdep_set_class(&bo->lock, &prime_bo_lock_class_key);
 +
 +      bo->base.import_attach = attach;
 +
 +      return &bo->base;
 +
 +err_detach:
 +      dma_buf_detach(buf, attach);
 +      dma_buf_put(buf);
 +      return ERR_CAST(bo);
 +}
 +
 +int ivpu_bo_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 +{
 +      struct ivpu_file_priv *file_priv = file->driver_priv;
 +      struct ivpu_device *vdev = to_ivpu_device(dev);
 +      struct drm_ivpu_bo_info *args = data;
 +      struct drm_gem_object *obj;
 +      struct ivpu_bo *bo;
 +      int ret = 0;
 +
 +      obj = drm_gem_object_lookup(file, args->handle);
 +      if (!obj)
 +              return -ENOENT;
 +
 +      bo = to_ivpu_bo(obj);
 +
 +      mutex_lock(&bo->lock);
 +
 +      if (!bo->ctx) {
 +              ret = ivpu_bo_alloc_vpu_addr(bo, &file_priv->ctx, NULL);
 +              if (ret) {
 +                      ivpu_err(vdev, "Failed to allocate vpu_addr: %d\n", ret);
 +                      goto unlock;
 +              }
 +      }
 +
 +      args->flags = bo->flags;
 +      args->mmap_offset = drm_vma_node_offset_addr(&obj->vma_node);
 +      args->vpu_addr = bo->vpu_addr;
 +      args->size = obj->size;
 +unlock:
 +      mutex_unlock(&bo->lock);
 +      drm_gem_object_put(obj);
 +      return ret;
 +}
 +
 +int ivpu_bo_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 +{
 +      struct drm_ivpu_bo_wait *args = data;
 +      struct drm_gem_object *obj;
 +      unsigned long timeout;
 +      long ret;
 +
 +      timeout = drm_timeout_abs_to_jiffies(args->timeout_ns);
 +
 +      obj = drm_gem_object_lookup(file, args->handle);
 +      if (!obj)
 +              return -EINVAL;
 +
 +      ret = dma_resv_wait_timeout(obj->resv, DMA_RESV_USAGE_READ, true, timeout);
 +      if (ret == 0) {
 +              ret = -ETIMEDOUT;
 +      } else if (ret > 0) {
 +              ret = 0;
 +              args->job_status = to_ivpu_bo(obj)->job_status;
 +      }
 +
 +      drm_gem_object_put(obj);
 +
 +      return ret;
 +}
 +
 +static void ivpu_bo_print_info(struct ivpu_bo *bo, struct drm_printer *p)
 +{
 +      unsigned long dma_refcount = 0;
 +
 +      if (bo->base.dma_buf && bo->base.dma_buf->file)
 +              dma_refcount = atomic_long_read(&bo->base.dma_buf->file->f_count);
 +
 +      drm_printf(p, "%5u %6d %16llx %10lu %10u %12lu %14s\n",
 +                 bo->ctx->id, bo->handle, bo->vpu_addr, bo->base.size,
 +                 kref_read(&bo->base.refcount), dma_refcount, bo->ops->name);
 +}
 +
 +void ivpu_bo_list(struct drm_device *dev, struct drm_printer *p)
 +{
 +      struct ivpu_device *vdev = to_ivpu_device(dev);
 +      struct ivpu_file_priv *file_priv;
 +      unsigned long ctx_id;
 +      struct ivpu_bo *bo;
 +
 +      drm_printf(p, "%5s %6s %16s %10s %10s %12s %14s\n",
 +                 "ctx", "handle", "vpu_addr", "size", "refcount", "dma_refcount", "type");
 +
 +      mutex_lock(&vdev->gctx.lock);
 +      list_for_each_entry(bo, &vdev->gctx.bo_list, ctx_node)
 +              ivpu_bo_print_info(bo, p);
 +      mutex_unlock(&vdev->gctx.lock);
 +
 +      xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
 +              file_priv = ivpu_file_priv_get_by_ctx_id(vdev, ctx_id);
 +              if (!file_priv)
 +                      continue;
 +
 +              mutex_lock(&file_priv->ctx.lock);
 +              list_for_each_entry(bo, &file_priv->ctx.bo_list, ctx_node)
 +                      ivpu_bo_print_info(bo, p);
 +              mutex_unlock(&file_priv->ctx.lock);
 +
 +              ivpu_file_priv_put(&file_priv);
 +      }
 +}
 +
 +void ivpu_bo_list_print(struct drm_device *dev)
 +{
 +      struct drm_printer p = drm_info_printer(dev->dev);
 +
 +      ivpu_bo_list(dev, &p);
 +}
 
  
        /* Tell the block layer that this is not a rotational device */
        blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+       blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);
        blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 +      blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
        err = add_disk(disk);
        if (err)
                goto out_cleanup_disk;
 
   */
  static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
  {
-       struct inode *inode = bh->b_page->mapping->host;
 -      /* Decrypt if needed */
 -      if (uptodate &&
 -          fscrypt_inode_uses_fs_layer_crypto(bh->b_folio->mapping->host)) {
 -              struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
++      struct inode *inode = bh->b_folio->mapping->host;
 +      bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
 +      bool verify = need_fsverity(bh);
 +
 +      /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
 +      if (uptodate && (decrypt || verify)) {
 +              struct postprocess_bh_ctx *ctx =
 +                      kmalloc(sizeof(*ctx), GFP_ATOMIC);
  
                if (ctx) {
 -                      INIT_WORK(&ctx->work, decrypt_bh);
                        ctx->bh = bh;
 -                      fscrypt_enqueue_decrypt_work(&ctx->work);
 +                      if (decrypt) {
 +                              INIT_WORK(&ctx->work, decrypt_bh);
 +                              fscrypt_enqueue_decrypt_work(&ctx->work);
 +                      } else {
 +                              INIT_WORK(&ctx->work, verify_bh);
 +                              fsverity_enqueue_verify_work(&ctx->work);
 +                      }
                        return;
                }
                uptodate = 0;
 
        return rc;
  }
  
 -static struct cifs_writedata *
 -wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
 -                        pgoff_t end, pgoff_t *index,
 -                        unsigned int *found_pages)
 +/*
 + * Extend the region to be written back to include subsequent contiguously
 + * dirty pages if possible, but don't sleep while doing so.
 + */
 +static void cifs_extend_writeback(struct address_space *mapping,
 +                                long *_count,
 +                                loff_t start,
 +                                int max_pages,
 +                                size_t max_len,
 +                                unsigned int *_len)
  {
 -      struct cifs_writedata *wdata;
 -      struct folio_batch fbatch;
 -      unsigned int i, idx, p, nr;
 -      wdata = cifs_writedata_alloc((unsigned int)tofind,
 -                                   cifs_writev_complete);
 -      if (!wdata)
 -              return NULL;
 -
 -      folio_batch_init(&fbatch);
 -      *found_pages = 0;
 -
 -again:
 -      nr = filemap_get_folios_tag(mapping, index, end,
 -                              PAGECACHE_TAG_DIRTY, &fbatch);
 -      if (!nr)
 -              goto out; /* No dirty pages left in the range */
 -
 -      for (i = 0; i < nr; i++) {
 -              struct folio *folio = fbatch.folios[i];
 -
 -              idx = 0;
 -              p = folio_nr_pages(folio);
 -add_more:
 -              wdata->pages[*found_pages] = folio_page(folio, idx);
 -              folio_get(folio);
 -              if (++*found_pages == tofind) {
 -                      folio_batch_release(&fbatch);
 -                      goto out;
 -              }
 -              if (++idx < p)
 -                      goto add_more;
 -      }
 -      folio_batch_release(&fbatch);
 -      goto again;
 -out:
 -      return wdata;
 -}
 +      struct folio_batch batch;
 +      struct folio *folio;
 +      unsigned int psize, nr_pages;
 +      size_t len = *_len;
 +      pgoff_t index = (start + len) / PAGE_SIZE;
 +      bool stop = true;
 +      unsigned int i;
 +      XA_STATE(xas, &mapping->i_pages, index);
  
 -static unsigned int
 -wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
 -                  struct address_space *mapping,
 -                  struct writeback_control *wbc,
 -                  pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
 -{
 -      unsigned int nr_pages = 0, i;
 -      struct page *page;
 +      folio_batch_init(&batch);
  
 -      for (i = 0; i < found_pages; i++) {
 -              page = wdata->pages[i];
 -              /*
 -               * At this point we hold neither the i_pages lock nor the
 -               * page lock: the page may be truncated or invalidated
 -               * (changing page->mapping to NULL), or even swizzled
 -               * back from swapper_space to tmpfs file mapping
 +      do {
 +              /* Firstly, we gather up a batch of contiguous dirty pages
 +               * under the RCU read lock - but we can't clear the dirty flags
 +               * there if any of those pages are mapped.
                 */
 +              rcu_read_lock();
  
 -              if (nr_pages == 0)
 -                      lock_page(page);
 -              else if (!trylock_page(page))
 -                      break;
 -
 -              if (unlikely(page->mapping != mapping)) {
 -                      unlock_page(page);
 -                      break;
 -              }
 +              xas_for_each(&xas, folio, ULONG_MAX) {
 +                      stop = true;
 +                      if (xas_retry(&xas, folio))
 +                              continue;
 +                      if (xa_is_value(folio))
 +                              break;
 +                      if (folio_index(folio) != index)
 +                              break;
 +                      if (!folio_try_get_rcu(folio)) {
 +                              xas_reset(&xas);
 +                              continue;
 +                      }
 +                      nr_pages = folio_nr_pages(folio);
 +                      if (nr_pages > max_pages)
 +                              break;
  
 -              if (!wbc->range_cyclic && page->index > end) {
 -                      *done = true;
 -                      unlock_page(page);
 -                      break;
 -              }
 +                      /* Has the page moved or been split? */
 +                      if (unlikely(folio != xas_reload(&xas))) {
 +                              folio_put(folio);
 +                              break;
 +                      }
  
 -              if (*next && (page->index != *next)) {
 -                      /* Not next consecutive page */
 -                      unlock_page(page);
 -                      break;
 -              }
 +                      if (!folio_trylock(folio)) {
 +                              folio_put(folio);
 +                              break;
 +                      }
 +                      if (!folio_test_dirty(folio) || folio_test_writeback(folio)) {
 +                              folio_unlock(folio);
 +                              folio_put(folio);
 +                              break;
 +                      }
  
 -              if (wbc->sync_mode != WB_SYNC_NONE)
 -                      wait_on_page_writeback(page);
 +                      max_pages -= nr_pages;
 +                      psize = folio_size(folio);
 +                      len += psize;
 +                      stop = false;
 +                      if (max_pages <= 0 || len >= max_len || *_count <= 0)
 +                              stop = true;
  
 -              if (PageWriteback(page) ||
 -                              !clear_page_dirty_for_io(page)) {
 -                      unlock_page(page);
 -                      break;
 +                      index += nr_pages;
 +                      if (!folio_batch_add(&batch, folio))
 +                              break;
 +                      if (stop)
 +                              break;
                }
  
 -              /*
 -               * This actually clears the dirty bit in the radix tree.
 -               * See cifs_writepage() for more commentary.
 +              if (!stop)
 +                      xas_pause(&xas);
 +              rcu_read_unlock();
 +
 +              /* Now, if we obtained any pages, we can shift them to being
 +               * writable and mark them for caching.
                 */
 -              set_page_writeback(page);
 -              if (page_offset(page) >= i_size_read(mapping->host)) {
 -                      *done = true;
 -                      unlock_page(page);
 -                      end_page_writeback(page);
 +              if (!folio_batch_count(&batch))
                        break;
 -              }
  
 -              wdata->pages[i] = page;
 -              *next = page->index + 1;
 -              ++nr_pages;
 -      }
 +              for (i = 0; i < folio_batch_count(&batch); i++) {
 +                      folio = batch.folios[i];
 +                      /* The folio should be locked, dirty and not undergoing
 +                       * writeback from the loop above.
 +                       */
 +                      if (!folio_clear_dirty_for_io(folio))
 +                              WARN_ON(1);
 +                      if (folio_start_writeback(folio))
 +                              WARN_ON(1);
  
 -      /* reset index to refind any pages skipped */
 -      if (nr_pages == 0)
 -              *index = wdata->pages[0]->index + 1;
 +                      *_count -= folio_nr_pages(folio);
 +                      folio_unlock(folio);
 +              }
  
 -      /* put any pages we aren't going to use */
 -      for (i = nr_pages; i < found_pages; i++) {
 -              put_page(wdata->pages[i]);
 -              wdata->pages[i] = NULL;
 -      }
 +              folio_batch_release(&batch);
 +              cond_resched();
 +      } while (!stop);
  
 -      return nr_pages;
 +      *_len = len;
  }
  
 -static int
 -wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
 -               struct address_space *mapping, struct writeback_control *wbc)
 +/*
 + * Write back the locked page and any subsequent non-locked dirty pages.
 + */
 +static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
 +                                               struct writeback_control *wbc,
 +                                               struct folio *folio,
 +                                               loff_t start, loff_t end)
  {
 +      struct inode *inode = mapping->host;
 +      struct TCP_Server_Info *server;
 +      struct cifs_writedata *wdata;
 +      struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 +      struct cifs_credits credits_on_stack;
 +      struct cifs_credits *credits = &credits_on_stack;
 +      struct cifsFileInfo *cfile = NULL;
 +      unsigned int xid, wsize, len;
 +      loff_t i_size = i_size_read(inode);
 +      size_t max_len;
 +      long count = wbc->nr_to_write;
        int rc;
  
 -      wdata->sync_mode = wbc->sync_mode;
 -      wdata->nr_pages = nr_pages;
 -      wdata->offset = page_offset(wdata->pages[0]);
 -      wdata->pagesz = PAGE_SIZE;
 -      wdata->tailsz = min(i_size_read(mapping->host) -
 -                      page_offset(wdata->pages[nr_pages - 1]),
 -                      (loff_t)PAGE_SIZE);
 -      wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
 -      wdata->pid = wdata->cfile->pid;
 -
 -      rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes);
 -      if (rc)
 -              return rc;
 -
 -      if (wdata->cfile->invalidHandle)
 -              rc = -EAGAIN;
 -      else
 -              rc = wdata->server->ops->async_writev(wdata,
 -                                                    cifs_writedata_release);
 +      /* The folio should be locked, dirty and not undergoing writeback. */
 +      if (folio_start_writeback(folio))
 +              WARN_ON(1);
  
 -      return rc;
 -}
 +      count -= folio_nr_pages(folio);
 +      len = folio_size(folio);
  
 -static int
 -cifs_writepage_locked(struct page *page, struct writeback_control *wbc);
 +      xid = get_xid();
 +      server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses);
  
 -static int cifs_write_one_page(struct folio *folio,
 -              struct writeback_control *wbc, void *data)
 -{
 -      struct address_space *mapping = data;
 -      int ret;
 +      rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
 +      if (rc) {
 +              cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc);
 +              goto err_xid;
 +      }
  
 -      ret = cifs_writepage_locked(&folio->page, wbc);
 -      folio_unlock(folio);
 -      mapping_set_error(mapping, ret);
 -      return ret;
 -}
 +      rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
 +                                         &wsize, credits);
 +      if (rc != 0)
 +              goto err_close;
  
 -static int cifs_writepages(struct address_space *mapping,
 -                         struct writeback_control *wbc)
 -{
 -      struct inode *inode = mapping->host;
 -      struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 -      struct TCP_Server_Info *server;
 -      bool done = false, scanned = false, range_whole = false;
 -      pgoff_t end, index;
 -      struct cifs_writedata *wdata;
 -      struct cifsFileInfo *cfile = NULL;
 -      int rc = 0;
 -      int saved_rc = 0;
 -      unsigned int xid;
 +      wdata = cifs_writedata_alloc(cifs_writev_complete);
 +      if (!wdata) {
 +              rc = -ENOMEM;
 +              goto err_uncredit;
 +      }
  
 -      /*
 -       * If wsize is smaller than the page cache size, default to writing
 -       * one page at a time.
 +      wdata->sync_mode = wbc->sync_mode;
 +      wdata->offset = folio_pos(folio);
 +      wdata->pid = cfile->pid;
 +      wdata->credits = credits_on_stack;
 +      wdata->cfile = cfile;
 +      wdata->server = server;
 +      cfile = NULL;
 +
 +      /* Find all consecutive lockable dirty pages, stopping when we find a
 +       * page that is not immediately lockable, is not dirty or is missing,
 +       * or we reach the end of the range.
         */
 -      if (cifs_sb->ctx->wsize < PAGE_SIZE)
 -              return write_cache_pages(mapping, wbc, cifs_write_one_page,
 -                              mapping);
 +      if (start < i_size) {
 +              /* Trim the write to the EOF; the extra data is ignored.  Also
 +               * put an upper limit on the size of a single storedata op.
 +               */
 +              max_len = wsize;
 +              max_len = min_t(unsigned long long, max_len, end - start + 1);
 +              max_len = min_t(unsigned long long, max_len, i_size - start);
  
 -      xid = get_xid();
 -      if (wbc->range_cyclic) {
 -              index = mapping->writeback_index; /* Start from prev offset */
 -              end = -1;
 -      } else {
 -              index = wbc->range_start >> PAGE_SHIFT;
 -              end = wbc->range_end >> PAGE_SHIFT;
 -              if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 -                      range_whole = true;
 -              scanned = true;
 +              if (len < max_len) {
 +                      int max_pages = INT_MAX;
 +
 +#ifdef CONFIG_CIFS_SMB_DIRECT
 +                      if (server->smbd_conn)
 +                              max_pages = server->smbd_conn->max_frmr_depth;
 +#endif
 +                      max_pages -= folio_nr_pages(folio);
 +
 +                      if (max_pages > 0)
 +                              cifs_extend_writeback(mapping, &count, start,
 +                                                    max_pages, max_len, &len);
 +              }
 +              len = min_t(loff_t, len, max_len);
        }
 -      server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses);
  
 -retry:
 -      while (!done && index <= end) {
 -              unsigned int i, nr_pages, found_pages, wsize;
 -              pgoff_t next = 0, tofind, saved_index = index;
 -              struct cifs_credits credits_on_stack;
 -              struct cifs_credits *credits = &credits_on_stack;
 -              int get_file_rc = 0;
 +      wdata->bytes = len;
  
 -              if (cfile)
 -                      cifsFileInfo_put(cfile);
 +      /* We now have a contiguous set of dirty pages, each with writeback
 +       * set; the first page is still locked at this point, but all the rest
 +       * have been unlocked.
 +       */
 +      folio_unlock(folio);
  
 -              rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
 +      if (start < i_size) {
 +              iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages,
 +                              start, len);
  
 -              /* in case of an error store it to return later */
 +              rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes);
                if (rc)
 -                      get_file_rc = rc;
 +                      goto err_wdata;
  
 -              rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
 -                                                 &wsize, credits);
 -              if (rc != 0) {
 -                      done = true;
 -                      break;
 +              if (wdata->cfile->invalidHandle)
 +                      rc = -EAGAIN;
 +              else
 +                      rc = wdata->server->ops->async_writev(wdata,
 +                                                            cifs_writedata_release);
 +              if (rc >= 0) {
 +                      kref_put(&wdata->refcount, cifs_writedata_release);
 +                      goto err_close;
                }
 +      } else {
 +              /* The dirty region was entirely beyond the EOF. */
 +              cifs_pages_written_back(inode, start, len);
 +              rc = 0;
 +      }
  
 -              tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
 +err_wdata:
 +      kref_put(&wdata->refcount, cifs_writedata_release);
 +err_uncredit:
 +      add_credits_and_wake_if(server, credits, 0);
 +err_close:
 +      if (cfile)
 +              cifsFileInfo_put(cfile);
 +err_xid:
 +      free_xid(xid);
 +      if (rc == 0) {
 +              wbc->nr_to_write = count;
 +      } else if (is_retryable_error(rc)) {
 +              cifs_pages_write_redirty(inode, start, len);
 +      } else {
 +              cifs_pages_write_failed(inode, start, len);
 +              mapping_set_error(mapping, rc);
 +      }
 +      /* Indication to update ctime and mtime as close is deferred */
 +      set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
 +      return rc;
 +}
  
 -              wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
 -                                                &found_pages);
 -              if (!wdata) {
 -                      rc = -ENOMEM;
 -                      done = true;
 -                      add_credits_and_wake_if(server, credits, 0);
 -                      break;
 -              }
 +/*
 + * write a region of pages back to the server
 + */
 +static int cifs_writepages_region(struct address_space *mapping,
 +                                struct writeback_control *wbc,
 +                                loff_t start, loff_t end, loff_t *_next)
 +{
-       struct folio *folio;
-       struct page *head_page;
-       ssize_t ret;
-       int n, skips = 0;
++      struct folio_batch fbatch;
++      int skips = 0;
  
 -              if (found_pages == 0) {
 -                      kref_put(&wdata->refcount, cifs_writedata_release);
 -                      add_credits_and_wake_if(server, credits, 0);
++      folio_batch_init(&fbatch);
 +      do {
++              int nr;
 +              pgoff_t index = start / PAGE_SIZE;
 +
-               n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE,
-                                            PAGECACHE_TAG_DIRTY, 1, &head_page);
-               if (!n)
++              nr = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE,
++                                          PAGECACHE_TAG_DIRTY, &fbatch);
++              if (!nr)
                        break;
 -              }
  
-               folio = page_folio(head_page);
-               start = folio_pos(folio); /* May regress with THPs */
 -              nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
 -                                             end, &index, &next, &done);
++              for (int i = 0; i < nr; i++) {
++                      ssize_t ret;
++                      struct folio *folio = fbatch.folios[i];
  
-               /* At this point we hold neither the i_pages lock nor the
-                * page lock: the page may be truncated or invalidated
-                * (changing page->mapping to NULL), or even swizzled
-                * back from swapper_space to tmpfs file mapping
-                */
-               if (wbc->sync_mode != WB_SYNC_NONE) {
-                       ret = folio_lock_killable(folio);
-                       if (ret < 0) {
-                               folio_put(folio);
-                               return ret;
 -              /* nothing to write? */
 -              if (nr_pages == 0) {
 -                      kref_put(&wdata->refcount, cifs_writedata_release);
 -                      add_credits_and_wake_if(server, credits, 0);
 -                      continue;
 -              }
++redo_folio:
++                      start = folio_pos(folio); /* May regress with THPs */
+ 
 -              wdata->credits = credits_on_stack;
 -              wdata->cfile = cfile;
 -              wdata->server = server;
 -              cfile = NULL;
++                      /* At this point we hold neither the i_pages lock nor the
++                       * page lock: the page may be truncated or invalidated
++                       * (changing page->mapping to NULL), or even swizzled
++                       * back from swapper_space to tmpfs file mapping
++                       */
++                      if (wbc->sync_mode != WB_SYNC_NONE) {
++                              ret = folio_lock_killable(folio);
++                              if (ret < 0)
++                                      goto write_error;
++                      } else {
++                              if (!folio_trylock(folio))
++                                      goto skip_write;
 +                      }
-               } else {
-                       if (!folio_trylock(folio)) {
-                               folio_put(folio);
-                               return 0;
+ 
 -              if (!wdata->cfile) {
 -                      cifs_dbg(VFS, "No writable handle in writepages rc=%d\n",
 -                               get_file_rc);
 -                      if (is_retryable_error(get_file_rc))
 -                              rc = get_file_rc;
 -                      else
 -                              rc = -EBADF;
 -              } else
 -                      rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
++                      if (folio_mapping(folio) != mapping ||
++                          !folio_test_dirty(folio)) {
++                              folio_unlock(folio);
++                              goto skip_write;
 +                      }
-               }
  
-               if (folio_mapping(folio) != mapping ||
-                   !folio_test_dirty(folio)) {
-                       start += folio_size(folio);
-                       folio_unlock(folio);
-                       folio_put(folio);
-                       continue;
-               }
 -              for (i = 0; i < nr_pages; ++i)
 -                      unlock_page(wdata->pages[i]);
++                      if (folio_test_writeback(folio) ||
++                          folio_test_fscache(folio)) {
++                              folio_unlock(folio);
++                              if (wbc->sync_mode == WB_SYNC_NONE)
++                                      goto skip_write;
  
-               if (folio_test_writeback(folio) ||
-                   folio_test_fscache(folio)) {
-                       folio_unlock(folio);
-                       if (wbc->sync_mode != WB_SYNC_NONE) {
 -              /* send failure -- clean up the mess */
 -              if (rc != 0) {
 -                      add_credits_and_wake_if(server, &wdata->credits, 0);
 -                      for (i = 0; i < nr_pages; ++i) {
 -                              if (is_retryable_error(rc))
 -                                      redirty_page_for_writepage(wbc,
 -                                                         wdata->pages[i]);
 -                              else
 -                                      SetPageError(wdata->pages[i]);
 -                              end_page_writeback(wdata->pages[i]);
 -                              put_page(wdata->pages[i]);
 +                              folio_wait_writeback(folio);
 +#ifdef CONFIG_CIFS_FSCACHE
 +                              folio_wait_fscache(folio);
 +#endif
-                       } else {
-                               start += folio_size(folio);
++                              goto redo_folio;
                        }
-                       folio_put(folio);
-                       if (wbc->sync_mode == WB_SYNC_NONE) {
-                               if (skips >= 5 || need_resched())
-                                       break;
-                               skips++;
-                       }
-                       continue;
 -                      if (!is_retryable_error(rc))
 -                              mapping_set_error(mapping, rc);
--              }
 -              kref_put(&wdata->refcount, cifs_writedata_release);
  
-               if (!folio_clear_dirty_for_io(folio))
-                       /* We hold the page lock - it should've been dirty. */
-                       WARN_ON(1);
 -              if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
 -                      index = saved_index;
 -                      continue;
 -              }
++                      if (!folio_clear_dirty_for_io(folio))
++                              /* We hold the page lock - it should've been dirty. */
++                              WARN_ON(1);
  
-               ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end);
-               folio_put(folio);
-               if (ret < 0)
 -              /* Return immediately if we received a signal during writing */
 -              if (is_interrupt_error(rc)) {
 -                      done = true;
 -                      break;
 -              }
++                      ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end);
++                      if (ret < 0)
++                              goto write_error;
+ 
 -              if (rc != 0 && saved_rc == 0)
 -                      saved_rc = rc;
++                      start += ret;
++                      continue;
+ 
 -              wbc->nr_to_write -= nr_pages;
 -              if (wbc->nr_to_write <= 0)
 -                      done = true;
++write_error:
++                      folio_batch_release(&fbatch);
++                      *_next = start;
 +                      return ret;
  
-               start += ret;
 -              index = next;
 -      }
++skip_write:
++                      /*
++                       * Too many skipped writes, or need to reschedule?
++                       * Treat it as a write error without an error code.
++                       */
++                      if (skips >= 5 || need_resched()) {
++                              ret = 0;
++                              goto write_error;
++                      }
+ 
 -      if (!scanned && !done) {
 -              /*
 -               * We hit the last page and there is more work to be done: wrap
 -               * back to the start of the file
 -               */
 -              scanned = true;
 -              index = 0;
 -              goto retry;
 -      }
++                      /* Otherwise, just skip that folio and go on to the next */
++                      skips++;
++                      start += folio_size(folio);
++                      continue;
++              }
+ 
 -      if (saved_rc != 0)
 -              rc = saved_rc;
++              folio_batch_release(&fbatch);           
 +              cond_resched();
 +      } while (wbc->nr_to_write > 0);
  
 -      if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 -              mapping->writeback_index = index;
 +      *_next = start;
 +      return 0;
 +}
  
 -      if (cfile)
 -              cifsFileInfo_put(cfile);
 -      free_xid(xid);
 -      /* Indication to update ctime and mtime as close is deferred */
 -      set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
 -      return rc;
 +/*
 + * Write some of the pending data back to the server
 + */
 +static int cifs_writepages(struct address_space *mapping,
 +                         struct writeback_control *wbc)
 +{
 +      loff_t start, next;
 +      int ret;
 +
 +      /* We have to be careful as we can end up racing with setattr()
 +       * truncating the pagecache since the caller doesn't take a lock here
 +       * to prevent it.
 +       */
 +
 +      if (wbc->range_cyclic) {
 +              start = mapping->writeback_index * PAGE_SIZE;
 +              ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
 +              if (ret == 0) {
 +                      mapping->writeback_index = next / PAGE_SIZE;
 +                      if (start > 0 && wbc->nr_to_write > 0) {
 +                              ret = cifs_writepages_region(mapping, wbc, 0,
 +                                                           start, &next);
 +                              if (ret == 0)
 +                                      mapping->writeback_index =
 +                                              next / PAGE_SIZE;
 +                      }
 +              }
 +      } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
 +              ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
 +              if (wbc->nr_to_write > 0 && ret == 0)
 +                      mapping->writeback_index = next / PAGE_SIZE;
 +      } else {
 +              ret = cifs_writepages_region(mapping, wbc,
 +                                           wbc->range_start, wbc->range_end, &next);
 +      }
 +
 +      return ret;
  }
  
  static int
 
        brelse(bd->bd_bh);
  }
  
- static int __gfs2_writepage(struct page *page, struct writeback_control *wbc,
++static int __gfs2_writepage(struct folio *folio, struct writeback_control *wbc,
 +                     void *data)
 +{
 +      struct address_space *mapping = data;
-       int ret = mapping->a_ops->writepage(page, wbc);
++      int ret = mapping->a_ops->writepage(&folio->page, wbc);
 +      mapping_set_error(mapping, ret);
 +      return ret;
 +}
 +
  /**
   * gfs2_ail1_start_one - Start I/O on a transaction
   * @sdp: The superblock
 
        return ret;
  }
  
- static int nfs_writepages_callback(struct page *page,
+ static int nfs_writepages_callback(struct folio *folio,
 -              struct writeback_control *wbc, void *data)
 +                                 struct writeback_control *wbc, void *data)
  {
-       struct folio *folio = page_folio(page);
        int ret;
  
 -      ret = nfs_do_writepage(&folio->page, wbc, data);
 +      ret = nfs_do_writepage(folio, wbc, data);
        if (ret != AOP_WRITEPAGE_ACTIVATE)
-               unlock_page(page);
+               folio_unlock(folio);
        return ret;
  }
  
 
        }
  }
  
- static int udf_adinicb_writepage(struct page *page,
++static int udf_adinicb_writepage(struct folio *folio,
 +                               struct writeback_control *wbc, void *data)
 +{
++      struct page *page = &folio->page;
 +      struct inode *inode = page->mapping->host;
 +      struct udf_inode_info *iinfo = UDF_I(inode);
 +
 +      BUG_ON(!PageLocked(page));
 +      memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr,
 +                     i_size_read(inode));
 +      unlock_page(page);
 +      mark_inode_dirty(inode);
 +
 +      return 0;
 +}
 +
  static int udf_writepages(struct address_space *mapping,
 -                      struct writeback_control *wbc)
 +                        struct writeback_control *wbc)
  {
 -      return mpage_writepages(mapping, wbc, udf_get_block);
 +      struct inode *inode = mapping->host;
 +      struct udf_inode_info *iinfo = UDF_I(inode);
 +
 +      if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
 +              return mpage_writepages(mapping, wbc, udf_get_block_wb);
 +      return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL);
 +}
 +
 +static void udf_adinicb_readpage(struct page *page)
 +{
 +      struct inode *inode = page->mapping->host;
 +      char *kaddr;
 +      struct udf_inode_info *iinfo = UDF_I(inode);
 +      loff_t isize = i_size_read(inode);
 +
 +      kaddr = kmap_local_page(page);
 +      memcpy(kaddr, iinfo->i_data + iinfo->i_lenEAttr, isize);
 +      memset(kaddr + isize, 0, PAGE_SIZE - isize);
 +      flush_dcache_page(page);
 +      SetPageUptodate(page);
 +      kunmap_local(kaddr);
  }
  
  static int udf_read_folio(struct file *file, struct folio *folio)
 
  int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
  void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
  
- extern struct static_key_false memcg_kmem_enabled_key;
 +extern struct static_key_false memcg_bpf_enabled_key;
 +static inline bool memcg_bpf_enabled(void)
 +{
 +      return static_branch_likely(&memcg_bpf_enabled_key);
 +}
 +
+ extern struct static_key_false memcg_kmem_online_key;
  
- static inline bool memcg_kmem_enabled(void)
+ static inline bool memcg_kmem_online(void)
  {
-       return static_branch_likely(&memcg_kmem_enabled_key);
+       return static_branch_likely(&memcg_kmem_online_key);
  }
  
  static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
        return NULL;
  }
  
- static inline bool memcg_kmem_enabled(void)
 +static inline bool memcg_bpf_enabled(void)
 +{
 +      return false;
 +}
 +
+ static inline bool memcg_kmem_online(void)
  {
        return false;
  }
 
                 * &struct mm_struct is freed.
                 */
                atomic_t mm_count;
 -
 +#ifdef CONFIG_SCHED_MM_CID
 +              /**
 +               * @cid_lock: Protect cid bitmap updates vs lookups.
 +               *
 +               * Prevent situations where updates to the cid bitmap happen
 +               * concurrently with lookups. Those can lead to situations
 +               * where a lookup cannot find a free bit simply because it was
 +               * unlucky enough to load, non-atomically, bitmap words as they
 +               * were being concurrently updated by the updaters.
 +               */
 +              raw_spinlock_t cid_lock;
 +#endif
  #ifdef CONFIG_MMU
-               atomic_long_t pgtables_bytes;   /* PTE page table pages */
+               atomic_long_t pgtables_bytes;   /* size of all page tables */
  #endif
                int map_count;                  /* number of VMAs */
  
  static inline void vma_iter_init(struct vma_iterator *vmi,
                struct mm_struct *mm, unsigned long addr)
  {
-       vmi->mas.tree = &mm->mm_mt;
-       vmi->mas.index = addr;
-       vmi->mas.node = MAS_START;
+       mas_init(&vmi->mas, &mm->mm_mt, addr);
  }
  
 +#ifdef CONFIG_SCHED_MM_CID
 +/* Accessor for struct mm_struct's cidmask. */
 +static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
 +{
 +      unsigned long cid_bitmap = (unsigned long)mm;
 +
 +      cid_bitmap += offsetof(struct mm_struct, cpu_bitmap);
 +      /* Skip cpu_bitmap */
 +      cid_bitmap += cpumask_size();
 +      return (struct cpumask *)cid_bitmap;
 +}
 +
 +static inline void mm_init_cid(struct mm_struct *mm)
 +{
 +      raw_spin_lock_init(&mm->cid_lock);
 +      cpumask_clear(mm_cidmask(mm));
 +}
 +
 +static inline unsigned int mm_cid_size(void)
 +{
 +      return cpumask_size();
 +}
 +#else /* CONFIG_SCHED_MM_CID */
 +static inline void mm_init_cid(struct mm_struct *mm) { }
 +static inline unsigned int mm_cid_size(void)
 +{
 +      return 0;
 +}
 +#endif /* CONFIG_SCHED_MM_CID */
 +
  struct mmu_gather;
  extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
  extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
 
        pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
        if (pmd_swp_soft_dirty(*pvmw->pmd))
                pmde = pmd_mksoft_dirty(pmde);
 -      if (is_writable_migration_entry(entry))
 -              pmde = maybe_pmd_mkwrite(pmde, vma);
        if (pmd_swp_uffd_wp(*pvmw->pmd))
-               pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+               pmde = pmd_mkuffd_wp(pmde);
        if (!is_migration_entry_young(entry))
                pmde = pmd_mkold(pmde);
        /* NOTE: this may contain setting soft-dirty on some archs */
 
   * conditional to this static branch, we'll have to allow modules that does
   * kmem_cache_alloc and the such to see this symbol as well
   */
- DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
- EXPORT_SYMBOL(memcg_kmem_enabled_key);
+ DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
+ EXPORT_SYMBOL(memcg_kmem_online_key);
 +
 +DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
 +EXPORT_SYMBOL(memcg_bpf_enabled_key);
  #endif
  
  /**
 
        stat->attributes_mask |= (STATX_ATTR_APPEND |
                        STATX_ATTR_IMMUTABLE |
                        STATX_ATTR_NODUMP);
 -      generic_fillattr(&init_user_ns, inode, stat);
 +      generic_fillattr(idmap, inode, stat);
  
-       if (shmem_is_huge(NULL, inode, 0, false))
+       if (shmem_is_huge(inode, 0, false, NULL, 0))
                stat->blksize = HPAGE_PMD_SIZE;
  
        if (request_mask & STATX_BTIME) {