return info;
 }
 
+static void arm_smmu_make_nested_cd_table_ste(
+       struct arm_smmu_ste *target, struct arm_smmu_master *master,
+       struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+       arm_smmu_make_s2_domain_ste(
+               target, master, nested_domain->vsmmu->s2_parent, ats_enabled);
+
+       target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
+                                     FIELD_PREP(STRTAB_STE_0_CFG,
+                                                STRTAB_STE_0_CFG_NESTED));
+       target->data[0] |= nested_domain->ste[0] &
+                          ~cpu_to_le64(STRTAB_STE_0_CFG);
+       target->data[1] |= nested_domain->ste[1];
+}
+
+/*
+ * Create a physical STE from the virtual STE that userspace provided when it
+ * created the nested domain. Using the vSTE userspace can request:
+ * - Non-valid STE
+ * - Abort STE
+ * - Bypass STE (install the S2, no CD table)
+ * - CD table STE (install the S2 and the userspace CD table)
+ */
+static void arm_smmu_make_nested_domain_ste(
+       struct arm_smmu_ste *target, struct arm_smmu_master *master,
+       struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+       unsigned int cfg =
+               FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
+
+       /*
+        * Userspace can request a non-valid STE through the nesting interface.
+        * We relay that into an abort physical STE with the intention that
+        * C_BAD_STE for this SID can be generated to userspace.
+        */
+       if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V)))
+               cfg = STRTAB_STE_0_CFG_ABORT;
+
+       switch (cfg) {
+       case STRTAB_STE_0_CFG_S1_TRANS:
+               arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
+                                                 ats_enabled);
+               break;
+       case STRTAB_STE_0_CFG_BYPASS:
+               arm_smmu_make_s2_domain_ste(target, master,
+                                           nested_domain->vsmmu->s2_parent,
+                                           ats_enabled);
+               break;
+       case STRTAB_STE_0_CFG_ABORT:
+       default:
+               arm_smmu_make_abort_ste(target);
+               break;
+       }
+}
+
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
+                                     struct device *dev)
+{
+       struct arm_smmu_nested_domain *nested_domain =
+               to_smmu_nested_domain(domain);
+       struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+       struct arm_smmu_attach_state state = {
+               .master = master,
+               .old_domain = iommu_get_domain_for_dev(dev),
+               .ssid = IOMMU_NO_PASID,
+               /* Currently invalidation of ATC is not supported */
+               .disable_ats = true,
+       };
+       struct arm_smmu_ste ste;
+       int ret;
+
+       if (nested_domain->vsmmu->smmu != master->smmu)
+               return -EINVAL;
+       if (arm_smmu_ssids_in_use(&master->cd_table))
+               return -EBUSY;
+
+       mutex_lock(&arm_smmu_asid_lock);
+       ret = arm_smmu_attach_prepare(&state, domain);
+       if (ret) {
+               mutex_unlock(&arm_smmu_asid_lock);
+               return ret;
+       }
+
+       arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
+                                       state.ats_enabled);
+       arm_smmu_install_ste_for_dev(master, &ste);
+       arm_smmu_attach_commit(&state);
+       mutex_unlock(&arm_smmu_asid_lock);
+       return 0;
+}
+
+static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
+{
+       kfree(to_smmu_nested_domain(domain));
+}
+
+static const struct iommu_domain_ops arm_smmu_nested_ops = {
+       .attach_dev = arm_smmu_attach_dev_nested,
+       .free = arm_smmu_domain_nested_free,
+};
+
+static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg)
+{
+       unsigned int cfg;
+
+       if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
+               memset(arg->ste, 0, sizeof(arg->ste));
+               return 0;
+       }
+
+       /* EIO is reserved for invalid STE data. */
+       if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
+           (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
+               return -EIO;
+
+       cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0]));
+       if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
+           cfg != STRTAB_STE_0_CFG_S1_TRANS)
+               return -EIO;
+       return 0;
+}
+
+static struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+                             const struct iommu_user_data *user_data)
+{
+       struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+       const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID;
+       struct arm_smmu_nested_domain *nested_domain;
+       struct iommu_hwpt_arm_smmuv3 arg;
+       int ret;
+
+       /*
+        * Faults delivered to the nested domain are faults that originated by
+        * the S1 in the domain. The core code will match all PASIDs when
+        * delivering the fault due to user_pasid_table
+        */
+       if (flags & ~SUPPORTED_FLAGS)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       ret = iommu_copy_struct_from_user(&arg, user_data,
+                                         IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
+       if (ret)
+               return ERR_PTR(ret);
+
+       ret = arm_smmu_validate_vste(&arg);
+       if (ret)
+               return ERR_PTR(ret);
+
+       nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
+       if (!nested_domain)
+               return ERR_PTR(-ENOMEM);
+
+       nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
+       nested_domain->domain.ops = &arm_smmu_nested_ops;
+       nested_domain->vsmmu = vsmmu;
+       nested_domain->ste[0] = arg.ste[0];
+       nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
+
+       return &nested_domain->domain;
+}
+
 static const struct iommufd_viommu_ops arm_vsmmu_ops = {
+       .alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
 };
 
 struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
 
        case CMDQ_OP_TLBI_NH_ASID:
                cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
                fallthrough;
+       case CMDQ_OP_TLBI_NH_ALL:
        case CMDQ_OP_TLBI_S12_VMALL:
                cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
                break;
        }
        __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
 
+       if (smmu_domain->nest_parent) {
+               /*
+                * When the S2 domain changes all the nested S1 ASIDs have to be
+                * flushed too.
+                */
+               cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
+               arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
+       }
+
        /*
         * Unfortunately, this can't be leaf-only since we may have
         * zapped an entire table.
        if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
            domain->type == IOMMU_DOMAIN_SVA)
                return to_smmu_domain(domain);
+       if (domain->type == IOMMU_DOMAIN_NESTED)
+               return to_smmu_nested_domain(domain)->vsmmu->s2_parent;
        return NULL;
 }
 
                 * enabled if we have arm_smmu_domain, those always have page
                 * tables.
                 */
-               state->ats_enabled = arm_smmu_ats_supported(master);
+               state->ats_enabled = !state->disable_ats &&
+                                    arm_smmu_ats_supported(master);
        }
 
        if (smmu_domain) {
                        goto err_free;
                }
                smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+               smmu_domain->nest_parent = true;
        }
 
        smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
        .page_response          = arm_smmu_page_response,
        .def_domain_type        = arm_smmu_def_domain_type,
        .viommu_alloc           = arm_vsmmu_alloc,
+       .user_pasid_table       = 1,
        .pgsize_bitmap          = -1UL, /* Restricted during device attach */
        .owner                  = THIS_MODULE,
        .default_domain_ops = &(const struct iommu_domain_ops) {
 
 #define STRTAB_STE_0_CFG_BYPASS                4
 #define STRTAB_STE_0_CFG_S1_TRANS      5
 #define STRTAB_STE_0_CFG_S2_TRANS      6
+#define STRTAB_STE_0_CFG_NESTED                7
 
 #define STRTAB_STE_0_S1FMT             GENMASK_ULL(5, 4)
 #define STRTAB_STE_0_S1FMT_LINEAR      0
 
 #define STRTAB_STE_3_S2TTB_MASK                GENMASK_ULL(51, 4)
 
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
+#define STRTAB_STE_0_NESTING_ALLOWED                                         \
+       cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
+                   STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
+#define STRTAB_STE_1_NESTING_ALLOWED                            \
+       cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
+                   STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
+                   STRTAB_STE_1_S1STALLD)
+
 /*
  * Context descriptors.
  *
                        };
                } cfgi;
 
+               #define CMDQ_OP_TLBI_NH_ALL     0x10
                #define CMDQ_OP_TLBI_NH_ASID    0x11
                #define CMDQ_OP_TLBI_NH_VA      0x12
                #define CMDQ_OP_TLBI_EL2_ALL    0x20
        struct list_head                devices;
        spinlock_t                      devices_lock;
        bool                            enforce_cache_coherency : 1;
+       bool                            nest_parent : 1;
 
        struct mmu_notifier             mmu_notifier;
 };
 
+struct arm_smmu_nested_domain {
+       struct iommu_domain domain;
+       struct arm_vsmmu *vsmmu;
+
+       __le64 ste[2];
+};
+
 /* The following are exposed for testing purposes. */
 struct arm_smmu_entry_writer_ops;
 struct arm_smmu_entry_writer {
        return container_of(dom, struct arm_smmu_domain, domain);
 }
 
+static inline struct arm_smmu_nested_domain *
+to_smmu_nested_domain(struct iommu_domain *dom)
+{
+       return container_of(dom, struct arm_smmu_nested_domain, domain);
+}
+
 extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
 
        struct iommu_domain *old_domain;
        struct arm_smmu_master *master;
        bool cd_needs_ats;
+       bool disable_ats;
        ioasid_t ssid;
        /* Resulting state */
        bool ats_enabled;