#include <linux/export.h>
 #include <linux/pci.h>
 #include <linux/memblock.h>
+#include <linux/iommu.h>
 
 #include <asm/iommu.h>
 #include <asm/pnv-pci.h>
        return pe;
 }
 
-static long pnv_npu_set_window(struct pnv_ioda_pe *npe,
+long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
                struct iommu_table *tbl)
 {
        struct pnv_phb *phb = npe->phb;
        pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
        /* Add the table to the list so its TCE cache will get invalidated */
-       pnv_pci_link_table_and_group(phb->hose->node, 0,
+       pnv_pci_link_table_and_group(phb->hose->node, num,
                        tbl, &npe->table_group);
 
        return 0;
 }
 
-static long pnv_npu_unset_window(struct pnv_ioda_pe *npe)
+long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
 {
        struct pnv_phb *phb = npe->phb;
        int64_t rc;
        }
        pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
-       pnv_pci_unlink_table_and_group(npe->table_group.tables[0],
+       pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
                        &npe->table_group);
 
        return 0;
        if (!gpe)
                return;
 
-       rc = pnv_npu_set_window(npe, gpe->table_group.tables[0]);
+       rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
 
        /*
         * We don't initialise npu_pe->tce32_table as we always use
        if (phb->type != PNV_PHB_NPU || !npe->pdev)
                return -EINVAL;
 
-       rc = pnv_npu_unset_window(npe);
+       rc = pnv_npu_unset_window(npe, 0);
        if (rc != OPAL_SUCCESS)
                return rc;
 
                }
        }
 }
+
+/* Switch ownership from platform code to external user (e.g. VFIO) */
+void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+{
+       struct pnv_phb *phb = npe->phb;
+       int64_t rc;
+
+       /*
+        * Note: NPU has just a single TVE in the hardware which means that
+        * while used by the kernel, it can have either 32bit window or
+        * DMA bypass but never both. So we deconfigure 32bit window only
+        * if it was enabled at the moment of ownership change.
+        */
+       if (npe->table_group.tables[0]) {
+               pnv_npu_unset_window(npe, 0);
+               return;
+       }
+
+       /* Disable bypass */
+       rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+                       npe->pe_number, npe->pe_number,
+                       0 /* bypass base */, 0);
+       if (rc) {
+               pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
+               return;
+       }
+       pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+}
+
+struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
+{
+       struct pnv_phb *phb = npe->phb;
+       struct pci_bus *pbus = phb->hose->bus;
+       struct pci_dev *npdev, *gpdev = NULL, *gptmp;
+       struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+
+       if (!gpe || !gpdev)
+               return NULL;
+
+       list_for_each_entry(npdev, &pbus->devices, bus_list) {
+               gptmp = pnv_pci_get_gpu_dev(npdev);
+
+               if (gptmp != gpdev)
+                       continue;
+
+               pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
+               iommu_group_add_device(gpe->table_group.group, &npdev->dev);
+       }
+
+       return gpe;
+}
 
        .take_ownership = pnv_ioda2_take_ownership,
        .release_ownership = pnv_ioda2_release_ownership,
 };
+
+static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
+{
+       struct pci_controller *hose;
+       struct pnv_phb *phb;
+       struct pnv_ioda_pe **ptmppe = opaque;
+       struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+       struct pci_dn *pdn = pci_get_pdn(pdev);
+
+       if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+               return 0;
+
+       hose = pci_bus_to_host(pdev->bus);
+       phb = hose->private_data;
+       if (phb->type != PNV_PHB_NPU)
+               return 0;
+
+       *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
+
+       return 1;
+}
+
+/*
+ * This returns PE of associated NPU.
+ * This assumes that NPU is in the same IOMMU group with GPU and there is
+ * no other PEs.
+ */
+static struct pnv_ioda_pe *gpe_table_group_to_npe(
+               struct iommu_table_group *table_group)
+{
+       struct pnv_ioda_pe *npe = NULL;
+       int ret = iommu_group_for_each_dev(table_group->group, &npe,
+                       gpe_table_group_to_npe_cb);
+
+       BUG_ON(!ret || !npe);
+
+       return npe;
+}
+
+static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
+               int num, struct iommu_table *tbl)
+{
+       long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
+
+       if (ret)
+               return ret;
+
+       ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl);
+       if (ret)
+               pnv_pci_ioda2_unset_window(table_group, num);
+
+       return ret;
+}
+
+static long pnv_pci_ioda2_npu_unset_window(
+               struct iommu_table_group *table_group,
+               int num)
+{
+       long ret = pnv_pci_ioda2_unset_window(table_group, num);
+
+       if (ret)
+               return ret;
+
+       return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
+}
+
+static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
+{
+       /*
+        * Detach NPU first as pnv_ioda2_take_ownership() will destroy
+        * the iommu_table if 32bit DMA is enabled.
+        */
+       pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
+       pnv_ioda2_take_ownership(table_group);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
+       .get_table_size = pnv_pci_ioda2_get_table_size,
+       .create_table = pnv_pci_ioda2_create_table,
+       .set_window = pnv_pci_ioda2_npu_set_window,
+       .unset_window = pnv_pci_ioda2_npu_unset_window,
+       .take_ownership = pnv_ioda2_npu_take_ownership,
+       .release_ownership = pnv_ioda2_release_ownership,
+};
+
+static void pnv_pci_ioda_setup_iommu_api(void)
+{
+       struct pci_controller *hose, *tmp;
+       struct pnv_phb *phb;
+       struct pnv_ioda_pe *pe, *gpe;
+
+       /*
+        * Now we have all PHBs discovered, time to add NPU devices to
+        * the corresponding IOMMU groups.
+        */
+       list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+               phb = hose->private_data;
+
+               if (phb->type != PNV_PHB_NPU)
+                       continue;
+
+               list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+                       gpe = pnv_pci_npu_setup_iommu(pe);
+                       if (gpe)
+                               gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
+               }
+       }
+}
+#else /* !CONFIG_IOMMU_API */
+static void pnv_pci_ioda_setup_iommu_api(void) { };
 #endif
 
 static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
                phb = hose->private_data;
                phb->initialized = 1;
        }
+
+       pnv_pci_ioda_setup_iommu_api();
 }
 
 static void pnv_pci_ioda_create_dbgfs(void)