/*
- * Copyright (c) 2006, Intel Corporation.
+ * Copyright © 2006-2014 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Author: Ashok Raj <ashok.raj@intel.com>
- * Author: Shaohua Li <shaohua.li@intel.com>
- * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- * Author: Fenghua Yu <fenghua.yu@intel.com>
+ * Authors: David Woodhouse <dwmw2@infradead.org>,
+ *          Ashok Raj <ashok.raj@intel.com>,
+ *          Shaohua Li <shaohua.li@intel.com>,
+ *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
+ *          Fenghua Yu <fenghua.yu@intel.com>
  */
 
 #include <linux/init.h>
        int next;
        struct iova *iova[HIGH_WATER_MARK];
        struct dmar_domain *domain[HIGH_WATER_MARK];
+       struct page *freelist[HIGH_WATER_MARK];
 };
 
 static struct deferred_flush_tables *deferred_flush;
        }
 }
 
+/* When a page at a given level is being unlinked from its parent, we don't
+   need to *modify* it at all. All we need to do is make a list of all the
+   pages which can be freed just as soon as we've flushed the IOTLB and we
+   know the hardware page-walk will no longer touch them.
+   The 'pte' argument is the *parent* PTE, pointing to the page that is to
+   be freed. */
+static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
+                                           int level, struct dma_pte *pte,
+                                           struct page *freelist)
+{
+       struct page *pg;
+
+       pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
+       pg->freelist = freelist;
+       freelist = pg;
+
+       if (level == 1)
+               return freelist;
+
+       for (pte = page_address(pg); !first_pte_in_page(pte); pte++) {
+               if (dma_pte_present(pte) && !dma_pte_superpage(pte))
+                       freelist = dma_pte_list_pagetables(domain, level - 1,
+                                                          pte, freelist);
+       }
+
+       return freelist;
+}
+
+static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+                                       struct dma_pte *pte, unsigned long pfn,
+                                       unsigned long start_pfn,
+                                       unsigned long last_pfn,
+                                       struct page *freelist)
+{
+       struct dma_pte *first_pte = NULL, *last_pte = NULL;
+
+       pfn = max(start_pfn, pfn);
+       pte = &pte[pfn_level_offset(pfn, level)];
+
+       do {
+               unsigned long level_pfn;
+
+               if (!dma_pte_present(pte))
+                       goto next;
+
+               level_pfn = pfn & level_mask(level);
+
+               /* If range covers entire pagetable, free it */
+               if (start_pfn <= level_pfn &&
+                   last_pfn >= level_pfn + level_size(level) - 1) {
+                       /* These suborbinate page tables are going away entirely. Don't
+                          bother to clear them; we're just going to *free* them. */
+                       if (level > 1 && !dma_pte_superpage(pte))
+                               freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
+
+                       dma_clear_pte(pte);
+                       if (!first_pte)
+                               first_pte = pte;
+                       last_pte = pte;
+               } else if (level > 1) {
+                       /* Recurse down into a level that isn't *entirely* obsolete */
+                       freelist = dma_pte_clear_level(domain, level - 1,
+                                                      phys_to_virt(dma_pte_addr(pte)),
+                                                      level_pfn, start_pfn, last_pfn,
+                                                      freelist);
+               }
+next:
+               pfn += level_size(level);
+       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+
+       if (first_pte)
+               domain_flush_cache(domain, first_pte,
+                                  (void *)++last_pte - (void *)first_pte);
+
+       return freelist;
+}
+
+/* We can't just free the pages because the IOMMU may still be walking
+   the page tables, and may have cached the intermediate levels. The
+   pages can only be freed after the IOTLB flush has been done. */
+struct page *domain_unmap(struct dmar_domain *domain,
+                         unsigned long start_pfn,
+                         unsigned long last_pfn)
+{
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       struct page *freelist = NULL;
+
+       BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
+       BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
+       BUG_ON(start_pfn > last_pfn);
+
+       /* we don't need lock here; nobody else touches the iova range */
+       freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
+                                      domain->pgd, 0, start_pfn, last_pfn, NULL);
+
+       /* free pgd */
+       if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+               struct page *pgd_page = virt_to_page(domain->pgd);
+               pgd_page->freelist = freelist;
+               freelist = pgd_page;
+
+               domain->pgd = NULL;
+       }
+
+       return freelist;
+}
+
+void dma_free_pagelist(struct page *freelist)
+{
+       struct page *pg;
+
+       while ((pg = freelist)) {
+               freelist = pg->freelist;
+               free_pgtable_page(page_address(pg));
+       }
+}
+
 /* iommu handling */
 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 {
                break;
        case DMA_TLB_PSI_FLUSH:
                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
-               /* Note: always flush non-leaf currently */
+               /* IH bit is passed in as part of address */
                val_iva = size_order | addr;
                break;
        default:
 }
 
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-                                 unsigned long pfn, unsigned int pages, int map)
+                                 unsigned long pfn, unsigned int pages, int ih, int map)
 {
        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
 
        BUG_ON(pages == 0);
 
+       if (ih)
+               ih = 1 << 6;
        /*
         * Fallback to domain selective flush if no PSI support or the size is
         * too big.
                iommu->flush.flush_iotlb(iommu, did, 0, 0,
                                                DMA_TLB_DSI_FLUSH);
        else
-               iommu->flush.flush_iotlb(iommu, did, addr, mask,
+               iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
                                                DMA_TLB_PSI_FLUSH);
 
        /*
 {
        struct dmar_drhd_unit *drhd;
        struct intel_iommu *iommu;
+       struct page *freelist = NULL;
 
        /* Domain 0 is reserved, so dont process it */
        if (!domain)
        /* destroy iovas */
        put_iova_domain(&domain->iovad);
 
-       /* clear ptes */
-       dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+       freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
 
        /* clear attached or cached domains */
        rcu_read_lock();
                        iommu_detach_domain(domain, iommu);
        rcu_read_unlock();
 
+       dma_free_pagelist(freelist);
+
        free_domain_mem(domain);
 }
 
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
+               iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
                        /* On real hardware multiple invalidations are expensive */
                        if (cap_caching_mode(iommu->cap))
                                iommu_flush_iotlb_psi(iommu, domain->id,
-                               iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
+                                       iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
+                                       !deferred_flush[i].freelist[j], 0);
                        else {
                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
                        }
                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+                       if (deferred_flush[i].freelist[j])
+                               dma_free_pagelist(deferred_flush[i].freelist[j]);
                }
                deferred_flush[i].next = 0;
        }
        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
 {
        unsigned long flags;
        int next, iommu_id;
        next = deferred_flush[iommu_id].next;
        deferred_flush[iommu_id].domain[next] = dom;
        deferred_flush[iommu_id].iova[next] = iova;
+       deferred_flush[iommu_id].freelist[next] = freelist;
        deferred_flush[iommu_id].next++;
 
        if (!timer_on) {
        unsigned long start_pfn, last_pfn;
        struct iova *iova;
        struct intel_iommu *iommu;
+       struct page *freelist;
 
        if (iommu_no_mapping(dev))
                return;
        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
                 pci_name(pdev), start_pfn, last_pfn);
 
-       /*  clear the whole page */
-       dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+       freelist = domain_unmap(domain, start_pfn, last_pfn);
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1, 0);
+                                     last_pfn - start_pfn + 1, !freelist, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
+               dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova);
+               add_unmap(domain, iova, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
        unsigned long start_pfn, last_pfn;
        struct iova *iova;
        struct intel_iommu *iommu;
+       struct page *freelist;
 
        if (iommu_no_mapping(hwdev))
                return;
        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
 
-       /*  clear the whole page */
-       dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+       freelist = domain_unmap(domain, start_pfn, last_pfn);
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1, 0);
+                                     last_pfn - start_pfn + 1, !freelist, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
+               dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova);
+               add_unmap(domain, iova, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
+               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
                        struct iova *iova;
                        struct dmar_drhd_unit *drhd;
                        struct intel_iommu *iommu;
+                       struct page *freelist;
 
                        iova = find_iova(&si_domain->iovad, start_vpfn);
                        if (iova == NULL) {
                                return NOTIFY_BAD;
                        }
 
+                       freelist = domain_unmap(si_domain, iova->pfn_lo,
+                                              iova->pfn_hi);
+
                        rcu_read_lock();
                        for_each_active_iommu(iommu, drhd)
                                iommu_flush_iotlb_psi(iommu, si_domain->id,
                                        iova->pfn_lo,
-                                       iova->pfn_hi - iova->pfn_lo + 1, 0);
+                                       iova->pfn_hi - iova->pfn_lo + 1,
+                                       !freelist, 0);
                        rcu_read_unlock();
-                       dma_pte_clear_range(si_domain, iova->pfn_lo,
-                                           iova->pfn_hi);
-                       dma_pte_free_pagetable(si_domain, iova->pfn_lo,
-                                              iova->pfn_hi);
+                       dma_free_pagelist(freelist);
 
                        start_vpfn = iova->pfn_hi + 1;
                        free_iova_mem(iova);
 }
 
 static size_t intel_iommu_unmap(struct iommu_domain *domain,
-                            unsigned long iova, size_t size)
+                               unsigned long iova, size_t size)
 {
        struct dmar_domain *dmar_domain = domain->priv;
-       int level = 0;
+       struct page *freelist = NULL;
+       struct intel_iommu *iommu;
+       unsigned long start_pfn, last_pfn;
+       unsigned int npages;
+       int iommu_id, num, ndomains, level = 0;
 
        /* Cope with horrid API which requires us to unmap more than the
           size argument if it happens to be a large-page mapping. */
        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
 
-       dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
-                           (iova + size - 1) >> VTD_PAGE_SHIFT);
+       start_pfn = iova >> VTD_PAGE_SHIFT;
+       last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+
+       freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
+
+       npages = last_pfn - start_pfn + 1;
+
+       for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
+               iommu = g_iommus[iommu_id];
+
+               /*
+                * find bit position of dmar_domain
+                */
+               ndomains = cap_ndoms(iommu->cap);
+               for_each_set_bit(num, iommu->domain_ids, ndomains) {
+                       if (iommu->domains[num] == dmar_domain)
+                               iommu_flush_iotlb_psi(iommu, num, start_pfn,
+                                                    npages, !freelist, 0);
+              }
+
+       }
+
+       dma_free_pagelist(freelist);
 
        if (dmar_domain->max_addr == iova + size)
                dmar_domain->max_addr = iova;