#define ARM64_HAS_CACHE_DIC                    28
 #define ARM64_HW_DBM                           29
 #define ARM64_SSBD                             30
+#define ARM64_HAS_STAGE2_FWB                   31
 
-#define ARM64_NCAPS                            31
+#define ARM64_NCAPS                            32
 
 #endif /* __ASM_CPUCAPS_H */
 
 #include <asm/types.h>
 
 /* Hyp Configuration Register (HCR) bits */
+#define HCR_FWB                (UL(1) << 46)
 #define HCR_TEA                (UL(1) << 37)
 #define HCR_TERR       (UL(1) << 36)
 #define HCR_TLOR       (UL(1) << 35)
 
                /* trap error record accesses */
                vcpu->arch.hcr_el2 |= HCR_TERR;
        }
+       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+               vcpu->arch.hcr_el2 |= HCR_FWB;
 
        if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
                vcpu->arch.hcr_el2 &= ~HCR_RW;
 
 {
        void *va = page_address(pfn_to_page(pfn));
 
+       /*
+        * With FWB, we ensure that the guest always accesses memory using
+        * cacheable attributes, and we don't have to clean to PoC when
+        * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
+        * PoU is not required either in this case.
+        */
+       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+               return;
+
        kvm_flush_dcache_to_poc(va, size);
 }
 
 
 static inline void __kvm_flush_dcache_pte(pte_t pte)
 {
-       struct page *page = pte_page(pte);
-       kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
+       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
+               struct page *page = pte_page(pte);
+               kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
+       }
 }
 
 static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
 {
-       struct page *page = pmd_page(pmd);
-       kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
+       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
+               struct page *page = pmd_page(pmd);
+               kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
+       }
 }
 
 static inline void __kvm_flush_dcache_pud(pud_t pud)
 {
-       struct page *page = pud_page(pud);
-       kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
+       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
+               struct page *page = pud_page(pud);
+               kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
+       }
 }
 
 #define kvm_virt_to_phys(x)            __pa_symbol(x)
 
 #define MT_S2_NORMAL           0xf
 #define MT_S2_DEVICE_nGnRE     0x1
 
+/*
+ * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001
+ * Stage-2 enforces Normal-WB and Device-nGnRE
+ */
+#define MT_S2_FWB_NORMAL       6
+#define MT_S2_FWB_DEVICE_nGnRE 1
+
 #ifdef CONFIG_ARM64_4K_PAGES
 #define IOREMAP_MAX_ORDER      (PUD_SHIFT)
 #else
 
 #define PAGE_HYP_RO            __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE                __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 
-#define PAGE_S2                        __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY | PTE_S2_XN)
-#define PAGE_S2_DEVICE         __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
+#define PAGE_S2_MEMATTR(attr)                                          \
+       ({                                                              \
+               u64 __val;                                              \
+               if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))          \
+                       __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr);     \
+               else                                                    \
+                       __val = PTE_S2_MEMATTR(MT_S2_ ## attr);         \
+               __val;                                                  \
+        })
+
+#define PAGE_S2                        __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PTE_S2_XN)
+#define PAGE_S2_DEVICE         __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
 
 #define PAGE_NONE              __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
 #define PAGE_SHARED            __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
 
 #define ID_AA64MMFR1_VMIDBITS_16       2
 
 /* id_aa64mmfr2 */
+#define ID_AA64MMFR2_FWB_SHIFT         40
 #define ID_AA64MMFR2_AT_SHIFT          32
 #define ID_AA64MMFR2_LVA_SHIFT         16
 #define ID_AA64MMFR2_IESB_SHIFT                12
 
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
+       ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0),
 }
 #endif
 
+static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
+{
+       u64 val = read_sysreg_s(SYS_CLIDR_EL1);
+
+       /* Check that CLIDR_EL1.LOU{U,IS} are both 0 */
+       WARN_ON(val & (7 << 27 | 7 << 21));
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
        {
                .desc = "GIC system register CPU interface",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cache_dic,
        },
+       {
+               .desc = "Stage-2 Force Write-Back",
+               .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+               .capability = ARM64_HAS_STAGE2_FWB,
+               .sys_reg = SYS_ID_AA64MMFR2_EL1,
+               .sign = FTR_UNSIGNED,
+               .field_pos = ID_AA64MMFR2_FWB_SHIFT,
+               .min_field_value = 1,
+               .matches = has_cpuid_feature,
+               .cpu_enable = cpu_has_fwb,
+       },
 #ifdef CONFIG_ARM64_HW_AFDBM
        {
                /*
 
  * This is why right after unmapping a page/section and invalidating
  * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
  * the IO subsystem will never hit in the cache.
+ *
+ * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
+ * we then fully enforce cacheability of RAM, no matter what the guest
+ * does.
  */
 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
                       phys_addr_t addr, phys_addr_t end)