*/
 #define EX_R3          EX_DAR
 
-/* Macros for annotating the expected destination of (h)rfid */
+/*
+ * Macros for annotating the expected destination of (h)rfid
+ *
+ * The nop instructions allow us to insert one or more instructions to flush the
+ * L1-D cache when returning to userspace or a guest.
+ */
+#define RFI_FLUSH_SLOT                                                 \
+       RFI_FLUSH_FIXUP_SECTION;                                        \
+       nop;                                                            \
+       nop;                                                            \
+       nop
 
 #define RFI_TO_KERNEL                                                  \
        rfid
 
 #define RFI_TO_USER                                                    \
-       rfid
+       RFI_FLUSH_SLOT;                                                 \
+       rfid;                                                           \
+       b       rfi_flush_fallback
 
 #define RFI_TO_USER_OR_KERNEL                                          \
-       rfid
+       RFI_FLUSH_SLOT;                                                 \
+       rfid;                                                           \
+       b       rfi_flush_fallback
 
 #define RFI_TO_GUEST                                                   \
-       rfid
+       RFI_FLUSH_SLOT;                                                 \
+       rfid;                                                           \
+       b       rfi_flush_fallback
 
 #define HRFI_TO_KERNEL                                                 \
        hrfid
 
 #define HRFI_TO_USER                                                   \
-       hrfid
+       RFI_FLUSH_SLOT;                                                 \
+       hrfid;                                                          \
+       b       hrfi_flush_fallback
 
 #define HRFI_TO_USER_OR_KERNEL                                         \
-       hrfid
+       RFI_FLUSH_SLOT;                                                 \
+       hrfid;                                                          \
+       b       hrfi_flush_fallback
 
 #define HRFI_TO_GUEST                                                  \
-       hrfid
+       RFI_FLUSH_SLOT;                                                 \
+       hrfid;                                                          \
+       b       hrfi_flush_fallback
 
 #define HRFI_TO_UNKNOWN                                                        \
-       hrfid
+       RFI_FLUSH_SLOT;                                                 \
+       hrfid;                                                          \
+       b       hrfi_flush_fallback
 
 #ifdef CONFIG_RELOCATABLE
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)                   \
 
        FTR_ENTRY_OFFSET label##1b-label##3b;           \
        .popsection;
 
+#define RFI_FLUSH_FIXUP_SECTION                                \
+951:                                                   \
+       .pushsection __rfi_flush_fixup,"a";             \
+       .align 2;                                       \
+952:                                                   \
+       FTR_ENTRY_OFFSET 951b-952b;                     \
+       .popsection;
+
+
 #ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
+
 void apply_feature_fixups(void);
 void setup_feature_keys(void);
 #endif
 
        struct sibling_subcore_state *sibling_subcore_state;
 #endif
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+       /*
+        * rfi fallback flush must be in its own cacheline to prevent
+        * other paca data leaking into the L1d
+        */
+       u64 exrfi[EX_SIZE] __aligned(0x80);
+       void *rfi_flush_fallback_area;
+       u64 l1d_flush_congruence;
+       u64 l1d_flush_sets;
+#endif
 };
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
 
 static inline void pseries_little_endian_exceptions(void) {}
 #endif /* CONFIG_PPC_PSERIES */
 
+void rfi_flush_enable(bool enable);
+
+/* These are bit flags */
+enum l1d_flush_type {
+       L1D_FLUSH_NONE          = 0x1,
+       L1D_FLUSH_FALLBACK      = 0x2,
+       L1D_FLUSH_ORI           = 0x4,
+       L1D_FLUSH_MTTRIG        = 0x8,
+};
+
+void __init setup_rfi_flush(enum l1d_flush_type, bool enable);
+void do_rfi_flush_fixups(enum l1d_flush_type types);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_SETUP_H */
 
        OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
        OFFSET(PACA_IN_MCE, paca_struct, in_mce);
        OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
+       OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
+       OFFSET(PACA_EXRFI, paca_struct, exrfi);
+       OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
+       OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
+
 #endif
        OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
        OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
 
        b       .;                                      \
        MASKED_DEC_HANDLER(_H)
 
+TRAMP_REAL_BEGIN(rfi_flush_fallback)
+       SET_SCRATCH0(r13);
+       GET_PACA(r13);
+       std     r9,PACA_EXRFI+EX_R9(r13)
+       std     r10,PACA_EXRFI+EX_R10(r13)
+       std     r11,PACA_EXRFI+EX_R11(r13)
+       std     r12,PACA_EXRFI+EX_R12(r13)
+       std     r8,PACA_EXRFI+EX_R13(r13)
+       mfctr   r9
+       ld      r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+       ld      r11,PACA_L1D_FLUSH_SETS(r13)
+       ld      r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+       /*
+        * The load adresses are at staggered offsets within cachelines,
+        * which suits some pipelines better (on others it should not
+        * hurt).
+        */
+       addi    r12,r12,8
+       mtctr   r11
+       DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+       /* order ld/st prior to dcbt stop all streams with flushing */
+       sync
+1:     li      r8,0
+       .rept   8 /* 8-way set associative */
+       ldx     r11,r10,r8
+       add     r8,r8,r12
+       xor     r11,r11,r11     // Ensure r11 is 0 even if fallback area is not
+       add     r8,r8,r11       // Add 0, this creates a dependency on the ldx
+       .endr
+       addi    r10,r10,128 /* 128 byte cache line */
+       bdnz    1b
+
+       mtctr   r9
+       ld      r9,PACA_EXRFI+EX_R9(r13)
+       ld      r10,PACA_EXRFI+EX_R10(r13)
+       ld      r11,PACA_EXRFI+EX_R11(r13)
+       ld      r12,PACA_EXRFI+EX_R12(r13)
+       ld      r8,PACA_EXRFI+EX_R13(r13)
+       GET_SCRATCH0(r13);
+       rfid
+
+TRAMP_REAL_BEGIN(hrfi_flush_fallback)
+       SET_SCRATCH0(r13);
+       GET_PACA(r13);
+       std     r9,PACA_EXRFI+EX_R9(r13)
+       std     r10,PACA_EXRFI+EX_R10(r13)
+       std     r11,PACA_EXRFI+EX_R11(r13)
+       std     r12,PACA_EXRFI+EX_R12(r13)
+       std     r8,PACA_EXRFI+EX_R13(r13)
+       mfctr   r9
+       ld      r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+       ld      r11,PACA_L1D_FLUSH_SETS(r13)
+       ld      r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+       /*
+        * The load adresses are at staggered offsets within cachelines,
+        * which suits some pipelines better (on others it should not
+        * hurt).
+        */
+       addi    r12,r12,8
+       mtctr   r11
+       DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+       /* order ld/st prior to dcbt stop all streams with flushing */
+       sync
+1:     li      r8,0
+       .rept   8 /* 8-way set associative */
+       ldx     r11,r10,r8
+       add     r8,r8,r12
+       xor     r11,r11,r11     // Ensure r11 is 0 even if fallback area is not
+       add     r8,r8,r11       // Add 0, this creates a dependency on the ldx
+       .endr
+       addi    r10,r10,128 /* 128 byte cache line */
+       bdnz    1b
+
+       mtctr   r9
+       ld      r9,PACA_EXRFI+EX_R9(r13)
+       ld      r10,PACA_EXRFI+EX_R10(r13)
+       ld      r11,PACA_EXRFI+EX_R11(r13)
+       ld      r12,PACA_EXRFI+EX_R12(r13)
+       ld      r8,PACA_EXRFI+EX_R13(r13)
+       GET_SCRATCH0(r13);
+       hrfid
+
 /*
  * Real mode exceptions actually use this too, but alternate
  * instruction code patches (which end up in the common .text area)
 
        return 0;
 }
 early_initcall(disable_hardlockup_detector);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static enum l1d_flush_type enabled_flush_types;
+static void *l1d_flush_fallback_area;
+bool rfi_flush;
+
+static void do_nothing(void *unused)
+{
+       /*
+        * We don't need to do the flush explicitly, just enter+exit kernel is
+        * sufficient, the RFI exit handlers will do the right thing.
+        */
+}
+
+void rfi_flush_enable(bool enable)
+{
+       if (rfi_flush == enable)
+               return;
+
+       if (enable) {
+               do_rfi_flush_fixups(enabled_flush_types);
+               on_each_cpu(do_nothing, NULL, 1);
+       } else
+               do_rfi_flush_fixups(L1D_FLUSH_NONE);
+
+       rfi_flush = enable;
+}
+
+static void init_fallback_flush(void)
+{
+       u64 l1d_size, limit;
+       int cpu;
+
+       l1d_size = ppc64_caches.l1d.size;
+       limit = min(safe_stack_limit(), ppc64_rma_size);
+
+       /*
+        * Align to L1d size, and size it at 2x L1d size, to catch possible
+        * hardware prefetch runoff. We don't have a recipe for load patterns to
+        * reliably avoid the prefetcher.
+        */
+       l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
+       memset(l1d_flush_fallback_area, 0, l1d_size * 2);
+
+       for_each_possible_cpu(cpu) {
+               /*
+                * The fallback flush is currently coded for 8-way
+                * associativity. Different associativity is possible, but it
+                * will be treated as 8-way and may not evict the lines as
+                * effectively.
+                *
+                * 128 byte lines are mandatory.
+                */
+               u64 c = l1d_size / 8;
+
+               paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
+               paca[cpu].l1d_flush_congruence = c;
+               paca[cpu].l1d_flush_sets = c / 128;
+       }
+}
+
+void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
+{
+       if (types & L1D_FLUSH_FALLBACK) {
+               pr_info("rfi-flush: Using fallback displacement flush\n");
+               init_fallback_flush();
+       }
+
+       if (types & L1D_FLUSH_ORI)
+               pr_info("rfi-flush: Using ori type flush\n");
+
+       if (types & L1D_FLUSH_MTTRIG)
+               pr_info("rfi-flush: Using mttrig type flush\n");
+
+       enabled_flush_types = types;
+
+       rfi_flush_enable(enable);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
        /* Read-only data */
        RO_DATA(PAGE_SIZE)
 
+#ifdef CONFIG_PPC64
+       . = ALIGN(8);
+       __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
+               __start___rfi_flush_fixup = .;
+               *(__rfi_flush_fixup)
+               __stop___rfi_flush_fixup = .;
+       }
+#endif
+
        EXCEPTION_TABLE(0)
 
        NOTES :kernel :notes
 
        }
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+void do_rfi_flush_fixups(enum l1d_flush_type types)
+{
+       unsigned int instrs[3], *dest;
+       long *start, *end;
+       int i;
+
+       start = PTRRELOC(&__start___rfi_flush_fixup),
+       end = PTRRELOC(&__stop___rfi_flush_fixup);
+
+       instrs[0] = 0x60000000; /* nop */
+       instrs[1] = 0x60000000; /* nop */
+       instrs[2] = 0x60000000; /* nop */
+
+       if (types & L1D_FLUSH_FALLBACK)
+               /* b .+16 to fallback flush */
+               instrs[0] = 0x48000010;
+
+       i = 0;
+       if (types & L1D_FLUSH_ORI) {
+               instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
+               instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/
+       }
+
+       if (types & L1D_FLUSH_MTTRIG)
+               instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */
+
+       for (i = 0; start < end; start++, i++) {
+               dest = (void *)start + *start;
+
+               pr_devel("patching dest %lx\n", (unsigned long)dest);
+
+               patch_instruction(dest, instrs[0]);
+               patch_instruction(dest + 1, instrs[1]);
+               patch_instruction(dest + 2, instrs[2]);
+       }
+
+       printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
        long *start, *end;