PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
        return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
 }
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
        return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
        PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
 }
 
        native_write_cr3(x);
 }
 
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
 {
        return native_read_cr4();
 }
 
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
 {
        return native_read_cr4_safe();
 }
 
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
 {
        native_write_cr4(x);
 }
 
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_state {
+#ifdef CONFIG_SMP
+       struct mm_struct *active_mm;
+       int state;
+#endif
+
+       /*
+        * Access to this CR4 shadow and to H/W CR4 is protected by
+        * disabling interrupts when modifying either one.
+        */
+       unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+       this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
 /* Set in this cpu's CR4. */
 static inline void cr4_set_bits(unsigned long mask)
 {
        unsigned long cr4;
 
-       cr4 = read_cr4();
-       cr4 |= mask;
-       write_cr4(cr4);
+       cr4 = this_cpu_read(cpu_tlbstate.cr4);
+       if ((cr4 | mask) != cr4) {
+               cr4 |= mask;
+               this_cpu_write(cpu_tlbstate.cr4, cr4);
+               __write_cr4(cr4);
+       }
 }
 
 /* Clear in this cpu's CR4. */
 {
        unsigned long cr4;
 
-       cr4 = read_cr4();
-       cr4 &= ~mask;
-       write_cr4(cr4);
+       cr4 = this_cpu_read(cpu_tlbstate.cr4);
+       if ((cr4 & ~mask) != cr4) {
+               cr4 &= ~mask;
+               this_cpu_write(cpu_tlbstate.cr4, cr4);
+               __write_cr4(cr4);
+       }
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+       return this_cpu_read(cpu_tlbstate.cr4);
 }
 
 /*
 {
        unsigned long cr4;
 
-       cr4 = native_read_cr4();
+       cr4 = this_cpu_read(cpu_tlbstate.cr4);
        /* clear PGE */
        native_write_cr4(cr4 & ~X86_CR4_PGE);
        /* write old PGE again and flush TLBs */
 #define TLBSTATE_OK    1
 #define TLBSTATE_LAZY  2
 
-struct tlb_state {
-       struct mm_struct *active_mm;
-       int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
 static inline void reset_lazy_tlbstate(void)
 {
        this_cpu_write(cpu_tlbstate.state, 0);
 
 
 static inline int cpu_vmx_enabled(void)
 {
-       return read_cr4() & X86_CR4_VMXE;
+       return __read_cr4() & X86_CR4_VMXE;
 }
 
 /** Disable VMX if it is enabled on the current CPU
 
 
        header->pmode_cr0 = read_cr0();
        if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
-               header->pmode_cr4 = read_cr4();
+               header->pmode_cr4 = __read_cr4();
                header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
        }
        if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
 
 #include <asm/archrandom.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
+#include <asm/tlbflush.h>
 #include <asm/debugreg.h>
 #include <asm/sections.h>
 #include <asm/vsyscall.h>
 
        wait_for_master_cpu(cpu);
 
+       /*
+        * Initialize the CR4 shadow before doing anything that could
+        * try to read it.
+        */
+       cr4_init_shadow();
+
        /*
         * Load microcode on this cpu if a valid microcode is available.
         * This is early microcode loading procedure.
 
 
        /*  Save value of CR4 and clear Page Global Enable (bit 7)  */
        if (cpu_has_pge) {
-               cr4 = read_cr4();
-               write_cr4(cr4 & ~X86_CR4_PGE);
+               cr4 = __read_cr4();
+               __write_cr4(cr4 & ~X86_CR4_PGE);
        }
 
        /*
 
        /* Restore value of CR4 */
        if (cpu_has_pge)
-               write_cr4(cr4);
+               __write_cr4(cr4);
 }
 
 static void cyrix_set_arr(unsigned int reg, unsigned long base,
 
 
        /* Save value of CR4 and clear Page Global Enable (bit 7) */
        if (cpu_has_pge) {
-               cr4 = read_cr4();
-               write_cr4(cr4 & ~X86_CR4_PGE);
+               cr4 = __read_cr4();
+               __write_cr4(cr4 & ~X86_CR4_PGE);
        }
 
        /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
 
        /* Restore value of CR4 */
        if (cpu_has_pge)
-               write_cr4(cr4);
+               __write_cr4(cr4);
        raw_spin_unlock(&set_atomicity_lock);
 }
 
 
 
 asmlinkage __visible void __init i386_start_kernel(void)
 {
+       cr4_init_shadow();
        sanitize_boot_params(&boot_params);
 
        /* Call the subarch specific early setup function */
 
                                (__START_KERNEL & PGDIR_MASK)));
        BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
+       cr4_init_shadow();
+
        /* Kill off the identity-map trampoline */
        reset_early_page_tables();
 
 
        cr0 = read_cr0();
        cr2 = read_cr2();
        cr3 = read_cr3();
-       cr4 = read_cr4_safe();
+       cr4 = __read_cr4_safe();
        printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
                        cr0, cr2, cr3, cr4);
 
 
        cr0 = read_cr0();
        cr2 = read_cr2();
        cr3 = read_cr3();
-       cr4 = read_cr4();
+       cr4 = __read_cr4();
 
        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
               fs, fsindex, gs, gsindex, shadowgs);
 
 
        if (boot_cpu_data.cpuid_level >= 0) {
                /* A CPU has %cr4 if and only if it has CPUID */
-               mmu_cr4_features = read_cr4();
+               mmu_cr4_features = __read_cr4();
                if (trampoline_cr4_features)
                        *trampoline_cr4_features = mmu_cr4_features;
        }
 
 
 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-       unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+       unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
        if (cr4 & X86_CR4_VMXE)
 
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
        u64 old, test_bits;
 
-       if (read_cr4() & X86_CR4_VMXE)
+       if (cr4_read_shadow() & X86_CR4_VMXE)
                return -EBUSY;
 
        INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
        /* Save the most likely value for this task's CR4 in the VMCS. */
-       cr4 = read_cr4();
+       cr4 = cr4_read_shadow();
        vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
        vmx->host_state.vmcs_host_cr4 = cr4;
 
        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
-       cr4 = read_cr4();
+       cr4 = cr4_read_shadow();
        if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
                vmcs_writel(HOST_CR4, cr4);
                vmx->host_state.vmcs_host_cr4 = cr4;
 
                        printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
                if (pte && pte_present(*pte) && pte_exec(*pte) &&
                                (pgd_flags(*pgd) & _PAGE_USER) &&
-                               (read_cr4() & X86_CR4_SMEP))
+                               (__read_cr4() & X86_CR4_SMEP))
                        printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
        }
 
 
        free_area_init_nodes(max_zone_pfns);
 }
 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+       .active_mm = &init_mm,
+       .state = 0,
+#endif
+       .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
+
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 {
        /* entry 0 MUST be WB (hardwired to speed up translations) */
 
 #include <asm/uv/uv.h>
 #include <linux/debugfs.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
-                       = { &init_mm, 0, };
-
 /*
  *     Smarter SMP flushing macros.
  *             c/o Linus Torvalds.
 
        ctxt->cr0 = read_cr0();
        ctxt->cr2 = read_cr2();
        ctxt->cr3 = read_cr3();
-#ifdef CONFIG_X86_32
-       ctxt->cr4 = read_cr4_safe();
-#else
-/* CONFIG_X86_64 */
-       ctxt->cr4 = read_cr4();
+       ctxt->cr4 = __read_cr4_safe();
+#ifdef CONFIG_X86_64
        ctxt->cr8 = read_cr8();
 #endif
        ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
        /* cr4 was introduced in the Pentium CPU */
 #ifdef CONFIG_X86_32
        if (ctxt->cr4)
-               write_cr4(ctxt->cr4);
+               __write_cr4(ctxt->cr4);
 #else
 /* CONFIG X86_64 */
        wrmsrl(MSR_EFER, ctxt->efer);
        write_cr8(ctxt->cr8);
-       write_cr4(ctxt->cr4);
+       __write_cr4(ctxt->cr4);
 #endif
        write_cr3(ctxt->cr3);
        write_cr2(ctxt->cr2);
 
 
        trampoline_header->start = (u64) secondary_startup_64;
        trampoline_cr4_features = &trampoline_header->cr4;
-       *trampoline_cr4_features = read_cr4();
+       *trampoline_cr4_features = __read_cr4();
 
        trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
        trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;