__ctl_load(reg, cr, cr);
 }
 
+void __ctl_set_vx(void);
+
 void smp_ctl_set_bit(int cr, int bit);
 void smp_ctl_clear_bit(int cr, int bit);
 
 
        };
 };
 
+void save_fpu_regs(struct fpu *fpu);
+
 #define is_vx_fpu(fpu) (!!((fpu)->flags & FPU_USE_VX))
 #define is_vx_task(tsk) (!!((tsk)->thread.fpu.flags & FPU_USE_VX))
 
+/* VX array structure for address operand constraints in inline assemblies */
+struct vx_array { __vector128 _[__NUM_VXRS]; };
+
 static inline int test_fp_ctl(u32 fpc)
 {
        u32 orig_fpc;
        return rc;
 }
 
-static inline void save_fp_ctl(u32 *fpc)
-{
-       asm volatile(
-               "       stfpc   %0\n"
-               : "+Q" (*fpc));
-}
-
-static inline int restore_fp_ctl(u32 *fpc)
-{
-       int rc;
-
-       asm volatile(
-               "       lfpc    %1\n"
-               "0:     la      %0,0\n"
-               "1:\n"
-               : "=d" (rc) : "Q" (*fpc), "0" (-EINVAL));
-       return rc;
-}
-
-static inline void save_fp_regs(freg_t *fprs)
-{
-       asm volatile("std 0,%0" : "=Q" (fprs[0]));
-       asm volatile("std 2,%0" : "=Q" (fprs[2]));
-       asm volatile("std 4,%0" : "=Q" (fprs[4]));
-       asm volatile("std 6,%0" : "=Q" (fprs[6]));
-       asm volatile("std 1,%0" : "=Q" (fprs[1]));
-       asm volatile("std 3,%0" : "=Q" (fprs[3]));
-       asm volatile("std 5,%0" : "=Q" (fprs[5]));
-       asm volatile("std 7,%0" : "=Q" (fprs[7]));
-       asm volatile("std 8,%0" : "=Q" (fprs[8]));
-       asm volatile("std 9,%0" : "=Q" (fprs[9]));
-       asm volatile("std 10,%0" : "=Q" (fprs[10]));
-       asm volatile("std 11,%0" : "=Q" (fprs[11]));
-       asm volatile("std 12,%0" : "=Q" (fprs[12]));
-       asm volatile("std 13,%0" : "=Q" (fprs[13]));
-       asm volatile("std 14,%0" : "=Q" (fprs[14]));
-       asm volatile("std 15,%0" : "=Q" (fprs[15]));
-}
-
-static inline void restore_fp_regs(freg_t *fprs)
-{
-       asm volatile("ld 0,%0" : : "Q" (fprs[0]));
-       asm volatile("ld 2,%0" : : "Q" (fprs[2]));
-       asm volatile("ld 4,%0" : : "Q" (fprs[4]));
-       asm volatile("ld 6,%0" : : "Q" (fprs[6]));
-       asm volatile("ld 1,%0" : : "Q" (fprs[1]));
-       asm volatile("ld 3,%0" : : "Q" (fprs[3]));
-       asm volatile("ld 5,%0" : : "Q" (fprs[5]));
-       asm volatile("ld 7,%0" : : "Q" (fprs[7]));
-       asm volatile("ld 8,%0" : : "Q" (fprs[8]));
-       asm volatile("ld 9,%0" : : "Q" (fprs[9]));
-       asm volatile("ld 10,%0" : : "Q" (fprs[10]));
-       asm volatile("ld 11,%0" : : "Q" (fprs[11]));
-       asm volatile("ld 12,%0" : : "Q" (fprs[12]));
-       asm volatile("ld 13,%0" : : "Q" (fprs[13]));
-       asm volatile("ld 14,%0" : : "Q" (fprs[14]));
-       asm volatile("ld 15,%0" : : "Q" (fprs[15]));
-}
-
-static inline void save_vx_regs(__vector128 *vxrs)
-{
-       typedef struct { __vector128 _[__NUM_VXRS]; } addrtype;
-
-       asm volatile(
-               "       la      1,%0\n"
-               "       .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
-               "       .word   0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
-               : "=Q" (*(addrtype *) vxrs) : : "1");
-}
-
 static inline void save_vx_regs_safe(__vector128 *vxrs)
 {
        unsigned long cr0, flags;
        __ctl_store(cr0, 0, 0);
        __ctl_set_bit(0, 17);
        __ctl_set_bit(0, 18);
-       save_vx_regs(vxrs);
-       __ctl_load(cr0, 0, 0);
-       arch_local_irq_restore(flags);
-}
-
-static inline void restore_vx_regs(__vector128 *vxrs)
-{
-       typedef struct { __vector128 _[__NUM_VXRS]; } addrtype;
-
        asm volatile(
                "       la      1,%0\n"
-               "       .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
-               "       .word   0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
-               : : "Q" (*(addrtype *) vxrs) : "1");
+               "       .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+               "       .word   0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
+               : "=Q" (*(struct vx_array *) vxrs) : : "1");
+       __ctl_load(cr0, 0, 0);
+       arch_local_irq_restore(flags);
 }
 
 static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
                       sizeof(fpregs->fprs));
 }
 
-static inline void save_fpu_regs(struct fpu *fpu)
-{
-       save_fp_ctl(&fpu->fpc);
-       if (is_vx_fpu(fpu))
-               save_vx_regs(fpu->vxrs);
-       else
-               save_fp_regs(fpu->fprs);
-}
-
-static inline void restore_fpu_regs(struct fpu *fpu)
-{
-       restore_fp_ctl(&fpu->fpc);
-       if (is_vx_fpu(fpu))
-               restore_vx_regs(fpu->vxrs);
-       else
-               restore_fp_regs(fpu->fprs);
-}
-
 #endif
 
 #endif /* _ASM_S390_FPU_INTERNAL_H */
 
 #include <linux/kvm.h>
 #include <asm/debug.h>
 #include <asm/cpu.h>
+#include <asm/fpu-internal.h>
 #include <asm/isc.h>
 
 #define KVM_MAX_VCPUS 64
 
 struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
-       s390_fp_regs      host_fpregs;
        unsigned int      host_acrs[NUM_ACRS];
-       s390_fp_regs      guest_fpregs;
-       struct kvm_s390_vregs   *host_vregs;
+       struct fpu        host_fpregs;
+       struct fpu        guest_fpregs;
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
 
 #define CIF_MCCK_PENDING       0       /* machine check handling is pending */
 #define CIF_ASCE               1       /* user asce needs fixup / uaccess */
 #define CIF_NOHZ_DELAY         2       /* delay HZ disable for a tick */
+#define CIF_FPU                        3       /* restore vector registers */
 
 #define _CIF_MCCK_PENDING      (1<<CIF_MCCK_PENDING)
 #define _CIF_ASCE              (1<<CIF_ASCE)
 #define _CIF_NOHZ_DELAY                (1<<CIF_NOHZ_DELAY)
+#define _CIF_FPU               (1<<CIF_FPU)
 
 #ifndef __ASSEMBLY__
 
 
        }                                                               \
        if (next->mm) {                                                 \
                update_cr_regs(next);                                   \
-               restore_fpu_regs(&next->thread.fpu);                    \
+               set_cpu_flag(CIF_FPU);                                  \
                restore_access_regs(&next->thread.acrs[0]);             \
                restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);  \
        }                                                               \
 
        DEFINE(__TASK_pid, offsetof(struct task_struct, pid));
        BLANK();
        DEFINE(__THREAD_ksp, offsetof(struct thread_struct, ksp));
+       DEFINE(__THREAD_fpu, offsetof(struct task_struct, thread.fpu));
        DEFINE(__THREAD_per_cause, offsetof(struct thread_struct, per_event.cause));
        DEFINE(__THREAD_per_address, offsetof(struct thread_struct, per_event.address));
        DEFINE(__THREAD_per_paid, offsetof(struct thread_struct, per_event.paid));
        DEFINE(__THREAD_trap_tdb, offsetof(struct thread_struct, trap_tdb));
        BLANK();
+       DEFINE(__FPU_fpc, offsetof(struct fpu, fpc));
+       DEFINE(__FPU_flags, offsetof(struct fpu, flags));
+       DEFINE(__FPU_regs, offsetof(struct fpu, regs));
+       BLANK();
        DEFINE(__TI_task, offsetof(struct thread_info, task));
        DEFINE(__TI_flags, offsetof(struct thread_info, flags));
        DEFINE(__TI_sysc_table, offsetof(struct thread_info, sys_call_table));
 
 static void load_sigregs(void)
 {
        restore_access_regs(current->thread.acrs);
-       restore_fpu_regs(¤t->thread.fpu);
 }
 
 static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs)
        if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32))
                goto badframe;
        set_current_blocked(&set);
+       save_fpu_regs(¤t->thread.fpu);
        if (restore_sigregs32(regs, &frame->sregs))
                goto badframe;
        if (restore_sigregs_ext32(regs, &frame->sregs_ext))
        set_current_blocked(&set);
        if (compat_restore_altstack(&frame->uc.uc_stack))
                goto badframe;
+       save_fpu_regs(¤t->thread.fpu);
        if (restore_sigregs32(regs, &frame->uc.uc_mcontext))
                goto badframe;
        if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext))
 
 #include <asm/page.h>
 #include <asm/sigp.h>
 #include <asm/irq.h>
+#include <asm/fpu-internal.h>
+#include <asm/vx-insn.h>
 
 __PT_R0      = __PT_GPRS
 __PT_R1      = __PT_GPRS + 8
                   _TIF_UPROBE)
 _TIF_TRACE     = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
                   _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK      = (_CIF_MCCK_PENDING | _CIF_ASCE)
+_CIF_WORK      = (_CIF_MCCK_PENDING | _CIF_ASCE | _CIF_FPU)
 _PIF_WORK      = (_PIF_PER_TRAP)
 
-#define BASED(name) name-system_call(%r13)
+#define BASED(name) name-cleanup_critical(%r13)
 
        .macro  TRACE_IRQS_ON
 #ifdef CONFIG_TRACE_IRQFLAGS
        jo      .Lsysc_sigpending
        tm      __TI_flags+7(%r12),_TIF_NOTIFY_RESUME
        jo      .Lsysc_notify_resume
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       jo      .Lsysc_vxrs
        tm      __LC_CPU_FLAGS+7,_CIF_ASCE
        jo      .Lsysc_uaccess
        j       .Lsysc_return           # beware of critical section cleanup
        lctlg   %c1,%c1,__LC_USER_ASCE          # load primary asce
        j       .Lsysc_return
 
+#
+# CIF_FPU is set, restore floating-point controls and floating-point registers.
+#
+.Lsysc_vxrs:
+       larl    %r14,.Lsysc_return
+       jg      load_fpu_regs
+
 #
 # _TIF_SIGPENDING is set, call do_signal
 #
        stmg    %r8,%r15,__LC_SAVE_AREA_SYNC
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_PGM_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,1
        tmhh    %r8,0x0001              # test problem state bit
        stmg    %r8,%r15,__LC_SAVE_AREA_ASYNC
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_IO_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,2
        SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
        jo      .Lio_sigpending
        tm      __TI_flags+7(%r12),_TIF_NOTIFY_RESUME
        jo      .Lio_notify_resume
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       jo      .Lio_vxrs
        tm      __LC_CPU_FLAGS+7,_CIF_ASCE
        jo      .Lio_uaccess
        j       .Lio_return             # beware of critical section cleanup
        lctlg   %c1,%c1,__LC_USER_ASCE          # load primary asce
        j       .Lio_return
 
+#
+# CIF_FPU is set, restore floating-point controls and floating-point registers.
+#
+.Lio_vxrs:
+       larl    %r14,.Lio_return
+       jg      load_fpu_regs
+
 #
 # _TIF_NEED_RESCHED is set, call schedule
 #
        stmg    %r8,%r15,__LC_SAVE_AREA_ASYNC
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_EXT_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,3
        SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
        br      %r14
 .Lpsw_idle_end:
 
+/* Store floating-point controls and floating-point or vector extension
+ * registers instead.  A critical section cleanup assures that the registers
+ * are stored even if interrupted for some other work. The register %r2
+ * designates a struct fpu to store register contents. If the specified
+ * structure does not contain a register save area, the register store is
+ * omitted (see also comments in arch_dup_task_struct()).
+ *
+ * The CIF_FPU flag is set in any case.  The CIF_FPU triggers a lazy restore
+ * of the register contents at system call or io return.
+ */
+ENTRY(save_fpu_regs)
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bor     %r14
+       stfpc   __FPU_fpc(%r2)
+.Lsave_fpu_regs_fpc_end:
+       lg      %r3,__FPU_regs(%r2)
+       ltgr    %r3,%r3
+       jz      .Lsave_fpu_regs_done      # no save area -> set CIF_FPU
+       tm      __FPU_flags+3(%r2),FPU_USE_VX
+       jz      .Lsave_fpu_regs_fp        # no -> store FP regs
+.Lsave_fpu_regs_vx_low:
+       VSTM    %v0,%v15,0,%r3            # vstm 0,15,0(3)
+.Lsave_fpu_regs_vx_high:
+       VSTM    %v16,%v31,256,%r3         # vstm 16,31,256(3)
+       j       .Lsave_fpu_regs_done      # -> set CIF_FPU flag
+.Lsave_fpu_regs_fp:
+       std     0,0(%r3)
+       std     1,8(%r3)
+       std     2,16(%r3)
+       std     3,24(%r3)
+       std     4,32(%r3)
+       std     5,40(%r3)
+       std     6,48(%r3)
+       std     7,56(%r3)
+       std     8,64(%r3)
+       std     9,72(%r3)
+       std     10,80(%r3)
+       std     11,88(%r3)
+       std     12,96(%r3)
+       std     13,104(%r3)
+       std     14,112(%r3)
+       std     15,120(%r3)
+.Lsave_fpu_regs_done:
+       oi      __LC_CPU_FLAGS+7,_CIF_FPU
+       br      %r14
+.Lsave_fpu_regs_end:
+
+/* Load floating-point controls and floating-point or vector extension
+ * registers.  A critical section cleanup assures that the register contents
+ * are loaded even if interrupted for some other work. Depending on the saved
+ * FP/VX state, the vector-enablement control, CR0.46, is either set or cleared.
+ *
+ * There are special calling conventions to fit into sysc and io return work:
+ *     %r12:   __LC_THREAD_INFO
+ *     %r15:   <kernel stack>
+ * The function requires:
+ *     %r4 and __SF_EMPTY+32(%r15)
+ */
+load_fpu_regs:
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bnor    %r14
+       lg      %r4,__TI_task(%r12)
+       la      %r4,__THREAD_fpu(%r4)
+       lfpc    __FPU_fpc(%r4)
+       stctg   %c0,%c0,__SF_EMPTY+32(%r15)     # store CR0
+       tm      __FPU_flags+3(%r4),FPU_USE_VX   # VX-enabled task ?
+       lg      %r4,__FPU_regs(%r4)             # %r4 <- reg save area
+       jz      .Lload_fpu_regs_fp_ctl          # -> no VX, load FP regs
+.Lload_fpu_regs_vx_ctl:
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jo      .Lload_fpu_regs_vx
+       oi      __SF_EMPTY+32+5(%r15),2         # set VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+.Lload_fpu_regs_vx:
+       VLM     %v0,%v15,0,%r4
+.Lload_fpu_regs_vx_high:
+       VLM     %v16,%v31,256,%r4
+       j       .Lload_fpu_regs_done
+.Lload_fpu_regs_fp_ctl:
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jz      .Lload_fpu_regs_fp
+       ni      __SF_EMPTY+32+5(%r15),253       # clear VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+.Lload_fpu_regs_fp:
+       ld      0,0(%r4)
+       ld      1,8(%r4)
+       ld      2,16(%r4)
+       ld      3,24(%r4)
+       ld      4,32(%r4)
+       ld      5,40(%r4)
+       ld      6,48(%r4)
+       ld      7,56(%r4)
+       ld      8,64(%r4)
+       ld      9,72(%r4)
+       ld      10,80(%r4)
+       ld      11,88(%r4)
+       ld      12,96(%r4)
+       ld      13,104(%r4)
+       ld      14,112(%r4)
+       ld      15,120(%r4)
+.Lload_fpu_regs_done:
+       ni      __LC_CPU_FLAGS+7,255-_CIF_FPU
+       br      %r14
+.Lload_fpu_regs_end:
+
+/* Test and set the vector enablement control in CR0.46 */
+ENTRY(__ctl_set_vx)
+       stctg   %c0,%c0,__SF_EMPTY(%r15)
+       tm      __SF_EMPTY+5(%r15),2
+       bor     %r14
+       oi      __SF_EMPTY+5(%r15),2
+       lctlg   %c0,%c0,__SF_EMPTY(%r15)
+       br      %r14
+.L__ctl_set_vx_end:
+
 .L__critical_end:
 
 /*
        lmg     %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_MCK_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,4
        tm      __LC_MCCK_CODE,0x80     # system damage?
        .quad   .Lio_done
        .quad   psw_idle
        .quad   .Lpsw_idle_end
+       .quad   save_fpu_regs
+       .quad   .Lsave_fpu_regs_end
+       .quad   load_fpu_regs
+       .quad   .Lload_fpu_regs_end
+       .quad   __ctl_set_vx
+       .quad   .L__ctl_set_vx_end
 
 cleanup_critical:
        clg     %r9,BASED(.Lcleanup_table)      # system_call
        jl      0f
        clg     %r9,BASED(.Lcleanup_table+72)   # .Lpsw_idle_end
        jl      .Lcleanup_idle
+       clg     %r9,BASED(.Lcleanup_table+80)   # save_fpu_regs
+       jl      0f
+       clg     %r9,BASED(.Lcleanup_table+88)   # .Lsave_fpu_regs_end
+       jl      .Lcleanup_save_fpu_regs
+       clg     %r9,BASED(.Lcleanup_table+96)   # load_fpu_regs
+       jl      0f
+       clg     %r9,BASED(.Lcleanup_table+104)  # .Lload_fpu_regs_end
+       jl      .Lcleanup_load_fpu_regs
+       clg     %r9,BASED(.Lcleanup_table+112)  # __ctl_set_vx
+       jl      0f
+       clg     %r9,BASED(.Lcleanup_table+120)  # .L__ctl_set_vx_end
+       jl      .Lcleanup___ctl_set_vx
 0:     br      %r14
 
 
 .Lcleanup_idle_insn:
        .quad   .Lpsw_idle_lpsw
 
+.Lcleanup_save_fpu_regs:
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bor     %r14
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_done)
+       jhe     5f
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_fp)
+       jhe     4f
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_vx_high)
+       jhe     3f
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_vx_low)
+       jhe     2f
+       clg     %r9,BASED(.Lcleanup_save_fpu_fpc_end)
+       jhe     1f
+0:     # Store floating-point controls
+       stfpc   __FPU_fpc(%r2)
+1:     # Load register save area and check if VX is active
+       lg      %r3,__FPU_regs(%r2)
+       ltgr    %r3,%r3
+       jz      5f                        # no save area -> set CIF_FPU
+       tm      __FPU_flags+3(%r2),FPU_USE_VX
+       jz      4f                        # no VX -> store FP regs
+2:     # Store vector registers (V0-V15)
+       VSTM    %v0,%v15,0,%r3            # vstm 0,15,0(3)
+3:     # Store vector registers (V16-V31)
+       VSTM    %v16,%v31,256,%r3         # vstm 16,31,256(3)
+       j       5f                        # -> done, set CIF_FPU flag
+4:     # Store floating-point registers
+       std     0,0(%r3)
+       std     1,8(%r3)
+       std     2,16(%r3)
+       std     3,24(%r3)
+       std     4,32(%r3)
+       std     5,40(%r3)
+       std     6,48(%r3)
+       std     7,56(%r3)
+       std     8,64(%r3)
+       std     9,72(%r3)
+       std     10,80(%r3)
+       std     11,88(%r3)
+       std     12,96(%r3)
+       std     13,104(%r3)
+       std     14,112(%r3)
+       std     15,120(%r3)
+5:     # Set CIF_FPU flag
+       oi      __LC_CPU_FLAGS+7,_CIF_FPU
+       lg      %r9,48(%r11)            # return from save_fpu_regs
+       br      %r14
+.Lcleanup_save_fpu_fpc_end:
+       .quad   .Lsave_fpu_regs_fpc_end
+.Lcleanup_save_fpu_regs_vx_low:
+       .quad   .Lsave_fpu_regs_vx_low
+.Lcleanup_save_fpu_regs_vx_high:
+       .quad   .Lsave_fpu_regs_vx_high
+.Lcleanup_save_fpu_regs_fp:
+       .quad   .Lsave_fpu_regs_fp
+.Lcleanup_save_fpu_regs_done:
+       .quad   .Lsave_fpu_regs_done
+
+.Lcleanup_load_fpu_regs:
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bnor    %r14
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_done)
+       jhe     1f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_fp)
+       jhe     2f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_fp_ctl)
+       jhe     3f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_vx_high)
+       jhe     4f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_vx)
+       jhe     5f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_vx_ctl)
+       jhe     6f
+       lg      %r4,__TI_task(%r12)
+       la      %r4,__THREAD_fpu(%r4)
+       lfpc    __FPU_fpc(%r4)
+       tm      __FPU_flags+3(%r4),FPU_USE_VX   # VX-enabled task ?
+       lg      %r4,__FPU_regs(%r4)             # %r4 <- reg save area
+       jz      3f                              # -> no VX, load FP regs
+6:     # Set VX-enablement control
+       stctg   %c0,%c0,__SF_EMPTY+32(%r15)     # store CR0
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jo      5f
+       oi      __SF_EMPTY+32+5(%r15),2         # set VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+5:     # Load V0 ..V15 registers
+       VLM     %v0,%v15,0,%r4
+4:     # Load V16..V31 registers
+       VLM     %v16,%v31,256,%r4
+       j       1f
+3:     # Clear VX-enablement control for FP
+       stctg   %c0,%c0,__SF_EMPTY+32(%r15)     # store CR0
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jz      2f
+       ni      __SF_EMPTY+32+5(%r15),253       # clear VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+2:     # Load floating-point registers
+       ld      0,0(%r4)
+       ld      1,8(%r4)
+       ld      2,16(%r4)
+       ld      3,24(%r4)
+       ld      4,32(%r4)
+       ld      5,40(%r4)
+       ld      6,48(%r4)
+       ld      7,56(%r4)
+       ld      8,64(%r4)
+       ld      9,72(%r4)
+       ld      10,80(%r4)
+       ld      11,88(%r4)
+       ld      12,96(%r4)
+       ld      13,104(%r4)
+       ld      14,112(%r4)
+       ld      15,120(%r4)
+1:     # Clear CIF_FPU bit
+       ni      __LC_CPU_FLAGS+7,255-_CIF_FPU
+       lg      %r9,48(%r11)            # return from load_fpu_regs
+       br      %r14
+.Lcleanup_load_fpu_regs_vx_ctl:
+       .quad   .Lload_fpu_regs_vx_ctl
+.Lcleanup_load_fpu_regs_vx:
+       .quad   .Lload_fpu_regs_vx
+.Lcleanup_load_fpu_regs_vx_high:
+       .quad   .Lload_fpu_regs_vx_high
+.Lcleanup_load_fpu_regs_fp_ctl:
+       .quad   .Lload_fpu_regs_fp_ctl
+.Lcleanup_load_fpu_regs_fp:
+       .quad   .Lload_fpu_regs_fp
+.Lcleanup_load_fpu_regs_done:
+       .quad   .Lload_fpu_regs_done
+
+.Lcleanup___ctl_set_vx:
+       stctg   %c0,%c0,__SF_EMPTY(%r15)
+       tm      __SF_EMPTY+5(%r15),2
+       bor     %r14
+       oi      __SF_EMPTY+5(%r15),2
+       lctlg   %c0,%c0,__SF_EMPTY(%r15)
+       lg      %r9,48(%r11)            # return from __ctl_set_vx
+       br      %r14
+
 /*
  * Integer constants
  */
        stg     %r2,__SF_EMPTY(%r15)            # save control block pointer
        stg     %r3,__SF_EMPTY+8(%r15)          # save guest register save area
        xc      __SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU       # load guest fp/vx registers ?
+       jno     .Lsie_load_guest_gprs
+       lg      %r12,__LC_THREAD_INFO           # load fp/vx regs save area
+       brasl   %r14,load_fpu_regs              # load guest fp/vx regs
+.Lsie_load_guest_gprs:
        lmg     %r0,%r13,0(%r3)                 # load guest gprs 0-13
        lg      %r14,__LC_GMAP                  # get gmap pointer
        ltgr    %r14,%r14
        oi      __SIE_PROG0C+3(%r14),1          # we are going into SIE now
        tm      __SIE_PROG20+3(%r14),3          # last exit...
        jnz     .Lsie_done
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       jo      .Lsie_done                      # exit if fp/vx regs changed
        LPP     __SF_EMPTY(%r15)                # set guest id
        sie     0(%r14)
 .Lsie_done:
 
                cr0.val = S390_lowcore.cregs_save_area[0];
                cr0.afp = cr0.vx = 1;
                __ctl_load(cr0.val, 0, 0);
-               restore_vx_regs((__vector128 *)
-                               &S390_lowcore.vector_save_area);
+               asm volatile(
+                       "       la      1,%0\n"
+                       "       .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+                       "       .word   0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+                       : : "Q" (*(struct vx_array *)
+                                &S390_lowcore.vector_save_area) : "1");
                __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
        }
        /* Revalidate access registers */
 
        *dst = *src;
 
        /* Set up a new floating-point register save area */
+       dst->thread.fpu.fpc = 0;
+       dst->thread.fpu.flags = 0;      /* Always start with VX disabled */
        dst->thread.fpu.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS,
                                       GFP_KERNEL|__GFP_REPEAT);
        if (!dst->thread.fpu.fprs)
                return -ENOMEM;
 
-       /* Save the fpu registers to new thread structure. */
-       save_fp_ctl(&dst->thread.fpu.fpc);
-       save_fp_regs(dst->thread.fpu.fprs);
-       dst->thread.fpu.flags = 0;     /* Always start with VX disabled */
-
+       /*
+        * Save the floating-point or vector register state of the current
+        * task.  The state is not saved for early kernel threads, for example,
+        * the init_task, which do not have an allocated save area.
+        * The CIF_FPU flag is set in any case to lazy clear or restore a saved
+        * state when switching to a different task or returning to user space.
+        */
+       save_fpu_regs(¤t->thread.fpu);
+       dst->thread.fpu.fpc = current->thread.fpu.fpc;
+       if (is_vx_task(current))
+               convert_vx_to_fp(dst->thread.fpu.fprs,
+                                current->thread.fpu.vxrs);
+       else
+               memcpy(dst->thread.fpu.fprs, current->thread.fpu.fprs,
+                      sizeof(freg_t) * __NUM_FPRS);
        return 0;
 }
 
  */
 int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 {
-       save_fp_ctl(&fpregs->fpc);
-       save_fp_regs(fpregs->fprs);
+       save_fpu_regs(¤t->thread.fpu);
+       fpregs->fpc = current->thread.fpu.fpc;
+       fpregs->pad = 0;
+       if (is_vx_task(current))
+               convert_vx_to_fp((freg_t *)&fpregs->fprs,
+                                current->thread.fpu.vxrs);
+       else
+               memcpy(&fpregs->fprs, current->thread.fpu.fprs,
+                      sizeof(fpregs->fprs));
        return 1;
 }
 EXPORT_SYMBOL(dump_fpu);
 
        struct per_regs old, new;
 
        /* Take care of the enable/disable of transactional execution. */
-       if (MACHINE_HAS_TE || MACHINE_HAS_VX) {
+       if (MACHINE_HAS_TE) {
                unsigned long cr, cr_new;
 
                __ctl_store(cr, 0, 0);
-               cr_new = cr;
-               if (MACHINE_HAS_TE) {
-                       /* Set or clear transaction execution TXC bit 8. */
-                       cr_new |= (1UL << 55);
-                       if (task->thread.per_flags & PER_FLAG_NO_TE)
-                               cr_new &= ~(1UL << 55);
-               }
-               if (MACHINE_HAS_VX) {
-                       /* Enable/disable of vector extension */
-                       cr_new &= ~(1UL << 17);
-                       if (task->thread.fpu.vxrs)
-                               cr_new |= (1UL << 17);
-               }
+               /* Set or clear transaction execution TXC bit 8. */
+               cr_new = cr | (1UL << 55);
+               if (task->thread.per_flags & PER_FLAG_NO_TE)
+                       cr_new &= ~(1UL << 55);
                if (cr_new != cr)
                        __ctl_load(cr_new, 0, 0);
-               if (MACHINE_HAS_TE) {
-                       /* Set/clear transaction execution TDC bits 62/63. */
-                       __ctl_store(cr, 2, 2);
-                       cr_new = cr & ~3UL;
-                       if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) {
-                               if (task->thread.per_flags &
-                                   PER_FLAG_TE_ABORT_RAND_TEND)
-                                       cr_new |= 1UL;
-                               else
-                                       cr_new |= 2UL;
-                       }
-                       if (cr_new != cr)
-                               __ctl_load(cr_new, 2, 2);
+               /* Set or clear transaction execution TDC bits 62 and 63. */
+               __ctl_store(cr, 2, 2);
+               cr_new = cr & ~3UL;
+               if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) {
+                       if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND)
+                               cr_new |= 1UL;
+                       else
+                               cr_new |= 2UL;
                }
+               if (cr_new != cr)
+                       __ctl_load(cr_new, 2, 2);
        }
        /* Copy user specified PER registers */
        new.control = thread->per_user.control;
        else
                memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs));
 
-       if (target == current)
-               restore_fpu_regs(&target->thread.fpu);
-
        return rc;
 }
 
                save_fpu_regs(&target->thread.fpu);
 
        rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
-       if (rc == 0) {
+       if (rc == 0)
                for (i = 0; i < __NUM_VXRS_LOW; i++)
                        *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i];
-               if (target == current)
-                       restore_fpu_regs(&target->thread.fpu);
-       }
 
        return rc;
 }
 
        rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1);
-       if (rc == 0 && target == current)
-               restore_vx_regs(target->thread.fpu.vxrs);
-
        return rc;
 }
 
 
 #include <linux/module.h>
 #include <linux/kvm_host.h>
+#include <asm/fpu-internal.h>
 #include <asm/ftrace.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
 #if IS_ENABLED(CONFIG_KVM)
 EXPORT_SYMBOL(sie64a);
 EXPORT_SYMBOL(sie_exit);
+EXPORT_SYMBOL(save_fpu_regs);
+EXPORT_SYMBOL(__ctl_set_vx);
 #endif
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
 
 static void load_sigregs(void)
 {
        restore_access_regs(current->thread.acrs);
-       restore_fpu_regs(¤t->thread.fpu);
 }
 
 /* Returns non-zero on fault. */
        if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE))
                goto badframe;
        set_current_blocked(&set);
+       save_fpu_regs(¤t->thread.fpu);
        if (restore_sigregs(regs, &frame->sregs))
                goto badframe;
        if (restore_sigregs_ext(regs, &frame->sregs_ext))
        set_current_blocked(&set);
        if (restore_altstack(&frame->uc.uc_stack))
                goto badframe;
+       save_fpu_regs(¤t->thread.fpu);
        if (restore_sigregs(regs, &frame->uc.uc_mcontext))
                goto badframe;
        if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext))
 
 DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN,
              "transaction constraint exception")
 
-static inline void do_fp_trap(struct pt_regs *regs, int fpc)
+static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc)
 {
        int si_code = 0;
        /* FPC[2] is Data Exception Code */
                return -ENOMEM;
        preempt_disable();
        if (tsk == current)
-               save_fp_regs(tsk->thread.fpu.fprs);
+               save_fpu_regs(&tsk->thread.fpu);
        /* Copy the 16 floating point registers */
        convert_fp_to_vx(vxrs, tsk->thread.fpu.fprs);
        fprs = tsk->thread.fpu.fprs;
        tsk->thread.fpu.vxrs = vxrs;
        tsk->thread.fpu.flags |= FPU_USE_VX;
        kfree(fprs);
-       if (tsk == current) {
-               __ctl_set_bit(0, 17);
-               restore_vx_regs(vxrs);
-       }
        preempt_enable();
        return 0;
 }
        }
 
        /* get vector interrupt code from fpc */
-       asm volatile("stfpc %0" : "=Q" (current->thread.fpu.fpc));
+       save_fpu_regs(¤t->thread.fpu);
        vic = (current->thread.fpu.fpc & 0xf00) >> 8;
        switch (vic) {
        case 1: /* invalid vector operation */
 
        location = get_trap_ip(regs);
 
-       asm volatile("stfpc %0" : "=Q" (current->thread.fpu.fpc));
+       save_fpu_regs(¤t->thread.fpu);
        /* Check for vector register enablement */
        if (MACHINE_HAS_VX && !is_vx_task(current) &&
            (current->thread.fpu.fpc & FPC_DXC_MASK) == 0xfe00) {
 
        return 0;
 }
 
+/*
+ * Backs up the current FP/VX register save area on a particular
+ * destination.  Used to switch between different register save
+ * areas.
+ */
+static inline void save_fpu_to(struct fpu *dst)
+{
+       dst->fpc = current->thread.fpu.fpc;
+       dst->flags = current->thread.fpu.flags;
+       dst->regs = current->thread.fpu.regs;
+}
+
+/*
+ * Switches the FP/VX register save area from which to lazy
+ * restore register contents.
+ */
+static inline void load_fpu_from(struct fpu *from)
+{
+       current->thread.fpu.fpc = from->fpc;
+       current->thread.fpu.flags = from->flags;
+       current->thread.fpu.regs = from->regs;
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-       __u32 fpc;
+       /* Save host register state */
+       save_fpu_regs(¤t->thread.fpu);
+       save_fpu_to(&vcpu->arch.host_fpregs);
 
-       save_fp_ctl(&vcpu->arch.host_fpregs.fpc);
-       if (test_kvm_facility(vcpu->kvm, 129))
-               save_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
-       else
-               save_fp_regs(vcpu->arch.host_fpregs.fprs);
-       save_access_regs(vcpu->arch.host_acrs);
        if (test_kvm_facility(vcpu->kvm, 129)) {
-               fpc = vcpu->run->s.regs.fpc;
-               restore_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
-       } else {
-               fpc = vcpu->arch.guest_fpregs.fpc;
-               restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
-       }
-       if (test_fp_ctl(fpc))
+               current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
+               current->thread.fpu.flags = FPU_USE_VX;
+               /*
+                * Use the register save area in the SIE-control block
+                * for register restore and save in kvm_arch_vcpu_put()
+                */
+               current->thread.fpu.vxrs =
+                       (__vector128 *)&vcpu->run->s.regs.vrs;
+               /* Always enable the vector extension for KVM */
+               __ctl_set_vx();
+       } else
+               load_fpu_from(&vcpu->arch.guest_fpregs);
+
+       if (test_fp_ctl(current->thread.fpu.fpc))
                /* User space provided an invalid FPC, let's clear it */
-               fpc = 0;
-       restore_fp_ctl(&fpc);
+               current->thread.fpu.fpc = 0;
+
+       save_access_regs(vcpu->arch.host_acrs);
        restore_access_regs(vcpu->run->s.regs.acrs);
        gmap_enable(vcpu->arch.gmap);
        atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 {
        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        gmap_disable(vcpu->arch.gmap);
-       if (test_kvm_facility(vcpu->kvm, 129)) {
-               save_fp_ctl(&vcpu->run->s.regs.fpc);
-               save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
-       } else {
-               save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-               save_fp_regs(vcpu->arch.guest_fpregs.fprs);
-       }
-       save_access_regs(vcpu->run->s.regs.acrs);
-       restore_fp_ctl(&vcpu->arch.host_fpregs.fpc);
+
+       save_fpu_regs(¤t->thread.fpu);
+
        if (test_kvm_facility(vcpu->kvm, 129))
-               restore_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
+               /*
+                * kvm_arch_vcpu_load() set up the register save area to
+                * the &vcpu->run->s.regs.vrs and, thus, the vector registers
+                * are already saved.  Only the floating-point control must be
+                * copied.
+                */
+               vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
        else
-               restore_fp_regs(vcpu->arch.host_fpregs.fprs);
+               save_fpu_to(&vcpu->arch.guest_fpregs);
+       load_fpu_from(&vcpu->arch.host_fpregs);
+
+       save_access_regs(vcpu->run->s.regs.acrs);
        restore_access_regs(vcpu->arch.host_acrs);
 }
 
 
        vcpu->arch.sie_block = &sie_page->sie_block;
        vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
-       vcpu->arch.host_vregs = &sie_page->vregs;
 
        vcpu->arch.sie_block->icpua = id;
        if (!kvm_is_ucontrol(kvm)) {
        vcpu->arch.local_int.wq = &vcpu->wq;
        vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
 
+       /*
+        * Allocate a save area for floating-point registers.  If the vector
+        * extension is available, register contents are saved in the SIE
+        * control block.  The allocated save area is still required in
+        * particular places, for example, in kvm_s390_vcpu_store_status().
+        */
+       vcpu->arch.guest_fpregs.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS,
+                                              GFP_KERNEL);
+       if (!vcpu->arch.guest_fpregs.fprs) {
+               rc = -ENOMEM;
+               goto out_free_sie_block;
+       }
+
        rc = kvm_vcpu_init(vcpu, kvm, id);
        if (rc)
                goto out_free_sie_block;
 {
        if (test_fp_ctl(fpu->fpc))
                return -EINVAL;
-       memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
+       memcpy(vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
        vcpu->arch.guest_fpregs.fpc = fpu->fpc;
-       restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       save_fpu_regs(¤t->thread.fpu);
+       load_fpu_from(&vcpu->arch.guest_fpregs);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
+       memcpy(&fpu->fprs, vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
        fpu->fpc = vcpu->arch.guest_fpregs.fpc;
        return 0;
 }
         * copying in vcpu load/put. Lets update our copies before we save
         * it into the save area
         */
-       save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       save_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       save_fpu_regs(¤t->thread.fpu);
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               /*
+                * If the vector extension is available, the vector registers
+                * which overlaps with floating-point registers are saved in
+                * the SIE-control block.  Hence, extract the floating-point
+                * registers and the FPC value and store them in the
+                * guest_fpregs structure.
+                */
+               WARN_ON(!is_vx_task(current));    /* XXX remove later */
+               vcpu->arch.guest_fpregs.fpc = current->thread.fpu.fpc;
+               convert_vx_to_fp(vcpu->arch.guest_fpregs.fprs,
+                                current->thread.fpu.vxrs);
+       } else
+               save_fpu_to(&vcpu->arch.guest_fpregs);
        save_access_regs(vcpu->run->s.regs.acrs);
 
        return kvm_s390_store_status_unloaded(vcpu, addr);
 
        /*
         * The guest VXRS are in the host VXRs due to the lazy
-        * copying in vcpu load/put. Let's update our copies before we save
-        * it into the save area.
+        * copying in vcpu load/put. We can simply call save_fpu_regs()
+        * to save the current register state because we are in the
+        * middle of a load/put cycle.
+        *
+        * Let's update our copies before we save it into the save area.
         */
-       save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+       save_fpu_regs(¤t->thread.fpu);
 
        return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
 }