]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
authorDavid Woodhouse <dwmw@amazon.co.uk>
Sat, 23 Oct 2021 16:41:05 +0000 (16:41 +0000)
committerDavid Woodhouse <dwmw@amazon.co.uk>
Wed, 9 Feb 2022 20:20:22 +0000 (20:20 +0000)
There are circumstances whem kvm_xen_update_runstate_guest() should not
sleep because it ends up being called from __schedule() when the vCPU
is preempted:

[  222.830825]  kvm_xen_update_runstate_guest+0x24/0x100
[  222.830878]  kvm_arch_vcpu_put+0x14c/0x200
[  222.830920]  kvm_sched_out+0x30/0x40
[  222.830960]  __schedule+0x55c/0x9f0

To handle this, make it use the same trick as __kvm_xen_has_interrupt(),
of using the hva from the gfn_to_hva_cache directly. Then it can use
pagefault_disable() around the accesses and just bail out if the page
is absent (which is unlikely).

After first looking at this, there followed a long path of discovery
which culminated in removing the existing gfn_to_pfn_cache and replacing
it with something more fit for purpose in 5.17. A future patch can
convert the runstate code to use that, but this simpler fix can be
applied more easily to the older stable kernels.

Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: stable@vger.kernel.org
arch/x86/kvm/xen.c

index bad57535fad0825608c91e9cdd590c9fc0040a42..39b319f428bc03c4ad30d5b0df3d1a18d920f79a 100644 (file)
@@ -133,36 +133,60 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
        struct kvm_vcpu_xen *vx = &v->arch.xen;
+       struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
+       struct kvm_memslots *slots = kvm_memslots(v->kvm);
+       bool atomic = (state == RUNSTATE_runnable);
        uint64_t state_entry_time;
-       unsigned int offset;
+       int __user *user_state;
+       uint64_t __user *user_times;
 
        kvm_xen_update_runstate(v, state);
 
        if (!vx->runstate_set)
                return;
 
-       BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+       if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
+           kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
+               return;
+
+       /* We made sure it fits in a single page */
+       BUG_ON(!ghc->memslot);
+
+       if (atomic)
+               pagefault_disable();
 
-       offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
-#ifdef CONFIG_X86_64
        /*
-        * The only difference is alignment of uint64_t in 32-bit.
-        * So the first field 'state' is accessed directly using
-        * offsetof() (where its offset happens to be zero), while the
-        * remaining fields which are all uint64_t, start at 'offset'
-        * which we tweak here by adding 4.
+        * The only difference between 32-bit and 64-bit versions of the
+        * runstate struct is the alignment of uint64_t in 32-bit, which
+        * means that the 64-bit version has an additional 4 bytes of
+        * padding after the first field 'state'.
+        *
+        * So we use 'int __user *user_state' to point to the state field,
+        * and 'uint64_t __user *user_times' for runstate_entry_time. So
+        * the actual array of time[] in each state starts at user_times[1].
         */
+       BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+       BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+       user_state = (int __user *)ghc->hva;
+
+       BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+       user_times = (uint64_t __user *)(ghc->hva +
+                                        offsetof(struct compat_vcpu_runstate_info,
+                                                 state_entry_time));
+#ifdef CONFIG_X86_64
        BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
                     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
        BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
                     offsetof(struct compat_vcpu_runstate_info, time) + 4);
 
        if (v->kvm->arch.xen.long_mode)
-               offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+               user_times = (uint64_t __user *)(ghc->hva +
+                                                offsetof(struct vcpu_runstate_info,
+                                                         state_entry_time));
 #endif
        /*
-        * First write the updated state_entry_time at the appropriate
-        * location determined by 'offset'.
+        * First write the updated state_entry_time to the guest area.
         */
        state_entry_time = vx->runstate_entry_time;
        state_entry_time |= XEN_RUNSTATE_UPDATE;
@@ -172,28 +196,21 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
        BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
                     sizeof(state_entry_time));
 
-       if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-                                         &state_entry_time, offset,
-                                         sizeof(state_entry_time)))
-               return;
+       if (__put_user(state_entry_time, user_times))
+               goto out;
        smp_wmb();
 
        /*
         * Next, write the new runstate. This is in the *same* place
         * for 32-bit and 64-bit guests, asserted here for paranoia.
         */
-       BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
-                    offsetof(struct compat_vcpu_runstate_info, state));
        BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
                     sizeof(vx->current_runstate));
        BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
                     sizeof(vx->current_runstate));
 
-       if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-                                         &vx->current_runstate,
-                                         offsetof(struct vcpu_runstate_info, state),
-                                         sizeof(vx->current_runstate)))
-               return;
+       if (__put_user(vx->current_runstate, user_state))
+               goto out;
 
        /*
         * Write the actual runstate times immediately after the
@@ -208,24 +225,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
        BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
                     sizeof(vx->runstate_times));
 
-       if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-                                         &vx->runstate_times[0],
-                                         offset + sizeof(u64),
-                                         sizeof(vx->runstate_times)))
-               return;
-
+       if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
+               goto out;
        smp_wmb();
 
        /*
         * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
         * runstate_entry_time field.
         */
-
        state_entry_time &= ~XEN_RUNSTATE_UPDATE;
-       if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
-                                         &state_entry_time, offset,
-                                         sizeof(state_entry_time)))
-               return;
+       __put_user(state_entry_time, user_times);
+       smp_wmb();
+
+ out:
+       mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+       if (atomic)
+               pagefault_enable();
 }
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -443,6 +459,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                        break;
                }
 
+               /* It must fit within a single page */
+               if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
+                       r = -EINVAL;
+                       break;
+               }
+
                r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
                                              &vcpu->arch.xen.vcpu_info_cache,
                                              data->u.gpa,
@@ -460,6 +482,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                        break;
                }
 
+               /* It must fit within a single page */
+               if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
+                       r = -EINVAL;
+                       break;
+               }
+
                r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
                                              &vcpu->arch.xen.vcpu_time_info_cache,
                                              data->u.gpa,
@@ -481,6 +509,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                        break;
                }
 
+               /* It must fit within a single page */
+               if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
+                       r = -EINVAL;
+                       break;
+               }
+
                r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
                                              &vcpu->arch.xen.runstate_cache,
                                              data->u.gpa,