KVM: x86/xen: Cope with lack of alignment constraints on runstate area

author David Woodhouse <dwmw@amazon.co.uk>

Fri, 18 Nov 2022 14:32:38 +0000 (14:32 +0000)

committer David Woodhouse <dwmw@amazon.co.uk>

Fri, 18 Nov 2022 15:03:52 +0000 (15:03 +0000)
author David Woodhouse <dwmw@amazon.co.uk>
Fri, 18 Nov 2022 14:32:38 +0000 (14:32 +0000)
committer David Woodhouse <dwmw@amazon.co.uk>
Fri, 18 Nov 2022 15:03:52 +0000 (15:03 +0000)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 81114a376c4ee7041845f181f657f47ceb6ce8f1..3fc08f416aa35d7f685e6dcee7dacfa3eea19309 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -647,6 +647,7 @@ struct kvm_vcpu_xen {
         struct gfn_to_pfn_cache vcpu_info_cache;
         struct gfn_to_pfn_cache vcpu_time_info_cache;
         struct gfn_to_pfn_cache runstate_cache;
+       struct gfn_to_pfn_cache runstate2_cache;
         u64 last_steal;
         u64 runstate_entry_time;
         u64 runstate_times[4];
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c

index 4b8e9628fbf57396b91cab1b61d773b2ed9bdcfb..3e705d871c010bd3d66479988c6b931ed78cf251 100644 (file)
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -201,46 +201,25 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
  void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
  {
         struct kvm_vcpu_xen *vx = &v->arch.xen;
-       struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
-       uint64_t *user_times;
+       struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+       struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
         unsigned long flags;
-       size_t user_len;
-       int *user_state;
+       size_t user_len, user_len1, user_len2;
+       size_t times_ofs;
+       u8 *update_bit;
+       struct vcpu_runstate_info rs;
+       int *rs_state = &rs.state;
  
         kvm_xen_update_runstate(v, state);
  
-       if (!vx->runstate_cache.active)
+       if (!gpc1->active)
                 return;
  
-       if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-               user_len = sizeof(struct vcpu_runstate_info);
-       else
-               user_len = sizeof(struct compat_vcpu_runstate_info);
-
-       read_lock_irqsave(&gpc->lock, flags);
-       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
-                                          user_len)) {
-               read_unlock_irqrestore(&gpc->lock, flags);
-
-               /* When invoked from kvm_sched_out() we cannot sleep */
-               if (state == RUNSTATE_runnable)
-                       return;
-
-               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
-                       return;
-
-               read_lock_irqsave(&gpc->lock, flags);
-       }
-
         /*
          * The only difference between 32-bit and 64-bit versions of the
          * runstate struct us the alignment of uint64_t in 32-bit, which
          * means that the 64-bit version has an additional 4 bytes of
          * padding after the first field 'state'.
-        *
-        * So we use 'int __user *user_state' to point to the state field,
-        * and 'uint64_t __user *user_times' for runstate_entry_time. So
-        * the actual array of time[] in each state starts at user_times[1].
          */
         BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
         BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
@@ -252,66 +231,208 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
                      offsetof(struct compat_vcpu_runstate_info, time) + 4);
  #endif
  
-       user_state = gpc->khva;
+       if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+               user_len = sizeof(struct vcpu_runstate_info);
+               times_ofs = offsetof(struct vcpu_runstate_info,
+                                    state_entry_time);
+       } else {
+               user_len = sizeof(struct compat_vcpu_runstate_info);
+               times_ofs = offsetof(struct compat_vcpu_runstate_info,
+                                    state_entry_time);
+               rs_state++;
+       }
  
-       if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-               user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
-                                                 state_entry_time);
-       else
-               user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
-                                                 state_entry_time);
+       /*
+        * There are basically no alignment constraints. The guest can set it
+        * up so it crosses from one page to the next, and at arbitrary byte
+        * alignment (and the 32-bit ABI doesn't align the 64-bit integers
+        * anyway, even if the overall struct had been 64-bit aligned).
+        */
+       if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+               user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+               user_len2 = user_len - user_len1;
+       } else {
+               user_len1 = user_len;
+               user_len2 = 0;
+       }
+       BUG_ON(user_len1 + user_len2 != user_len);
  
+ retry:
         /*
-        * First write the updated state_entry_time at the appropriate
-        * location determined by 'offset'.
+        * Attempt to obtain the GPC lock on *both* (if there are two)
+        * gfn_to_pfn caches that cover the region.
          */
-       BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
-                    sizeof(user_times[0]));
-       BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
-                    sizeof(user_times[0]));
+       read_lock_irqsave(&gpc1->lock, flags);
+       while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa, user_len1)) {
+               read_unlock_irqrestore(&gpc1->lock, flags);
  
-       user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
-       smp_wmb();
+               /* When invoked from kvm_sched_out() we cannot sleep */
+               if (state == RUNSTATE_runnable)
+                       return;
+
+               if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1))
+                       return;
+
+               read_lock_irqsave(&gpc1->lock, flags);
+       }
+
+       /*
+        * The common case is that it all fits on a page and we can
+        * just do it the simple way.
+        */
+       if (likely(!user_len2)) {
+               /*
+                * We use 'int *user_state' to point to the state field, and
+                * 'u64 *user_times' for runstate_entry_time. So the actual
+                * array of time[] in each state starts at user_times[1].
+                */
+               int *user_state = gpc1->khva;
+               u64 *user_times = gpc1->khva + times_ofs;
+
+               /*
+                * The XEN_RUNSTATE_UPDATE bit is the top bit of the state_entry_time
+                * field. We need to set it (and write-barrier) before writing to the
+                * the rest of the structure, and clear it last. Just as Xen does, we
+                * address the single *byte* in which it resides because it might be
+                * in a different cache line to the rest of the 64-bit word, due to
+                * the (lack of) alignment constraints.
+                */
+               BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
+                            sizeof(uint64_t));
+               BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
+                            sizeof(uint64_t));
+               BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
+
+               update_bit = ((u8 *)(&user_times[1])) - 1;
+               *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
+               smp_wmb();
+
+               /*
+                * Next, write the new runstate. This is in the *same* place
+                * for 32-bit and 64-bit guests, asserted here for paranoia.
+                */
+               BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
+                            offsetof(struct compat_vcpu_runstate_info, state));
+               BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
+                            sizeof(vx->current_runstate));
+               BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
+                            sizeof(vx->current_runstate));
+               *user_state = vx->current_runstate;
+
+               /*
+                * Then the actual runstate_entry_time (with the UPDATE bit
+                * still set).
+                */
+               *user_times = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
+
+               /*
+                * Write the actual runstate times immediately after the
+                * runstate_entry_time.
+                */
+               BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+                            offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
+               BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
+                            offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
+               BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+                            sizeof_field(struct compat_vcpu_runstate_info, time));
+               BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+                            sizeof(vx->runstate_times));
+               memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+
+               smp_wmb();
+
+               /*
+                * Finally, clear the 'updating' bit. Don't use &= here because
+                * the compiler may not realise that update_bit and user_times
+                * point to the same place. That's a classic pointer-aliasing
+                * problem.
+                */
+               *update_bit = vx->runstate_entry_time >> 56;
+               smp_wmb();
+
+               goto done_1;
+       }
  
         /*
-        * Next, write the new runstate. This is in the *same* place
-        * for 32-bit and 64-bit guests, asserted here for paranoia.
+        * The painful code path. It's split across two pages and we need to
+        * hold and validate both GPCs simultaneously. Thankfully we can get
+        * away with declaring a lock ordering GPC1 > GPC2 because nothing
+        * else takes them more than one at a time.
          */
-       BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
-                    offsetof(struct compat_vcpu_runstate_info, state));
-       BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
-                    sizeof(vx->current_runstate));
-       BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
-                    sizeof(vx->current_runstate));
+       read_lock(&gpc2->lock);
+
+       if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) {
+               read_unlock(&gpc2->lock);
+               read_unlock_irqrestore(&gpc1->lock, flags);
  
-       *user_state = vx->current_runstate;
+               /* When invoked from kvm_sched_out() we cannot sleep */
+               if (state == RUNSTATE_runnable)
+                       return;
+
+               /*
+                * Use kvm_gpc_activate() here because if the runstate
+                * area was configured in 32-bit mode and only extends
+                * to the second page now because the guest changed to
+                * 64-bit mode, the second GPC won't have been set up.
+                */
+               if (kvm_gpc_activate(v->kvm, gpc2, NULL, KVM_HOST_USES_PFN,
+                                    gpc1->gpa + user_len1, user_len2))
+                       return;
+
+               /*
+                * We dropped the lock on GPC1 so we have to go all the way
+                * back and revalidate that too.
+                */
+               goto retry;
+       }
  
         /*
-        * Write the actual runstate times immediately after the
-        * runstate_entry_time.
+        * Work out where the byte containing the XEN_RUNSTATE_UPDATE bit is.
          */
-       BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
-                    offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
-       BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
-                    offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
-       BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
-                    sizeof_field(struct compat_vcpu_runstate_info, time));
-       BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
-                    sizeof(vx->runstate_times));
-
-       memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+       if (user_len1 >= times_ofs + sizeof(uint64_t))
+               update_bit = ((u8 *)gpc1->khva) + times_ofs + sizeof(u64) - 1;
+       else
+               update_bit = ((u8 *)gpc2->khva) + times_ofs + sizeof(u64) - 1 -
+                       user_len1;
+
+
+       /* Create a structure on our stack with everything in the right place.
+        * The rs_state pointer points to the start of it, which in the case
+        * of a compat guest on a 64-bit host is the 32 bit field that the
+        * compiler thinks is padding.
+        */
+       *rs_state = vx->current_runstate;
+#ifdef CONFIG_X86_64
+       /* Don't leak kernel memory through the padding in the 64-bit struct */
+       if (rs_state == &rs.state)
+               rs_state[1] = 0;
+#endif
+       rs.state_entry_time = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
+       memcpy(rs.time, vx->runstate_times, sizeof(vx->runstate_times));
+
+       *update_bit = rs.state_entry_time >> 56;
         smp_wmb();
  
         /*
-        * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
-        * runstate_entry_time field.
+        * Having constructed the structure, copy it into the first and
+        * second pages as appropriate using user_len1 and user_len2.
          */
-       user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+       memcpy(gpc1->khva, rs_state, user_len1);
+       memcpy(gpc2->khva, ((u8 *)rs_state) + user_len1, user_len2);
         smp_wmb();
  
-       read_unlock_irqrestore(&gpc->lock, flags);
+       /*
+        * Finally, clear the XEN_RUNSTATE_UPDATE bit.
+        */
+       *update_bit = vx->runstate_entry_time >> 56;
+       smp_wmb();
  
-       mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+       read_unlock(&gpc2->lock);
+       mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
+
+ done_1:
+       read_unlock_irqrestore(&gpc1->lock, flags);
+       mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
  }
  
  static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
@@ -584,23 +705,50 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
                 break;
  
-       case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+       case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+               size_t sz, sz1, sz2;
+
                 if (!sched_info_on()) {
                         r = -EOPNOTSUPP;
                         break;
                 }
                 if (data->u.gpa == GPA_INVALID) {
+                       r = 0;
+               deactivate_out:
                         kvm_gpc_deactivate(vcpu->kvm,
                                            &vcpu->arch.xen.runstate_cache);
-                       r = 0;
+               deactivate2_out:
+                       kvm_gpc_deactivate(vcpu->kvm,
+                                          &vcpu->arch.xen.runstate2_cache);
                         break;
                 }
  
+               if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+                       sz = sizeof(struct vcpu_runstate_info);
+               else
+                       sz = sizeof(struct compat_vcpu_runstate_info);
+
+               /* How much fits in the (first) page? */
+               sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
                 r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
-                                    NULL, KVM_HOST_USES_PFN, data->u.gpa,
-                                    sizeof(struct vcpu_runstate_info));
-               break;
+                                    NULL, KVM_HOST_USES_PFN, data->u.gpa, sz1);
+               if (r)
+                       goto deactivate_out;
+
+               if (sz1 > sz)
+                       goto deactivate2_out; /* We've mapped all we need */
+
+               /* Map the second page */
+               sz2 = sz - sz1;
+               BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
+               r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache,
+                                            NULL, KVM_HOST_USES_PFN,
+                                            data->u.gpa + sz1, sz2);
+               if (r)
+                       goto deactivate_out;
  
+               break;
+       }
         case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
                 if (!sched_info_on()) {
                         r = -EOPNOTSUPP;
@@ -1834,6 +1982,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
         timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
  
         kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+       kvm_gpc_init(&vcpu->arch.xen.runstate2_cache);
         kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
         kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
  }
@@ -1844,6 +1993,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
                 kvm_xen_stop_timer(vcpu);
  
         kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+       kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache);
         kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
         kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
author	David Woodhouse <dwmw@amazon.co.uk>
	Fri, 18 Nov 2022 14:32:38 +0000 (14:32 +0000)
committer	David Woodhouse <dwmw@amazon.co.uk>
	Fri, 18 Nov 2022 15:03:52 +0000 (15:03 +0000)
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/kvm/xen.c		patch \| blob \| history