]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
KVM: x86: Use fast path for Xen timer delivery
authorDavid Woodhouse <dwmw@amazon.co.uk>
Fri, 22 Sep 2023 09:24:42 +0000 (09:24 +0000)
committerDavid Woodhouse <dwmw@amazon.co.uk>
Mon, 2 Oct 2023 10:15:36 +0000 (11:15 +0100)
Most of the time there's no need to kick the vCPU and deliver the timer
event through kvm_xen_inject_timer_irqs(). Use kvm_xen_set_evtchn_fast()
directly from the timer callback, and only fall back to the slow path
when it's necessary to do so.

This gives a significant improvement in timer latency testing (using
nanosleep() for various periods and then measuring the actual time
elapsed).

However, there was a reason¹ the fast path was dropped when this support
was first added. The current code holds vcpu->mutex for all operations
on the kvm->arch.timer_expires field, and the fast path introduces a
potential race condition. Avoid that race by ensuring the hrtimer is
(temporarily) cancelled before making changes in kvm_xen_start_timer(),
and also when reading the values out for KVM_XEN_VCPU_ATTR_TYPE_TIMER.

¹ https://lore.kernel.org/kvm/846caa99-2e42-4443-1070-84e49d2f11d2@redhat.com/

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
arch/x86/kvm/xen.c

index 40edf4d1974c530336e9f9044fd3b18b18ea8de3..75586da134b38b0bb6280ff32ee6ae71051b39c7 100644 (file)
@@ -134,9 +134,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
 {
        struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
                                             arch.xen.timer);
+       struct kvm_xen_evtchn e;
+       int rc;
+
        if (atomic_read(&vcpu->arch.xen.timer_pending))
                return HRTIMER_NORESTART;
 
+       e.vcpu_id = vcpu->vcpu_id;
+       e.vcpu_idx = vcpu->vcpu_idx;
+       e.port = vcpu->arch.xen.timer_virq;
+       e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+       rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
+       if (rc != -EWOULDBLOCK) {
+               vcpu->arch.xen.timer_expires = 0;
+               return HRTIMER_NORESTART;
+       }
+
        atomic_inc(&vcpu->arch.xen.timer_pending);
        kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
        kvm_vcpu_kick(vcpu);
@@ -146,6 +160,14 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
 
 static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
 {
+       /*
+        * Avoid races with the old timer firing. Checking timer_expires
+        * to avoid calling hrtimer_cancel() will only have false positives
+        * so is fine.
+        */
+       if (vcpu->arch.xen.timer_expires)
+               hrtimer_cancel(&vcpu->arch.xen.timer);
+
        atomic_set(&vcpu->arch.xen.timer_pending, 0);
        vcpu->arch.xen.timer_expires = guest_abs;
 
@@ -1019,9 +1041,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
                break;
 
        case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+               /*
+                * Ensure a consistent snapshot of state is captured, with a
+                * timer either being pending, or the event channel delivered
+                * to the corresponding bit in the shared_info. Not still
+                * lurking in the timer_pending flag for deferred delivery.
+                * Purely as an optimisation, if the timer_expires field is
+                * zero, that means the timer isn't active (or even in the
+                * timer_pending flag) and there is no need to cancel it.
+                */
+               if (vcpu->arch.xen.timer_expires) {
+                       hrtimer_cancel(&vcpu->arch.xen.timer);
+                       kvm_xen_inject_timer_irqs(vcpu);
+               }
+
                data->u.timer.port = vcpu->arch.xen.timer_virq;
                data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
                data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+
+               /*
+                * The hrtimer may trigger and raise the IRQ immediately,
+                * while the returned state causes it to be set up and
+                * raised again on the destination system after migration.
+                * That's fine, as the guest won't even have had a chance
+                * to run and handle the interrupt. Asserting an already
+                * pending event channel is idempotent.
+                */
+               if (vcpu->arch.xen.timer_expires)
+                       hrtimer_start_expires(&vcpu->arch.xen.timer,
+                                             HRTIMER_MODE_ABS_HARD);
+
                r = 0;
                break;