]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
KVM: x86: Refine calculation of guest wall clock to use a single TSC read
authorDavid Woodhouse <dwmw@amazon.co.uk>
Sun, 1 Oct 2023 17:06:31 +0000 (18:06 +0100)
committerDavid Woodhouse <dwmw@amazon.co.uk>
Mon, 2 Oct 2023 10:15:36 +0000 (11:15 +0100)
When populating the guest's PV wall clock information, KVM currently does
a simple 'kvm_get_real_ns() - get_kvmclock_ns(kvm)'. This is an antipattern
which should be avoided; when working with the relationship between two
clocks, it's never correct to obtain one of them "now" and then the other
at a slightly different "now" after an unspecified period of preemption
(which might not even be under the control of the kernel, if this is an
L1 hosting an L2 guest under nested virtualization).

Add a kvm_get_wall_clock_epoch() function to return the guest wall clock
epoch in nanoseconds using the same method as __get_kvmclock() — by using
kvm_get_walltime_and_clockread() to calculate both the wall clock and KVM
clock time from a *single* TSC reading.

The condition using get_cpu_tsc_khz() is equivalent to the version in
__get_kvmclock() which separately checks for the CONSTANT_TSC feature or
the per-CPU cpu_tsc_khz. Which is what get_cpu_tsc_khz() does anyway.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/kvm/xen.c

index 9f18b06bbda66bf411bb492fa4ab4c5fd9e90080..856b12875bcbe58c38676f5b156078b9276048ce 100644 (file)
@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
        if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
                return;
 
-       /*
-        * The guest calculates current wall clock time by adding
-        * system time (updated by kvm_guest_time_update below) to the
-        * wall clock specified here.  We do the reverse here.
-        */
-       wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+       wall_nsec = kvm_get_wall_clock_epoch(kvm);
 
-       wc.nsec = do_div(wall_nsec, 1000000000);
+       wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
        wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
        wc.version = version;
 
@@ -3241,6 +3236,57 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        return 0;
 }
 
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+       /*
+        * The guest calculates current wall clock time by adding
+        * system time (updated by kvm_guest_time_update below) to the
+        * wall clock specified here.  We do the reverse here.
+        */
+#ifdef CONFIG_X86_64
+       struct pvclock_vcpu_time_info hv_clock;
+       struct kvm_arch *ka = &kvm->arch;
+       unsigned long seq, local_tsc_khz = 0;
+       struct timespec64 ts;
+       uint64_t host_tsc;
+
+       do {
+               seq = read_seqcount_begin(&ka->pvclock_sc);
+
+               if (!ka->use_master_clock)
+                       break;
+
+               /* It all has to happen on the same CPU */
+               get_cpu();
+
+               local_tsc_khz = get_cpu_tsc_khz();
+
+               if (local_tsc_khz &&
+                   !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+                       local_tsc_khz = 0; /* Fall back to old method */
+
+               hv_clock.tsc_timestamp = ka->master_cycle_now;
+               hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+               put_cpu();
+       } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+       /*
+        * If the conditions were right, and obtaining the wallclock+TSC was
+        * successful, calculate the KVM clock at the corresponding time and
+        * subtract one from the other to get the epoch in nanoseconds.
+        */
+       if (local_tsc_khz) {
+               kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * 1000LL,
+                                  &hv_clock.tsc_shift,
+                                  &hv_clock.tsc_to_system_mul);
+               return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+                       __pvclock_read_cycles(&hv_clock, host_tsc);
+       }
+#endif
+       return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+}
+
 /*
  * kvmclock updates which are isolated to a given vcpu, such as
  * vcpu->cpu migration, should not allow system_timestamp from
index 1e7be1f6ab299d78a76e76385db159dee679220b..ed1a69942347101408b660c4ffae9997062401bf 100644 (file)
@@ -290,6 +290,8 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
        return !(kvm->arch.disabled_quirks & quirk);
 }
 
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
+
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 u64 get_kvmclock_ns(struct kvm *kvm);
index 75586da134b38b0bb6280ff32ee6ae71051b39c7..6bab715be428d1df12c0a4036b4d5a6eeaadf423 100644 (file)
@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
                 * This code mirrors kvm_write_wall_clock() except that it writes
                 * directly through the pfn cache and doesn't mark the page dirty.
                 */
-               wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+               wall_nsec = kvm_get_wall_clock_epoch(kvm);
 
                /* It could be invalid again already, so we need to check */
                read_lock_irq(&gpc->lock);
@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
        wc_version = wc->version = (wc->version + 1) | 1;
        smp_wmb();
 
-       wc->nsec = do_div(wall_nsec,  1000000000);
+       wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
        wc->sec = (u32)wall_nsec;
        *wc_sec_hi = wall_nsec >> 32;
        smp_wmb();