From: David Woodhouse Date: Mon, 29 Jul 2024 14:02:45 +0000 (+0000) Subject: Update to non-RFC v3 posting X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=204998614d65b4f49440ae8f12ce858030c1bd0d;p=users%2Fdwmw2%2Fvmclock.git Update to non-RFC v3 posting Signed-off-by: David Woodhouse --- diff --git a/ptp_vmclock.c b/ptp_vmclock.c index adf6ef3..58a6ba2 100644 --- a/ptp_vmclock.c +++ b/ptp_vmclock.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "vmclock-abi.h" @@ -27,7 +28,7 @@ #include #endif -#ifdef CONFIG_KVM_GUESTx +#if defined(CONFIG_KVM_GUEST) && LINUX_VERSION_CODE >= KERNEL_VERSION(6,9,0) #define SUPPORT_KVMCLOCK #endif @@ -48,6 +49,13 @@ struct vmclock_state { #define VMCLOCK_MAX_WAIT ms_to_ktime(100) +/* Require at least the flags field to be present. All else can be optional. */ +#define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad) + +#define VMCLOCK_FIELD_PRESENT(_c, _f) \ + (_c)->size >= (offsetof(struct vmclock_abi, _f) + \ + sizeof((_c)->_f)) + /* * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64 * and add the fractional second part of the reference time. @@ -58,9 +66,9 @@ struct vmclock_state { * If __int128 isn't available, perform the calculation 32 bits at a time to * avoid overflow. */ -static inline uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, - uint64_t period, uint8_t shift, - uint64_t frac_sec) +static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, + uint64_t period, uint8_t shift, + uint64_t frac_sec) { unsigned __int128 res = (unsigned __int128)delta * period; @@ -70,7 +78,7 @@ static inline uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, return (uint64_t)res; } -static inline bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) +static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) { if (likely(clk->time_type == VMCLOCK_TIME_UTC)) return true; @@ -104,6 +112,11 @@ static int vmclock_get_crosststamp(struct vmclock_state *st, while (1) { seq = st->clk->seq_count & ~1ULL; + + /* + * This pairs with a write barrier in the hypervisor + * which populates this structure. + */ virt_rmb(); if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE) @@ -131,8 +144,9 @@ static int vmclock_get_crosststamp(struct vmclock_state *st, cycle = get_cycles(); ptp_read_system_postts(sts); } - } else + } else { cycle = get_cycles(); + } delta = cycle - st->clk->counter_value; @@ -156,7 +170,9 @@ static int vmclock_get_crosststamp(struct vmclock_state *st, if (system_counter) { system_counter->cycles = cycle; - //system_counter->cs_id = st->cs_id; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,9,0) + system_counter->cs_id = st->cs_id; +#endif } if (sts) { @@ -235,7 +251,6 @@ static int ptp_vmclock_get_time_fn(ktime_t *device_time, return ret; } - static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *xtstamp) { @@ -313,6 +328,37 @@ static const struct ptp_clock_info ptp_vmclock_info = { .getcrosststamp = ptp_vmclock_getcrosststamp, }; +static struct ptp_clock *vmclock_ptp_register(struct device *dev, + struct vmclock_state *st) +{ + enum clocksource_ids cs_id; + + if (IS_ENABLED(CONFIG_ARM64) && + st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) { + /* Can we check it's the virtual counter? */ + cs_id = CSID_ARM_ARCH_COUNTER; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,9,0) + } else if (IS_ENABLED(CONFIG_X86) && + st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) { + cs_id = CSID_X86_TSC; +#endif + } else { + return NULL; + } + + /* Only UTC, or TAI with offset */ + if (!tai_adjust(st->clk, NULL)) { + dev_info(dev, "vmclock does not provide unambiguous UTC\n"); + return NULL; + } + + st->sys_cs_id = st->cs_id = cs_id; + st->ptp_clock_info = ptp_vmclock_info; + strscpy(st->ptp_clock_info.name, st->name, sizeof(st->ptp_clock_info.name)); + + return ptp_clock_register(&st->ptp_clock_info, dev); +} + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) { struct vmclock_state *st = container_of(fp->private_data, @@ -420,12 +466,20 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) struct acpi_device *adev = ACPI_COMPANION(dev); acpi_status status; - status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, - vmclock_acpi_resources, st); - if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) { - dev_err(dev, "failed to get resources\n"); - return -ENODEV; - } + /* + * This should never happen as this function is only called when + * has_acpi_companion(dev) is true, but the logic is sufficiently + * complex that Coverity can't see the tautology. + */ + if (!adev) + return -ENODEV; + + status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, + vmclock_acpi_resources, st); + if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) { + dev_err(dev, "failed to get resources\n"); + return -ENODEV; + } return 0; } @@ -457,6 +511,12 @@ static int vmclock_probe(struct platform_device *pdev) goto out; } + if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) { + dev_info(dev, "Region too small (0x%llx)\n", + resource_size(&st->res)); + ret = -EINVAL; + goto out; + } st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res), MEMREMAP_WB | MEMREMAP_DEC); if (IS_ERR(st->clk)) { @@ -467,7 +527,7 @@ static int vmclock_probe(struct platform_device *pdev) } if (st->clk->magic != VMCLOCK_MAGIC || - st->clk->size < sizeof(*st->clk) || + st->clk->size > resource_size(&st->res) || st->clk->version != 1) { dev_info(dev, "vmclock magic fields invalid\n"); ret = -EINVAL; @@ -479,7 +539,7 @@ static int vmclock_probe(struct platform_device *pdev) goto out; st->index = ret; - ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); + ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); if (ret) goto out; @@ -489,7 +549,12 @@ static int vmclock_probe(struct platform_device *pdev) goto out; } - /* If the structure is big enough, it can be mapped to userspace */ + /* + * If the structure is big enough, it can be mapped to userspace. + * Theoretically a guest OS even using larger pages could still + * use 4KiB PTEs to map smaller MMIO regions like this, but let's + * cross that bridge if/when we come to it. + */ if (st->clk->size >= PAGE_SIZE) { st->miscdev.minor = MISC_DYNAMIC_MINOR; st->miscdev.fops = &vmclock_miscdev_fops; @@ -501,37 +566,18 @@ static int vmclock_probe(struct platform_device *pdev) } /* If there is valid clock information, register a PTP clock */ - if (IS_ENABLED(CONFIG_ARM64) && - st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) { - /* Can we check it's the virtual counter? */ - st->cs_id = CSID_ARM_ARCH_COUNTER; -#if 0 - } else if (IS_ENABLED(CONFIG_X86) && - st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) { - st->cs_id = CSID_X86_TSC; -#endif - } - - /* Only UTC, or TAI with offset */ - if (!tai_adjust(st->clk, NULL)) { - dev_info(dev, "vmclock does not provide unambiguous UTC\n"); - st->cs_id = CSID_GENERIC; - } - - if (st->cs_id) { - st->sys_cs_id = st->cs_id; - - st->ptp_clock_info = ptp_vmclock_info; - strscpy(st->ptp_clock_info.name, st->name, sizeof(st->ptp_clock_info.name)); - st->ptp_clock = ptp_clock_register(&st->ptp_clock_info, dev); - + if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) { + /* Can return a silent NULL, or an error. */ + st->ptp_clock = vmclock_ptp_register(dev, st); if (IS_ERR(st->ptp_clock)) { ret = PTR_ERR(st->ptp_clock); st->ptp_clock = NULL; vmclock_remove(pdev); goto out; } - } else if (!st->miscdev.minor) { + } + + if (!st->miscdev.minor && !st->ptp_clock) { /* Neither miscdev nor PTP registered */ dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n"); ret = -ENODEV; @@ -550,6 +596,7 @@ static int vmclock_probe(struct platform_device *pdev) } static const struct acpi_device_id vmclock_acpi_ids[] = { + { "AMZNC10C", 0 }, { "VMCLOCK", 0 }, {} }; diff --git a/vmclock-abi.h b/vmclock-abi.h index 3bde10d..19cbf85 100644 --- a/vmclock-abi.h +++ b/vmclock-abi.h @@ -144,10 +144,9 @@ struct vmclock_abi { int16_t tai_offset_sec; uint8_t leap_indicator; /* - * This field is based on the the VIRTIO_RTC_LEAP_xxx values as - * defined in the current draft of virtio-rtc, but since smearing - * cannot be used with the shared memory device, some values are - * not used. + * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined + * in the current draft of virtio-rtc, but since smearing cannot be + * used with the shared memory device, some values are not used. * * The _POST_POS and _POST_NEG values allow the guest to perform * its own smearing during the day or so after a leap second when @@ -168,8 +167,8 @@ struct vmclock_abi { */ uint64_t counter_value; /* - * Counter frequency, and error margin. The unit of these fields is - * seconds >> (64 + counter_period_shift) + * Counter period, and error margin of same. The unit of these + * fields is 1/2^(64 + counter_period_shift) of a second. */ uint64_t counter_period_frac_sec; uint64_t counter_period_esterror_rate_frac_sec; @@ -179,7 +178,7 @@ struct vmclock_abi { * Time according to time_type field above. */ uint64_t time_sec; /* Seconds since time_type epoch */ - uint64_t time_frac_sec; /* (seconds >> 64) */ + uint64_t time_frac_sec; /* Units of 1/2^64 of a second */ uint64_t time_esterror_nanosec; uint64_t time_maxerror_nanosec; };