From 1201f226c863b7da739f7420ddba818cedf372fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2024 17:32:58 -0800 Subject: [PATCH 01/16] KVM: x86: Cache CPUID.0xD XSTATE offsets+sizes during module init Snapshot the output of CPUID.0xD.[1..n] during kvm.ko initiliaization to avoid the overead of CPUID during runtime. The offset, size, and metadata for CPUID.0xD.[1..n] sub-leaves does not depend on XCR0 or XSS values, i.e. is constant for a given CPU, and thus can be cached during module load. On Intel's Emerald Rapids, CPUID is *wildly* expensive, to the point where recomputing XSAVE offsets and sizes results in a 4x increase in latency of nested VM-Enter and VM-Exit (nested transitions can trigger xstate_required_size() multiple times per transition), relative to using cached values. The issue is easily visible by running `perf top` while triggering nested transitions: kvm_update_cpuid_runtime() shows up at a whopping 50%. As measured via RDTSC from L2 (using KVM-Unit-Test's CPUID VM-Exit test and a slightly modified L1 KVM to handle CPUID in the fastpath), a nested roundtrip to emulate CPUID on Skylake (SKX), Icelake (ICX), and Emerald Rapids (EMR) takes: SKX 11650 ICX 22350 EMR 28850 Using cached values, the latency drops to: SKX 6850 ICX 9000 EMR 7900 The underlying issue is that CPUID itself is slow on ICX, and comically slow on EMR. The problem is exacerbated on CPUs which support XSAVES and/or XSAVEC, as KVM invokes xstate_required_size() twice on each runtime CPUID update, and because there are more supported XSAVE features (CPUID for supported XSAVE feature sub-leafs is significantly slower). SKX: CPUID.0xD.2 = 348 cycles CPUID.0xD.3 = 400 cycles CPUID.0xD.4 = 276 cycles CPUID.0xD.5 = 236 cycles EMR: CPUID.0xD.2 = 1138 cycles CPUID.0xD.3 = 1362 cycles CPUID.0xD.4 = 1068 cycles CPUID.0xD.5 = 910 cycles CPUID.0xD.6 = 914 cycles CPUID.0xD.7 = 1350 cycles CPUID.0xD.8 = 734 cycles CPUID.0xD.9 = 766 cycles CPUID.0xD.10 = 732 cycles CPUID.0xD.11 = 718 cycles CPUID.0xD.12 = 734 cycles CPUID.0xD.13 = 1700 cycles CPUID.0xD.14 = 1126 cycles CPUID.0xD.15 = 898 cycles CPUID.0xD.16 = 716 cycles CPUID.0xD.17 = 748 cycles CPUID.0xD.18 = 776 cycles Note, updating runtime CPUID information multiple times per nested transition is itself a flaw, especially since CPUID is a mandotory intercept on both Intel and AMD. E.g. KVM doesn't need to ensure emulated CPUID state is up-to-date while running L2. That flaw will be fixed in a future patch, as deferring runtime CPUID updates is more subtle than it appears at first glance, the benefits aren't super critical to have once the XSAVE issue is resolved, and caching CPUID output is desirable even if KVM's updates are deferred. Cc: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20241211013302.1347853-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 31 ++++++++++++++++++++++++++----- arch/x86/kvm/cpuid.h | 1 + arch/x86/kvm/x86.c | 2 ++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 097bdc022d0f..ae0b438a2c99 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -36,6 +36,26 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); +struct cpuid_xstate_sizes { + u32 eax; + u32 ebx; + u32 ecx; +}; + +static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init; + +void __init kvm_init_xstate_sizes(void) +{ + u32 ign; + int i; + + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + + cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign); + } +} + u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; @@ -44,14 +64,15 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx, offset; - cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); + struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; + u32 offset; + /* ECX[1]: 64B alignment in compacted form */ if (compacted) - offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; else - offset = ebx; - ret = max(ret, offset + eax); + offset = xs->ebx; + ret = max(ret, offset + xs->eax); } xstate_bv >>= 1; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c8dc66eddefd..f16a7b2c2adc 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -31,6 +31,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); +void __init kvm_init_xstate_sizes(void); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e713480933a..c8160baf3838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13997,6 +13997,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { + kvm_init_xstate_sizes(); + kvm_mmu_x86_module_init(); mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0; -- 2.51.0 From 824927e88456331c7a999fdf5d9d27923b619590 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Dec 2024 14:37:15 +0200 Subject: [PATCH 02/16] ARC: build: Try to guess GCC variant of cross compiler MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit ARC GCC compiler is packaged starting from Fedora 39i and the GCC variant of cross compile tools has arc-linux-gnu- prefix and not arc-linux-. This is causing that CROSS_COMPILE variable is left unset. This change allows builds without need to supply CROSS_COMPILE argument if distro package is used. Before this change: $ make -j 128 ARCH=arc W=1 drivers/infiniband/hw/mlx4/ gcc: warning: ‘-mcpu=’ is deprecated; use ‘-mtune=’ or ‘-march=’ instead gcc: error: unrecognized command-line option ‘-mmedium-calls’ gcc: error: unrecognized command-line option ‘-mlock’ gcc: error: unrecognized command-line option ‘-munaligned-access’ [1] https://packages.fedoraproject.org/pkgs/cross-gcc/gcc-arc-linux-gnu/index.html Signed-off-by: Leon Romanovsky Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 2390dd042e36..fb98478ed1ab 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -6,7 +6,7 @@ KBUILD_DEFCONFIG := haps_hs_smp_defconfig ifeq ($(CROSS_COMPILE),) -CROSS_COMPILE := $(call cc-cross-prefix, arc-linux- arceb-linux-) +CROSS_COMPILE := $(call cc-cross-prefix, arc-linux- arceb-linux- arc-linux-gnu-) endif cflags-y += -fno-common -pipe -fno-builtin -mmedium-calls -D__linux__ -- 2.51.0 From c00d738e1673ab801e1577e4e3c780ccf88b1a5b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:27 -0800 Subject: [PATCH 03/16] bpf: Revert "bpf: Mark raw_tp arguments with PTR_MAYBE_NULL" This patch reverts commit cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL"). The patch was well-intended and meant to be as a stop-gap fixing branch prediction when the pointer may actually be NULL at runtime. Eventually, it was supposed to be replaced by an automated script or compiler pass detecting possibly NULL arguments and marking them accordingly. However, it caused two main issues observed for production programs and failed to preserve backwards compatibility. First, programs relied on the verifier not exploring == NULL branch when pointer is not NULL, thus they started failing with a 'dereference of scalar' error. Next, allowing raw_tp arguments to be modified surfaced the warning in the verifier that warns against reg->off when PTR_MAYBE_NULL is set. More information, context, and discusson on both problems is available in [0]. Overall, this approach had several shortcomings, and the fixes would further complicate the verifier's logic, and the entire masking scheme would have to be removed eventually anyway. Hence, revert the patch in preparation of a better fix avoiding these issues to replace this commit. [0]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com Reported-by: Manu Bretelle Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 -- kernel/bpf/btf.c | 5 +- kernel/bpf/verifier.c | 79 ++----------------- .../bpf/progs/test_tp_btf_nullable.c | 6 +- 4 files changed, 9 insertions(+), 87 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 805040813f5d..6e63dd3443b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3514,10 +3514,4 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } -static inline bool bpf_prog_is_raw_tp(const struct bpf_prog *prog) -{ - return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_RAW_TP; -} - #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a63a03582f02..c4aa304028ce 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6594,10 +6594,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; - /* Raw tracepoint arguments always get marked as maybe NULL */ - if (bpf_prog_is_raw_tp(prog)) - info->reg_type |= PTR_MAYBE_NULL; - else if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; if (tgt_prog) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e541339b2f6..f7f892a52a37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -420,25 +420,6 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } -static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) && - bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id; -} - -static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - if (!mask_raw_tp_reg_cond(env, reg)) - return false; - reg->type &= ~PTR_MAYBE_NULL; - return true; -} - -static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result) -{ - if (result) - reg->type |= PTR_MAYBE_NULL; -} - static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; @@ -6801,7 +6782,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, const char *field_name = NULL; enum bpf_type_flag flag = 0; u32 btf_id = 0; - bool mask; int ret; if (!env->allow_ptr_leaks) { @@ -6873,21 +6853,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - /* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL - * trusted PTR_TO_BTF_ID, these are the ones that are possibly - * arguments to the raw_tp. Since internal checks in for trusted - * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL - * modifier as problematic, mask it out temporarily for the - * check. Don't apply this to pointers with ref_obj_id > 0, as - * those won't be raw_tp args. - * - * We may end up applying this relaxation to other trusted - * PTR_TO_BTF_ID with maybe null flag, since we cannot - * distinguish PTR_MAYBE_NULL tagged for arguments vs normal - * tagging, but that should expand allowed behavior, and not - * cause regression for existing behavior. - */ - mask = mask_raw_tp_reg(env, reg); + if (ret != PTR_TO_BTF_ID) { /* just mark; */ @@ -6948,13 +6914,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) { + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); - /* We've assigned a new type to regno, so don't undo masking. */ - if (regno == value_regno) - mask = false; - } - unmask_raw_tp_reg(reg, mask); return 0; } @@ -7329,7 +7290,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && - (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) { + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -9032,7 +8993,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; int err = 0; - bool mask; if (arg_type == ARG_DONTCARE) return 0; @@ -9073,11 +9033,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + if (err) + return err; - err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); + err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err; @@ -9872,17 +9832,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { struct bpf_call_arg_meta meta; - bool mask; int err; if (register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); - unmask_raw_tp_reg(reg, mask); if (err) return err; } else { @@ -12205,7 +12162,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; - bool mask = false; int kf_arg_type; t = btf_type_skip_modifiers(btf, args[i].type, NULL); @@ -12264,15 +12220,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - mask = mask_raw_tp_reg(env, reg); if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); - unmask_raw_tp_reg(reg, mask); return -EACCES; } - unmask_raw_tp_reg(reg, mask); if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { @@ -12330,24 +12283,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break; - /* Allow passing maybe NULL raw_tp arguments to - * kfuncs for compatibility. Don't apply this to - * arguments with ref_obj_id > 0. - */ - mask = mask_raw_tp_reg(env, reg); if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } if (!is_rcu_reg(reg)) { verbose(env, "R%d must be a rcu pointer\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } } - unmask_raw_tp_reg(reg, mask); fallthrough; case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: @@ -12370,9 +12315,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; - mask = mask_raw_tp_reg(env, reg); ret = check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; @@ -12549,7 +12492,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_tname = btf_name_by_offset(btf, ref_t->name_off); fallthrough; case KF_ARG_PTR_TO_BTF_ID: - mask = mask_raw_tp_reg(env, reg); /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && @@ -12558,11 +12500,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; break; @@ -13535,7 +13475,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, */ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, - struct bpf_reg_state *ptr_reg, + const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { struct bpf_verifier_state *vstate = env->cur_state; @@ -13549,7 +13489,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - bool mask; int ret; dst_reg = ®s[dst]; @@ -13576,14 +13515,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - mask = mask_raw_tp_reg(env, ptr_reg); if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", dst, reg_type_str(env, ptr_reg->type)); - unmask_raw_tp_reg(ptr_reg, mask); return -EACCES; } - unmask_raw_tp_reg(ptr_reg, mask); switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: @@ -20126,7 +20062,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: - case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c index 5aaf2b065f86..bba3e37f749b 100644 --- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c @@ -7,11 +7,7 @@ #include "bpf_misc.h" SEC("tp_btf/bpf_testmod_test_nullable_bare") -/* This used to be a failure test, but raw_tp nullable arguments can now - * directly be dereferenced, whether they have nullable annotation or not, - * and don't need to be explicitly checked. - */ -__success +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx) { return nullable_ctx->len; -- 2.51.0 From 838a10bd2ebfe11a60dd67687533a7cfc220cc86 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:28 -0800 Subject: [PATCH 04/16] bpf: Augment raw_tp arguments with PTR_MAYBE_NULL Arguments to a raw tracepoint are tagged as trusted, which carries the semantics that the pointer will be non-NULL. However, in certain cases, a raw tracepoint argument may end up being NULL. More context about this issue is available in [0]. Thus, there is a discrepancy between the reality, that raw_tp arguments can actually be NULL, and the verifier's knowledge, that they are never NULL, causing explicit NULL check branch to be dead code eliminated. A previous attempt [1], i.e. the second fixed commit, was made to simulate symbolic execution as if in most accesses, the argument is a non-NULL raw_tp, except for conditional jumps. This tried to suppress branch prediction while preserving compatibility, but surfaced issues with production programs that were difficult to solve without increasing verifier complexity. A more complete discussion of issues and fixes is available at [2]. Fix this by maintaining an explicit list of tracepoints where the arguments are known to be NULL, and mark the positional arguments as PTR_MAYBE_NULL. Additionally, capture the tracepoints where arguments are known to be ERR_PTR, and mark these arguments as scalar values to prevent potential dereference. Each hex digit is used to encode NULL-ness (0x1) or ERR_PTR-ness (0x2), shifted by the zero-indexed argument number x 4. This can be represented as follows: 1st arg: 0x1 2nd arg: 0x10 3rd arg: 0x100 ... and so on (likewise for ERR_PTR case). In the future, an automated pass will be used to produce such a list, or insert __nullable annotations automatically for tracepoints. Each compilation unit will be analyzed and results will be collated to find whether a tracepoint pointer is definitely not null, maybe null, or an unknown state where verifier conservatively marks it PTR_MAYBE_NULL. A proof of concept of this tool from Eduard is available at [3]. Note that in case we don't find a specification in the raw_tp_null_args array and the tracepoint belongs to a kernel module, we will conservatively mark the arguments as PTR_MAYBE_NULL. This is because unlike for in-tree modules, out-of-tree module tracepoints may pass NULL freely to the tracepoint. We don't protect against such tracepoints passing ERR_PTR (which is uncommon anyway), lest we mark all such arguments as SCALAR_VALUE. While we are it, let's adjust the test raw_tp_null to not perform dereference of the skb->mark, as that won't be allowed anymore, and make it more robust by using inline assembly to test the dead code elimination behavior, which should still stay the same. [0]: https://lore.kernel.org/bpf/ZrCZS6nisraEqehw@jlelli-thinkpadt14gen4.remote.csb [1]: https://lore.kernel.org/all/20241104171959.2938862-1-memxor@gmail.com [2]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com [3]: https://github.com/eddyz87/llvm-project/tree/nullness-for-tracepoint-params Reported-by: Juri Lelli # original bug Reported-by: Manu Bretelle # bugs in masking fix Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL") Reviewed-by: Eduard Zingerman Co-developed-by: Jiri Olsa Signed-off-by: Jiri Olsa Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 138 ++++++++++++++++++ .../testing/selftests/bpf/progs/raw_tp_null.c | 19 ++- 2 files changed, 147 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c4aa304028ce..e5a5f023cedd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6439,6 +6439,101 @@ int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, return off; } +struct bpf_raw_tp_null_args { + const char *func; + u64 mask; +}; + +static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { + /* sched */ + { "sched_pi_setprio", 0x10 }, + /* ... from sched_numa_pair_template event class */ + { "sched_stick_numa", 0x100 }, + { "sched_swap_numa", 0x100 }, + /* afs */ + { "afs_make_fs_call", 0x10 }, + { "afs_make_fs_calli", 0x10 }, + { "afs_make_fs_call1", 0x10 }, + { "afs_make_fs_call2", 0x10 }, + { "afs_protocol_error", 0x1 }, + { "afs_flock_ev", 0x10 }, + /* cachefiles */ + { "cachefiles_lookup", 0x1 | 0x200 }, + { "cachefiles_unlink", 0x1 }, + { "cachefiles_rename", 0x1 }, + { "cachefiles_prep_read", 0x1 }, + { "cachefiles_mark_active", 0x1 }, + { "cachefiles_mark_failed", 0x1 }, + { "cachefiles_mark_inactive", 0x1 }, + { "cachefiles_vfs_error", 0x1 }, + { "cachefiles_io_error", 0x1 }, + { "cachefiles_ondemand_open", 0x1 }, + { "cachefiles_ondemand_copen", 0x1 }, + { "cachefiles_ondemand_close", 0x1 }, + { "cachefiles_ondemand_read", 0x1 }, + { "cachefiles_ondemand_cread", 0x1 }, + { "cachefiles_ondemand_fd_write", 0x1 }, + { "cachefiles_ondemand_fd_release", 0x1 }, + /* ext4, from ext4__mballoc event class */ + { "ext4_mballoc_discard", 0x10 }, + { "ext4_mballoc_free", 0x10 }, + /* fib */ + { "fib_table_lookup", 0x100 }, + /* filelock */ + /* ... from filelock_lock event class */ + { "posix_lock_inode", 0x10 }, + { "fcntl_setlk", 0x10 }, + { "locks_remove_posix", 0x10 }, + { "flock_lock_inode", 0x10 }, + /* ... from filelock_lease event class */ + { "break_lease_noblock", 0x10 }, + { "break_lease_block", 0x10 }, + { "break_lease_unblock", 0x10 }, + { "generic_delete_lease", 0x10 }, + { "time_out_leases", 0x10 }, + /* host1x */ + { "host1x_cdma_push_gather", 0x10000 }, + /* huge_memory */ + { "mm_khugepaged_scan_pmd", 0x10 }, + { "mm_collapse_huge_page_isolate", 0x1 }, + { "mm_khugepaged_scan_file", 0x10 }, + { "mm_khugepaged_collapse_file", 0x10 }, + /* kmem */ + { "mm_page_alloc", 0x1 }, + { "mm_page_pcpu_drain", 0x1 }, + /* .. from mm_page event class */ + { "mm_page_alloc_zone_locked", 0x1 }, + /* netfs */ + { "netfs_failure", 0x10 }, + /* power */ + { "device_pm_callback_start", 0x10 }, + /* qdisc */ + { "qdisc_dequeue", 0x1000 }, + /* rxrpc */ + { "rxrpc_recvdata", 0x1 }, + { "rxrpc_resend", 0x10 }, + /* sunrpc */ + { "xs_stream_read_data", 0x1 }, + /* ... from xprt_cong_event event class */ + { "xprt_reserve_cong", 0x10 }, + { "xprt_release_cong", 0x10 }, + { "xprt_get_cong", 0x10 }, + { "xprt_put_cong", 0x10 }, + /* tcp */ + { "tcp_send_reset", 0x11 }, + /* tegra_apb_dma */ + { "tegra_dma_tx_status", 0x100 }, + /* timer_migration */ + { "tmigr_update_events", 0x1 }, + /* writeback, from writeback_folio_template event class */ + { "writeback_dirty_folio", 0x10 }, + { "folio_wait_writeback", 0x10 }, + /* rdma */ + { "mr_integ_alloc", 0x2000 }, + /* bpf_testmod */ + { "bpf_testmod_test_read", 0x0 }, +}; + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6449,6 +6544,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + bool ptr_err_raw_tp = false; const char *tag_value; u32 nr_args, arg; int i, ret; @@ -6597,6 +6693,39 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { + struct btf *btf = prog->aux->attach_btf; + const struct btf_type *t; + const char *tname; + + /* BTF lookups cannot fail, return false on error */ + t = btf_type_by_id(btf, prog->aux->attach_btf_id); + if (!t) + return false; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return false; + /* Checked by bpf_check_attach_target */ + tname += sizeof("btf_trace_") - 1; + for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) { + /* Is this a func with potential NULL args? */ + if (strcmp(tname, raw_tp_null_args[i].func)) + continue; + if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + info->reg_type |= PTR_MAYBE_NULL; + /* Is the current arg IS_ERR? */ + if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + ptr_err_raw_tp = true; + break; + } + /* If we don't know NULL-ness specification and the tracepoint + * is coming from a loadable module, be conservative and mark + * argument as PTR_MAYBE_NULL. + */ + if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf)) + info->reg_type |= PTR_MAYBE_NULL; + } + if (tgt_prog) { enum bpf_prog_type tgt_type; @@ -6641,6 +6770,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); + + /* Perform all checks on the validity of type for this argument, but if + * we know it can be IS_ERR at runtime, scrub pointer type and mark as + * scalar. + */ + if (ptr_err_raw_tp) { + bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg); + info->reg_type = SCALAR_VALUE; + } return true; } EXPORT_SYMBOL_GPL(btf_ctx_access); diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null.c b/tools/testing/selftests/bpf/progs/raw_tp_null.c index 457f34c151e3..5927054b6dd9 100644 --- a/tools/testing/selftests/bpf/progs/raw_tp_null.c +++ b/tools/testing/selftests/bpf/progs/raw_tp_null.c @@ -3,6 +3,7 @@ #include #include +#include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -17,16 +18,14 @@ int BPF_PROG(test_raw_tp_null, struct sk_buff *skb) if (task->pid != tid) return 0; - i = i + skb->mark + 1; - /* The compiler may move the NULL check before this deref, which causes - * the load to fail as deref of scalar. Prevent that by using a barrier. + /* If dead code elimination kicks in, the increment +=2 will be + * removed. For raw_tp programs attaching to tracepoints in kernel + * modules, we mark input arguments as PTR_MAYBE_NULL, so branch + * prediction should never kick in. */ - barrier(); - /* If dead code elimination kicks in, the increment below will - * be removed. For raw_tp programs, we mark input arguments as - * PTR_MAYBE_NULL, so branch prediction should never kick in. - */ - if (!skb) - i += 2; + asm volatile ("%[i] += 1; if %[ctx] != 0 goto +1; %[i] += 2;" + : [i]"+r"(i) + : [ctx]"r"(skb) + : "memory"); return 0; } -- 2.51.0 From 0da1955b5bd2af3a1c3d13916df06e34ffa6df3d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:29 -0800 Subject: [PATCH 05/16] selftests/bpf: Add tests for raw_tp NULL args Add tests to ensure that arguments are correctly marked based on their specified positions, and whether they get marked correctly as maybe null. For modules, all tracepoint parameters should be marked PTR_MAYBE_NULL by default. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/raw_tp_null.c | 3 +++ .../selftests/bpf/progs/raw_tp_null_fail.c | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/raw_tp_null_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c index 6fa19449297e..43676a9922dc 100644 --- a/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c +++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_null.c @@ -3,11 +3,14 @@ #include #include "raw_tp_null.skel.h" +#include "raw_tp_null_fail.skel.h" void test_raw_tp_null(void) { struct raw_tp_null *skel; + RUN_TESTS(raw_tp_null_fail); + skel = raw_tp_null__open_and_load(); if (!ASSERT_OK_PTR(skel, "raw_tp_null__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c new file mode 100644 index 000000000000..38d669957bf1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/raw_tp_null_fail.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +/* Ensure module parameter has PTR_MAYBE_NULL */ +SEC("tp_btf/bpf_testmod_test_raw_tp_null") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int test_raw_tp_null_bpf_testmod_test_raw_tp_null_arg_1(void *ctx) { + asm volatile("r1 = *(u64 *)(r1 +0); r1 = *(u64 *)(r1 +0);" ::: __clobber_all); + return 0; +} + +/* Check NULL marking */ +SEC("tp_btf/sched_pi_setprio") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int test_raw_tp_null_sched_pi_setprio_arg_2(void *ctx) { + asm volatile("r1 = *(u64 *)(r1 +8); r1 = *(u64 *)(r1 +0);" ::: __clobber_all); + return 0; +} -- 2.51.0 From c83508da5620ef89232cb614fb9e02dfdfef2b8f Mon Sep 17 00:00:00 2001 From: Priya Bala Govindasamy Date: Fri, 13 Dec 2024 17:58:58 -0800 Subject: [PATCH 06/16] bpf: Avoid deadlock caused by nested kprobe and fentry bpf programs BPF program types like kprobe and fentry can cause deadlocks in certain situations. If a function takes a lock and one of these bpf programs is hooked to some point in the function's critical section, and if the bpf program tries to call the same function and take the same lock it will lead to deadlock. These situations have been reported in the following bug reports. In percpu_freelist - Link: https://lore.kernel.org/bpf/CAADnVQLAHwsa+2C6j9+UC6ScrDaN9Fjqv1WjB1pP9AzJLhKuLQ@mail.gmail.com/T/ Link: https://lore.kernel.org/bpf/CAPPBnEYm+9zduStsZaDnq93q1jPLqO-PiKX9jy0MuL8LCXmCrQ@mail.gmail.com/T/ In bpf_lru_list - Link: https://lore.kernel.org/bpf/CAPPBnEajj+DMfiR_WRWU5=6A7KKULdB5Rob_NJopFLWF+i9gCA@mail.gmail.com/T/ Link: https://lore.kernel.org/bpf/CAPPBnEZQDVN6VqnQXvVqGoB+ukOtHGZ9b9U0OLJJYvRoSsMY_g@mail.gmail.com/T/ Link: https://lore.kernel.org/bpf/CAPPBnEaCB1rFAYU7Wf8UxqcqOWKmRPU1Nuzk3_oLk6qXR7LBOA@mail.gmail.com/T/ Similar bugs have been reported by syzbot. In queue_stack_maps - Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/ Link: https://lore.kernel.org/all/20240418230932.2689-1-hdanton@sina.com/T/ In lpm_trie - Link: https://lore.kernel.org/linux-kernel/00000000000035168a061a47fa38@google.com/T/ In ringbuf - Link: https://lore.kernel.org/bpf/20240313121345.2292-1-hdanton@sina.com/T/ Prevent kprobe and fentry bpf programs from attaching to these critical sections by removing CC_FLAGS_FTRACE for percpu_freelist.o, bpf_lru_list.o, queue_stack_maps.o, lpm_trie.o, ringbuf.o files. The bugs reported by syzbot are due to tracepoint bpf programs being called in the critical sections. This patch does not aim to fix deadlocks caused by tracepoint programs. However, it does prevent deadlocks from occurring in similar situations due to kprobe and fentry programs. Signed-off-by: Priya Bala Govindasamy Link: https://lore.kernel.org/r/CAPPBnEZpjGnsuA26Mf9kYibSaGLm=oF6=12L21X1GEQdqjLnzQ@mail.gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 9762bdddf1de..410028633621 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -53,3 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o + +CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) -- 2.51.0 From 78d4f34e2115b517bcbfe7ec0d018bbbb6f9b0b8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Dec 2024 15:58:23 -0800 Subject: [PATCH 07/16] Linux 6.13-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 64c594bd7ad0..e5b8a8832c0c 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 2589dbd72797a4163dd998b05c4663ff98bd0771 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 20 Dec 2024 11:33:05 +0000 Subject: [PATCH 08/16] KVM: arm64: Consolidate allowed and restricted VM feature checks The definitions for features allowed and allowed with restrictions for protected guests, which are based on feature registers, were defined and checked for separately, even though they are handled in the same way. This could result in missing checks for certain features, e.g., pointer authentication, causing traps for allowed features. Consolidate the definitions into one. Use that new definition to construct the guest view of the feature registers for consistency. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-2-tabba@google.com Signed-off-by: Marc Zyngier --- .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 55 +++++++------------ arch/arm64/kvm/hyp/nvhe/pkvm.c | 8 +-- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 6 +- 3 files changed, 26 insertions(+), 43 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index f957890c7e38..d1e59b88ff66 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -14,11 +14,8 @@ * guest virtual machines, depending on the mode KVM is running in and on the * type of guest that is running. * - * The ALLOW masks represent a bitmask of feature fields that are allowed - * without any restrictions as long as they are supported by the system. - * - * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for - * features that are restricted to support at most the specified feature. + * Each field in the masks represents the highest supported *unsigned* value for + * the feature, if supported by the system. * * If a feature field is not present in either, than it is not supported. * @@ -34,16 +31,7 @@ * - Floating-point and Advanced SIMD * - Data Independent Timing * - Spectre/Meltdown Mitigation - */ -#define PVM_ID_AA64PFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \ - ) - -/* + * * Restrict to the following *unsigned* features for protected VMs: * - AArch64 guests only (no support for AArch32 guests): * AArch32 adds complexity in trap handling, emulation, condition codes, @@ -51,7 +39,12 @@ * - RAS (v1) * Supported by KVM */ -#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ +#define PVM_ID_AA64PFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) | \ SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP) | \ SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP) | \ SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP) | \ @@ -77,20 +70,16 @@ * - Distinction between Secure and Non-secure Memory * - Mixed-endian at EL0 only * - Non-context synchronizing exception entry and exit + * + * Restrict to the following *unsigned* features for protected VMs: + * - 40-bit IPA + * - 16-bit ASID */ #define PVM_ID_AA64MMFR0_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - 40-bit IPA - * - 16-bit ASID - */ -#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) | \ FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ ) @@ -185,15 +174,6 @@ ) /* Restrict pointer authentication to the basic version. */ -#define PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ - ) - -#define PVM_ID_AA64ISAR2_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ - ) - #define PVM_ID_AA64ISAR1_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ @@ -206,13 +186,16 @@ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ ) #define PVM_ID_AA64ISAR2_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) \ + ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ ) u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 071993c16de8..42b4d6e3b15c 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -36,9 +36,9 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) /* Protected KVM does not support AArch32 guests. */ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL0_IMP); + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL1_IMP); + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); /* * Linux guests assume support for floating-point and Advanced SIMD. Do @@ -362,8 +362,8 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) set_bit(KVM_ARM_VCPU_SVE, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED)) + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_ALLOW)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 2860548d4250..59fb2f056177 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -89,7 +89,7 @@ static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); + PVM_ID_AA64PFR0_ALLOW); return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; } @@ -189,7 +189,7 @@ static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) u64 set_mask; set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); + PVM_ID_AA64MMFR0_ALLOW); return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; } @@ -276,7 +276,7 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, * of AArch32 feature id registers. */ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_EL1_IMP); + PVM_ID_AA64PFR0_ALLOW) > ID_AA64PFR0_EL1_EL1_IMP); return pvm_access_raz_wi(vcpu, p, r); } -- 2.51.0 From f50758260bfff393f2a800469b37c45a7ef50376 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 20 Dec 2024 11:33:05 +0000 Subject: [PATCH 09/16] KVM: arm64: Group setting traps for protected VMs by control register Group setting protected VM traps by control register rather than feature id register, since some trap values (e.g., PAuth), depend on more than one feature id register. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 317 +++++++++++++++------------------ 1 file changed, 144 insertions(+), 173 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 42b4d6e3b15c..3af5fca64f67 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -23,233 +23,204 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; -/* - * Set trap register values based on features in ID_AA64PFR0. - */ -static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - u64 hcr_set = HCR_RW; - u64 hcr_clear = 0; - u64 cptr_set = 0; - u64 cptr_clear = 0; - - /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); - - /* - * Linux guests assume support for floating-point and Advanced SIMD. Do - * not change the trapping behavior for these from the KVM default. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; if (has_hvhe()) - hcr_set |= HCR_E2H; + vcpu->arch.hcr_el2 |= HCR_E2H; - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < - ID_AA64PFR0_EL1_RAS_V1P1) { - hcr_set |= HCR_TERR | HCR_TEA; - hcr_clear |= HCR_FIEN; + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { - hcr_clear |= HCR_AMVOFFEN; - cptr_set |= CPTR_EL2_TAM; - } + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { - if (has_hvhe()) - cptr_clear |= CPACR_ELx_ZEN; - else - cptr_set |= CPTR_EL2_TZ; - } + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; - vcpu->arch.cptr_el2 &= ~cptr_clear; + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); } -/* - * Set trap register values based on features in ID_AA64PFR1. - */ -static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - u64 hcr_set = 0; - u64 hcr_clear = 0; + const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + const u64 id_aa64mmfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + u64 val = vcpu->arch.hcr_el2; + + /* No support for AArch32. */ + val |= HCR_RW; + + if (has_hvhe()) + val |= HCR_E2H; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; + + /* Trap RAS unless all current versions are supported */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0) < + ID_AA64PFR0_EL1_RAS_V1P1) { + val |= HCR_TERR | HCR_TEA; + val &= ~(HCR_FIEN); + } + + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + val &= ~(HCR_AMVOFFEN); /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) { - hcr_set |= HCR_TID5; - hcr_clear |= HCR_DCT | HCR_ATA; + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), id_aa64pfr1)) { + val |= HCR_TID5; + val &= ~(HCR_DCT | HCR_ATA); } - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; + /* Trap LOR */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), id_aa64mmfr1)) + val |= HCR_TLOR; + + vcpu->arch.hcr_el2 = val; } -/* - * Set trap register values based on features in ID_AA64DFR0. - */ -static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) +static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) { - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - u64 mdcr_set = 0; - u64 mdcr_clear = 0; - u64 cptr_set = 0; + const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 val = vcpu->arch.cptr_el2; - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; - mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | - MDCR_EL2_HPMN_MASK; + if (!has_hvhe()) { + val |= CPTR_NVHE_EL2_RES1; + val &= ~(CPTR_NVHE_EL2_RES0); } - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids)) - mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; - - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids)) - mdcr_set |= MDCR_EL2_TDOSA; + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + val |= CPTR_EL2_TAM; - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK; + /* Trap SVE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), id_aa64pfr0)) { + if (has_hvhe()) + val &= ~(CPACR_ELx_ZEN); + else + val |= CPTR_EL2_TZ; } - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) - mdcr_set |= MDCR_EL2_TTRF; + /* No SME support in KVM. */ + BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME), id_aa64pfr1)); + if (has_hvhe()) + val &= ~(CPACR_ELx_SMEN); + else + val |= CPTR_EL2_TSM; /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), id_aa64dfr0)) { if (has_hvhe()) - cptr_set |= CPACR_EL1_TTA; + val |= CPACR_EL1_TTA; else - cptr_set |= CPTR_EL2_TTA; + val |= CPTR_EL2_TTA; } - /* Trap External Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) - mdcr_clear |= MDCR_EL2_E2TB_MASK; - - vcpu->arch.mdcr_el2 |= mdcr_set; - vcpu->arch.mdcr_el2 &= ~mdcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; -} - -/* - * Set trap register values based on features in ID_AA64MMFR0. - */ -static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); - u64 mdcr_set = 0; - - /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids)) - mdcr_set |= MDCR_EL2_TDCC; - - vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.cptr_el2 = val; } -/* - * Set trap register values based on features in ID_AA64MMFR1. - */ -static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); - u64 hcr_set = 0; - - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids)) - hcr_set |= HCR_TLOR; - - vcpu->arch.hcr_el2 |= hcr_set; -} - -/* - * Set baseline trap register values. - */ -static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) +static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) { - const u64 hcr_trap_feat_regs = HCR_TID3; - const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; - - /* - * Always trap: - * - Feature id registers: to control features exposed to guests - * - Implementation-defined features - */ - vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; + const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + const u64 id_aa64mmfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + u64 val = vcpu->arch.mdcr_el2; - /* Clear res0 and set res1 bits to trap potential new features. */ - vcpu->arch.hcr_el2 &= ~(HCR_RES0); - vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - if (!has_hvhe()) { - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); + /* Trap/constrain PMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), id_aa64dfr0)) { + val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); } -} -static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; + /* Trap Debug */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0)) + val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; - if (has_hvhe()) - vcpu->arch.hcr_el2 |= HCR_E2H; + /* Trap OS Double Lock */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), id_aa64dfr0)) + val |= MDCR_EL2_TDOSA; - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { - /* route synchronous external abort exceptions to EL2 */ - vcpu->arch.hcr_el2 |= HCR_TEA; - /* trap error record accesses */ - vcpu->arch.hcr_el2 |= HCR_TERR; + /* Trap SPE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), id_aa64dfr0)) { + val |= MDCR_EL2_TPMS; + val &= ~MDCR_EL2_E2PB_MASK; } - if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) - vcpu->arch.hcr_el2 |= HCR_FWB; + /* Trap Trace Filter */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), id_aa64dfr0)) + val |= MDCR_EL2_TTRF; - if (cpus_have_final_cap(ARM64_HAS_EVT) && - !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) - vcpu->arch.hcr_el2 |= HCR_TID4; - else - vcpu->arch.hcr_el2 |= HCR_TID2; + /* Trap External Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), id_aa64dfr0)) + val |= MDCR_EL2_E2TB_MASK; - if (vcpu_has_ptrauth(vcpu)) - vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + /* Trap Debug Communications Channel registers */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), id_aa64mmfr0)) + val |= MDCR_EL2_TDCC; + + vcpu->arch.mdcr_el2 = val; } /* * Initialize trap register values in protected mode. */ -static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) { + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); vcpu->arch.mdcr_el2 = 0; pkvm_vcpu_reset_hcr(vcpu); - if ((!vcpu_is_protected(vcpu))) + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) return; - pvm_init_trap_regs(vcpu); - pvm_init_traps_aa64pfr0(vcpu); - pvm_init_traps_aa64pfr1(vcpu); - pvm_init_traps_aa64dfr0(vcpu); - pvm_init_traps_aa64mmfr0(vcpu); - pvm_init_traps_aa64mmfr1(vcpu); + /* + * PAuth is allowed if supported by the system and the vcpu. + * Properly checking for PAuth requires checking various fields in + * ID_AA64ISAR1_EL1 and ID_AA64ISAR2_EL1. The way that fixed config + * is controlled now in pKVM does not easily allow that. This will + * change later to follow the changes upstream wrt fixed configuration + * and nested virt. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), + PVM_ID_AA64ISAR1_ALLOW)); + + /* Protected KVM does not support AArch32 guests. */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); + + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), + PVM_ID_AA64PFR0_ALLOW)); + + pvm_init_traps_hcr(vcpu); + pvm_init_traps_cptr(vcpu); + pvm_init_traps_mdcr(vcpu); } /* @@ -448,7 +419,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); pkvm_vcpu_init_ptrauth(hyp_vcpu); - pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); + pkvm_vcpu_init_traps(hyp_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); -- 2.51.0 From 1fea164ccf19750c5bea688afd9122eb84eb3a72 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:43 +0000 Subject: [PATCH 10/16] KVM: arm64: Move checking protected vcpu features to a separate function At the moment, checks for supported vcpu features for protected VMs are build-time bugs. In the following patch, they will become runtime checks based on the vcpu's features registers. Therefore, consolidate them into one function that would return an error if it encounters an unsupported feature. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 45 ++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 3af5fca64f67..6a39f68563ae 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -178,20 +178,11 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) } /* - * Initialize trap register values in protected mode. + * Check that cpu features that are neither trapped nor supported are not + * enabled for protected VMs. */ -static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) +static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; - - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); - vcpu->arch.mdcr_el2 = 0; - - pkvm_vcpu_reset_hcr(vcpu); - - if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) - return; - /* * PAuth is allowed if supported by the system and the vcpu. * Properly checking for PAuth requires checking various fields in @@ -218,9 +209,34 @@ static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), PVM_ID_AA64PFR0_ALLOW)); + return 0; +} + +/* + * Initialize trap register values in protected mode. + */ +static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret; + + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) + return 0; + + ret = pkvm_check_pvm_cpu_features(vcpu); + if (ret) + return ret; + pvm_init_traps_hcr(vcpu); pvm_init_traps_cptr(vcpu); pvm_init_traps_mdcr(vcpu); + + return 0; } /* @@ -417,9 +433,12 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + ret = pkvm_vcpu_init_traps(hyp_vcpu); + if (ret) + goto done; + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); pkvm_vcpu_init_ptrauth(hyp_vcpu); - pkvm_vcpu_init_traps(hyp_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); -- 2.51.0 From 27f5cf8ad5224033a711aef3fde90b60c9a8d7d5 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:44 +0000 Subject: [PATCH 11/16] KVM: arm64: Remove KVM_ARM_VCPU_POWER_OFF from protected VMs allowed features in pKVM The hypervisor is responsible for the power state of protected VMs in pKVM. Therefore, remove KVM_ARM_VCPU_POWER_OFF from the list of allowed features for protected VMs. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 6a39f68563ae..d7ca7e9ccea2 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -331,10 +331,8 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* * For protected VMs, always allow: - * - CPU starting in poweroff state * - PSCI v0.2 */ - set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); /* -- 2.51.0 From a3163dca4817e9a30b154a14c793641e39a00592 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:45 +0000 Subject: [PATCH 12/16] KVM: arm64: Use KVM extension checks for allowed protected VM capabilities Use KVM extension checks as the source for determining which capabilities are allowed for protected VMs. KVM extension checks is the natural place for this, since it is also the interface exposed to users. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-6-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pkvm.h | 25 +++++++++++++++++++++++++ arch/arm64/kvm/arm.c | 29 ++--------------------------- arch/arm64/kvm/hyp/nvhe/pkvm.c | 24 ++++++------------------ 3 files changed, 33 insertions(+), 45 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index cd56acd9a842..400f7cef1e81 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -20,6 +20,31 @@ int pkvm_init_host_vm(struct kvm *kvm); int pkvm_create_hyp_vm(struct kvm *kvm); void pkvm_destroy_hyp_vm(struct kvm *kvm); +/* + * This functions as an allow-list of protected VM capabilities. + * Features not explicitly allowed by this function are denied. + */ +static inline bool kvm_pvm_ext_allowed(long ext) +{ + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_ARM_PSCI: + case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_NR_VCPUS: + case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: + case KVM_CAP_MSI_DEVID: + case KVM_CAP_ARM_VM_IPA_SIZE: + case KVM_CAP_ARM_PMU_V3: + case KVM_CAP_ARM_SVE: + case KVM_CAP_ARM_PTRAUTH_ADDRESS: + case KVM_CAP_ARM_PTRAUTH_GENERIC: + return true; + default: + return false; + } +} + extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a102c3aebdbc..b295218cdc24 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -80,31 +80,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -/* - * This functions as an allow-list of protected VM capabilities. - * Features not explicitly allowed by this function are denied. - */ -static bool pkvm_ext_allowed(struct kvm *kvm, long ext) -{ - switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_ARM_PSCI: - case KVM_CAP_ARM_PSCI_0_2: - case KVM_CAP_NR_VCPUS: - case KVM_CAP_MAX_VCPUS: - case KVM_CAP_MAX_VCPU_ID: - case KVM_CAP_MSI_DEVID: - case KVM_CAP_ARM_VM_IPA_SIZE: - case KVM_CAP_ARM_PMU_V3: - case KVM_CAP_ARM_SVE: - case KVM_CAP_ARM_PTRAUTH_ADDRESS: - case KVM_CAP_ARM_PTRAUTH_GENERIC: - return true; - default: - return false; - } -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -113,7 +88,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, if (cap->flags) return -EINVAL; - if (kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, cap->cap)) + if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) return -EINVAL; switch (cap->cap) { @@ -311,7 +286,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; - if (kvm && kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, ext)) + if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) return 0; switch (ext) { diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index d7ca7e9ccea2..c39d4e92dd3c 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -329,32 +329,20 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); - /* - * For protected VMs, always allow: - * - PSCI v0.2 - */ set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); - /* - * Check if remaining features are allowed: - * - Performance Monitoring - * - Scalable Vectors - * - Pointer Authentication - */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) - set_bit(KVM_ARM_VCPU_SVE, allowed_features); - - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_ALLOW) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, allowed_features, KVM_VCPU_MAX_FEATURES); } -- 2.51.0 From 7ba5b8f80475e48b486f095ee9fb67dc9f9d02df Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:46 +0000 Subject: [PATCH 13/16] KVM: arm64: Initialize feature id registers for protected VMs The hypervisor maintains the state of protected VMs. Initialize the values for feature ID registers for protected VMs, to be used when setting traps and when advertising features to protected VMs. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-7-tabba@google.com Signed-off-by: Marc Zyngier --- .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 1 + arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 2 + arch/arm64/kvm/hyp/nvhe/pkvm.c | 10 +++-- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 45 +++++++++++++++++-- 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index d1e59b88ff66..69e26d1a0ebe 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -201,6 +201,7 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); int kvm_check_pvm_sysreg_table(void); #endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 24a9a8330d19..698bc20ab80b 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -47,6 +47,8 @@ struct pkvm_hyp_vm { struct pkvm_hyp_vcpu *vcpus[]; }; +extern hyp_spinlock_t vm_table_lock; + static inline struct pkvm_hyp_vm * pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index c39d4e92dd3c..c7958183e40d 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -257,10 +257,10 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) /* * Spinlock for protecting state related to the VM table. Protects writes - * to 'vm_table' and 'nr_table_entries' as well as reads and writes to - * 'last_hyp_vcpu_lookup'. + * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. + * Also protects reads and writes to 'last_hyp_vcpu_lookup'. */ -static DEFINE_HYP_SPINLOCK(vm_table_lock); +DEFINE_HYP_SPINLOCK(vm_table_lock); /* * The table of VM entries for protected VMs in hyp. @@ -381,6 +381,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.flags = 0; pkvm_init_features_from_host(hyp_vm, host_kvm); } @@ -419,6 +420,9 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + ret = pkvm_vcpu_init_traps(hyp_vcpu); if (ret) goto done; diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 59fb2f056177..2aea44c911bd 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -12,6 +12,7 @@ #include #include +#include #include "../../sys_regs.h" @@ -204,8 +205,7 @@ static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; } -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) { switch (id) { case SYS_ID_AA64PFR0_EL1: @@ -240,10 +240,25 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) } } +/* Read a sanitized cpufeature ID register by its encoding */ +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + return pvm_calc_id_reg(vcpu, id); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { - return pvm_read_id_reg(vcpu, reg_to_encoding(r)); + struct kvm *kvm = vcpu->kvm; + u32 reg = reg_to_encoding(r); + + if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))) + return 0; + + if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7)) + return kvm->arch.id_regs[IDREG_IDX(reg)]; + + return 0; } /* Handler to RAZ/WI sysregs */ @@ -448,6 +463,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Performance Monitoring Registers are restricted. */ }; +/* + * Initializes feature registers for protected vms. + */ +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch *ka = &kvm->arch; + u32 r; + + hyp_assert_lock_held(&vm_table_lock); + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + /* + * Initialize only AArch64 id registers since AArch32 isn't supported + * for protected VMs. + */ + for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1)) + ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r); + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + /* * Checks that the sysreg table is unique and in-order. * -- 2.51.0 From 9df9186f8df513dc9bf9f95f68525c7ebc941bcd Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:47 +0000 Subject: [PATCH 14/16] KVM: arm64: Fix RAS trapping in pKVM for protected VMs Trap RAS in pKVM if not supported at all for protected VMs. The RAS version doesn't matter in this case. Fixes: 2a0c343386ae ("KVM: arm64: Initialize trap registers for protected VMs") Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-8-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index c7958183e40d..6eddd29b264b 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -70,9 +70,8 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) */ val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0) < - ID_AA64PFR0_EL1_RAS_V1P1) { + /* Trap RAS */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0)) { val |= HCR_TERR | HCR_TEA; val &= ~(HCR_FIEN); } -- 2.51.0 From 0401f7e76d707741d2562f4988743cc5daf445e4 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:48 +0000 Subject: [PATCH 15/16] KVM: arm64: Set protected VM traps based on its view of feature registers Now that the VM's feature id registers are initialized with the values of the supported features, use those values to determine which traps to set using kvm_has_feature(). Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-9-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 84 +++++++++++------------------- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 7 --- 2 files changed, 30 insertions(+), 61 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 6eddd29b264b..23afc63cac55 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -52,9 +52,7 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) { - const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - const u64 id_aa64mmfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + struct kvm *kvm = vcpu->kvm; u64 val = vcpu->arch.hcr_el2; /* No support for AArch32. */ @@ -70,24 +68,20 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) */ val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; - /* Trap RAS */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0)) { + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { val |= HCR_TERR | HCR_TEA; val &= ~(HCR_FIEN); } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) val &= ~(HCR_AMVOFFEN); - /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), id_aa64pfr1)) { + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { val |= HCR_TID5; val &= ~(HCR_DCT | HCR_ATA); } - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), id_aa64mmfr1)) + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) val |= HCR_TLOR; vcpu->arch.hcr_el2 = val; @@ -95,9 +89,7 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) { - const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + struct kvm *kvm = vcpu->kvm; u64 val = vcpu->arch.cptr_el2; if (!has_hvhe()) { @@ -105,12 +97,11 @@ static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) val &= ~(CPTR_NVHE_EL2_RES0); } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) val |= CPTR_EL2_TAM; - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), id_aa64pfr0)) { + /* SVE can be disabled by userspace even if supported. */ + if (!vcpu_has_sve(vcpu)) { if (has_hvhe()) val &= ~(CPACR_ELx_ZEN); else @@ -118,14 +109,13 @@ static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) } /* No SME support in KVM. */ - BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME), id_aa64pfr1)); + BUG_ON(kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)); if (has_hvhe()) val &= ~(CPACR_ELx_SMEN); else val |= CPTR_EL2_TSM; - /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), id_aa64dfr0)) { + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) { if (has_hvhe()) val |= CPACR_EL1_TTA; else @@ -137,40 +127,33 @@ static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) { - const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - const u64 id_aa64mmfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + struct kvm *kvm = vcpu->kvm; u64 val = vcpu->arch.mdcr_el2; - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), id_aa64dfr0)) { + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); } - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) val |= MDCR_EL2_TDOSA; - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), id_aa64dfr0)) { + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { val |= MDCR_EL2_TPMS; val &= ~MDCR_EL2_E2PB_MASK; } - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) val |= MDCR_EL2_TTRF; - /* Trap External Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) val |= MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), id_aa64mmfr0)) + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) val |= MDCR_EL2_TDCC; vcpu->arch.mdcr_el2 = val; @@ -182,31 +165,24 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) */ static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - /* - * PAuth is allowed if supported by the system and the vcpu. - * Properly checking for PAuth requires checking various fields in - * ID_AA64ISAR1_EL1 and ID_AA64ISAR2_EL1. The way that fixed config - * is controlled now in pKVM does not easily allow that. This will - * change later to follow the changes upstream wrt fixed configuration - * and nested virt. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), - PVM_ID_AA64ISAR1_ALLOW)); + struct kvm *kvm = vcpu->kvm; /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) + return -EINVAL; /* * Linux guests assume support for floating-point and Advanced SIMD. Do * not change the trapping behavior for these from the KVM default. */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || + !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) + return -EINVAL; + + /* No SME support in KVM right now. Check to catch if it changes. */ + if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + return -EINVAL; return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 2aea44c911bd..398563d3a266 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -286,13 +286,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, return false; } - /* - * No support for AArch32 guests, therefore, pKVM has no sanitized copy - * of AArch32 feature id registers. - */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_ALLOW) > ID_AA64PFR0_EL1_EL1_IMP); - return pvm_access_raz_wi(vcpu, p, r); } -- 2.51.0 From 3d7ff00700d1a4d0c8f092f2c1bf67553a8c7c4c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:49 +0000 Subject: [PATCH 16/16] KVM: arm64: Rework specifying restricted features for protected VMs The existing code didn't properly distinguish between signed and unsigned features, and was difficult to read and to maintain. Rework it using the same method used in other parts of KVM when handling vcpu features. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-10-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 1 - arch/arm64/kvm/hyp/nvhe/sys_regs.c | 357 +++++++++--------- 3 files changed, 189 insertions(+), 170 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e18e9244d17a..d3230a22846f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1422,6 +1422,7 @@ static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature) return test_bit(feature, ka->vcpu_features); } +#define kvm_vcpu_has_feature(k, f) __vcpu_has_feature(&(k)->arch, (f)) #define vcpu_has_feature(v, f) __vcpu_has_feature(&(v)->kvm->arch, (f)) #define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED) diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index 69e26d1a0ebe..37a6d2434e47 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -198,7 +198,6 @@ FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ ) -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 398563d3a266..934d7c4b04b4 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -29,221 +29,240 @@ u64 id_aa64mmfr1_el1_sys_val; u64 id_aa64mmfr2_el1_sys_val; u64 id_aa64smfr0_el1_sys_val; -/* - * Inject an unknown/undefined exception to an AArch64 guest while most of its - * sysregs are live. - */ -static void inject_undef64(struct kvm_vcpu *vcpu) -{ - u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - - *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); - *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - - kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); - - __kvm_adjust_pc(vcpu); - - write_sysreg_el1(esr, SYS_ESR); - write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); - write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); - write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); -} - -/* - * Returns the restricted features values of the feature register based on the - * limitations in restrict_fields. - * A feature id field value of 0b0000 does not impose any restrictions. - * Note: Use only for unsigned feature field values. - */ -static u64 get_restricted_features_unsigned(u64 sys_reg_val, - u64 restrict_fields) -{ - u64 value = 0UL; - u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); +struct pvm_ftr_bits { + bool sign; + u8 shift; + u8 width; + u8 max_val; + bool (*vm_supported)(const struct kvm *kvm); +}; - /* - * According to the Arm Architecture Reference Manual, feature fields - * use increasing values to indicate increases in functionality. - * Iterate over the restricted feature fields and calculate the minimum - * unsigned value between the one supported by the system, and what the - * value is being restricted to. - */ - while (sys_reg_val && restrict_fields) { - value |= min(sys_reg_val & mask, restrict_fields & mask); - sys_reg_val &= ~mask; - restrict_fields &= ~mask; - mask <<= ARM64_FEATURE_FIELD_BITS; +#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \ + { \ + .sign = sgn, \ + .shift = id##_##fld##_SHIFT, \ + .width = id##_##fld##_WIDTH, \ + .max_val = id##_##fld##_##max, \ + .vm_supported = func, \ } - return value; -} - -/* - * Functions that return the value of feature id registers for protected VMs - * based on allowed features, system features, and KVM support. - */ - -static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask = 0; - u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; - - set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_ALLOW); +#define MAX_FEAT_FUNC(id, fld, max, func) \ + __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED) - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; -} - -static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) -{ - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); - u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; +#define MAX_FEAT(id, fld, max) \ + MAX_FEAT_FUNC(id, fld, max, NULL) - if (!kvm_has_mte(kvm)) - allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); +#define MAX_FEAT_ENUM(id, fld, max) \ + __MAX_FEAT_FUNC(id, fld, max, NULL, false) - return id_aa64pfr1_el1_sys_val & allow_mask; -} +#define FEAT_END { .width = 0, } -static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) +static bool vm_has_ptrauth(const struct kvm *kvm) { - /* - * No support for Scalable Vectors, therefore, hyp has no sanitized - * copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, including breakpoints, and watchpoints, - * therefore, pKVM has no sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, therefore, hyp has no sanitized copy of the - * feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); - return 0; -} + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; -static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); - return 0; + return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || + cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC); } -static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +static bool vm_has_sve(const struct kvm *kvm) { - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); - return 0; + return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE); } -static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) -{ - return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; -} +/* + * Definitions for features to be allowed or restricted for protected guests. + * + * Each field in the masks represents the highest supported value for the + * feature. If a feature field is not present, it is not supported. Moreover, + * these are used to generate the guest's view of the feature registers. + * + * The approach for protected VMs is to at least support features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ -static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64pfr0[] = { + MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP), + MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve), + MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); +static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { + MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP), + MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2), + MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI), + FEAT_END +}; - return id_aa64isar1_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW; +static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = { + MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM), + MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16), + MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2), + MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3), + MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP), + FEAT_END +}; - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); +static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { + MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), + MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), + MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), + FEAT_END +}; - return id_aa64isar2_el1_sys_val & allow_mask; -} +static const struct pvm_ftr_bits pvmid_aa64isar1[] = { + MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3), + MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX), + MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16), + MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP), + FEAT_END +}; -static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask; +static const struct pvm_ftr_bits pvmid_aa64isar2[] = { + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP), + FEAT_END +}; - set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_ALLOW); +/* + * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported. + * However, both have Not-Implemented values that are non-zero. Define them + * so they can be used when getting the value of these registers. + */ +#define ID_AA64DFR0_EL1_NONZERO_NI \ +( \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \ +) - return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; -} +#define ID_AA64MMFR4_EL1_NONZERO_NI \ + SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI) -static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +/* + * Returns the value of the feature registers based on the system register + * value, the vcpu support for the revelant features, and the additional + * restrictions for protected VMs. + */ +static u64 get_restricted_features(const struct kvm_vcpu *vcpu, + u64 sys_reg_val, + const struct pvm_ftr_bits restrictions[]) { - return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; -} + u64 val = 0UL; + int i; + + for (i = 0; restrictions[i].width != 0; i++) { + bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported; + bool sign = restrictions[i].sign; + int shift = restrictions[i].shift; + int width = restrictions[i].width; + u64 min_signed = (1UL << width) - 1UL; + u64 sign_bit = 1UL << (width - 1); + u64 mask = GENMASK_ULL(width + shift - 1, shift); + u64 sys_val = (sys_reg_val & mask) >> shift; + u64 pvm_max = restrictions[i].max_val; + + if (vm_supported && !vm_supported(vcpu->kvm)) + val |= (sign ? min_signed : 0) << shift; + else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit)) + val |= max(sys_val, pvm_max) << shift; + else + val |= min(sys_val, pvm_max) << shift; + } -static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) -{ - return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; + return val; } static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) { switch (id) { case SYS_ID_AA64PFR0_EL1: - return get_pvm_id_aa64pfr0(vcpu); + return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); case SYS_ID_AA64PFR1_EL1: - return get_pvm_id_aa64pfr1(vcpu); - case SYS_ID_AA64ZFR0_EL1: - return get_pvm_id_aa64zfr0(vcpu); - case SYS_ID_AA64DFR0_EL1: - return get_pvm_id_aa64dfr0(vcpu); - case SYS_ID_AA64DFR1_EL1: - return get_pvm_id_aa64dfr1(vcpu); - case SYS_ID_AA64AFR0_EL1: - return get_pvm_id_aa64afr0(vcpu); - case SYS_ID_AA64AFR1_EL1: - return get_pvm_id_aa64afr1(vcpu); + return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); case SYS_ID_AA64ISAR0_EL1: - return get_pvm_id_aa64isar0(vcpu); + return id_aa64isar0_el1_sys_val; case SYS_ID_AA64ISAR1_EL1: - return get_pvm_id_aa64isar1(vcpu); + return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1); case SYS_ID_AA64ISAR2_EL1: - return get_pvm_id_aa64isar2(vcpu); + return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2); case SYS_ID_AA64MMFR0_EL1: - return get_pvm_id_aa64mmfr0(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0); case SYS_ID_AA64MMFR1_EL1: - return get_pvm_id_aa64mmfr1(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1); case SYS_ID_AA64MMFR2_EL1: - return get_pvm_id_aa64mmfr2(vcpu); + return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2); + case SYS_ID_AA64DFR0_EL1: + return ID_AA64DFR0_EL1_NONZERO_NI; + case SYS_ID_AA64MMFR4_EL1: + return ID_AA64MMFR4_EL1_NONZERO_NI; default: /* Unhandled ID register, RAZ */ return 0; } } -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) { - return pvm_calc_id_reg(vcpu, id); + u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); } static u64 read_id_reg(const struct kvm_vcpu *vcpu, -- 2.51.0