KVM: arm64: nv: Handle shadow stage 2 page faults

author Marc Zyngier <maz@kernel.org>

Fri, 14 Jun 2024 14:45:39 +0000 (15:45 +0100)

committer Oliver Upton <oliver.upton@linux.dev>

Wed, 19 Jun 2024 08:13:49 +0000 (08:13 +0000)
author Marc Zyngier <maz@kernel.org>
Fri, 14 Jun 2024 14:45:39 +0000 (15:45 +0100)
committer Oliver Upton <oliver.upton@linux.dev>
Wed, 19 Jun 2024 08:13:49 +0000 (08:13 +0000)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h

index 5404b7b843cff6b203ee70440d699b48f3e72ad2..82e0484ca26bfeebc664c4e97d5bf31febf2f2d1 100644 (file)
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -78,8 +78,41 @@ struct kvm_s2_trans {
         u64 upper_attr;
  };
  
+static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
+{
+       return trans->output;
+}
+
+static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans)
+{
+       return trans->block_size;
+}
+
+static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans)
+{
+       return trans->esr;
+}
+
+static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans)
+{
+       return trans->readable;
+}
+
+static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans)
+{
+       return trans->writable;
+}
+
+static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans)
+{
+       return !(trans->upper_attr & BIT(54));
+}
+
  extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
                               struct kvm_s2_trans *result);
+extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
+                                   struct kvm_s2_trans *trans);
+extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
  
  int kvm_init_nv_sysregs(struct kvm *kvm);
  
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c

index 8984b7c213e19b1df1373ea3d757d0584a4edaac..5aed2e9d380d0813db675a0992683db201a76570 100644 (file)
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1407,6 +1407,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
  }
  
  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+                         struct kvm_s2_trans *nested,
                           struct kvm_memory_slot *memslot, unsigned long hva,
                           bool fault_is_perm)
  {
@@ -1415,6 +1416,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         bool exec_fault, mte_allowed;
         bool device = false, vfio_allow_any_uc = false;
         unsigned long mmu_seq;
+       phys_addr_t ipa = fault_ipa;
         struct kvm *kvm = vcpu->kvm;
         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
         struct vm_area_struct *vma;
@@ -1498,10 +1500,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         }
  
         vma_pagesize = 1UL << vma_shift;
+
+       if (nested) {
+               unsigned long max_map_size;
+
+               max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
+
+               ipa = kvm_s2_trans_output(nested);
+
+               /*
+                * If we're about to create a shadow stage 2 entry, then we
+                * can only create a block mapping if the guest stage 2 page
+                * table uses at least as big a mapping.
+                */
+               max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+
+               /*
+                * Be careful that if the mapping size falls between
+                * two host sizes, take the smallest of the two.
+                */
+               if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
+                       max_map_size = PMD_SIZE;
+               else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
+                       max_map_size = PAGE_SIZE;
+
+               force_pte = (max_map_size == PAGE_SIZE);
+               vma_pagesize = min(vma_pagesize, (long)max_map_size);
+       }
+
         if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
                 fault_ipa &= ~(vma_pagesize - 1);
  
-       gfn = fault_ipa >> PAGE_SHIFT;
+       gfn = ipa >> PAGE_SHIFT;
         mte_allowed = kvm_vma_mte_allowed(vma);
  
         vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1552,6 +1582,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         if (exec_fault && device)
                 return -ENOEXEC;
  
+       /*
+        * Potentially reduce shadow S2 permissions to match the guest's own
+        * S2. For exec faults, we'd only reach this point if the guest
+        * actually allowed it (see kvm_s2_handle_perm_fault).
+        */
+       if (nested) {
+               writable &= kvm_s2_trans_writable(nested);
+               if (!kvm_s2_trans_readable(nested))
+                       prot &= ~KVM_PGTABLE_PROT_R;
+       }
+
         read_lock(&kvm->mmu_lock);
         pgt = vcpu->arch.hw_mmu->pgt;
         if (mmu_invalidate_retry(kvm, mmu_seq)) {
@@ -1598,7 +1639,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                         prot |= KVM_PGTABLE_PROT_NORMAL_NC;
                 else
                         prot |= KVM_PGTABLE_PROT_DEVICE;
-       } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
+       } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
+                  (!nested || kvm_s2_trans_executable(nested))) {
                 prot |= KVM_PGTABLE_PROT_X;
         }
  
@@ -1658,8 +1700,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
   */
  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
  {
+       struct kvm_s2_trans nested_trans, *nested = NULL;
         unsigned long esr;
-       phys_addr_t fault_ipa;
+       phys_addr_t fault_ipa; /* The address we faulted on */
+       phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
         struct kvm_memory_slot *memslot;
         unsigned long hva;
         bool is_iabt, write_fault, writable;
@@ -1668,7 +1712,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
  
         esr = kvm_vcpu_get_esr(vcpu);
  
-       fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+       ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
         is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
  
         if (esr_fsc_is_translation_fault(esr)) {
@@ -1718,7 +1762,42 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
  
         idx = srcu_read_lock(&vcpu->kvm->srcu);
  
-       gfn = fault_ipa >> PAGE_SHIFT;
+       /*
+        * We may have faulted on a shadow stage 2 page table if we are
+        * running a nested guest.  In this case, we have to resolve the L2
+        * IPA to the L1 IPA first, before knowing what kind of memory should
+        * back the L1 IPA.
+        *
+        * If the shadow stage 2 page table walk faults, then we simply inject
+        * this to the guest and carry on.
+        *
+        * If there are no shadow S2 PTs because S2 is disabled, there is
+        * nothing to walk and we treat it as a 1:1 before going through the
+        * canonical translation.
+        */
+       if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
+           vcpu->arch.hw_mmu->nested_stage2_enabled) {
+               u32 esr;
+
+               ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+               if (ret) {
+                       esr = kvm_s2_trans_esr(&nested_trans);
+                       kvm_inject_s2_fault(vcpu, esr);
+                       goto out_unlock;
+               }
+
+               ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
+               if (ret) {
+                       esr = kvm_s2_trans_esr(&nested_trans);
+                       kvm_inject_s2_fault(vcpu, esr);
+                       goto out_unlock;
+               }
+
+               ipa = kvm_s2_trans_output(&nested_trans);
+               nested = &nested_trans;
+       }
+
+       gfn = ipa >> PAGE_SHIFT;
         memslot = gfn_to_memslot(vcpu->kvm, gfn);
         hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
         write_fault = kvm_is_write_fault(vcpu);
@@ -1762,13 +1841,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
                  * faulting VA. This is always 12 bits, irrespective
                  * of the page size.
                  */
-               fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
-               ret = io_mem_abort(vcpu, fault_ipa);
+               ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+               ret = io_mem_abort(vcpu, ipa);
                 goto out_unlock;
         }
  
         /* Userspace should not be able to register out-of-bounds IPAs */
-       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
+       VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
  
         if (esr_fsc_is_access_flag_fault(esr)) {
                 handle_access_fault(vcpu, fault_ipa);
@@ -1776,7 +1855,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
                 goto out_unlock;
         }
  
-       ret = user_mem_abort(vcpu, fault_ipa, memslot, hva,
+       ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
                              esr_fsc_is_permission_fault(esr));
         if (ret == 0)
                 ret = 1;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c

index 8c325d0a5636a4335e2cc423117af415ae054254..2a8821674b10d21cbba7f7f3b9f887b916ec77d4 100644 (file)
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -121,6 +121,15 @@ static u32 compute_fsc(int level, u32 fsc)
         return fsc | (level & 0x3);
  }
  
+static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
+{
+       u32 esr;
+
+       esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
+       esr |= compute_fsc(level, fsc);
+       return esr;
+}
+
  static int get_ia_size(struct s2_walk_info *wi)
  {
         return 64 - wi->t0sz;
@@ -482,6 +491,42 @@ void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
         }
  }
  
+/*
+ * Returns non-zero if permission fault is handled by injecting it to the next
+ * level hypervisor.
+ */
+int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
+{
+       bool forward_fault = false;
+
+       trans->esr = 0;
+
+       if (!kvm_vcpu_trap_is_permission_fault(vcpu))
+               return 0;
+
+       if (kvm_vcpu_trap_is_iabt(vcpu)) {
+               forward_fault = !kvm_s2_trans_executable(trans);
+       } else {
+               bool write_fault = kvm_is_write_fault(vcpu);
+
+               forward_fault = ((write_fault && !trans->writable) ||
+                                (!write_fault && !trans->readable));
+       }
+
+       if (forward_fault)
+               trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
+
+       return forward_fault;
+}
+
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+       vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
+       vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
+
+       return kvm_inject_nested_sync(vcpu, esr_el2);
+}
+
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
  {
         int i;
author	Marc Zyngier <maz@kernel.org>
	Fri, 14 Jun 2024 14:45:39 +0000 (15:45 +0100)
committer	Oliver Upton <oliver.upton@linux.dev>
	Wed, 19 Jun 2024 08:13:49 +0000 (08:13 +0000)
arch/arm64/include/asm/kvm_nested.h		patch \| blob \| history
arch/arm64/kvm/mmu.c		patch \| blob \| history
arch/arm64/kvm/nested.c		patch \| blob \| history