From a2a3374c47c428c0edb0bbc693638d4783f81e31 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 10 Jan 2025 23:16:31 +0100 Subject: [PATCH 01/16] sched_ext: idle: Refresh idle masks during idle-to-idle transitions With the consolidation of put_prev_task/set_next_task(), see commit 436f3eed5c69 ("sched: Combine the last put_prev_task() and the first set_next_task()"), we are now skipping the transition between these two functions when the previous and the next tasks are the same. As a result, the scx idle state of a CPU is updated only when transitioning to or from the idle thread. While this is generally correct, it can lead to uneven and inefficient core utilization in certain scenarios [1]. A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu() selects and marks an idle CPU as busy, followed by a wake-up via scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU continues running the idle thread, returns to idle, but remains marked as busy, preventing it from being selected again as an idle CPU (until a task eventually runs on it and releases the CPU). For example, running a workload that uses 20% of each CPU, combined with an scx scheduler using proactive wake-ups, results in the following core utilization: CPU 0: 25.7% CPU 1: 29.3% CPU 2: 26.5% CPU 3: 25.5% CPU 4: 0.0% CPU 5: 25.5% CPU 6: 0.0% CPU 7: 10.5% To address this, refresh the idle state also in pick_task_idle(), during idle-to-idle transitions, but only trigger ops.update_idle() on actual state changes to prevent unnecessary updates to the scx scheduler and maintain balanced state transitions. With this change in place, the core utilization in the previous example becomes the following: CPU 0: 18.8% CPU 1: 19.4% CPU 2: 18.0% CPU 3: 18.7% CPU 4: 19.3% CPU 5: 18.9% CPU 6: 18.7% CPU 7: 19.3% [1] https://github.com/sched-ext/scx/pull/1139 Fixes: 7c65ae81ea86 ("sched_ext: Don't call put_prev_task_scx() before picking the next task") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 61 ++++++++++++++++++++++++++++++++++++++------- kernel/sched/ext.h | 8 +++--- kernel/sched/idle.c | 5 ++-- 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 68150e110451..19813b387ef9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3590,16 +3590,8 @@ static void reset_idle_masks(void) cpumask_copy(idle_masks.smt, cpu_online_mask); } -void __scx_update_idle(struct rq *rq, bool idle) +static void update_builtin_idle(int cpu, bool idle) { - int cpu = cpu_of(rq); - - if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - if (!static_branch_unlikely(&scx_builtin_idle_enabled)) - return; - } - if (idle) cpumask_set_cpu(cpu, idle_masks.cpu); else @@ -3626,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index b1675bb59fc4..4d022d17ac7d 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ #if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) -void __scx_update_idle(struct rq *rq, bool idle); +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); -static inline void scx_update_idle(struct rq *rq, bool idle) +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) { if (scx_enabled()) - __scx_update_idle(rq, idle); + __scx_update_idle(rq, idle, do_notify); } #else -static inline void scx_update_idle(struct rq *rq, bool idle) {} +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #endif #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 621696269584..2c85c86b455f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { dl_server_update_idle_time(rq, prev); - scx_update_idle(rq, false); + scx_update_idle(rq, false, true); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); - scx_update_idle(rq, true); + scx_update_idle(rq, true, true); schedstat_inc(rq->sched_goidle); next->se.exec_start = rq_clock_task(rq); } struct task_struct *pick_task_idle(struct rq *rq) { + scx_update_idle(rq, true, false); return rq->idle; } -- 2.50.1 From 20b1aa912316ffb7fbb5f407f17c330f2a22ddff Mon Sep 17 00:00:00 2001 From: Meetakshi Setiya Date: Wed, 8 Jan 2025 05:10:34 -0500 Subject: [PATCH 02/16] smb: client: sync the root session and superblock context passwords before automounting In some cases, when password2 becomes the working password, the client swaps the two password fields in the root session struct, but not in the smb3_fs_context struct in cifs_sb. DFS automounts inherit fs context from their parent mounts. Therefore, they might end up getting the passwords in the stale order. The automount should succeed, because the mount function will end up retrying with the actual password anyway. But to reduce these unnecessary session setup retries for automounts, we can sync the parent context's passwords with the root session's passwords before duplicating it to the child's fs context. Cc: stable@vger.kernel.org Signed-off-by: Meetakshi Setiya Reviewed-by: Shyam Prasad N Acked-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/namespace.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index 0f788031b740..e3f9213131c4 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -196,11 +196,28 @@ static struct vfsmount *cifs_do_automount(struct path *path) struct smb3_fs_context tmp; char *full_path; struct vfsmount *mnt; + struct cifs_sb_info *mntpt_sb; + struct cifs_ses *ses; if (IS_ROOT(mntpt)) return ERR_PTR(-ESTALE); - cur_ctx = CIFS_SB(mntpt->d_sb)->ctx; + mntpt_sb = CIFS_SB(mntpt->d_sb); + ses = cifs_sb_master_tcon(mntpt_sb)->ses; + cur_ctx = mntpt_sb->ctx; + + /* + * At this point, the root session should be in the mntpt sb. We should + * bring the sb context passwords in sync with the root session's + * passwords. This would help prevent unnecessary retries and password + * swaps for automounts. + */ + mutex_lock(&ses->session_mutex); + rc = smb3_sync_session_ctx_passwords(mntpt_sb, ses); + mutex_unlock(&ses->session_mutex); + + if (rc) + return ERR_PTR(rc); fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt); if (IS_ERR(fc)) -- 2.50.1 From 77a903cd8e5a91d120ee014c8f8eae74d6c5d0f6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 11 Jan 2025 10:57:38 +1100 Subject: [PATCH 03/16] MAINTAINERS: powerpc: Update my status Maddy is taking over the day-to-day maintenance of powerpc. I will still be around to help, and as a backup. Re-order the main POWERPC list to put Maddy first to reflect that. KVM/powerpc patches will be handled by Maddy via the powerpc tree with review from Nick, so replace myself with Maddy there. Remove myself from BPF, leaving Hari & Christophe as maintainers. Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index dba82fb32d04..a87ddad78e26 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4128,7 +4128,6 @@ S: Odd Fixes F: drivers/net/ethernet/netronome/nfp/bpf/ BPF JIT for POWERPC (32-BIT AND 64-BIT) -M: Michael Ellerman M: Hari Bathini M: Christophe Leroy R: Naveen N Rao @@ -12629,7 +12628,7 @@ F: arch/mips/include/uapi/asm/kvm* F: arch/mips/kvm/ KERNEL VIRTUAL MACHINE FOR POWERPC (KVM/powerpc) -M: Michael Ellerman +M: Madhavan Srinivasan R: Nicholas Piggin L: linuxppc-dev@lists.ozlabs.org L: kvm@vger.kernel.org @@ -13208,11 +13207,11 @@ X: drivers/macintosh/adb-iop.c X: drivers/macintosh/via-macii.c LINUX FOR POWERPC (32-BIT AND 64-BIT) +M: Madhavan Srinivasan M: Michael Ellerman R: Nicholas Piggin R: Christophe Leroy R: Naveen N Rao -M: Madhavan Srinivasan L: linuxppc-dev@lists.ozlabs.org S: Supported W: https://github.com/linuxppc/wiki/wiki -- 2.50.1 From 87ecfdbc699cc95fac73291b52650283ddcf929d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 12 Jan 2025 10:34:44 +0100 Subject: [PATCH 04/16] KVM: e500: always restore irqs If find_linux_pte fails, IRQs will not be restored. This is unlikely to happen in practice since it would have been reported as hanging hosts, but it should of course be fixed anyway. Cc: stable@vger.kernel.org Reported-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index e5a145b578a4..6824e8139801 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -479,7 +479,6 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (pte_present(pte)) { wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - local_irq_restore(flags); } else { local_irq_restore(flags); pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n", @@ -488,8 +487,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, goto out; } } - writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + local_irq_restore(flags); + writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); -- 2.50.1 From e97fbb43fb1b2e909dfd726204af3cdcb971517e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:19:28 +0100 Subject: [PATCH 05/16] KVM: e500: use shadow TLB entry as witness for writability kvmppc_e500_ref_setup is returning whether the guest TLB entry is writable, which is than passed to kvm_release_faultin_page. This makes little sense for two reasons: first, because the function sets up the private data for the page and the return value feels like it has been bolted on the side; second, because what really matters is whether the _shadow_ TLB entry is writable. If it is not writable, the page can be released as non-dirty. Shift from using tlbe_is_writable(gtlbe) to doing the same check on the shadow TLB entry. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 6824e8139801..c266c02f120f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -242,7 +242,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } -static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref, +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, struct kvm_book3e_206_tlb_entry *gtlbe, kvm_pfn_t pfn, unsigned int wimg) { @@ -251,8 +251,6 @@ static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref, /* Use guest supplied MAS2_G and MAS2_E */ ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; - - return tlbe_is_writable(gtlbe); } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) @@ -489,9 +487,10 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); - writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); + writable = tlbe_is_writable(stlbe); /* Clear i-cache for new pages */ kvmppc_mmu_flush_icache(pfn); -- 2.50.1 From f2104bf22f0475a08a0feeee40d8e45ce38ed5a1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:21:38 +0100 Subject: [PATCH 06/16] KVM: e500: track host-writability of pages Add the possibility of marking a page so that the UW and SW bits are force-cleared. This is stored in the private info so that it persists across multiple calls to kvmppc_e500_setup_stlbe. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500.h | 2 ++ arch/powerpc/kvm/e500_mmu_host.c | 15 +++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 6d0d329cbb35..f9acf866c709 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -34,6 +34,8 @@ enum vcpu_ftr { #define E500_TLB_BITMAP (1 << 30) /* TLB1 entry is mapped by host TLB0 */ #define E500_TLB_TLB0 (1 << 29) +/* entry is writable on the host */ +#define E500_TLB_WRITABLE (1 << 28) /* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */ #define E500_TLB_MAS2_ATTR (0x7f) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c266c02f120f..b1be39639d4a 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -45,11 +45,14 @@ static inline unsigned int tlb1_max_shadow_size(void) return host_tlb_params[1].entries - tlbcam_index - 1; } -static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) +static inline u32 e500_shadow_mas3_attrib(u32 mas3, bool writable, int usermode) { /* Mask off reserved bits. */ mas3 &= MAS3_ATTRIB_MASK; + if (!writable) + mas3 &= ~(MAS3_UW|MAS3_SW); + #ifndef CONFIG_KVM_BOOKE_HV if (!usermode) { /* Guest is in supervisor mode, @@ -244,10 +247,13 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, struct kvm_book3e_206_tlb_entry *gtlbe, - kvm_pfn_t pfn, unsigned int wimg) + kvm_pfn_t pfn, unsigned int wimg, + bool writable) { ref->pfn = pfn; ref->flags = E500_TLB_VALID; + if (writable) + ref->flags |= E500_TLB_WRITABLE; /* Use guest supplied MAS2_G and MAS2_E */ ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; @@ -303,6 +309,7 @@ static void kvmppc_e500_setup_stlbe( { kvm_pfn_t pfn = ref->pfn; u32 pr = vcpu->arch.shared->msr & MSR_PR; + bool writable = !!(ref->flags & E500_TLB_WRITABLE); BUG_ON(!(ref->flags & E500_TLB_VALID)); @@ -310,7 +317,7 @@ static void kvmppc_e500_setup_stlbe( stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR); stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | - e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); + e500_shadow_mas3_attrib(gtlbe->mas7_3, writable, pr); } static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, @@ -487,7 +494,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); - kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, true); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); writable = tlbe_is_writable(stlbe); -- 2.50.1 From 03b755b2aa48d242440cbfbd365e153b4b20fe54 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:14:55 +0100 Subject: [PATCH 07/16] KVM: e500: map readonly host pages for read The new __kvm_faultin_pfn() function is upset by the fact that e500 KVM ignores host page permissions - __kvm_faultin requires a "writable" outgoing argument, but e500 KVM is nonchalantly passing NULL. If the host page permissions do not include writability, the shadow TLB entry is forcibly mapped read-only. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index b1be39639d4a..b38679e5821b 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -374,6 +374,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned long slot_start, slot_end; pfnmap = 1; + writable = vma->vm_flags & VM_WRITE; start = vma->vm_pgoff; end = start + @@ -449,7 +450,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); - pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) pr_err("%s: real page not found for gfn %lx\n", @@ -494,7 +495,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); - kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, true); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); writable = tlbe_is_writable(stlbe); -- 2.50.1 From 55f4db79c4d94d4bb757f7a31a7f14de22fe517d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Jan 2025 16:49:50 +0100 Subject: [PATCH 08/16] KVM: e500: perform hugepage check after looking up the PFN e500 KVM tries to bypass __kvm_faultin_pfn() in order to map VM_PFNMAP VMAs as huge pages. This is a Bad Idea because VM_PFNMAP VMAs could become noncontiguous as a result of callsto remap_pfn_range(). Instead, use the already existing host PTE lookup to retrieve a valid host-side mapping level after __kvm_faultin_pfn() has returned. Then find the largest size that will satisfy the guest's request while staying within a single host PTE. Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500_mmu_host.c | 178 ++++++++++++------------------- 1 file changed, 69 insertions(+), 109 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index b38679e5821b..06caf8bbbe2b 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -326,15 +326,14 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, struct tlbe_ref *ref) { struct kvm_memory_slot *slot; - unsigned long pfn = 0; /* silence GCC warning */ + unsigned int psize; + unsigned long pfn; struct page *page = NULL; unsigned long hva; - int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; int ret = 0; unsigned long mmu_seq; struct kvm *kvm = vcpu_e500->vcpu.kvm; - unsigned long tsize_pages = 0; pte_t *ptep; unsigned int wimg = 0; pgd_t *pgdir; @@ -356,111 +355,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); hva = gfn_to_hva_memslot(slot, gfn); - if (tlbsel == 1) { - struct vm_area_struct *vma; - mmap_read_lock(kvm->mm); - - vma = find_vma(kvm->mm, hva); - if (vma && hva >= vma->vm_start && - (vma->vm_flags & VM_PFNMAP)) { - /* - * This VMA is a physically contiguous region (e.g. - * /dev/mem) that bypasses normal Linux page - * management. Find the overlap between the - * vma and the memslot. - */ - - unsigned long start, end; - unsigned long slot_start, slot_end; - - pfnmap = 1; - writable = vma->vm_flags & VM_WRITE; - - start = vma->vm_pgoff; - end = start + - vma_pages(vma); - - pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); - - slot_start = pfn - (gfn - slot->base_gfn); - slot_end = slot_start + slot->npages; - - if (start < slot_start) - start = slot_start; - if (end > slot_end) - end = slot_end; - - tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> - MAS1_TSIZE_SHIFT; - - /* - * e500 doesn't implement the lowest tsize bit, - * or 1K pages. - */ - tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); - - /* - * Now find the largest tsize (up to what the guest - * requested) that will cover gfn, stay within the - * range, and for which gfn and pfn are mutually - * aligned. - */ - - for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { - unsigned long gfn_start, gfn_end; - tsize_pages = 1UL << (tsize - 2); - - gfn_start = gfn & ~(tsize_pages - 1); - gfn_end = gfn_start + tsize_pages; - - if (gfn_start + pfn - gfn < start) - continue; - if (gfn_end + pfn - gfn > end) - continue; - if ((gfn & (tsize_pages - 1)) != - (pfn & (tsize_pages - 1))) - continue; - - gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); - pfn &= ~(tsize_pages - 1); - break; - } - } else if (vma && hva >= vma->vm_start && - is_vm_hugetlb_page(vma)) { - unsigned long psize = vma_kernel_pagesize(vma); - - tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> - MAS1_TSIZE_SHIFT; - - /* - * Take the largest page size that satisfies both host - * and guest mapping - */ - tsize = min(__ilog2(psize) - 10, tsize); - - /* - * e500 doesn't implement the lowest tsize bit, - * or 1K pages. - */ - tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); - } - - mmap_read_unlock(kvm->mm); - } - - if (likely(!pfnmap)) { - tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); - pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page); - if (is_error_noslot_pfn(pfn)) { - if (printk_ratelimit()) - pr_err("%s: real page not found for gfn %lx\n", - __func__, (long)gfn); - return -EINVAL; - } - - /* Align guest and physical address to page map boundaries */ - pfn &= ~(tsize_pages - 1); - gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page); + if (is_error_noslot_pfn(pfn)) { + if (printk_ratelimit()) + pr_err("%s: real page not found for gfn %lx\n", + __func__, (long)gfn); + return -EINVAL; } spin_lock(&kvm->mmu_lock); @@ -478,7 +378,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, * can't run hence pfn won't change. */ local_irq_save(flags); - ptep = find_linux_pte(pgdir, hva, NULL, NULL); + ptep = find_linux_pte(pgdir, hva, NULL, &psize); if (ptep) { pte_t pte = READ_ONCE(*ptep); @@ -495,6 +395,66 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } local_irq_restore(flags); + if (psize && tlbsel == 1) { + unsigned long psize_pages, tsize_pages; + unsigned long start, end; + unsigned long slot_start, slot_end; + + psize_pages = 1UL << (psize - PAGE_SHIFT); + start = pfn & ~(psize_pages - 1); + end = start + psize_pages; + + slot_start = pfn - (gfn - slot->base_gfn); + slot_end = slot_start + slot->npages; + + if (start < slot_start) + start = slot_start; + if (end > slot_end) + end = slot_end; + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Any page size that doesn't satisfy the host mapping + * will fail the start and end tests. + */ + tsize = min(psize - PAGE_SHIFT + BOOK3E_PAGESZ_4K, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + + /* + * Now find the largest tsize (up to what the guest + * requested) that will cover gfn, stay within the + * range, and for which gfn and pfn are mutually + * aligned. + */ + + for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { + unsigned long gfn_start, gfn_end; + tsize_pages = 1UL << (tsize - 2); + + gfn_start = gfn & ~(tsize_pages - 1); + gfn_end = gfn_start + tsize_pages; + + if (gfn_start + pfn - gfn < start) + continue; + if (gfn_end + pfn - gfn > end) + continue; + if ((gfn & (tsize_pages - 1)) != + (pfn & (tsize_pages - 1))) + continue; + + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn &= ~(tsize_pages - 1); + break; + } + } + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); -- 2.50.1 From 5bc55a333a2f7316b58edc7573e8e893f7acb532 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Jan 2025 14:37:56 -0800 Subject: [PATCH 09/16] Linux 6.13-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7904d5d88088..e20a62ad397f 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.50.1 From a78c5c2269e39544fab4140105f55fca4502e915 Mon Sep 17 00:00:00 2001 From: Ethan Carter Edwards Date: Sun, 22 Dec 2024 21:23:56 -0500 Subject: [PATCH 10/16] fs: erofs: xattr.c change kzalloc to kcalloc Refactor xattr.c to use kcalloc instead of kzalloc when multiplying allocation size by count. This refactor prevents unintentional memory overflows. Discovered by checkpatch.pl. Signed-off-by: Ethan Carter Edwards Link: https://lore.kernel.org/r/i3CLJhMELKzBJr3DaRyv-hP_4m-3Twx0sgBWXW6naZlMtHrIeWr93xOFshX8qZHDrJeSjHMTiUOh8JmBZ9v0AB-S1lIYM_d-vasSRlsF_s4=@ethancedwards.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a90d7d649739..7940241d9355 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -478,7 +478,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) if (!sbi->xattr_prefix_count) return 0; - pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL); + pfs = kcalloc(sbi->xattr_prefix_count, sizeof(*pfs), GFP_KERNEL); if (!pfs) return -ENOMEM; -- 2.50.1 From 54ab25d03eab652a321c3d1566edc2b050d83f2f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Dec 2024 10:39:48 +0800 Subject: [PATCH 11/16] erofs: micro-optimize superblock checksum Just verify the remaining unknown on-disk data instead of allocating a temporary buffer for the whole superblock and zeroing out the checksum field since .magic(EROFS_SUPER_MAGIC_V1) is verified and .checksum(0) is fixed. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20241212023948.1143038-1-hsiangkao@linux.alibaba.com --- fs/erofs/erofs_fs.h | 3 ++- fs/erofs/super.c | 30 +++++++++++------------------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index c8f2ae845bd2..199395ed1c1f 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -9,6 +9,7 @@ #ifndef __EROFS_FS_H #define __EROFS_FS_H +/* to allow for x86 boot sectors and other oddities. */ #define EROFS_SUPER_OFFSET 1024 #define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 @@ -54,7 +55,7 @@ struct erofs_deviceslot { /* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ - __le32 checksum; /* crc32c(super_block) */ + __le32 checksum; /* crc32c to avoid unexpected on-disk overlap */ __le32 feature_compat; __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index f5956474bfde..1fc5623c3a4d 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -39,29 +39,21 @@ void _erofs_printk(struct super_block *sb, const char *fmt, ...) static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) { - size_t len = 1 << EROFS_SB(sb)->blkszbits; - struct erofs_super_block *dsb; - u32 expected_crc, crc; + struct erofs_super_block *dsb = sbdata + EROFS_SUPER_OFFSET; + u32 len = 1 << EROFS_SB(sb)->blkszbits, crc; if (len > EROFS_SUPER_OFFSET) len -= EROFS_SUPER_OFFSET; + len -= offsetof(struct erofs_super_block, checksum) + + sizeof(dsb->checksum); - dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL); - if (!dsb) - return -ENOMEM; - - expected_crc = le32_to_cpu(dsb->checksum); - dsb->checksum = 0; - /* to allow for x86 boot sectors and other oddities. */ - crc = crc32c(~0, dsb, len); - kfree(dsb); - - if (crc != expected_crc) { - erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", - crc, expected_crc); - return -EBADMSG; - } - return 0; + /* skip .magic(pre-verified) and .checksum(0) fields */ + crc = crc32c(0x5045B54A, (&dsb->checksum) + 1, len); + if (crc == le32_to_cpu(dsb->checksum)) + return 0; + erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", + crc, le32_to_cpu(dsb->checksum)); + return -EBADMSG; } static void erofs_inode_init_once(void *ptr) -- 2.50.1 From 9f74ae8c9ac97a79f9d45c92bd8ac8598e17f21f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 7 Jan 2025 16:28:25 +0800 Subject: [PATCH 12/16] erofs: shorten bvecs[] for file-backed mounts BIO_MAX_VECS is too large for __GFP_NOFAIL allocation. We could use a mempool (since BIOs can always proceed), but it seems overly complicated for now. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250107082825.74242-1-hsiangkao@linux.alibaba.com --- fs/erofs/fileio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 33f8539dda4a..0ffd1c63beeb 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -6,7 +6,7 @@ #include struct erofs_fileio_rq { - struct bio_vec bvecs[BIO_MAX_VECS]; + struct bio_vec bvecs[16]; struct bio bio; struct kiocb iocb; struct super_block *sb; @@ -68,7 +68,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), GFP_KERNEL | __GFP_NOFAIL); - bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); + bio_init(&rq->bio, NULL, rq->bvecs, ARRAY_SIZE(rq->bvecs), REQ_OP_READ); rq->iocb.ki_filp = mdev->m_dif->file; rq->sb = mdev->m_sb; return rq; -- 2.50.1 From db902986dee453bfb5835cbc8efa67154ab34caf Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 14 Jan 2025 12:00:58 +0800 Subject: [PATCH 13/16] erofs: fix potential return value overflow of z_erofs_shrink_scan() z_erofs_shrink_scan() could return small numbers due to the mistyped `freed`. Although I don't think it has any visible impact. Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250114040058.459981-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 19ef4ff2a134..f416b73e0ca3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -927,8 +927,7 @@ unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr_shrink) { struct z_erofs_pcluster *pcl; - unsigned int freed = 0; - unsigned long index; + unsigned long index, freed = 0; xa_lock(&sbi->managed_pslots); xa_for_each(&sbi->managed_pslots, index, pcl) { -- 2.50.1 From 2a810ea79cd7a6d5f134ea69ca2ba726e600cbc4 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 14 Jan 2025 11:44:26 +0800 Subject: [PATCH 14/16] erofs: simplify z_erofs_load_compact_lcluster() - Get rid of unpack_compacted_index() and fold it into z_erofs_load_compact_lcluster(); - Avoid a goto. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250114034429.431408-1-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 89 ++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 53 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 4535f2f0a014..b9e35089c9b8 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -97,17 +97,48 @@ static int get_compacted_la_distance(unsigned int lobits, return d1; } -static int unpack_compacted_index(struct z_erofs_maprecorder *m, - unsigned int amortizedshift, - erofs_off_t pos, bool lookahead) +static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn, bool lookahead) { - struct erofs_inode *const vi = EROFS_I(m->inode); + struct inode *const inode = m->inode; + struct erofs_inode *const vi = EROFS_I(inode); + const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int totalidx = erofs_iblks(inode); + unsigned int compacted_4b_initial, compacted_2b, amortizedshift; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; - bool big_pcluster; + bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + erofs_off_t pos; u8 *in, type; int i; + if (lcn >= totalidx || lclusterbits > 14) + return -EINVAL; + + m->lcn = lcn; + /* used to align to 32-byte (compacted_2b) alignment */ + compacted_4b_initial = ((32 - ebase % 32) / 4) & 7; + compacted_2b = 0; + if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && + compacted_4b_initial < totalidx) + compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); + + pos = ebase; + amortizedshift = 2; /* compact_4b */ + if (lcn >= compacted_4b_initial) { + pos += compacted_4b_initial * 4; + lcn -= compacted_4b_initial; + if (lcn < compacted_2b) { + amortizedshift = 1; + } else { + pos += compacted_2b * 2; + lcn -= compacted_2b; + } + } + pos += lcn * (1 << amortizedshift); + + /* figure out the lcluster count in this pack */ if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; else if (1 << amortizedshift == 2 && lclusterbits <= 12) @@ -122,7 +153,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); - big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; bytes = pos & ((vcnt << amortizedshift) - 1); @@ -207,53 +237,6 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, return 0; } -static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, - unsigned long lcn, bool lookahead) -{ - struct inode *const inode = m->inode; - struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + - ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - unsigned int totalidx = erofs_iblks(inode); - unsigned int compacted_4b_initial, compacted_2b; - unsigned int amortizedshift; - erofs_off_t pos; - - if (lcn >= totalidx || vi->z_logical_clusterbits > 14) - return -EINVAL; - - m->lcn = lcn; - /* used to align to 32-byte (compacted_2b) alignment */ - compacted_4b_initial = (32 - ebase % 32) / 4; - if (compacted_4b_initial == 32 / 4) - compacted_4b_initial = 0; - - if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && - compacted_4b_initial < totalidx) - compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); - else - compacted_2b = 0; - - pos = ebase; - if (lcn < compacted_4b_initial) { - amortizedshift = 2; - goto out; - } - pos += compacted_4b_initial * 4; - lcn -= compacted_4b_initial; - - if (lcn < compacted_2b) { - amortizedshift = 1; - goto out; - } - pos += compacted_2b * 2; - lcn -= compacted_2b; - amortizedshift = 2; -out: - pos += lcn * (1 << amortizedshift); - return unpack_compacted_index(m, amortizedshift, pos, lookahead); -} - static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, unsigned int lcn, bool lookahead) { -- 2.50.1 From 5514d8478b8ef3f0ba1b77beaa65f05c12825143 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 14 Jan 2025 11:44:27 +0800 Subject: [PATCH 15/16] erofs: get rid of `z_erofs_next_pcluster_t` It was originally intended for tagged pointer reservation. Now all encoded data can be represented uniformally with `struct z_erofs_pcluster` as described in commit bf1aa03980f4 ("erofs: sunset `struct erofs_workgroup`"), let's drop it too. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250114034429.431408-2-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 80 ++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 51 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f416b73e0ca3..057d68eaa5d2 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -12,12 +12,6 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - struct z_erofs_bvec { struct page *page; int offset; @@ -48,7 +42,7 @@ struct z_erofs_pcluster { struct lockref lockref; /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; + struct z_erofs_pcluster *next; /* I: start block address of this pcluster */ erofs_off_t index; @@ -94,12 +88,11 @@ struct z_erofs_pcluster { /* the end of a chain of pclusters */ #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) -#define Z_EROFS_PCLUSTER_NIL (NULL) struct z_erofs_decompressqueue { struct super_block *sb; + struct z_erofs_pcluster *head; atomic_t pending_bios; - z_erofs_next_pcluster_t head; union { struct completion done; @@ -493,8 +486,7 @@ struct z_erofs_decompress_frontend { struct page *pagepool; struct page *candidate_bvpage; - struct z_erofs_pcluster *pcl; - z_erofs_next_pcluster_t owned_head; + struct z_erofs_pcluster *pcl, *head; enum z_erofs_pclustermode mode; erofs_off_t headoffset; @@ -504,7 +496,7 @@ struct z_erofs_decompress_frontend { }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ - .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ + .inode = __i, .head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED } static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) @@ -752,9 +744,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; - - /* new pclusters should be claimed as type 1, primary and followed */ - pcl->next = fe->owned_head; + pcl->next = fe->head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; @@ -790,8 +780,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) goto err_out; } } - fe->owned_head = &pcl->next; - fe->pcl = pcl; + fe->head = fe->pcl = pcl; return 0; err_out: @@ -810,7 +799,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->pcl); /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ - DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); + DBG_BUGON(!fe->head); if (!(map->m_flags & EROFS_MAP_META)) { while (1) { @@ -838,10 +827,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); /* check if this pcluster hasn't been linked into any chain. */ - if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL, - fe->owned_head) == Z_EROFS_PCLUSTER_NIL) { + if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) { /* .. so it can be attached to our submission chain */ - fe->owned_head = &fe->pcl->next; + fe->head = fe->pcl; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; } else { /* otherwise, it belongs to an inflight chain */ fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; @@ -1393,7 +1381,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); + WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); if (z_erofs_is_inline_pcluster(pcl)) @@ -1411,16 +1399,14 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, .pagepool = pagepool, .decompressed_secondary_bvecs = LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + .pcl = io->head, }; - z_erofs_next_pcluster_t owned = io->head; + struct z_erofs_pcluster *next; int err = io->eio ? -EIO : 0; - while (owned != Z_EROFS_PCLUSTER_TAIL) { - DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - - be.pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(be.pcl->next); - + for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) { + DBG_BUGON(!be.pcl); + next = READ_ONCE(be.pcl->next); err = z_erofs_decompress_pcluster(&be, err) ?: err; } return err; @@ -1630,18 +1616,13 @@ enum { NR_JOBQUEUES, }; -static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, - z_erofs_next_pcluster_t qtail[], - z_erofs_next_pcluster_t owned_head) +static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl, + struct z_erofs_pcluster *next, + struct z_erofs_pcluster **qtail[]) { - z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; - z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); - - WRITE_ONCE(*submit_qtail, owned_head); - WRITE_ONCE(*bypass_qtail, &pcl->next); - + WRITE_ONCE(*qtail[JQ_SUBMIT], next); + WRITE_ONCE(*qtail[JQ_BYPASS], pcl); qtail[JQ_BYPASS] = &pcl->next; } @@ -1676,9 +1657,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); - z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; + struct z_erofs_pcluster **qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - z_erofs_next_pcluster_t owned_head = f->owned_head; + struct z_erofs_pcluster *pcl, *next; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ erofs_off_t last_pa; unsigned int nr_bios = 0; @@ -1694,22 +1675,19 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; /* by default, all need io submission */ - q[JQ_SUBMIT]->head = owned_head; + q[JQ_SUBMIT]->head = next = f->head; do { struct erofs_map_dev mdev; - struct z_erofs_pcluster *pcl; erofs_off_t cur, end; struct bio_vec bvec; unsigned int i = 0; bool bypass = true; - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned_head, struct z_erofs_pcluster, next); - owned_head = READ_ONCE(pcl->next); - + pcl = next; + next = READ_ONCE(pcl->next); if (z_erofs_is_inline_pcluster(pcl)) { - move_to_bypass_jobqueue(pcl, qtail, owned_head); + z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } @@ -1781,8 +1759,8 @@ drain_io: if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; else - move_to_bypass_jobqueue(pcl, qtail, owned_head); - } while (owned_head != Z_EROFS_PCLUSTER_TAIL); + z_erofs_move_to_bypass_queue(pcl, next, qtail); + } while (next != Z_EROFS_PCLUSTER_TAIL); if (bio) { if (erofs_is_fileio_mode(EROFS_SB(sb))) @@ -1814,7 +1792,7 @@ static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f, bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios); int err; - if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->head == Z_EROFS_PCLUSTER_TAIL) return 0; z_erofs_submit_queue(f, io, &force_fg, !!ra_folios); -- 2.50.1 From 6f435e94a19ad25b372bc61443afd0839b8a521c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 14 Jan 2025 11:44:28 +0800 Subject: [PATCH 16/16] erofs: tidy up zdata.c All small code style adjustments, no logic changes: - z_erofs_decompress_frontend => z_erofs_frontend; - z_erofs_decompress_backend => z_erofs_backend; - Use Z_EROFS_DEFINE_FRONTEND() to replace DECOMPRESS_FRONTEND_INIT(); - `nr_folios` should be `nrpages` in z_erofs_readahead(); - Refine in-line comments. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250114034429.431408-3-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 111 ++++++++++++++++++----------------------------- 1 file changed, 43 insertions(+), 68 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 057d68eaa5d2..72c1d5d27c3a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -455,31 +455,25 @@ err_decompressor: } enum z_erofs_pclustermode { + /* It has previously been linked into another processing chain */ Z_EROFS_PCLUSTER_INFLIGHT, /* - * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it - * could be dispatched into bypass queue later due to uptodated managed - * pages. All related online pages cannot be reused for inplace I/O (or - * bvpage) since it can be directly decoded without I/O submission. + * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it + * may be dispatched to the bypass queue later due to uptodated managed + * folios. All file-backed folios related to this pcluster cannot be + * reused for in-place I/O (or bvpage) since the pcluster may be decoded + * in a separate queue (and thus out of order). */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The pcluster was just linked to a decompression chain by us. It can - * also be linked with the remaining pclusters, which means if the - * processing page is the tail page of a pcluster, this pcluster can - * safely use the whole page (since the previous pcluster is within the - * same chain) for in-place I/O, as illustrated below: - * ___________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current pcl) | (of the previous pcl) | - * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| - * - * [ (*) the page above can be used as inplace I/O. ] + * The pcluster has just been linked to our processing chain. + * File-backed folios (except for the head page) related to it can be + * used for in-place I/O (or bvpage). */ Z_EROFS_PCLUSTER_FOLLOWED, }; -struct z_erofs_decompress_frontend { +struct z_erofs_frontend { struct inode *const inode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; @@ -495,11 +489,11 @@ struct z_erofs_decompress_frontend { unsigned int icur; }; -#define DECOMPRESS_FRONTEND_INIT(__i) { \ - .inode = __i, .head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = Z_EROFS_PCLUSTER_FOLLOWED } +#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \ + .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho } -static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe) { unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; @@ -516,7 +510,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) +static void z_erofs_bind_cache(struct z_erofs_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; @@ -673,7 +667,7 @@ int erofs_init_managed_cache(struct super_block *sb) } /* callers must be with pcluster lock held */ -static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, +static int z_erofs_attach_page(struct z_erofs_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { struct z_erofs_pcluster *pcl = fe->pcl; @@ -719,7 +713,7 @@ static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) return true; } -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) +static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; @@ -789,7 +783,7 @@ err_out: return err; } -static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; @@ -862,14 +856,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) return 0; } -/* - * keep in mind that no referenced pclusters will be freed - * only after a RCU grace period. - */ static void z_erofs_rcu_callback(struct rcu_head *head) { - z_erofs_free_pcluster(container_of(head, - struct z_erofs_pcluster, rcu)); + z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu)); } static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, @@ -911,8 +900,7 @@ static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, return free; } -unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, - unsigned long nr_shrink) +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr) { struct z_erofs_pcluster *pcl; unsigned long index, freed = 0; @@ -925,7 +913,7 @@ unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, xa_unlock(&sbi->managed_pslots); ++freed; - if (!--nr_shrink) + if (!--nr) return freed; xa_lock(&sbi->managed_pslots); } @@ -954,7 +942,7 @@ static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; @@ -967,13 +955,9 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - /* - * if all pending pages are added, don't hold its reference - * any longer if the pcluster isn't hosted by ourselves. - */ + /* Drop refcount if it doesn't belong to our processing chain */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); - fe->pcl = NULL; } @@ -1002,7 +986,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio, return 0; } -static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, +static int z_erofs_scan_folio(struct z_erofs_frontend *f, struct folio *folio, bool ra) { struct inode *const inode = f->inode; @@ -1117,7 +1101,7 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page); } -struct z_erofs_decompress_backend { +struct z_erofs_backend { struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; struct super_block *sb; struct z_erofs_pcluster *pcl; @@ -1137,7 +1121,7 @@ struct z_erofs_bvec_item { struct list_head list; }; -static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, +static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be, struct z_erofs_bvec *bvec) { struct z_erofs_bvec_item *item; @@ -1160,8 +1144,7 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, list_add(&item->list, &be->decompressed_secondary_bvecs); } -static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, - int err) +static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err) { unsigned int off0 = be->pcl->pageofs_out; struct list_head *p, *n; @@ -1202,7 +1185,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, } } -static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; @@ -1227,8 +1210,7 @@ static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); } -static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, - bool *overlapped) +static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped) { struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); @@ -1263,8 +1245,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, return err; } -static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, - int err) +static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) { struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; @@ -1394,7 +1375,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { - struct z_erofs_decompress_backend be = { + struct z_erofs_backend be = { .sb = io->sb, .pagepool = pagepool, .decompressed_secondary_bvecs = @@ -1472,7 +1453,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, } static void z_erofs_fill_bio_vec(struct bio_vec *bvec, - struct z_erofs_decompress_frontend *f, + struct z_erofs_frontend *f, struct z_erofs_pcluster *pcl, unsigned int nr, struct address_space *mc) @@ -1651,7 +1632,7 @@ static void z_erofs_endio(struct bio *bio) bio_put(bio); } -static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_frontend *f, struct z_erofs_decompressqueue *fgq, bool *force_fg, bool readahead) { @@ -1784,17 +1765,16 @@ drain_io: z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } -static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - unsigned int ra_folios) +static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); - bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios); + bool force_fg = z_erofs_is_sync_decompress(sbi, rapages); int err; if (f->head == Z_EROFS_PCLUSTER_TAIL) return 0; - z_erofs_submit_queue(f, io, &force_fg, !!ra_folios); + z_erofs_submit_queue(f, io, &force_fg, !!rapages); /* handle bypass queue (no i/o pclusters) immediately */ err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); @@ -1812,7 +1792,7 @@ static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f, * Since partial uptodate is still unimplemented for now, we have to use * approximate readmore strategies as a start. */ -static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, +static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; @@ -1867,12 +1847,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; - struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio)); int err; trace_erofs_read_folio(folio, false); - f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_scan_folio(&f, folio, false); z_erofs_pcluster_readmore(&f, NULL, false); @@ -1892,17 +1870,14 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; - struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac)); struct folio *head = NULL, *folio; - unsigned int nr_folios; + unsigned int nrpages = readahead_count(rac); int err; - f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, true); - nr_folios = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); - + nrpages = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nrpages, false); while ((folio = readahead_folio(rac))) { folio->private = head; head = folio; @@ -1921,7 +1896,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); - (void)z_erofs_runqueue(&f, nr_folios); + (void)z_erofs_runqueue(&f, nrpages); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } -- 2.50.1