NFP_CSR_ACT_LM_ADDR0);
        }
 
-       /* The following steps are performed:
+       /* Two cases for jumping to the callee:
+        *
+        * - If callee uses and needs to save R6~R9 then:
         *     1. Put the start offset of the callee into imm_b(). This will
         *        require a fixup step, as we do not necessarily know this
         *        address yet.
         *        register ret_reg().
         *     3. (After defer slots are consumed) Jump to the subroutine that
         *        pushes the registers to the stack.
-        * The subroutine acts as a trampoline, and returns to the address in
-        * imm_b(), i.e. jumps to the callee.
+        *   The subroutine acts as a trampoline, and returns to the address in
+        *   imm_b(), i.e. jumps to the callee.
+        *
+        * - If callee does not need to save R6~R9 then just load return
+        *   address to the caller in ret_reg(), and jump to the callee
+        *   directly.
         *
         * Using ret_reg() to pass the return address to the callee is set here
         * as a convention. The callee can then push this address onto its
         *   execution of the callee, we will not have to push the return
         *   address to the stack for leaf functions.
         */
-       ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
-       emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
-                    RELO_BR_GO_CALL_PUSH_REGS);
-       offset_br = nfp_prog_current_offset(nfp_prog);
-       wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+       if (!meta->jmp_dst) {
+               pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
+               return -ELOOP;
+       }
+       if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
+               ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
+               emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
+                            RELO_BR_GO_CALL_PUSH_REGS);
+               offset_br = nfp_prog_current_offset(nfp_prog);
+               wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+       } else {
+               ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
+               emit_br(nfp_prog, BR_UNC, meta->n + 1 + meta->insn.imm, 1);
+               offset_br = nfp_prog_current_offset(nfp_prog);
+       }
        wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
 
        if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
 static int
 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-       /* Pop R6~R9 to the stack via related subroutine.
-        * Pop return address for BPF-to-BPF call from the stack and load it
-        * into ret_reg() before we jump. This means that the subroutine does
-        * not come back here, we make it jump back to the subprogram caller
-        * directly!
-        */
-       emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
-                    RELO_BR_GO_CALL_POP_REGS);
-       wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+       if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
+               /* Pop R6~R9 to the stack via related subroutine.
+                * We loaded the return address to the caller into ret_reg().
+                * This means that the subroutine does not come back here, we
+                * make it jump back to the subprogram caller directly!
+                */
+               emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
+                            RELO_BR_GO_CALL_POP_REGS);
+               /* Pop return address from the stack. */
+               wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+       } else {
+               /* Pop return address from the stack. */
+               wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+               /* Jump back to caller if no callee-saved registers were used
+                * by the subprogram.
+                */
+               emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
+       }
 
        return 0;
 }
                        return -ELOOP;
                }
 
-               if (is_mbpf_pseudo_call(meta)) {
+               if (is_mbpf_pseudo_call(meta) &&
+                   nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
                        err = nfp_fixup_immed_relo(nfp_prog, meta,
                                                   jmp_dst, br_idx);
                        if (err)
        emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
+{
+       unsigned int idx;
+
+       for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
+               if (nfp_prog->subprog[idx].needs_reg_push)
+                       return true;
+
+       return false;
+}
+
 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
 {
        u8 reg;
                WARN_ON(1);
        }
 
-       if (nfp_prog->subprog_cnt == 1)
+       if (!nfp_prog_needs_callee_reg_save(nfp_prog))
                return;
 
        nfp_push_callee_registers(nfp_prog);
                                      nfp_prog->tgt_abort + bv->start_off);
                        break;
                case RELO_BR_GO_CALL_PUSH_REGS:
+                       if (!nfp_prog->tgt_call_push_regs) {
+                               pr_err("BUG: failed to detect subprogram registers needs\n");
+                               err = -EINVAL;
+                               goto err_free_prog;
+                       }
                        off = nfp_prog->tgt_call_push_regs + bv->start_off;
                        br_set_offset(&prog[i], off);
                        break;
                case RELO_BR_GO_CALL_POP_REGS:
+                       if (!nfp_prog->tgt_call_pop_regs) {
+                               pr_err("BUG: failed to detect subprogram registers needs\n");
+                               err = -EINVAL;
+                               goto err_free_prog;
+                       }
                        off = nfp_prog->tgt_call_pop_regs + bv->start_off;
                        br_set_offset(&prog[i], off);
                        break;
 
 }
 
 static int
-nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
+nfp_assign_subprog_idx_and_regs(struct bpf_verifier_env *env,
+                               struct nfp_prog *nfp_prog)
 {
        struct nfp_insn_meta *meta;
        int index = 0;
                if (nfp_is_subprog_start(meta))
                        index++;
                meta->subprog_idx = index;
+
+               if (meta->insn.dst_reg >= BPF_REG_6 &&
+                   meta->insn.dst_reg <= BPF_REG_9)
+                       nfp_prog->subprog[index].needs_reg_push = 1;
        }
 
        if (index + 1 != nfp_prog->subprog_cnt) {
        if (!nfp_prog->subprog)
                return -ENOMEM;
 
-       nfp_assign_subprog_idx(env, nfp_prog);
+       nfp_assign_subprog_idx_and_regs(env, nfp_prog);
 
        info = env->subprog_info;
        for (i = 0; i < nfp_prog->subprog_cnt; i++) {
 
                /* Account for size of return address. */
                nfp_prog->subprog[i].stack_depth += REG_WIDTH;
-               /* Account for size of saved registers. */
-               nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
+               /* Account for size of saved registers, if necessary. */
+               if (nfp_prog->subprog[i].needs_reg_push)
+                       nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
        }
 
        nn = netdev_priv(env->prog->aux->offload->netdev);