void *orig_call)
 {
        int ret, i, nr_args = m->nr_args;
-       int stack_size = nr_args * 8;
+       int regs_off, ip_off, stack_size = nr_args * 8;
        struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
        struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
        struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
        if (!is_valid_bpf_tramp_flags(flags))
                return -EINVAL;
 
+       /* Generated trampoline stack layout:
+        *
+        * RBP + 8         [ return address  ]
+        * RBP + 0         [ RBP             ]
+        *
+        * RBP - 8         [ return value    ]  BPF_TRAMP_F_CALL_ORIG or
+        *                                      BPF_TRAMP_F_RET_FENTRY_RET flags
+        *
+        *                 [ reg_argN        ]  always
+        *                 [ ...             ]
+        * RBP - regs_off  [ reg_arg1        ]  program's ctx pointer
+        *
+        * RBP - ip_off    [ traced function ]  BPF_TRAMP_F_IP_ARG flag
+        */
+
        /* room for return value of orig_call or fentry prog */
        save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
        if (save_ret)
                stack_size += 8;
 
+       regs_off = stack_size;
+
        if (flags & BPF_TRAMP_F_IP_ARG)
                stack_size += 8; /* room for IP address argument */
 
+       ip_off = stack_size;
+
        if (flags & BPF_TRAMP_F_SKIP_FRAME)
                /* skip patched call instruction and point orig_call to actual
                 * body of the kernel function.
                /* Store IP address of the traced function:
                 * mov rax, QWORD PTR [rbp + 8]
                 * sub rax, X86_PATCH_SIZE
-                * mov QWORD PTR [rbp - stack_size], rax
+                * mov QWORD PTR [rbp - ip_off], rax
                 */
                emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
                EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
-               emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size);
-
-               /* Continue with stack_size for regs storage, stack will
-                * be correctly restored with 'leave' instruction.
-                */
-               stack_size -= 8;
+               emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
        }
 
-       save_regs(m, &prog, nr_args, stack_size);
+       save_regs(m, &prog, nr_args, regs_off);
 
        if (flags & BPF_TRAMP_F_CALL_ORIG) {
                /* arg1: mov rdi, im */
        }
 
        if (fentry->nr_progs)
-               if (invoke_bpf(m, &prog, fentry, stack_size,
+               if (invoke_bpf(m, &prog, fentry, regs_off,
                               flags & BPF_TRAMP_F_RET_FENTRY_RET))
                        return -EINVAL;
 
                if (!branches)
                        return -ENOMEM;
 
-               if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size,
+               if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
                                       branches)) {
                        ret = -EINVAL;
                        goto cleanup;
        }
 
        if (flags & BPF_TRAMP_F_CALL_ORIG) {
-               restore_regs(m, &prog, nr_args, stack_size);
+               restore_regs(m, &prog, nr_args, regs_off);
 
                /* call original function */
                if (emit_call(&prog, orig_call, prog)) {
        }
 
        if (fexit->nr_progs)
-               if (invoke_bpf(m, &prog, fexit, stack_size, false)) {
+               if (invoke_bpf(m, &prog, fexit, regs_off, false)) {
                        ret = -EINVAL;
                        goto cleanup;
                }
 
        if (flags & BPF_TRAMP_F_RESTORE_REGS)
-               restore_regs(m, &prog, nr_args, stack_size);
+               restore_regs(m, &prog, nr_args, regs_off);
 
        /* This needs to be done regardless. If there were fmod_ret programs,
         * the return value is only updated on the stack and still needs to be