#ifdef CONFIG_FSL_EMB_PERF_EVENT
 #include <asm/perf_event_fsl_emb.h>
 #endif
+
+#ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+
+#define perf_arch_fetch_caller_regs(regs, __ip)                        \
+       do {                                                    \
+               (regs)->nip = __ip;                             \
+               (regs)->gpr[1] = *(unsigned long *)__get_SP();  \
+               asm volatile("mfmsr %0" : "=r" ((regs)->msr));  \
+       } while (0)
+#endif
 
 _GLOBAL(__restore_cpu_power7)
        /* place holder */
        blr
-
-/*
- * Get a minimal set of registers for our caller's nth caller.
- * r3 = regs pointer, r5 = n.
- *
- * We only get R1 (stack pointer), NIP (next instruction pointer)
- * and LR (link register).  These are all we can get in the
- * general case without doing complicated stack unwinding, but
- * fortunately they are enough to do a stack backtrace, which
- * is all we need them for.
- */
-_GLOBAL(perf_arch_fetch_caller_regs)
-       mr      r6,r1
-       cmpwi   r5,0
-       mflr    r4
-       ble     2f
-       mtctr   r5
-1:     PPC_LL  r6,0(r6)
-       bdnz    1b
-       PPC_LL  r4,PPC_LR_STKOFF(r6)
-2:     PPC_LL  r7,0(r6)
-       PPC_LL  r7,PPC_LR_STKOFF(r7)
-       PPC_STL r6,GPR1-STACK_FRAME_OVERHEAD(r3)
-       PPC_STL r4,_NIP-STACK_FRAME_OVERHEAD(r3)
-       PPC_STL r7,_LINK-STACK_FRAME_OVERHEAD(r3)
-       blr
 
 #define        PERF_EVENT_INDEX_OFFSET 0
 
 #ifdef CONFIG_PERF_EVENTS
+#include <asm/ptrace.h>
+
 extern void init_hw_perf_events(void);
+
+extern void
+__perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+
+#define perf_arch_fetch_caller_regs(pt_regs, ip)       \
+       __perf_arch_fetch_caller_regs(pt_regs, ip, 1);
 #else
 static inline void init_hw_perf_events(void)   { }
 #endif
 
        .size           stack_trace_flush,.-stack_trace_flush
 
 #ifdef CONFIG_PERF_EVENTS
-       .globl          perf_arch_fetch_caller_regs
-       .type           perf_arch_fetch_caller_regs,#function
-perf_arch_fetch_caller_regs:
+       .globl          __perf_arch_fetch_caller_regs
+       .type           __perf_arch_fetch_caller_regs,#function
+__perf_arch_fetch_caller_regs:
        /* We always read the %pstate into %o5 since we will use
         * that to construct a fake %tstate to store into the regs.
         */
 
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)  perf_misc_flags(regs)
 
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip)                {       \
+       (regs)->ip = (__ip);                                    \
+       (regs)->bp = caller_frame_pointer();                    \
+       (regs)->cs = __KERNEL_CS;                               \
+       regs->flags = 0;                                        \
+}
+
 #else
 static inline void init_hw_perf_events(void)           { }
 static inline void perf_events_lapic_init(void)        { }
 
     u32 return_address;
 };
 
-static inline unsigned long rewind_frame_pointer(int n)
+static inline unsigned long caller_frame_pointer(void)
 {
        struct stack_frame *frame;
 
        get_bp(frame);
 
 #ifdef CONFIG_FRAME_POINTER
-       while (n--) {
-               if (probe_kernel_address(&frame->next_frame, frame))
-                       break;
-       }
+       frame = frame->next_frame;
 #endif
 
        return (unsigned long)frame;
 
        return entry;
 }
 
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-       regs->ip = ip;
-       /*
-        * perf_arch_fetch_caller_regs adds another call, we need to increment
-        * the skip level
-        */
-       regs->bp = rewind_frame_pointer(skip + 1);
-       regs->cs = __KERNEL_CS;
-       /*
-        * We abuse bit 3 to pass exact information, see perf_misc_flags
-        * and the comment with PERF_EFLAGS_EXACT.
-        */
-       regs->flags = 0;
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
        unsigned long ip;
 
 
 extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
 
-extern void
-perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+#ifndef perf_arch_fetch_caller_regs
+static inline void
+perf_arch_fetch_caller_regs(struct regs *regs, unsigned long ip) { }
+#endif
 
 /*
  * Take a snapshot of the regs. Skip ip and frame pointer to
  * - bp for callchains
  * - eflags, for future purposes, just in case
  */
-static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
+static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-       unsigned long ip;
-
        memset(regs, 0, sizeof(*regs));
 
-       switch (skip) {
-       case 1 :
-               ip = CALLER_ADDR0;
-               break;
-       case 2 :
-               ip = CALLER_ADDR1;
-               break;
-       case 3 :
-               ip = CALLER_ADDR2;
-               break;
-       case 4:
-               ip = CALLER_ADDR3;
-               break;
-       /* No need to support further for now */
-       default:
-               ip = 0;
-       }
-
-       return perf_arch_fetch_caller_regs(regs, ip, skip);
+       perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
 static inline void
                struct pt_regs hot_regs;
 
                if (!regs) {
-                       perf_fetch_caller_regs(&hot_regs, 1);
+                       perf_fetch_caller_regs(&hot_regs);
                        regs = &hot_regs;
                }
                __perf_sw_event(event_id, nr, nmi, regs, addr);
 
        int __data_size;                                                \
        int rctx;                                                       \
                                                                        \
-       perf_fetch_caller_regs(&__regs, 1);                             \
+       perf_fetch_caller_regs(&__regs);                                \
                                                                        \
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
        __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
 
        return NULL;
 }
 
-__weak
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-}
-
 
 /*
  * We assume there is only KVM supporting the callbacks.
 
 #include <linux/kprobes.h>
 #include "trace.h"
 
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
-
 static char *perf_trace_buf[4];
 
 /*