#include <linux/ring_buffer.h>
 #include <linux/trace_seq.h>
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 
 struct trace_array;
 struct tracer;
        void                    (*profile_disable)(void);
 };
 
+#define FTRACE_MAX_PROFILE_SIZE        2048
+
+extern char                    *trace_profile_buf;
+extern char                    *trace_profile_buf_nmi;
+
 #define MAX_FILTER_PRED                32
 #define MAX_FILTER_STR_VAL     256     /* Should handle KSYM_SYMBOL_LEN */
 
 
  *     struct ftrace_raw_##call *entry;
  *     u64 __addr = 0, __count = 1;
  *     unsigned long irq_flags;
+ *     struct trace_entry *ent;
  *     int __entry_size;
  *     int __data_size;
+ *     int __cpu
  *     int pc;
  *
- *     local_save_flags(irq_flags);
  *     pc = preempt_count();
  *
  *     __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
  *                          sizeof(u64));
  *     __entry_size -= sizeof(u32);
  *
- *     do {
- *             char raw_data[__entry_size]; <- allocate our sample in the stack
- *             struct trace_entry *ent;
+ *     // Protect the non nmi buffer
+ *     // This also protects the rcu read side
+ *     local_irq_save(irq_flags);
+ *     __cpu = smp_processor_id();
+ *
+ *     if (in_nmi())
+ *             raw_data = rcu_dereference(trace_profile_buf_nmi);
+ *     else
+ *             raw_data = rcu_dereference(trace_profile_buf);
+ *
+ *     if (!raw_data)
+ *             goto end;
  *
- *             zero dead bytes from alignment to avoid stack leak to userspace:
+ *     raw_data = per_cpu_ptr(raw_data, __cpu);
  *
- *             *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
- *             entry = (struct ftrace_raw_<call> *)raw_data;
- *             ent = &entry->ent;
- *             tracing_generic_entry_update(ent, irq_flags, pc);
- *             ent->type = event_call->id;
+ *     //zero dead bytes from alignment to avoid stack leak to userspace:
+ *     *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
+ *     entry = (struct ftrace_raw_<call> *)raw_data;
+ *     ent = &entry->ent;
+ *     tracing_generic_entry_update(ent, irq_flags, pc);
+ *     ent->type = event_call->id;
  *
- *             <tstruct> <- do some jobs with dynamic arrays
+ *     <tstruct> <- do some jobs with dynamic arrays
  *
- *             <assign>  <- affect our values
+ *     <assign>  <- affect our values
  *
- *             perf_tpcounter_event(event_call->id, __addr, __count, entry,
- *                          __entry_size);  <- submit them to perf counter
- *     } while (0);
+ *     perf_tpcounter_event(event_call->id, __addr, __count, entry,
+ *                  __entry_size);  <- submit them to perf counter
  *
  * }
  */
        struct ftrace_raw_##call *entry;                                \
        u64 __addr = 0, __count = 1;                                    \
        unsigned long irq_flags;                                        \
+       struct trace_entry *ent;                                        \
        int __entry_size;                                               \
        int __data_size;                                                \
+       char *raw_data;                                                 \
+       int __cpu;                                                      \
        int pc;                                                         \
                                                                        \
-       local_save_flags(irq_flags);                                    \
        pc = preempt_count();                                           \
                                                                        \
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
                             sizeof(u64));                              \
        __entry_size -= sizeof(u32);                                    \
                                                                        \
-       do {                                                            \
-               char raw_data[__entry_size];                            \
-               struct trace_entry *ent;                                \
+       if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE,           \
+                     "profile buffer not large enough"))               \
+               return;                                                 \
+                                                                       \
+       local_irq_save(irq_flags);                                      \
+       __cpu = smp_processor_id();                                     \
                                                                        \
-               *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
-               entry = (struct ftrace_raw_##call *)raw_data;           \
-               ent = &entry->ent;                                      \
-               tracing_generic_entry_update(ent, irq_flags, pc);       \
-               ent->type = event_call->id;                             \
+       if (in_nmi())                                                   \
+               raw_data = rcu_dereference(trace_profile_buf_nmi);              \
+       else                                                            \
+               raw_data = rcu_dereference(trace_profile_buf);          \
                                                                        \
-               tstruct                                                 \
+       if (!raw_data)                                                  \
+               goto end;                                               \
                                                                        \
-               { assign; }                                             \
+       raw_data = per_cpu_ptr(raw_data, __cpu);                        \
                                                                        \
-               perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+       *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;         \
+       entry = (struct ftrace_raw_##call *)raw_data;                   \
+       ent = &entry->ent;                                              \
+       tracing_generic_entry_update(ent, irq_flags, pc);               \
+       ent->type = event_call->id;                                     \
+                                                                       \
+       tstruct                                                         \
+                                                                       \
+       { assign; }                                                     \
+                                                                       \
+       perf_tpcounter_event(event_call->id, __addr, __count, entry,    \
                             __entry_size);                             \
-       } while (0);                                                    \
+                                                                       \
+end:                                                                   \
+       local_irq_restore(irq_flags);                                   \
                                                                        \
 }
 
 
 #include <linux/module.h>
 #include "trace.h"
 
+/*
+ * We can't use a size but a type in alloc_percpu()
+ * So let's create a dummy type that matches the desired size
+ */
+typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
+
+char           *trace_profile_buf;
+char           *trace_profile_buf_nmi;
+
+/* Count the events in use (per event id, not per instance) */
+static int     total_profile_count;
+
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
+       char *buf;
+       int ret = -ENOMEM;
+
        if (atomic_inc_return(&event->profile_count))
                return 0;
 
-       return event->profile_enable();
+       if (!total_profile_count++) {
+               buf = (char *)alloc_percpu(profile_buf_t);
+               if (!buf)
+                       goto fail_buf;
+
+               rcu_assign_pointer(trace_profile_buf, buf);
+
+               buf = (char *)alloc_percpu(profile_buf_t);
+               if (!buf)
+                       goto fail_buf_nmi;
+
+               rcu_assign_pointer(trace_profile_buf_nmi, buf);
+       }
+
+       ret = event->profile_enable();
+       if (!ret)
+               return 0;
+
+       kfree(trace_profile_buf_nmi);
+fail_buf_nmi:
+       kfree(trace_profile_buf);
+fail_buf:
+       total_profile_count--;
+       atomic_dec(&event->profile_count);
+
+       return ret;
 }
 
 int ftrace_profile_enable(int event_id)
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
+       char *buf, *nmi_buf;
+
        if (!atomic_add_negative(-1, &event->profile_count))
                return;
 
        event->profile_disable();
+
+       if (!--total_profile_count) {
+               buf = trace_profile_buf;
+               rcu_assign_pointer(trace_profile_buf, NULL);
+
+               nmi_buf = trace_profile_buf_nmi;
+               rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+
+               /*
+                * Ensure every events in profiling have finished before
+                * releasing the buffers
+                */
+               synchronize_sched();
+
+               free_percpu(buf);
+               free_percpu(nmi_buf);
+       }
 }
 
 void ftrace_profile_disable(int event_id)
 
 
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
-       struct syscall_trace_enter *rec;
        struct syscall_metadata *sys_data;
+       struct syscall_trace_enter *rec;
+       unsigned long flags;
+       char *raw_data;
        int syscall_nr;
        int size;
+       int cpu;
 
        syscall_nr = syscall_get_nr(current, regs);
        if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
 
-       do {
-               char raw_data[size];
+       if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                     "profile buffer not large enough"))
+               return;
+
+       /* Protect the per cpu buffer, begin the rcu read side */
+       local_irq_save(flags);
 
-               /* zero the dead bytes from align to not leak stack to user */
-               *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+       cpu = smp_processor_id();
+
+       if (in_nmi())
+               raw_data = rcu_dereference(trace_profile_buf_nmi);
+       else
+               raw_data = rcu_dereference(trace_profile_buf);
+
+       if (!raw_data)
+               goto end;
 
-               rec = (struct syscall_trace_enter *) raw_data;
-               tracing_generic_entry_update(&rec->ent, 0, 0);
-               rec->ent.type = sys_data->enter_id;
-               rec->nr = syscall_nr;
-               syscall_get_arguments(current, regs, 0, sys_data->nb_args,
-                                      (unsigned long *)&rec->args);
-               perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
-       } while(0);
+       raw_data = per_cpu_ptr(raw_data, cpu);
+
+       /* zero the dead bytes from align to not leak stack to user */
+       *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+       rec = (struct syscall_trace_enter *) raw_data;
+       tracing_generic_entry_update(&rec->ent, 0, 0);
+       rec->ent.type = sys_data->enter_id;
+       rec->nr = syscall_nr;
+       syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+                              (unsigned long *)&rec->args);
+       perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+
+end:
+       local_irq_restore(flags);
 }
 
 int reg_prof_syscall_enter(char *name)
 static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
-       struct syscall_trace_exit rec;
+       struct syscall_trace_exit *rec;
+       unsigned long flags;
        int syscall_nr;
+       char *raw_data;
+       int size;
+       int cpu;
 
        syscall_nr = syscall_get_nr(current, regs);
        if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
        if (!sys_data)
                return;
 
-       tracing_generic_entry_update(&rec.ent, 0, 0);
-       rec.ent.type = sys_data->exit_id;
-       rec.nr = syscall_nr;
-       rec.ret = syscall_get_return_value(current, regs);
+       /* We can probably do that at build time */
+       size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
+       size -= sizeof(u32);
 
-       perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+       /*
+        * Impossible, but be paranoid with the future
+        * How to put this check outside runtime?
+        */
+       if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+               "exit event has grown above profile buffer size"))
+               return;
+
+       /* Protect the per cpu buffer, begin the rcu read side */
+       local_irq_save(flags);
+       cpu = smp_processor_id();
+
+       if (in_nmi())
+               raw_data = rcu_dereference(trace_profile_buf_nmi);
+       else
+               raw_data = rcu_dereference(trace_profile_buf);
+
+       if (!raw_data)
+               goto end;
+
+       raw_data = per_cpu_ptr(raw_data, cpu);
+
+       /* zero the dead bytes from align to not leak stack to user */
+       *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+       rec = (struct syscall_trace_exit *)raw_data;
+
+       tracing_generic_entry_update(&rec->ent, 0, 0);
+       rec->ent.type = sys_data->exit_id;
+       rec->nr = syscall_nr;
+       rec->ret = syscall_get_return_value(current, regs);
+
+       perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
+
+end:
+       local_irq_restore(flags);
 }
 
 int reg_prof_syscall_exit(char *name)