}
 
 #ifdef CONFIG_STACKTRACE
+
+#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+struct ftrace_stack {
+       unsigned long           calls[FTRACE_STACK_MAX_ENTRIES];
+};
+
+static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(int, ftrace_stack_reserve);
+
 static void __ftrace_trace_stack(struct ring_buffer *buffer,
                                 unsigned long flags,
                                 int skip, int pc, struct pt_regs *regs)
        struct ring_buffer_event *event;
        struct stack_entry *entry;
        struct stack_trace trace;
+       int use_stack;
+       int size = FTRACE_STACK_ENTRIES;
+
+       trace.nr_entries        = 0;
+       trace.skip              = skip;
+
+       /*
+        * Since events can happen in NMIs there's no safe way to
+        * use the per cpu ftrace_stacks. We reserve it and if an interrupt
+        * or NMI comes in, it will just have to use the default
+        * FTRACE_STACK_SIZE.
+        */
+       preempt_disable_notrace();
+
+       use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+       /*
+        * We don't need any atomic variables, just a barrier.
+        * If an interrupt comes in, we don't care, because it would
+        * have exited and put the counter back to what we want.
+        * We just need a barrier to keep gcc from moving things
+        * around.
+        */
+       barrier();
+       if (use_stack == 1) {
+               trace.entries           = &__get_cpu_var(ftrace_stack).calls[0];
+               trace.max_entries       = FTRACE_STACK_MAX_ENTRIES;
+
+               if (regs)
+                       save_stack_trace_regs(regs, &trace);
+               else
+                       save_stack_trace(&trace);
+
+               if (trace.nr_entries > size)
+                       size = trace.nr_entries;
+       } else
+               /* From now on, use_stack is a boolean */
+               use_stack = 0;
+
+       size *= sizeof(unsigned long);
 
        event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
-                                         sizeof(*entry), flags, pc);
+                                         sizeof(*entry) + size, flags, pc);
        if (!event)
-               return;
-       entry   = ring_buffer_event_data(event);
-       memset(&entry->caller, 0, sizeof(entry->caller));
+               goto out;
+       entry = ring_buffer_event_data(event);
 
-       trace.nr_entries        = 0;
-       trace.max_entries       = FTRACE_STACK_ENTRIES;
-       trace.skip              = skip;
-       trace.entries           = entry->caller;
+       memset(&entry->caller, 0, size);
+
+       if (use_stack)
+               memcpy(&entry->caller, trace.entries,
+                      trace.nr_entries * sizeof(unsigned long));
+       else {
+               trace.max_entries       = FTRACE_STACK_ENTRIES;
+               trace.entries           = entry->caller;
+               if (regs)
+                       save_stack_trace_regs(regs, &trace);
+               else
+                       save_stack_trace(&trace);
+       }
+
+       entry->size = trace.nr_entries;
 
-       if (regs)
-               save_stack_trace_regs(regs, &trace);
-       else
-               save_stack_trace(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+
+ out:
+       /* Again, don't let gcc optimize things here */
+       barrier();
+       __get_cpu_var(ftrace_stack_reserve)--;
+       preempt_enable_notrace();
+
 }
 
 void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
 
        ftrace_enable_cpu();
 
-       return event ? ring_buffer_event_data(event) : NULL;
+       if (event) {
+               iter->ent_size = ring_buffer_event_length(event);
+               return ring_buffer_event_data(event);
+       }
+       iter->ent_size = 0;
+       return NULL;
 }
 
 static struct trace_entry *
 
 {
        struct stack_entry *field;
        struct trace_seq *s = &iter->seq;
-       int i;
+       unsigned long *p;
+       unsigned long *end;
 
        trace_assign_type(field, iter->ent);
+       end = (unsigned long *)((long)iter->ent + iter->ent_size);
 
        if (!trace_seq_puts(s, "<stack trace>\n"))
                goto partial;
-       for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-               if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
-                       break;
+
+       for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
                if (!trace_seq_puts(s, " => "))
                        goto partial;
 
-               if (!seq_print_ip_sym(s, field->caller[i], flags))
+               if (!seq_print_ip_sym(s, *p, flags))
                        goto partial;
                if (!trace_seq_puts(s, "\n"))
                        goto partial;