void (*func)(void *info);
        void *info;
        int wait;
-       cpumask_t pending;
-       cpumask_t waitmask;
+       cpumask_t *waitmask;
 };
 
 static struct blackfin_flush_data smp_flush_data;
 static DEFINE_SPINLOCK(stop_lock);
 
 struct ipi_message {
-       struct list_head list;
        unsigned long type;
        struct smp_call_struct call_struct;
 };
 
+/* A magic number - stress test shows this is safe for common cases */
+#define BFIN_IPI_MSGQ_LEN 5
+
+/* Simple FIFO buffer, overflow leads to panic */
 struct ipi_message_queue {
-       struct list_head head;
        spinlock_t lock;
        unsigned long count;
+       unsigned long head; /* head of the queue */
+       struct ipi_message ipi_message[BFIN_IPI_MSGQ_LEN];
 };
 
 static DEFINE_PER_CPU(struct ipi_message_queue, ipi_msg_queue);
        func = msg->call_struct.func;
        info = msg->call_struct.info;
        wait = msg->call_struct.wait;
-       cpu_clear(cpu, msg->call_struct.pending);
        func(info);
        if (wait) {
 #ifdef __ARCH_SYNC_CORE_DCACHE
                 */
                resync_core_dcache();
 #endif
-               cpu_clear(cpu, msg->call_struct.waitmask);
-       } else
-               kfree(msg);
+               cpu_clear(cpu, *msg->call_struct.waitmask);
+       }
 }
 
-static irqreturn_t ipi_handler(int irq, void *dev_instance)
+/* Use IRQ_SUPPLE_0 to request reschedule.
+ * When returning from interrupt to user space,
+ * there is chance to reschedule */
+static irqreturn_t ipi_handler_int0(int irq, void *dev_instance)
+{
+       unsigned int cpu = smp_processor_id();
+
+       platform_clear_ipi(cpu, IRQ_SUPPLE_0);
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t ipi_handler_int1(int irq, void *dev_instance)
 {
        struct ipi_message *msg;
        struct ipi_message_queue *msg_queue;
        unsigned int cpu = smp_processor_id();
+       unsigned long flags;
 
-       platform_clear_ipi(cpu);
+       platform_clear_ipi(cpu, IRQ_SUPPLE_1);
 
        msg_queue = &__get_cpu_var(ipi_msg_queue);
-       msg_queue->count++;
 
-       spin_lock(&msg_queue->lock);
-       while (!list_empty(&msg_queue->head)) {
-               msg = list_entry(msg_queue->head.next, typeof(*msg), list);
-               list_del(&msg->list);
+       spin_lock_irqsave(&msg_queue->lock, flags);
+
+       while (msg_queue->count) {
+               msg = &msg_queue->ipi_message[msg_queue->head];
                switch (msg->type) {
-               case BFIN_IPI_RESCHEDULE:
-                       /* That's the easiest one; leave it to
-                        * return_from_int. */
-                       kfree(msg);
-                       break;
                case BFIN_IPI_CALL_FUNC:
-                       spin_unlock(&msg_queue->lock);
+                       spin_unlock_irqrestore(&msg_queue->lock, flags);
                        ipi_call_function(cpu, msg);
-                       spin_lock(&msg_queue->lock);
+                       spin_lock_irqsave(&msg_queue->lock, flags);
                        break;
                case BFIN_IPI_CPU_STOP:
-                       spin_unlock(&msg_queue->lock);
+                       spin_unlock_irqrestore(&msg_queue->lock, flags);
                        ipi_cpu_stop(cpu);
-                       spin_lock(&msg_queue->lock);
-                       kfree(msg);
+                       spin_lock_irqsave(&msg_queue->lock, flags);
                        break;
                default:
                        printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%lx\n",
                               cpu, msg->type);
-                       kfree(msg);
                        break;
                }
+               msg_queue->head++;
+               msg_queue->head %= BFIN_IPI_MSGQ_LEN;
+               msg_queue->count--;
        }
-       spin_unlock(&msg_queue->lock);
+       spin_unlock_irqrestore(&msg_queue->lock, flags);
        return IRQ_HANDLED;
 }
 
        struct ipi_message_queue *msg_queue;
        for_each_possible_cpu(cpu) {
                msg_queue = &per_cpu(ipi_msg_queue, cpu);
-               INIT_LIST_HEAD(&msg_queue->head);
                spin_lock_init(&msg_queue->lock);
                msg_queue->count = 0;
+               msg_queue->head = 0;
        }
 }
 
-int smp_call_function(void (*func)(void *info), void *info, int wait)
+static inline void smp_send_message(cpumask_t callmap, unsigned long type,
+                                       void (*func) (void *info), void *info, int wait)
 {
        unsigned int cpu;
-       cpumask_t callmap;
-       unsigned long flags;
        struct ipi_message_queue *msg_queue;
        struct ipi_message *msg;
-
-       callmap = cpu_online_map;
-       cpu_clear(smp_processor_id(), callmap);
-       if (cpus_empty(callmap))
-               return 0;
-
-       msg = kmalloc(sizeof(*msg), GFP_ATOMIC);
-       if (!msg)
-               return -ENOMEM;
-       INIT_LIST_HEAD(&msg->list);
-       msg->call_struct.func = func;
-       msg->call_struct.info = info;
-       msg->call_struct.wait = wait;
-       msg->call_struct.pending = callmap;
-       msg->call_struct.waitmask = callmap;
-       msg->type = BFIN_IPI_CALL_FUNC;
+       unsigned long flags, next_msg;
+       cpumask_t waitmask = callmap; /* waitmask is shared by all cpus */
 
        for_each_cpu_mask(cpu, callmap) {
                msg_queue = &per_cpu(ipi_msg_queue, cpu);
                spin_lock_irqsave(&msg_queue->lock, flags);
-               list_add_tail(&msg->list, &msg_queue->head);
+               if (msg_queue->count < BFIN_IPI_MSGQ_LEN) {
+                       next_msg = (msg_queue->head + msg_queue->count)
+                                       % BFIN_IPI_MSGQ_LEN;
+                       msg = &msg_queue->ipi_message[next_msg];
+                       msg->type = type;
+                       if (type == BFIN_IPI_CALL_FUNC) {
+                               msg->call_struct.func = func;
+                               msg->call_struct.info = info;
+                               msg->call_struct.wait = wait;
+                               msg->call_struct.waitmask = &waitmask;
+                       }
+                       msg_queue->count++;
+               } else
+                       panic("IPI message queue overflow\n");
                spin_unlock_irqrestore(&msg_queue->lock, flags);
-               platform_send_ipi_cpu(cpu);
+               platform_send_ipi_cpu(cpu, IRQ_SUPPLE_1);
        }
+
        if (wait) {
-               while (!cpus_empty(msg->call_struct.waitmask))
+               while (!cpus_empty(waitmask))
                        blackfin_dcache_invalidate_range(
-                               (unsigned long)(&msg->call_struct.waitmask),
-                               (unsigned long)(&msg->call_struct.waitmask));
+                               (unsigned long)(&waitmask),
+                               (unsigned long)(&waitmask));
 #ifdef __ARCH_SYNC_CORE_DCACHE
                /*
                 * Invalidate D cache in case shared data was changed by
                 */
                resync_core_dcache();
 #endif
-               kfree(msg);
        }
+}
+
+int smp_call_function(void (*func)(void *info), void *info, int wait)
+{
+       cpumask_t callmap;
+
+       callmap = cpu_online_map;
+       cpu_clear(smp_processor_id(), callmap);
+       if (cpus_empty(callmap))
+               return 0;
+
+       smp_send_message(callmap, BFIN_IPI_CALL_FUNC, func, info, wait);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(smp_call_function);
 {
        unsigned int cpu = cpuid;
        cpumask_t callmap;
-       unsigned long flags;
-       struct ipi_message_queue *msg_queue;
-       struct ipi_message *msg;
 
        if (cpu_is_offline(cpu))
                return 0;
        cpus_clear(callmap);
        cpu_set(cpu, callmap);
 
-       msg = kmalloc(sizeof(*msg), GFP_ATOMIC);
-       if (!msg)
-               return -ENOMEM;
-       INIT_LIST_HEAD(&msg->list);
-       msg->call_struct.func = func;
-       msg->call_struct.info = info;
-       msg->call_struct.wait = wait;
-       msg->call_struct.pending = callmap;
-       msg->call_struct.waitmask = callmap;
-       msg->type = BFIN_IPI_CALL_FUNC;
-
-       msg_queue = &per_cpu(ipi_msg_queue, cpu);
-       spin_lock_irqsave(&msg_queue->lock, flags);
-       list_add_tail(&msg->list, &msg_queue->head);
-       spin_unlock_irqrestore(&msg_queue->lock, flags);
-       platform_send_ipi_cpu(cpu);
+       smp_send_message(callmap, BFIN_IPI_CALL_FUNC, func, info, wait);
 
-       if (wait) {
-               while (!cpus_empty(msg->call_struct.waitmask))
-                       blackfin_dcache_invalidate_range(
-                               (unsigned long)(&msg->call_struct.waitmask),
-                               (unsigned long)(&msg->call_struct.waitmask));
-#ifdef __ARCH_SYNC_CORE_DCACHE
-               /*
-                * Invalidate D cache in case shared data was changed by
-                * other processors to ensure cache coherence.
-                */
-               resync_core_dcache();
-#endif
-               kfree(msg);
-       }
        return 0;
 }
 EXPORT_SYMBOL_GPL(smp_call_function_single);
 
 void smp_send_reschedule(int cpu)
 {
-       unsigned long flags;
-       struct ipi_message_queue *msg_queue;
-       struct ipi_message *msg;
-
+       /* simply trigger an ipi */
        if (cpu_is_offline(cpu))
                return;
-
-       msg = kzalloc(sizeof(*msg), GFP_ATOMIC);
-       if (!msg)
-               return;
-       INIT_LIST_HEAD(&msg->list);
-       msg->type = BFIN_IPI_RESCHEDULE;
-
-       msg_queue = &per_cpu(ipi_msg_queue, cpu);
-       spin_lock_irqsave(&msg_queue->lock, flags);
-       list_add_tail(&msg->list, &msg_queue->head);
-       spin_unlock_irqrestore(&msg_queue->lock, flags);
-       platform_send_ipi_cpu(cpu);
+       platform_send_ipi_cpu(cpu, IRQ_SUPPLE_0);
 
        return;
 }
 
 void smp_send_stop(void)
 {
-       unsigned int cpu;
        cpumask_t callmap;
-       unsigned long flags;
-       struct ipi_message_queue *msg_queue;
-       struct ipi_message *msg;
 
        callmap = cpu_online_map;
        cpu_clear(smp_processor_id(), callmap);
        if (cpus_empty(callmap))
                return;
 
-       msg = kzalloc(sizeof(*msg), GFP_ATOMIC);
-       if (!msg)
-               return;
-       INIT_LIST_HEAD(&msg->list);
-       msg->type = BFIN_IPI_CPU_STOP;
+       smp_send_message(callmap, BFIN_IPI_CPU_STOP, NULL, NULL, 0);
 
-       for_each_cpu_mask(cpu, callmap) {
-               msg_queue = &per_cpu(ipi_msg_queue, cpu);
-               spin_lock_irqsave(&msg_queue->lock, flags);
-               list_add_tail(&msg->list, &msg_queue->head);
-               spin_unlock_irqrestore(&msg_queue->lock, flags);
-               platform_send_ipi_cpu(cpu);
-       }
        return;
 }
 
 {
        platform_prepare_cpus(max_cpus);
        ipi_queue_init();
-       platform_request_ipi(ipi_handler);
+       platform_request_ipi(IRQ_SUPPLE_0, ipi_handler_int0);
+       platform_request_ipi(IRQ_SUPPLE_1, ipi_handler_int1);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)