unsigned int       send_only;
 };
 
+/*
+ * This should be called with the mcast_mutex held
+ */
+static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
+                                              struct ipoib_mcast *mcast,
+                                              bool delay)
+{
+       if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+               return;
+
+       /*
+        * We will be scheduling *something*, so cancel whatever is
+        * currently scheduled first
+        */
+       cancel_delayed_work(&priv->mcast_task);
+       if (mcast && delay) {
+               /*
+                * We had a failure and want to schedule a retry later
+                */
+               mcast->backoff *= 2;
+               if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+                       mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+               mcast->delay_until = jiffies + (mcast->backoff * HZ);
+               /*
+                * Mark this mcast for its delay, but restart the
+                * task immediately.  The join task will make sure to
+                * clear out all entries without delays, and then
+                * schedule itself to run again when the earliest
+                * delay expires
+                */
+               queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+       } else if (delay) {
+               /*
+                * Special case of retrying after a failure to
+                * allocate the broadcast multicast group, wait
+                * 1 second and try again
+                */
+               queue_delayed_work(priv->wq, &priv->mcast_task, HZ);
+       } else
+               queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+}
+
 static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 {
        struct net_device *dev = mcast->dev;
 
        mcast->dev = dev;
        mcast->created = jiffies;
+       mcast->delay_until = jiffies;
        mcast->backoff = 1;
 
        INIT_LIST_HEAD(&mcast->list);
 {
        struct ipoib_mcast *mcast = multicast->context;
        struct net_device *dev = mcast->dev;
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+       /*
+        * We have to take the mutex to force mcast_sendonly_join to
+        * return from ib_sa_multicast_join and set mcast->mc to a
+        * valid value.  Otherwise we were racing with ourselves in
+        * that we might fail here, but get a valid return from
+        * ib_sa_multicast_join after we had cleared mcast->mc here,
+        * resulting in mis-matched joins and leaves and a deadlock
+        */
+       mutex_lock(&mcast_mutex);
 
        /* We trap for port events ourselves. */
-       if (status == -ENETRESET)
-               return 0;
+       if (status == -ENETRESET) {
+               status = 0;
+               goto out;
+       }
 
        if (!status)
                status = ipoib_mcast_join_finish(mcast, &multicast->rec);
 
        if (status) {
                if (mcast->logcount++ < 20)
-                       ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n",
+                       ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast "
+                                       "join failed for %pI6, status %d\n",
                                        mcast->mcmember.mgid.raw, status);
 
                /* Flush out any queued packets */
                        dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
                }
                netif_tx_unlock_bh(dev);
-
-               /* Clear the busy flag so we try again */
-               status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
-                                           &mcast->flags);
+               __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+       } else {
+               mcast->backoff = 1;
+               mcast->delay_until = jiffies;
+               __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
        }
+out:
+       clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+       if (status)
+               mcast->mc = NULL;
+       complete(&mcast->done);
+       mutex_unlock(&mcast_mutex);
        return status;
 }
 
        int ret = 0;
 
        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
-               ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+               ipoib_dbg_mcast(priv, "device shutting down, no sendonly "
+                               "multicast joins\n");
+               clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+               complete(&mcast->done);
                return -ENODEV;
        }
 
-       if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
-               ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
-               return -EBUSY;
-       }
-
        rec.mgid     = mcast->mcmember.mgid;
        rec.port_gid = priv->local_gid;
        rec.pkey     = cpu_to_be16(priv->pkey);
 
+       mutex_lock(&mcast_mutex);
        mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
                                         priv->port, &rec,
                                         IB_SA_MCMEMBER_REC_MGID        |
        if (IS_ERR(mcast->mc)) {
                ret = PTR_ERR(mcast->mc);
                clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-               ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
-                          ret);
+               ipoib_warn(priv, "ib_sa_join_multicast for sendonly join "
+                          "failed (ret = %d)\n", ret);
+               complete(&mcast->done);
        } else {
-               ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n",
-                               mcast->mcmember.mgid.raw);
+               ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting "
+                               "sendonly join\n", mcast->mcmember.mgid.raw);
        }
+       mutex_unlock(&mcast_mutex);
 
        return ret;
 }
        ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
                        mcast->mcmember.mgid.raw, status);
 
+       /*
+        * We have to take the mutex to force mcast_join to
+        * return from ib_sa_multicast_join and set mcast->mc to a
+        * valid value.  Otherwise we were racing with ourselves in
+        * that we might fail here, but get a valid return from
+        * ib_sa_multicast_join after we had cleared mcast->mc here,
+        * resulting in mis-matched joins and leaves and a deadlock
+        */
+       mutex_lock(&mcast_mutex);
+
        /* We trap for port events ourselves. */
        if (status == -ENETRESET) {
                status = 0;
 
        if (!status) {
                mcast->backoff = 1;
-               mutex_lock(&mcast_mutex);
-               if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-                       queue_delayed_work(priv->wq, &priv->mcast_task, 0);
-               mutex_unlock(&mcast_mutex);
+               mcast->delay_until = jiffies;
+               __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
 
                /*
                 * Defer carrier on work to priv->wq to avoid a
                 */
                if (mcast == priv->broadcast)
                        queue_work(priv->wq, &priv->carrier_on_task);
-
-               status = 0;
-               goto out;
-       }
-
-       if (mcast->logcount++ < 20) {
-               if (status == -ETIMEDOUT || status == -EAGAIN) {
-                       ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
-                                       mcast->mcmember.mgid.raw, status);
-               } else {
-                       ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
-                                  mcast->mcmember.mgid.raw, status);
+       } else {
+               if (mcast->logcount++ < 20) {
+                       if (status == -ETIMEDOUT || status == -EAGAIN) {
+                               ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
+                                               mcast->mcmember.mgid.raw, status);
+                       } else {
+                               ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
+                                          mcast->mcmember.mgid.raw, status);
+                       }
                }
-       }
-
-       mcast->backoff *= 2;
-       if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
-               mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
 
-       /* Clear the busy flag so we try again */
-       status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-
-       mutex_lock(&mcast_mutex);
-       spin_lock_irq(&priv->lock);
-       if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-               queue_delayed_work(priv->wq, &priv->mcast_task,
-                                  mcast->backoff * HZ);
-       spin_unlock_irq(&priv->lock);
-       mutex_unlock(&mcast_mutex);
+               /* Requeue this join task with a backoff delay */
+               __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+       }
 out:
+       clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+       if (status)
+               mcast->mc = NULL;
        complete(&mcast->done);
+       mutex_unlock(&mcast_mutex);
        return status;
 }
 
                rec.hop_limit     = priv->broadcast->mcmember.hop_limit;
        }
 
-       set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-       init_completion(&mcast->done);
-       set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
-
+       mutex_lock(&mcast_mutex);
        mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
                                         &rec, comp_mask, GFP_KERNEL,
                                         ipoib_mcast_join_complete, mcast);
        if (IS_ERR(mcast->mc)) {
                clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-               complete(&mcast->done);
                ret = PTR_ERR(mcast->mc);
                ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
-
-               mcast->backoff *= 2;
-               if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
-                       mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
-
-               mutex_lock(&mcast_mutex);
-               if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-                       queue_delayed_work(priv->wq, &priv->mcast_task,
-                                          mcast->backoff * HZ);
-               mutex_unlock(&mcast_mutex);
+               __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+               complete(&mcast->done);
        }
+       mutex_unlock(&mcast_mutex);
 }
 
 void ipoib_mcast_join_task(struct work_struct *work)
                container_of(work, struct ipoib_dev_priv, mcast_task.work);
        struct net_device *dev = priv->dev;
        struct ib_port_attr port_attr;
+       unsigned long delay_until = 0;
+       struct ipoib_mcast *mcast = NULL;
+       int create = 1;
 
        if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
                return;
        else
                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
+       /*
+        * We have to hold the mutex to keep from racing with the join
+        * completion threads on setting flags on mcasts, and we have
+        * to hold the priv->lock because dev_flush will remove entries
+        * out from underneath us, so at a minimum we need the lock
+        * through the time that we do the for_each loop of the mcast
+        * list or else dev_flush can make us oops.
+        */
+       mutex_lock(&mcast_mutex);
+       spin_lock_irq(&priv->lock);
+       if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+               goto out;
+
        if (!priv->broadcast) {
                struct ipoib_mcast *broadcast;
 
-               if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
-                       return;
-
-               broadcast = ipoib_mcast_alloc(dev, 1);
+               broadcast = ipoib_mcast_alloc(dev, 0);
                if (!broadcast) {
                        ipoib_warn(priv, "failed to allocate broadcast group\n");
-                       mutex_lock(&mcast_mutex);
-                       if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-                               queue_delayed_work(priv->wq, &priv->mcast_task,
-                                                  HZ);
-                       mutex_unlock(&mcast_mutex);
-                       return;
+                       /*
+                        * Restart us after a 1 second delay to retry
+                        * creating our broadcast group and attaching to
+                        * it.  Until this succeeds, this ipoib dev is
+                        * completely stalled (multicast wise).
+                        */
+                       __ipoib_mcast_schedule_join_thread(priv, NULL, 1);
+                       goto out;
                }
 
-               spin_lock_irq(&priv->lock);
                memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
                       sizeof (union ib_gid));
                priv->broadcast = broadcast;
 
                __ipoib_mcast_add(dev, priv->broadcast);
-               spin_unlock_irq(&priv->lock);
        }
 
        if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
-               if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
-                       ipoib_mcast_join(dev, priv->broadcast, 0);
-               return;
+               if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
+                   !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
+                       mcast = priv->broadcast;
+                       create = 0;
+                       if (mcast->backoff > 1 &&
+                           time_before(jiffies, mcast->delay_until)) {
+                               delay_until = mcast->delay_until;
+                               mcast = NULL;
+                       }
+               }
+               goto out;
        }
 
-       while (1) {
-               struct ipoib_mcast *mcast = NULL;
-
-               spin_lock_irq(&priv->lock);
-               list_for_each_entry(mcast, &priv->multicast_list, list) {
-                       if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
-                           && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
-                           && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+       /*
+        * We'll never get here until the broadcast group is both allocated
+        * and attached
+        */
+       list_for_each_entry(mcast, &priv->multicast_list, list) {
+               if (IS_ERR_OR_NULL(mcast->mc) &&
+                   !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
+                   !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+                       if (mcast->backoff == 1 ||
+                           time_after_eq(jiffies, mcast->delay_until))
                                /* Found the next unjoined group */
                                break;
-                       }
+                       else if (!delay_until ||
+                                time_before(mcast->delay_until, delay_until))
+                               delay_until = mcast->delay_until;
                }
-               spin_unlock_irq(&priv->lock);
-
-               if (&mcast->list == &priv->multicast_list) {
-                       /* All done */
-                       break;
-               }
-
-               ipoib_mcast_join(dev, mcast, 1);
-               return;
        }
 
-       ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+       if (&mcast->list == &priv->multicast_list) {
+               /*
+                * All done, unless we have delayed work from
+                * backoff retransmissions, but we will get
+                * restarted when the time is right, so we are
+                * done for now
+                */
+               mcast = NULL;
+               ipoib_dbg_mcast(priv, "successfully joined all "
+                               "multicast groups\n");
+       }
 
-       clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+out:
+       if (mcast) {
+               init_completion(&mcast->done);
+               set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+       }
+       spin_unlock_irq(&priv->lock);
+       mutex_unlock(&mcast_mutex);
+       if (mcast) {
+               if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+                       ipoib_mcast_sendonly_join(mcast);
+               else
+                       ipoib_mcast_join(dev, mcast, create);
+       }
+       if (delay_until)
+               queue_delayed_work(priv->wq, &priv->mcast_task,
+                                  delay_until - jiffies);
 }
 
 int ipoib_mcast_start_thread(struct net_device *dev)
        ipoib_dbg_mcast(priv, "starting multicast thread\n");
 
        mutex_lock(&mcast_mutex);
-       if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
-               queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+       set_bit(IPOIB_MCAST_RUN, &priv->flags);
+       __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
        mutex_unlock(&mcast_mutex);
 
        return 0;
        int ret = 0;
 
        if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+               ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
+
+       if (!IS_ERR_OR_NULL(mcast->mc))
                ib_sa_free_multicast(mcast->mc);
+       else
+               ipoib_dbg(priv, "ipoib_mcast_leave with mcast->mc invalid\n");
 
        if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
                ipoib_dbg_mcast(priv, "leaving MGID %pI6\n",
                                      be16_to_cpu(mcast->mcmember.mlid));
                if (ret)
                        ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
-       }
+       } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+               ipoib_dbg(priv, "leaving with no mcmember but not a "
+                         "SENDONLY join\n");
 
        return 0;
 }
                memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
                __ipoib_mcast_add(dev, mcast);
                list_add_tail(&mcast->list, &priv->multicast_list);
+               __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
        }
 
        if (!mcast->ah) {
                        ++dev->stats.tx_dropped;
                        dev_kfree_skb_any(skb);
                }
-
-               if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
-                       ipoib_dbg_mcast(priv, "no address vector, "
-                                       "but multicast join already started\n");
-               else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-                       ipoib_mcast_sendonly_join(mcast);
-
                /*
                 * If lookup completes between here and out:, don't
                 * want to send packet twice.
 
        spin_unlock_irqrestore(&priv->lock, flags);
 
-       /* seperate between the wait to the leave*/
+       /*
+        * make sure the in-flight joins have finished before we attempt
+        * to leave
+        */
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
-               if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags))
+               if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
                        wait_for_completion(&mcast->done);
 
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
        unsigned long flags;
        struct ib_sa_mcmember_rec rec;
 
-       ipoib_dbg_mcast(priv, "restarting multicast task\n");
+       if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+               /*
+                * shortcut...on shutdown flush is called next, just
+                * let it do all the work
+                */
+               return;
 
-       /*
-        * We're running on the priv->wq right now, so we can't call
-        * mcast_stop_thread as it wants to flush the wq and that
-        * will deadlock.  We don't actually *need* to stop the
-        * thread here anyway, so just clear the run flag, cancel
-        * any delayed work, do our work, remove the old entries,
-        * then restart the thread.
-        */
-       mutex_lock(&mcast_mutex);
-       clear_bit(IPOIB_MCAST_RUN, &priv->flags);
-       cancel_delayed_work(&priv->mcast_task);
-       mutex_unlock(&mcast_mutex);
+       ipoib_dbg_mcast(priv, "restarting multicast task\n");
 
        local_irq_save(flags);
        netif_addr_lock(dev);
        netif_addr_unlock(dev);
        local_irq_restore(flags);
 
-       /* We have to cancel outside of the spinlock */
+       /*
+        * make sure the in-flight joins have finished before we attempt
+        * to leave
+        */
+       list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+               if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+                       wait_for_completion(&mcast->done);
+
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
                ipoib_mcast_leave(mcast->dev, mcast);
                ipoib_mcast_free(mcast);
        }
 
-       if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
-               ipoib_mcast_start_thread(dev);
+       /*
+        * Double check that we are still up
+        */
+       if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+               spin_lock_irqsave(&priv->lock, flags);
+               __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+               spin_unlock_irqrestore(&priv->lock, flags);
+       }
 }
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG