seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
        seg->start_addr = 0;
 
-       err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in));
+       err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in),
+                                   NULL, NULL, NULL);
        if (err) {
                mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
                goto err_in;
 
        int                     npages;
        struct completion       done;
        enum ib_wc_status       status;
+       struct mlx5_ib_dev     *dev;
+       struct mlx5_create_mkey_mbox_out out;
+       unsigned long           start;
 };
 
 struct mlx5_ib_fast_reg_page_list {
        struct mlx5_ib_dev     *dev;
        struct work_struct      work;
        struct delayed_work     dwork;
+       int                     pending;
 };
 
 struct mlx5_mr_cache {
        spinlock_t                      mr_lock;
        struct mlx5_ib_resources        devr;
        struct mlx5_mr_cache            cache;
+       struct timer_list               delay_timer;
+       int                             fill_delay;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
 
 #include <linux/random.h>
 #include <linux/debugfs.h>
 #include <linux/export.h>
+#include <linux/delay.h>
 #include <rdma/ib_umem.h>
 #include "mlx5_ib.h"
 
 enum {
        DEF_CACHE_SIZE  = 10,
+       MAX_PENDING_REG_MR = 8,
 };
 
 enum {
                return order - cache->ent[0].order;
 }
 
+static void reg_mr_callback(int status, void *context)
+{
+       struct mlx5_ib_mr *mr = context;
+       struct mlx5_ib_dev *dev = mr->dev;
+       struct mlx5_mr_cache *cache = &dev->cache;
+       int c = order2idx(dev, mr->order);
+       struct mlx5_cache_ent *ent = &cache->ent[c];
+       u8 key;
+       unsigned long delta = jiffies - mr->start;
+       unsigned long index;
+       unsigned long flags;
+
+       index = find_last_bit(&delta, 8 * sizeof(delta));
+       if (index == 64)
+               index = 0;
+
+       spin_lock_irqsave(&ent->lock, flags);
+       ent->pending--;
+       spin_unlock_irqrestore(&ent->lock, flags);
+       if (status) {
+               mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
+               kfree(mr);
+               dev->fill_delay = 1;
+               mod_timer(&dev->delay_timer, jiffies + HZ);
+               return;
+       }
+
+       if (mr->out.hdr.status) {
+               mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
+                            mr->out.hdr.status,
+                            be32_to_cpu(mr->out.hdr.syndrome));
+               kfree(mr);
+               dev->fill_delay = 1;
+               mod_timer(&dev->delay_timer, jiffies + HZ);
+               return;
+       }
+
+       spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags);
+       key = dev->mdev.priv.mkey_key++;
+       spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags);
+       mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
+
+       cache->last_add = jiffies;
+
+       spin_lock_irqsave(&ent->lock, flags);
+       list_add_tail(&mr->list, &ent->head);
+       ent->cur++;
+       ent->size++;
+       spin_unlock_irqrestore(&ent->lock, flags);
+}
+
 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 {
        struct mlx5_mr_cache *cache = &dev->cache;
                return -ENOMEM;
 
        for (i = 0; i < num; i++) {
+               if (ent->pending >= MAX_PENDING_REG_MR) {
+                       err = -EAGAIN;
+                       break;
+               }
+
                mr = kzalloc(sizeof(*mr), GFP_KERNEL);
                if (!mr) {
                        err = -ENOMEM;
-                       goto out;
+                       break;
                }
                mr->order = ent->order;
                mr->umred = 1;
+               mr->dev = dev;
                in->seg.status = 1 << 6;
                in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
                in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
                in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
                in->seg.log2_page_size = 12;
 
+               spin_lock_irq(&ent->lock);
+               ent->pending++;
+               spin_unlock_irq(&ent->lock);
+               mr->start = jiffies;
                err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in,
-                                           sizeof(*in));
+                                           sizeof(*in), reg_mr_callback,
+                                           mr, &mr->out);
                if (err) {
                        mlx5_ib_warn(dev, "create mkey failed %d\n", err);
                        kfree(mr);
-                       goto out;
+                       break;
                }
-               cache->last_add = jiffies;
-
-               spin_lock(&ent->lock);
-               list_add_tail(&mr->list, &ent->head);
-               ent->cur++;
-               ent->size++;
-               spin_unlock(&ent->lock);
        }
 
-out:
        kfree(in);
        return err;
 }
        int i;
 
        for (i = 0; i < num; i++) {
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (list_empty(&ent->head)) {
-                       spin_unlock(&ent->lock);
+                       spin_unlock_irq(&ent->lock);
                        return;
                }
                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
                list_del(&mr->list);
                ent->cur--;
                ent->size--;
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
                err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
                return -EINVAL;
 
        if (var > ent->size) {
-               err = add_keys(dev, c, var - ent->size);
-               if (err)
-                       return err;
+               do {
+                       err = add_keys(dev, c, var - ent->size);
+                       if (err && err != -EAGAIN)
+                               return err;
+
+                       usleep_range(3000, 5000);
+               } while (err);
        } else if (var < ent->size) {
                remove_keys(dev, c, ent->size - var);
        }
        struct mlx5_ib_dev *dev = ent->dev;
        struct mlx5_mr_cache *cache = &dev->cache;
        int i = order2idx(dev, ent->order);
+       int err;
 
        if (cache->stopped)
                return;
 
        ent = &dev->cache.ent[i];
-       if (ent->cur < 2 * ent->limit) {
-               add_keys(dev, i, 1);
-               if (ent->cur < 2 * ent->limit)
-                       queue_work(cache->wq, &ent->work);
+       if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
+               err = add_keys(dev, i, 1);
+               if (ent->cur < 2 * ent->limit) {
+                       if (err == -EAGAIN) {
+                               mlx5_ib_dbg(dev, "returned eagain, order %d\n",
+                                           i + 2);
+                               queue_delayed_work(cache->wq, &ent->dwork,
+                                                  msecs_to_jiffies(3));
+                       } else if (err) {
+                               mlx5_ib_warn(dev, "command failed order %d, err %d\n",
+                                            i + 2, err);
+                               queue_delayed_work(cache->wq, &ent->dwork,
+                                                  msecs_to_jiffies(1000));
+                       } else {
+                               queue_work(cache->wq, &ent->work);
+                       }
+               }
        } else if (ent->cur > 2 * ent->limit) {
                if (!someone_adding(cache) &&
-                   time_after(jiffies, cache->last_add + 60 * HZ)) {
+                   time_after(jiffies, cache->last_add + 300 * HZ)) {
                        remove_keys(dev, i, 1);
                        if (ent->cur > ent->limit)
                                queue_work(cache->wq, &ent->work);
                } else {
-                       queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ);
+                       queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
                }
        }
 }
 
                mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
 
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (!list_empty(&ent->head)) {
                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
                                              list);
                        list_del(&mr->list);
                        ent->cur--;
-                       spin_unlock(&ent->lock);
+                       spin_unlock_irq(&ent->lock);
                        if (ent->cur < ent->limit)
                                queue_work(cache->wq, &ent->work);
                        break;
                }
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
 
                queue_work(cache->wq, &ent->work);
 
                return;
        }
        ent = &cache->ent[c];
-       spin_lock(&ent->lock);
+       spin_lock_irq(&ent->lock);
        list_add_tail(&mr->list, &ent->head);
        ent->cur++;
        if (ent->cur > 2 * ent->limit)
                shrink = 1;
-       spin_unlock(&ent->lock);
+       spin_unlock_irq(&ent->lock);
 
        if (shrink)
                queue_work(cache->wq, &ent->work);
 
        cancel_delayed_work(&ent->dwork);
        while (1) {
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (list_empty(&ent->head)) {
-                       spin_unlock(&ent->lock);
+                       spin_unlock_irq(&ent->lock);
                        return;
                }
                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
                list_del(&mr->list);
                ent->cur--;
                ent->size--;
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
                err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
        debugfs_remove_recursive(dev->cache.root);
 }
 
+static void delay_time_func(unsigned long ctx)
+{
+       struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
+
+       dev->fill_delay = 0;
+}
+
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 {
        struct mlx5_mr_cache *cache = &dev->cache;
                return -ENOMEM;
        }
 
+       setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
                INIT_LIST_HEAD(&cache->ent[i].head);
                spin_lock_init(&cache->ent[i].lock);
                clean_keys(dev, i);
 
        destroy_workqueue(dev->cache.wq);
+       del_timer_sync(&dev->delay_timer);
 
        return 0;
 }
        seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
        seg->start_addr = 0;
 
-       err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in));
+       err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL,
+                                   NULL);
        if (err)
                goto err_in;
 
        int err;
        int i;
 
-       for (i = 0; i < 10; i++) {
+       for (i = 0; i < 1; i++) {
                mr = alloc_cached_mr(dev, order);
                if (mr)
                        break;
 
                err = add_keys(dev, order2idx(dev, order), 1);
-               if (err) {
-                       mlx5_ib_warn(dev, "add_keys failed\n");
+               if (err && err != -EAGAIN) {
+                       mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
                        break;
                }
        }
        in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
        in->seg.log2_page_size = page_shift;
        in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
-       in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
-       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen);
+       in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
+                                                        1 << page_shift));
+       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL,
+                                   NULL, NULL);
        if (err) {
                mlx5_ib_warn(dev, "create mkey failed\n");
                goto err_2;
         * TBD not needed - issue 197292 */
        in->seg.log2_page_size = PAGE_SHIFT;
 
-       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in));
+       err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
+                                   NULL, NULL);
        kfree(in);
        if (err)
                goto err_free;
 
                        MLX5_MKEY_MASK_PD               |
                        MLX5_MKEY_MASK_LR               |
                        MLX5_MKEY_MASK_LW               |
+                       MLX5_MKEY_MASK_KEY              |
                        MLX5_MKEY_MASK_RR               |
                        MLX5_MKEY_MASK_RW               |
                        MLX5_MKEY_MASK_A                |
        seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
        seg->len = cpu_to_be64(wr->wr.fast_reg.length);
        seg->log2_page_size = wr->wr.fast_reg.page_shift;
-       seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+       seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
+                                      mlx5_mkey_variant(wr->wr.fast_reg.rkey));
 }
 
 static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
 
 static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
                                           struct mlx5_cmd_msg *in,
                                           struct mlx5_cmd_msg *out,
+                                          void *uout, int uout_size,
                                           mlx5_cmd_cbk_t cbk,
                                           void *context, int page_queue)
 {
 
        ent->in         = in;
        ent->out        = out;
+       ent->uout       = uout;
+       ent->uout_size  = uout_size;
        ent->callback   = cbk;
        ent->context    = context;
        ent->cmd        = cmd;
        ent->lay = lay;
        memset(lay, 0, sizeof(*lay));
        memcpy(lay->in, ent->in->first.data, sizeof(lay->in));
+       ent->op = be32_to_cpu(lay->in[0]) >> 16;
        if (ent->in->next)
                lay->in_ptr = cpu_to_be64(ent->in->next->dma);
        lay->inlen = cpu_to_be32(ent->in->len);
  *    2. page queue commands do not support asynchrous completion
  */
 static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
-                          struct mlx5_cmd_msg *out, mlx5_cmd_cbk_t callback,
+                          struct mlx5_cmd_msg *out, void *uout, int uout_size,
+                          mlx5_cmd_cbk_t callback,
                           void *context, int page_queue, u8 *status)
 {
        struct mlx5_cmd *cmd = &dev->cmd;
        if (callback && page_queue)
                return -EINVAL;
 
-       ent = alloc_cmd(cmd, in, out, callback, context, page_queue);
+       ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context,
+                       page_queue);
        if (IS_ERR(ent))
                return PTR_ERR(ent);
 
                op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode);
                if (op < ARRAY_SIZE(cmd->stats)) {
                        stats = &cmd->stats[op];
-                       spin_lock(&stats->lock);
+                       spin_lock_irq(&stats->lock);
                        stats->sum += ds;
                        ++stats->n;
-                       spin_unlock(&stats->lock);
+                       spin_unlock_irq(&stats->lock);
                }
                mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME,
                                   "fw exec time for %s is %lld nsec\n",
        int n;
        int i;
 
-       msg = kzalloc(sizeof(*msg), GFP_KERNEL);
+       msg = kzalloc(sizeof(*msg), flags);
        if (!msg)
                return ERR_PTR(-ENOMEM);
 
                up(&cmd->sem);
 }
 
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
+{
+       unsigned long flags;
+
+       if (msg->cache) {
+               spin_lock_irqsave(&msg->cache->lock, flags);
+               list_add_tail(&msg->list, &msg->cache->head);
+               spin_unlock_irqrestore(&msg->cache->lock, flags);
+       } else {
+               mlx5_free_cmd_msg(dev, msg);
+       }
+}
+
 void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector)
 {
        struct mlx5_cmd *cmd = &dev->cmd;
        void *context;
        int err;
        int i;
+       ktime_t t1, t2, delta;
+       s64 ds;
+       struct mlx5_cmd_stats *stats;
+       unsigned long flags;
 
        for (i = 0; i < (1 << cmd->log_sz); i++) {
                if (test_bit(i, &vector)) {
                        }
                        free_ent(cmd, ent->idx);
                        if (ent->callback) {
+                               t1 = timespec_to_ktime(ent->ts1);
+                               t2 = timespec_to_ktime(ent->ts2);
+                               delta = ktime_sub(t2, t1);
+                               ds = ktime_to_ns(delta);
+                               if (ent->op < ARRAY_SIZE(cmd->stats)) {
+                                       stats = &cmd->stats[ent->op];
+                                       spin_lock_irqsave(&stats->lock, flags);
+                                       stats->sum += ds;
+                                       ++stats->n;
+                                       spin_unlock_irqrestore(&stats->lock, flags);
+                               }
+
                                callback = ent->callback;
                                context = ent->context;
                                err = ent->ret;
+                               if (!err)
+                                       err = mlx5_copy_from_msg(ent->uout,
+                                                                ent->out,
+                                                                ent->uout_size);
+
+                               mlx5_free_cmd_msg(dev, ent->out);
+                               free_msg(dev, ent->in);
+
                                free_cmd(ent);
                                callback(err, context);
                        } else {
        return status ? -1 : 0; /* TBD more meaningful codes */
 }
 
-static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size)
+static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
+                                     gfp_t gfp)
 {
        struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
        struct mlx5_cmd *cmd = &dev->cmd;
                ent = &cmd->cache.med;
 
        if (ent) {
-               spin_lock(&ent->lock);
+               spin_lock_irq(&ent->lock);
                if (!list_empty(&ent->head)) {
                        msg = list_entry(ent->head.next, typeof(*msg), list);
                        /* For cached lists, we must explicitly state what is
                        msg->len = in_size;
                        list_del(&msg->list);
                }
-               spin_unlock(&ent->lock);
+               spin_unlock_irq(&ent->lock);
        }
 
        if (IS_ERR(msg))
-               msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, in_size);
+               msg = mlx5_alloc_cmd_msg(dev, gfp, in_size);
 
        return msg;
 }
 
-static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
-{
-       if (msg->cache) {
-               spin_lock(&msg->cache->lock);
-               list_add_tail(&msg->list, &msg->cache->head);
-               spin_unlock(&msg->cache->lock);
-       } else {
-               mlx5_free_cmd_msg(dev, msg);
-       }
-}
-
 static int is_manage_pages(struct mlx5_inbox_hdr *in)
 {
        return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES;
 }
 
-int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
-                 int out_size)
+static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+                   int out_size, mlx5_cmd_cbk_t callback, void *context)
 {
        struct mlx5_cmd_msg *inb;
        struct mlx5_cmd_msg *outb;
        int pages_queue;
+       gfp_t gfp;
        int err;
        u8 status = 0;
 
        pages_queue = is_manage_pages(in);
+       gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
 
-       inb = alloc_msg(dev, in_size);
+       inb = alloc_msg(dev, in_size, gfp);
        if (IS_ERR(inb)) {
                err = PTR_ERR(inb);
                return err;
                goto out_in;
        }
 
-       outb = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, out_size);
+       outb = mlx5_alloc_cmd_msg(dev, gfp, out_size);
        if (IS_ERR(outb)) {
                err = PTR_ERR(outb);
                goto out_in;
        }
 
-       err = mlx5_cmd_invoke(dev, inb, outb, NULL, NULL, pages_queue, &status);
+       err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context,
+                             pages_queue, &status);
        if (err)
                goto out_out;
 
        err = mlx5_copy_from_msg(out, outb, out_size);
 
 out_out:
-       mlx5_free_cmd_msg(dev, outb);
+       if (!callback)
+               mlx5_free_cmd_msg(dev, outb);
 
 out_in:
-       free_msg(dev, inb);
+       if (!callback)
+               free_msg(dev, inb);
        return err;
 }
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+                 int out_size)
+{
+       return cmd_exec(dev, in, in_size, out, out_size, NULL, NULL);
+}
 EXPORT_SYMBOL(mlx5_cmd_exec);
 
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+                    void *out, int out_size, mlx5_cmd_cbk_t callback,
+                    void *context)
+{
+       return cmd_exec(dev, in, in_size, out, out_size, callback, context);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_cb);
+
 static void destroy_msg_cache(struct mlx5_core_dev *dev)
 {
        struct mlx5_cmd *cmd = &dev->cmd;
 
                return 0;
 
        stats = filp->private_data;
-       spin_lock(&stats->lock);
+       spin_lock_irq(&stats->lock);
        if (stats->n)
                field = div64_u64(stats->sum, stats->n);
-       spin_unlock(&stats->lock);
+       spin_unlock_irq(&stats->lock);
        ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
        if (ret > 0) {
                if (copy_to_user(buf, tbuf, ret))
        struct mlx5_cmd_stats *stats;
 
        stats = filp->private_data;
-       spin_lock(&stats->lock);
+       spin_lock_irq(&stats->lock);
        stats->sum = 0;
        stats->n = 0;
-       spin_unlock(&stats->lock);
+       spin_unlock_irq(&stats->lock);
 
        *pos += count;
 
 
 #include "mlx5_core.h"
 
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
-                         struct mlx5_create_mkey_mbox_in *in, int inlen)
+                         struct mlx5_create_mkey_mbox_in *in, int inlen,
+                         mlx5_cmd_cbk_t callback, void *context,
+                         struct mlx5_create_mkey_mbox_out *out)
 {
-       struct mlx5_create_mkey_mbox_out out;
+       struct mlx5_create_mkey_mbox_out lout;
        int err;
        u8 key;
 
-       memset(&out, 0, sizeof(out));
-       spin_lock(&dev->priv.mkey_lock);
+       memset(&lout, 0, sizeof(lout));
+       spin_lock_irq(&dev->priv.mkey_lock);
        key = dev->priv.mkey_key++;
-       spin_unlock(&dev->priv.mkey_lock);
+       spin_unlock_irq(&dev->priv.mkey_lock);
        in->seg.qpn_mkey7_0 |= cpu_to_be32(key);
        in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY);
-       err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+       if (callback) {
+               err = mlx5_cmd_exec_cb(dev, in, inlen, out, sizeof(*out),
+                                      callback, context);
+               return err;
+       } else {
+               err = mlx5_cmd_exec(dev, in, inlen, &lout, sizeof(lout));
+       }
+
        if (err) {
                mlx5_core_dbg(dev, "cmd exec faile %d\n", err);
                return err;
        }
 
-       if (out.hdr.status) {
-               mlx5_core_dbg(dev, "status %d\n", out.hdr.status);
-               return mlx5_cmd_status_to_err(&out.hdr);
+       if (lout.hdr.status) {
+               mlx5_core_dbg(dev, "status %d\n", lout.hdr.status);
+               return mlx5_cmd_status_to_err(&lout.hdr);
        }
 
-       mr->key = mlx5_idx_to_mkey(be32_to_cpu(out.mkey) & 0xffffff) | key;
-       mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", be32_to_cpu(out.mkey), key, mr->key);
+       mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key;
+       mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
+                     be32_to_cpu(lout.mkey), key, mr->key);
 
        return err;
 }
 
 struct mlx5_cmd_work_ent {
        struct mlx5_cmd_msg    *in;
        struct mlx5_cmd_msg    *out;
+       void                   *uout;
+       int                     uout_size;
        mlx5_cmd_cbk_t          callback;
        void                   *context;
-       int idx;
+       int                     idx;
        struct completion       done;
        struct mlx5_cmd        *cmd;
        struct work_struct      work;
        u8                      token;
        struct timespec         ts1;
        struct timespec         ts2;
+       u16                     op;
 };
 
 struct mlx5_pas {
 int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr);
 int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
                  int out_size);
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+                    void *out, int out_size, mlx5_cmd_cbk_t callback,
+                    void *context);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
 int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
                      u16 lwm, int is_srq);
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
-                         struct mlx5_create_mkey_mbox_in *in, int inlen);
+                         struct mlx5_create_mkey_mbox_in *in, int inlen,
+                         mlx5_cmd_cbk_t callback, void *context,
+                         struct mlx5_create_mkey_mbox_out *out);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr);
 int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
                         struct mlx5_query_mkey_mbox_out *out, int outlen);
        return mkey_idx << 8;
 }
 
+static inline u8 mlx5_mkey_variant(u32 mkey)
+{
+       return mkey & 0xff;
+}
+
 enum {
        MLX5_PROF_MASK_QP_SIZE          = (u64)1 << 0,
        MLX5_PROF_MASK_MR_CACHE         = (u64)1 << 1,