struct mutex tx_lock;
        struct request *pending;
        int sent;
+       bool dead;
+       int fallback_index;
 };
 
 #define NBD_TIMEDOUT                   0
 
 struct nbd_cmd {
        struct nbd_device *nbd;
+       int index;
        struct completion send_complete;
 };
 
        return "invalid";
 }
 
+static void nbd_mark_nsock_dead(struct nbd_sock *nsock)
+{
+       if (!nsock->dead)
+               kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+       nsock->dead = true;
+       nsock->pending = NULL;
+       nsock->sent = 0;
+}
+
 static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 {
        if (bdev->bd_openers <= 1)
        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
        struct nbd_device *nbd = cmd->nbd;
 
-       dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
+       if (nbd->num_connections > 1) {
+               dev_err_ratelimited(nbd_to_dev(nbd),
+                                   "Connection timed out, retrying\n");
+               mutex_lock(&nbd->config_lock);
+               /*
+                * Hooray we have more connections, requeue this IO, the submit
+                * path will put it on a real connection.
+                */
+               if (nbd->socks && nbd->num_connections > 1) {
+                       if (cmd->index < nbd->num_connections) {
+                               struct nbd_sock *nsock =
+                                       nbd->socks[cmd->index];
+                               mutex_lock(&nsock->tx_lock);
+                               nbd_mark_nsock_dead(nsock);
+                               mutex_unlock(&nsock->tx_lock);
+                       }
+                       mutex_unlock(&nbd->config_lock);
+                       blk_mq_requeue_request(req, true);
+                       return BLK_EH_NOT_HANDLED;
+               }
+               mutex_unlock(&nbd->config_lock);
+       } else {
+               dev_err_ratelimited(nbd_to_dev(nbd),
+                                   "Connection timed out\n");
+       }
        set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
        req->errors = -EIO;
 
                }
                iov_iter_advance(&from, sent);
        }
+       cmd->index = index;
        request.type = htonl(type);
        if (type != NBD_CMD_FLUSH) {
                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
                }
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                        "Send control failed (result %d)\n", result);
-               return -EIO;
+               return -EAGAIN;
        }
 send_pages:
        if (type != NBD_CMD_WRITE)
                                dev_err(disk_to_dev(nbd->disk),
                                        "Send data failed (result %d)\n",
                                        result);
-                               return -EIO;
+                               return -EAGAIN;
                        }
                        /*
                         * The completion might already have come in,
        return 0;
 }
 
+static int nbd_disconnected(struct nbd_device *nbd)
+{
+       return test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) ||
+               test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
+}
+
 /* NULL returned = something went wrong, inform userspace */
 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 {
        iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
        if (result <= 0) {
-               if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
-                   !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+               if (!nbd_disconnected(nbd))
                        dev_err(disk_to_dev(nbd->disk),
                                "Receive control failed (result %d)\n", result);
                return ERR_PTR(result);
                        if (result <= 0) {
                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
                                        result);
-                               req->errors = -EIO;
-                               return cmd;
+                               /*
+                                * If we've disconnected or we only have 1
+                                * connection then we need to make sure we
+                                * complete this request, otherwise error out
+                                * and let the timeout stuff handle resubmitting
+                                * this request onto another connection.
+                                */
+                               if (nbd_disconnected(nbd) ||
+                                   nbd->num_connections <= 1) {
+                                       req->errors = -EIO;
+                                       return cmd;
+                               }
+                               return ERR_PTR(-EIO);
                        }
                        dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
                                cmd, bvec.bv_len);
        while (1) {
                cmd = nbd_read_stat(nbd, args->index);
                if (IS_ERR(cmd)) {
+                       struct nbd_sock *nsock = nbd->socks[args->index];
+
+                       mutex_lock(&nsock->tx_lock);
+                       nbd_mark_nsock_dead(nsock);
+                       mutex_unlock(&nsock->tx_lock);
                        ret = PTR_ERR(cmd);
                        break;
                }
 
                nbd_end_request(cmd);
        }
-
-       /*
-        * We got an error, shut everybody down if this wasn't the result of a
-        * disconnect request.
-        */
-       if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
-               sock_shutdown(nbd);
        atomic_dec(&nbd->recv_threads);
        wake_up(&nbd->recv_wq);
 }
        dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }
 
+static int find_fallback(struct nbd_device *nbd, int index)
+{
+       int new_index = -1;
+       struct nbd_sock *nsock = nbd->socks[index];
+       int fallback = nsock->fallback_index;
+
+       if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
+               return new_index;
+
+       if (nbd->num_connections <= 1) {
+               dev_err_ratelimited(disk_to_dev(nbd->disk),
+                                   "Attempted send on invalid socket\n");
+               return new_index;
+       }
+
+       if (fallback >= 0 && fallback < nbd->num_connections &&
+           !nbd->socks[fallback]->dead)
+               return fallback;
+
+       if (nsock->fallback_index < 0 ||
+           nsock->fallback_index >= nbd->num_connections ||
+           nbd->socks[nsock->fallback_index]->dead) {
+               int i;
+               for (i = 0; i < nbd->num_connections; i++) {
+                       if (i == index)
+                               continue;
+                       if (!nbd->socks[i]->dead) {
+                               new_index = i;
+                               break;
+                       }
+               }
+               nsock->fallback_index = new_index;
+               if (new_index < 0) {
+                       dev_err_ratelimited(disk_to_dev(nbd->disk),
+                                           "Dead connection, failed to find a fallback\n");
+                       return new_index;
+               }
+       }
+       new_index = nsock->fallback_index;
+       return new_index;
+}
 
 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 {
                                    "Attempted send on invalid socket\n");
                return -EINVAL;
        }
-
-       if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
-               dev_err_ratelimited(disk_to_dev(nbd->disk),
-                                   "Attempted send on closed socket\n");
-               return -EINVAL;
-       }
-
        req->errors = 0;
-
+again:
        nsock = nbd->socks[index];
        mutex_lock(&nsock->tx_lock);
-       if (unlikely(!nsock->sock)) {
+       if (nsock->dead) {
+               index = find_fallback(nbd, index);
                mutex_unlock(&nsock->tx_lock);
-               dev_err_ratelimited(disk_to_dev(nbd->disk),
-                                   "Attempted send on closed socket\n");
-               return -EINVAL;
+               if (index < 0)
+                       return -EIO;
+               goto again;
        }
 
        /* Handle the case that we have a pending request that was partially
                ret = 0;
                goto out;
        }
+       /*
+        * Some failures are related to the link going down, so anything that
+        * returns EAGAIN can be retried on a different socket.
+        */
        ret = nbd_send_cmd(nbd, cmd, index);
+       if (ret == -EAGAIN) {
+               dev_err_ratelimited(disk_to_dev(nbd->disk),
+                                   "Request send failed trying another connection\n");
+               nbd_mark_nsock_dead(nsock);
+               mutex_unlock(&nsock->tx_lock);
+               goto again;
+       }
 out:
        mutex_unlock(&nsock->tx_lock);
        return ret;
 
        nbd->socks = socks;
 
+       nsock->fallback_index = -1;
+       nsock->dead = false;
        mutex_init(&nsock->tx_lock);
        nsock->sock = sock;
        nsock->pending = NULL;