--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Christoph Hellwig.
+ * Copyright (c) 2025, Western Digital Corporation or its affiliates.
+ *
+ * Zoned Loop Device driver - exports a zoned block device using one file per
+ * zone as backing storage.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/blkzoned.h>
+#include <linux/pagemap.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/mutex.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+
+/*
+ * Options for adding (and removing) a device.
+ */
+enum {
+       ZLOOP_OPT_ERR                   = 0,
+       ZLOOP_OPT_ID                    = (1 << 0),
+       ZLOOP_OPT_CAPACITY              = (1 << 1),
+       ZLOOP_OPT_ZONE_SIZE             = (1 << 2),
+       ZLOOP_OPT_ZONE_CAPACITY         = (1 << 3),
+       ZLOOP_OPT_NR_CONV_ZONES         = (1 << 4),
+       ZLOOP_OPT_BASE_DIR              = (1 << 5),
+       ZLOOP_OPT_NR_QUEUES             = (1 << 6),
+       ZLOOP_OPT_QUEUE_DEPTH           = (1 << 7),
+       ZLOOP_OPT_BUFFERED_IO           = (1 << 8),
+};
+
+static const match_table_t zloop_opt_tokens = {
+       { ZLOOP_OPT_ID,                 "id=%d" },
+       { ZLOOP_OPT_CAPACITY,           "capacity_mb=%u"        },
+       { ZLOOP_OPT_ZONE_SIZE,          "zone_size_mb=%u"       },
+       { ZLOOP_OPT_ZONE_CAPACITY,      "zone_capacity_mb=%u"   },
+       { ZLOOP_OPT_NR_CONV_ZONES,      "conv_zones=%u"         },
+       { ZLOOP_OPT_BASE_DIR,           "base_dir=%s"           },
+       { ZLOOP_OPT_NR_QUEUES,          "nr_queues=%u"          },
+       { ZLOOP_OPT_QUEUE_DEPTH,        "queue_depth=%u"        },
+       { ZLOOP_OPT_BUFFERED_IO,        "buffered_io"           },
+       { ZLOOP_OPT_ERR,                NULL                    }
+};
+
+/* Default values for the "add" operation. */
+#define ZLOOP_DEF_ID                   -1
+#define ZLOOP_DEF_ZONE_SIZE            ((256ULL * SZ_1M) >> SECTOR_SHIFT)
+#define ZLOOP_DEF_NR_ZONES             64
+#define ZLOOP_DEF_NR_CONV_ZONES                8
+#define ZLOOP_DEF_BASE_DIR             "/var/local/zloop"
+#define ZLOOP_DEF_NR_QUEUES            1
+#define ZLOOP_DEF_QUEUE_DEPTH          128
+#define ZLOOP_DEF_BUFFERED_IO          false
+
+/* Arbitrary limit on the zone size (16GB). */
+#define ZLOOP_MAX_ZONE_SIZE_MB         16384
+
+struct zloop_options {
+       unsigned int            mask;
+       int                     id;
+       sector_t                capacity;
+       sector_t                zone_size;
+       sector_t                zone_capacity;
+       unsigned int            nr_conv_zones;
+       char                    *base_dir;
+       unsigned int            nr_queues;
+       unsigned int            queue_depth;
+       bool                    buffered_io;
+};
+
+/*
+ * Device states.
+ */
+enum {
+       Zlo_creating = 0,
+       Zlo_live,
+       Zlo_deleting,
+};
+
+enum zloop_zone_flags {
+       ZLOOP_ZONE_CONV = 0,
+       ZLOOP_ZONE_SEQ_ERROR,
+};
+
+struct zloop_zone {
+       struct file             *file;
+
+       unsigned long           flags;
+       struct mutex            lock;
+       enum blk_zone_cond      cond;
+       sector_t                start;
+       sector_t                wp;
+
+       gfp_t                   old_gfp_mask;
+};
+
+struct zloop_device {
+       unsigned int            id;
+       unsigned int            state;
+
+       struct blk_mq_tag_set   tag_set;
+       struct gendisk          *disk;
+
+       struct workqueue_struct *workqueue;
+       bool                    buffered_io;
+
+       const char              *base_dir;
+       struct file             *data_dir;
+
+       unsigned int            zone_shift;
+       sector_t                zone_size;
+       sector_t                zone_capacity;
+       unsigned int            nr_zones;
+       unsigned int            nr_conv_zones;
+       unsigned int            block_size;
+
+       struct zloop_zone       zones[] __counted_by(nr_zones);
+};
+
+struct zloop_cmd {
+       struct work_struct      work;
+       atomic_t                ref;
+       sector_t                sector;
+       sector_t                nr_sectors;
+       long                    ret;
+       struct kiocb            iocb;
+       struct bio_vec          *bvec;
+};
+
+static DEFINE_IDR(zloop_index_idr);
+static DEFINE_MUTEX(zloop_ctl_mutex);
+
+static unsigned int rq_zone_no(struct request *rq)
+{
+       struct zloop_device *zlo = rq->q->queuedata;
+
+       return blk_rq_pos(rq) >> zlo->zone_shift;
+}
+
+static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+       struct zloop_zone *zone = &zlo->zones[zone_no];
+       struct kstat stat;
+       sector_t file_sectors;
+       int ret;
+
+       lockdep_assert_held(&zone->lock);
+
+       ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+       if (ret < 0) {
+               pr_err("Failed to get zone %u file stat (err=%d)\n",
+                      zone_no, ret);
+               set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+               return ret;
+       }
+
+       file_sectors = stat.size >> SECTOR_SHIFT;
+       if (file_sectors > zlo->zone_capacity) {
+               pr_err("Zone %u file too large (%llu sectors > %llu)\n",
+                      zone_no, file_sectors, zlo->zone_capacity);
+               return -EINVAL;
+       }
+
+       if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+               pr_err("Zone %u file size not aligned to block size %u\n",
+                      zone_no, zlo->block_size);
+               return -EINVAL;
+       }
+
+       if (!file_sectors) {
+               zone->cond = BLK_ZONE_COND_EMPTY;
+               zone->wp = zone->start;
+       } else if (file_sectors == zlo->zone_capacity) {
+               zone->cond = BLK_ZONE_COND_FULL;
+               zone->wp = zone->start + zlo->zone_size;
+       } else {
+               zone->cond = BLK_ZONE_COND_CLOSED;
+               zone->wp = zone->start + file_sectors;
+       }
+
+       return 0;
+}
+
+static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+       struct zloop_zone *zone = &zlo->zones[zone_no];
+       int ret = 0;
+
+       if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+               return -EIO;
+
+       mutex_lock(&zone->lock);
+
+       if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+               ret = zloop_update_seq_zone(zlo, zone_no);
+               if (ret)
+                       goto unlock;
+       }
+
+       switch (zone->cond) {
+       case BLK_ZONE_COND_EXP_OPEN:
+               break;
+       case BLK_ZONE_COND_EMPTY:
+       case BLK_ZONE_COND_CLOSED:
+       case BLK_ZONE_COND_IMP_OPEN:
+               zone->cond = BLK_ZONE_COND_EXP_OPEN;
+               break;
+       case BLK_ZONE_COND_FULL:
+       default:
+               ret = -EIO;
+               break;
+       }
+
+unlock:
+       mutex_unlock(&zone->lock);
+
+       return ret;
+}
+
+static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+       struct zloop_zone *zone = &zlo->zones[zone_no];
+       int ret = 0;
+
+       if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+               return -EIO;
+
+       mutex_lock(&zone->lock);
+
+       if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+               ret = zloop_update_seq_zone(zlo, zone_no);
+               if (ret)
+                       goto unlock;
+       }
+
+       switch (zone->cond) {
+       case BLK_ZONE_COND_CLOSED:
+               break;
+       case BLK_ZONE_COND_IMP_OPEN:
+       case BLK_ZONE_COND_EXP_OPEN:
+               if (zone->wp == zone->start)
+                       zone->cond = BLK_ZONE_COND_EMPTY;
+               else
+                       zone->cond = BLK_ZONE_COND_CLOSED;
+               break;
+       case BLK_ZONE_COND_EMPTY:
+       case BLK_ZONE_COND_FULL:
+       default:
+               ret = -EIO;
+               break;
+       }
+
+unlock:
+       mutex_unlock(&zone->lock);
+
+       return ret;
+}
+
+static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+       struct zloop_zone *zone = &zlo->zones[zone_no];
+       int ret = 0;
+
+       if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+               return -EIO;
+
+       mutex_lock(&zone->lock);
+
+       if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+           zone->cond == BLK_ZONE_COND_EMPTY)
+               goto unlock;
+
+       if (vfs_truncate(&zone->file->f_path, 0)) {
+               set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+               ret = -EIO;
+               goto unlock;
+       }
+
+       zone->cond = BLK_ZONE_COND_EMPTY;
+       zone->wp = zone->start;
+       clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+unlock:
+       mutex_unlock(&zone->lock);
+
+       return ret;
+}
+
+static int zloop_reset_all_zones(struct zloop_device *zlo)
+{
+       unsigned int i;
+       int ret;
+
+       for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
+               ret = zloop_reset_zone(zlo, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+       struct zloop_zone *zone = &zlo->zones[zone_no];
+       int ret = 0;
+
+       if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+               return -EIO;
+
+       mutex_lock(&zone->lock);
+
+       if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+           zone->cond == BLK_ZONE_COND_FULL)
+               goto unlock;
+
+       if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
+               set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+               ret = -EIO;
+               goto unlock;
+       }
+
+       zone->cond = BLK_ZONE_COND_FULL;
+       zone->wp = zone->start + zlo->zone_size;
+       clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+ unlock:
+       mutex_unlock(&zone->lock);
+
+       return ret;
+}
+
+static void zloop_put_cmd(struct zloop_cmd *cmd)
+{
+       struct request *rq = blk_mq_rq_from_pdu(cmd);
+
+       if (!atomic_dec_and_test(&cmd->ref))
+               return;
+       kfree(cmd->bvec);
+       cmd->bvec = NULL;
+       if (likely(!blk_should_fake_timeout(rq->q)))
+               blk_mq_complete_request(rq);
+}
+
+static void zloop_rw_complete(struct kiocb *iocb, long ret)
+{
+       struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
+
+       cmd->ret = ret;
+       zloop_put_cmd(cmd);
+}
+
+static void zloop_rw(struct zloop_cmd *cmd)
+{
+       struct request *rq = blk_mq_rq_from_pdu(cmd);
+       struct zloop_device *zlo = rq->q->queuedata;
+       unsigned int zone_no = rq_zone_no(rq);
+       sector_t sector = blk_rq_pos(rq);
+       sector_t nr_sectors = blk_rq_sectors(rq);
+       bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
+       bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
+       int rw = is_write ? ITER_SOURCE : ITER_DEST;
+       struct req_iterator rq_iter;
+       struct zloop_zone *zone;
+       struct iov_iter iter;
+       struct bio_vec tmp;
+       sector_t zone_end;
+       int nr_bvec = 0;
+       int ret;
+
+       atomic_set(&cmd->ref, 2);
+       cmd->sector = sector;
+       cmd->nr_sectors = nr_sectors;
+       cmd->ret = 0;
+
+       /* We should never get an I/O beyond the device capacity. */
+       if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
+               ret = -EIO;
+               goto out;
+       }
+       zone = &zlo->zones[zone_no];
+       zone_end = zone->start + zlo->zone_capacity;
+
+       /*
+        * The block layer should never send requests that are not fully
+        * contained within the zone.
+        */
+       if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
+               ret = -EIO;
+               goto out;
+       }
+
+       if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+               mutex_lock(&zone->lock);
+               ret = zloop_update_seq_zone(zlo, zone_no);
+               mutex_unlock(&zone->lock);
+               if (ret)
+                       goto out;
+       }
+
+       if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
+               mutex_lock(&zone->lock);
+
+               if (is_append) {
+                       sector = zone->wp;
+                       cmd->sector = sector;
+               }
+
+               /*
+                * Write operations must be aligned to the write pointer and
+                * fully contained within the zone capacity.
+                */
+               if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
+                       pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
+                              zone_no, sector, zone->wp);
+                       ret = -EIO;
+                       goto unlock;
+               }
+
+               /* Implicitly open the target zone. */
+               if (zone->cond == BLK_ZONE_COND_CLOSED ||
+                   zone->cond == BLK_ZONE_COND_EMPTY)
+                       zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+               /*
+                * Advance the write pointer of sequential zones. If the write
+                * fails, the wp position will be corrected when the next I/O
+                * copmpletes.
+                */
+               zone->wp += nr_sectors;
+               if (zone->wp == zone_end)
+                       zone->cond = BLK_ZONE_COND_FULL;
+       }
+
+       rq_for_each_bvec(tmp, rq, rq_iter)
+               nr_bvec++;
+
+       if (rq->bio != rq->biotail) {
+               struct bio_vec *bvec;
+
+               cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
+               if (!cmd->bvec) {
+                       ret = -EIO;
+                       goto unlock;
+               }
+
+               /*
+                * The bios of the request may be started from the middle of
+                * the 'bvec' because of bio splitting, so we can't directly
+                * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
+                * API will take care of all details for us.
+                */
+               bvec = cmd->bvec;
+               rq_for_each_bvec(tmp, rq, rq_iter) {
+                       *bvec = tmp;
+                       bvec++;
+               }
+               iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
+       } else {
+               /*
+                * Same here, this bio may be started from the middle of the
+                * 'bvec' because of bio splitting, so offset from the bvec
+                * must be passed to iov iterator
+                */
+               iov_iter_bvec(&iter, rw,
+                       __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
+                                       nr_bvec, blk_rq_bytes(rq));
+               iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
+       }
+
+       cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
+       cmd->iocb.ki_filp = zone->file;
+       cmd->iocb.ki_complete = zloop_rw_complete;
+       if (!zlo->buffered_io)
+               cmd->iocb.ki_flags = IOCB_DIRECT;
+       cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
+       if (rw == ITER_SOURCE)
+               ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
+       else
+               ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
+unlock:
+       if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
+               mutex_unlock(&zone->lock);
+out:
+       if (ret != -EIOCBQUEUED)
+               zloop_rw_complete(&cmd->iocb, ret);
+       zloop_put_cmd(cmd);
+}
+
+static void zloop_handle_cmd(struct zloop_cmd *cmd)
+{
+       struct request *rq = blk_mq_rq_from_pdu(cmd);
+       struct zloop_device *zlo = rq->q->queuedata;
+
+       switch (req_op(rq)) {
+       case REQ_OP_READ:
+       case REQ_OP_WRITE:
+       case REQ_OP_ZONE_APPEND:
+               /*
+                * zloop_rw() always executes asynchronously or completes
+                * directly.
+                */
+               zloop_rw(cmd);
+               return;
+       case REQ_OP_FLUSH:
+               /*
+                * Sync the entire FS containing the zone files instead of
+                * walking all files
+                */
+               cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
+               break;
+       case REQ_OP_ZONE_RESET:
+               cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
+               break;
+       case REQ_OP_ZONE_RESET_ALL:
+               cmd->ret = zloop_reset_all_zones(zlo);
+               break;
+       case REQ_OP_ZONE_FINISH:
+               cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
+               break;
+       case REQ_OP_ZONE_OPEN:
+               cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
+               break;
+       case REQ_OP_ZONE_CLOSE:
+               cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
+               break;
+       default:
+               WARN_ON_ONCE(1);
+               pr_err("Unsupported operation %d\n", req_op(rq));
+               cmd->ret = -EOPNOTSUPP;
+               break;
+       }
+
+       blk_mq_complete_request(rq);
+}
+
+static void zloop_cmd_workfn(struct work_struct *work)
+{
+       struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
+       int orig_flags = current->flags;
+
+       current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+       zloop_handle_cmd(cmd);
+       current->flags = orig_flags;
+}
+
+static void zloop_complete_rq(struct request *rq)
+{
+       struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+       struct zloop_device *zlo = rq->q->queuedata;
+       unsigned int zone_no = cmd->sector >> zlo->zone_shift;
+       struct zloop_zone *zone = &zlo->zones[zone_no];
+       blk_status_t sts = BLK_STS_OK;
+
+       switch (req_op(rq)) {
+       case REQ_OP_READ:
+               if (cmd->ret < 0)
+                       pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
+                              zone_no, cmd->sector, cmd->nr_sectors);
+
+               if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+                       /* short read */
+                       struct bio *bio;
+
+                       __rq_for_each_bio(bio, rq)
+                               zero_fill_bio(bio);
+               }
+               break;
+       case REQ_OP_WRITE:
+       case REQ_OP_ZONE_APPEND:
+               if (cmd->ret < 0)
+                       pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
+                              zone_no,
+                              req_op(rq) == REQ_OP_WRITE ? "" : "append ",
+                              cmd->sector, cmd->nr_sectors);
+
+               if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+                       pr_err("Zone %u: partial write %ld/%u B\n",
+                              zone_no, cmd->ret, blk_rq_bytes(rq));
+                       cmd->ret = -EIO;
+               }
+
+               if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+                       /*
+                        * A write to a sequential zone file failed: mark the
+                        * zone as having an error. This will be corrected and
+                        * cleared when the next IO is submitted.
+                        */
+                       set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+                       break;
+               }
+               if (req_op(rq) == REQ_OP_ZONE_APPEND)
+                       rq->__sector = cmd->sector;
+
+               break;
+       default:
+               break;
+       }
+
+       if (cmd->ret < 0)
+               sts = errno_to_blk_status(cmd->ret);
+       blk_mq_end_request(rq, sts);
+}
+
+static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
+                                  const struct blk_mq_queue_data *bd)
+{
+       struct request *rq = bd->rq;
+       struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+       struct zloop_device *zlo = rq->q->queuedata;
+
+       if (zlo->state == Zlo_deleting)
+               return BLK_STS_IOERR;
+
+       blk_mq_start_request(rq);
+
+       INIT_WORK(&cmd->work, zloop_cmd_workfn);
+       queue_work(zlo->workqueue, &cmd->work);
+
+       return BLK_STS_OK;
+}
+
+static const struct blk_mq_ops zloop_mq_ops = {
+       .queue_rq       = zloop_queue_rq,
+       .complete       = zloop_complete_rq,
+};
+
+static int zloop_open(struct gendisk *disk, blk_mode_t mode)
+{
+       struct zloop_device *zlo = disk->private_data;
+       int ret;
+
+       ret = mutex_lock_killable(&zloop_ctl_mutex);
+       if (ret)
+               return ret;
+
+       if (zlo->state != Zlo_live)
+               ret = -ENXIO;
+       mutex_unlock(&zloop_ctl_mutex);
+       return ret;
+}
+
+static int zloop_report_zones(struct gendisk *disk, sector_t sector,
+               unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+       struct zloop_device *zlo = disk->private_data;
+       struct blk_zone blkz = {};
+       unsigned int first, i;
+       int ret;
+
+       first = disk_zone_no(disk, sector);
+       if (first >= zlo->nr_zones)
+               return 0;
+       nr_zones = min(nr_zones, zlo->nr_zones - first);
+
+       for (i = 0; i < nr_zones; i++) {
+               unsigned int zone_no = first + i;
+               struct zloop_zone *zone = &zlo->zones[zone_no];
+
+               mutex_lock(&zone->lock);
+
+               if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+                       ret = zloop_update_seq_zone(zlo, zone_no);
+                       if (ret) {
+                               mutex_unlock(&zone->lock);
+                               return ret;
+                       }
+               }
+
+               blkz.start = zone->start;
+               blkz.len = zlo->zone_size;
+               blkz.wp = zone->wp;
+               blkz.cond = zone->cond;
+               if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+                       blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
+                       blkz.capacity = zlo->zone_size;
+               } else {
+                       blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+                       blkz.capacity = zlo->zone_capacity;
+               }
+
+               mutex_unlock(&zone->lock);
+
+               ret = cb(&blkz, i, data);
+               if (ret)
+                       return ret;
+       }
+
+       return nr_zones;
+}
+
+static void zloop_free_disk(struct gendisk *disk)
+{
+       struct zloop_device *zlo = disk->private_data;
+       unsigned int i;
+
+       for (i = 0; i < zlo->nr_zones; i++) {
+               struct zloop_zone *zone = &zlo->zones[i];
+
+               mapping_set_gfp_mask(zone->file->f_mapping,
+                               zone->old_gfp_mask);
+               fput(zone->file);
+       }
+
+       fput(zlo->data_dir);
+       destroy_workqueue(zlo->workqueue);
+       kfree(zlo->base_dir);
+       kvfree(zlo);
+}
+
+static const struct block_device_operations zloop_fops = {
+       .owner                  = THIS_MODULE,
+       .open                   = zloop_open,
+       .report_zones           = zloop_report_zones,
+       .free_disk              = zloop_free_disk,
+};
+
+__printf(3, 4)
+static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
+               const char *fmt, ...)
+{
+       struct file *file;
+       va_list ap;
+       char *p;
+
+       va_start(ap, fmt);
+       p = kvasprintf(GFP_KERNEL, fmt, ap);
+       va_end(ap);
+
+       if (!p)
+               return ERR_PTR(-ENOMEM);
+       file = filp_open(p, oflags, mode);
+       kfree(p);
+       return file;
+}
+
+static int zloop_get_block_size(struct zloop_device *zlo,
+                               struct zloop_zone *zone)
+{
+       struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
+       struct kstat st;
+
+       /*
+        * If the FS block size is lower than or equal to 4K, use that as the
+        * device block size. Otherwise, fallback to the FS direct IO alignment
+        * constraint if that is provided, and to the FS underlying device
+        * physical block size if the direct IO alignment is unknown.
+        */
+       if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
+               zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
+       else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
+                (st.result_mask & STATX_DIOALIGN))
+               zlo->block_size = st.dio_offset_align;
+       else if (sb_bdev)
+               zlo->block_size = bdev_physical_block_size(sb_bdev);
+       else
+               zlo->block_size = SECTOR_SIZE;
+
+       if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+               pr_err("Zone capacity is not aligned to block size %u\n",
+                      zlo->block_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
+                          unsigned int zone_no, bool restore)
+{
+       struct zloop_zone *zone = &zlo->zones[zone_no];
+       int oflags = O_RDWR;
+       struct kstat stat;
+       sector_t file_sectors;
+       int ret;
+
+       mutex_init(&zone->lock);
+       zone->start = (sector_t)zone_no << zlo->zone_shift;
+
+       if (!restore)
+               oflags |= O_CREAT;
+
+       if (!opts->buffered_io)
+               oflags |= O_DIRECT;
+
+       if (zone_no < zlo->nr_conv_zones) {
+               /* Conventional zone file. */
+               set_bit(ZLOOP_ZONE_CONV, &zone->flags);
+               zone->cond = BLK_ZONE_COND_NOT_WP;
+               zone->wp = U64_MAX;
+
+               zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
+                                       zlo->base_dir, zlo->id, zone_no);
+               if (IS_ERR(zone->file)) {
+                       pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
+                              zone_no, zlo->base_dir, zlo->id, zone_no,
+                              PTR_ERR(zone->file));
+                       return PTR_ERR(zone->file);
+               }
+
+               if (!zlo->block_size) {
+                       ret = zloop_get_block_size(zlo, zone);
+                       if (ret)
+                               return ret;
+               }
+
+               ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+               if (ret < 0) {
+                       pr_err("Failed to get zone %u file stat\n", zone_no);
+                       return ret;
+               }
+               file_sectors = stat.size >> SECTOR_SHIFT;
+
+               if (restore && file_sectors != zlo->zone_size) {
+                       pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
+                              zone_no, file_sectors, zlo->zone_capacity);
+                       return ret;
+               }
+
+               ret = vfs_truncate(&zone->file->f_path,
+                                  zlo->zone_size << SECTOR_SHIFT);
+               if (ret < 0) {
+                       pr_err("Failed to truncate zone %u file (err=%d)\n",
+                              zone_no, ret);
+                       return ret;
+               }
+
+               return 0;
+       }
+
+       /* Sequential zone file. */
+       zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
+                                        zlo->base_dir, zlo->id, zone_no);
+       if (IS_ERR(zone->file)) {
+               pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
+                      zone_no, zlo->base_dir, zlo->id, zone_no,
+                      PTR_ERR(zone->file));
+               return PTR_ERR(zone->file);
+       }
+
+       if (!zlo->block_size) {
+               ret = zloop_get_block_size(zlo, zone);
+               if (ret)
+                       return ret;
+       }
+
+       zloop_get_block_size(zlo, zone);
+
+       mutex_lock(&zone->lock);
+       ret = zloop_update_seq_zone(zlo, zone_no);
+       mutex_unlock(&zone->lock);
+
+       return ret;
+}
+
+static bool zloop_dev_exists(struct zloop_device *zlo)
+{
+       struct file *cnv, *seq;
+       bool exists;
+
+       cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
+                                 zlo->base_dir, zlo->id, 0);
+       seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
+                                 zlo->base_dir, zlo->id, 0);
+       exists = !IS_ERR(cnv) || !IS_ERR(seq);
+
+       if (!IS_ERR(cnv))
+               fput(cnv);
+       if (!IS_ERR(seq))
+               fput(seq);
+
+       return exists;
+}
+
+static int zloop_ctl_add(struct zloop_options *opts)
+{
+       struct queue_limits lim = {
+               .max_hw_sectors         = SZ_1M >> SECTOR_SHIFT,
+               .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
+               .chunk_sectors          = opts->zone_size,
+               .features               = BLK_FEAT_ZONED,
+       };
+       unsigned int nr_zones, i, j;
+       struct zloop_device *zlo;
+       int ret = -EINVAL;
+       bool restore;
+
+       __module_get(THIS_MODULE);
+
+       nr_zones = opts->capacity >> ilog2(opts->zone_size);
+       if (opts->nr_conv_zones >= nr_zones) {
+               pr_err("Invalid number of conventional zones %u\n",
+                      opts->nr_conv_zones);
+               goto out;
+       }
+
+       zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
+       if (!zlo) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       zlo->state = Zlo_creating;
+
+       ret = mutex_lock_killable(&zloop_ctl_mutex);
+       if (ret)
+               goto out_free_dev;
+
+       /* Allocate id, if @opts->id >= 0, we're requesting that specific id */
+       if (opts->id >= 0) {
+               ret = idr_alloc(&zloop_index_idr, zlo,
+                                 opts->id, opts->id + 1, GFP_KERNEL);
+               if (ret == -ENOSPC)
+                       ret = -EEXIST;
+       } else {
+               ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
+       }
+       mutex_unlock(&zloop_ctl_mutex);
+       if (ret < 0)
+               goto out_free_dev;
+
+       zlo->id = ret;
+       zlo->zone_shift = ilog2(opts->zone_size);
+       zlo->zone_size = opts->zone_size;
+       if (opts->zone_capacity)
+               zlo->zone_capacity = opts->zone_capacity;
+       else
+               zlo->zone_capacity = zlo->zone_size;
+       zlo->nr_zones = nr_zones;
+       zlo->nr_conv_zones = opts->nr_conv_zones;
+       zlo->buffered_io = opts->buffered_io;
+
+       zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
+                               opts->nr_queues * opts->queue_depth, zlo->id);
+       if (!zlo->workqueue) {
+               ret = -ENOMEM;
+               goto out_free_idr;
+       }
+
+       if (opts->base_dir)
+               zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
+       else
+               zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
+       if (!zlo->base_dir) {
+               ret = -ENOMEM;
+               goto out_destroy_workqueue;
+       }
+
+       zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
+                                           zlo->base_dir, zlo->id);
+       if (IS_ERR(zlo->data_dir)) {
+               ret = PTR_ERR(zlo->data_dir);
+               pr_warn("Failed to open directory %s/%u (err=%d)\n",
+                       zlo->base_dir, zlo->id, ret);
+               goto out_free_base_dir;
+       }
+
+       /*
+        * If we already have zone files, we are restoring a device created by a
+        * previous add operation. In this case, zloop_init_zone() will check
+        * that the zone files are consistent with the zone configuration given.
+        */
+       restore = zloop_dev_exists(zlo);
+       for (i = 0; i < nr_zones; i++) {
+               ret = zloop_init_zone(zlo, opts, i, restore);
+               if (ret)
+                       goto out_close_files;
+       }
+
+       lim.physical_block_size = zlo->block_size;
+       lim.logical_block_size = zlo->block_size;
+
+       zlo->tag_set.ops = &zloop_mq_ops;
+       zlo->tag_set.nr_hw_queues = opts->nr_queues;
+       zlo->tag_set.queue_depth = opts->queue_depth;
+       zlo->tag_set.numa_node = NUMA_NO_NODE;
+       zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
+       zlo->tag_set.driver_data = zlo;
+
+       ret = blk_mq_alloc_tag_set(&zlo->tag_set);
+       if (ret) {
+               pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
+               goto out_close_files;
+       }
+
+       zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
+       if (IS_ERR(zlo->disk)) {
+               pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
+               ret = PTR_ERR(zlo->disk);
+               goto out_cleanup_tags;
+       }
+       zlo->disk->flags = GENHD_FL_NO_PART;
+       zlo->disk->fops = &zloop_fops;
+       zlo->disk->private_data = zlo;
+       sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
+       set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
+
+       ret = blk_revalidate_disk_zones(zlo->disk);
+       if (ret)
+               goto out_cleanup_disk;
+
+       ret = add_disk(zlo->disk);
+       if (ret) {
+               pr_err("add_disk failed (err=%d)\n", ret);
+               goto out_cleanup_disk;
+       }
+
+       mutex_lock(&zloop_ctl_mutex);
+       zlo->state = Zlo_live;
+       mutex_unlock(&zloop_ctl_mutex);
+
+       pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
+               zlo->id, zlo->nr_zones,
+               ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
+               zlo->block_size);
+
+       return 0;
+
+out_cleanup_disk:
+       put_disk(zlo->disk);
+out_cleanup_tags:
+       blk_mq_free_tag_set(&zlo->tag_set);
+out_close_files:
+       for (j = 0; j < i; j++) {
+               struct zloop_zone *zone = &zlo->zones[j];
+
+               if (!IS_ERR_OR_NULL(zone->file))
+                       fput(zone->file);
+       }
+       fput(zlo->data_dir);
+out_free_base_dir:
+       kfree(zlo->base_dir);
+out_destroy_workqueue:
+       destroy_workqueue(zlo->workqueue);
+out_free_idr:
+       mutex_lock(&zloop_ctl_mutex);
+       idr_remove(&zloop_index_idr, zlo->id);
+       mutex_unlock(&zloop_ctl_mutex);
+out_free_dev:
+       kvfree(zlo);
+out:
+       module_put(THIS_MODULE);
+       if (ret == -ENOENT)
+               ret = -EINVAL;
+       return ret;
+}
+
+static int zloop_ctl_remove(struct zloop_options *opts)
+{
+       struct zloop_device *zlo;
+       int ret;
+
+       if (!(opts->mask & ZLOOP_OPT_ID)) {
+               pr_err("No ID specified\n");
+               return -EINVAL;
+       }
+
+       ret = mutex_lock_killable(&zloop_ctl_mutex);
+       if (ret)
+               return ret;
+
+       zlo = idr_find(&zloop_index_idr, opts->id);
+       if (!zlo || zlo->state == Zlo_creating) {
+               ret = -ENODEV;
+       } else if (zlo->state == Zlo_deleting) {
+               ret = -EINVAL;
+       } else {
+               idr_remove(&zloop_index_idr, zlo->id);
+               zlo->state = Zlo_deleting;
+       }
+
+       mutex_unlock(&zloop_ctl_mutex);
+       if (ret)
+               return ret;
+
+       del_gendisk(zlo->disk);
+       put_disk(zlo->disk);
+       blk_mq_free_tag_set(&zlo->tag_set);
+
+       pr_info("Removed device %d\n", opts->id);
+
+       module_put(THIS_MODULE);
+
+       return 0;
+}
+
+static int zloop_parse_options(struct zloop_options *opts, const char *buf)
+{
+       substring_t args[MAX_OPT_ARGS];
+       char *options, *o, *p;
+       unsigned int token;
+       int ret = 0;
+
+       /* Set defaults. */
+       opts->mask = 0;
+       opts->id = ZLOOP_DEF_ID;
+       opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
+       opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
+       opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
+       opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
+       opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
+       opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
+
+       if (!buf)
+               return 0;
+
+       /* Skip leading spaces before the options. */
+       while (isspace(*buf))
+               buf++;
+
+       options = o = kstrdup(buf, GFP_KERNEL);
+       if (!options)
+               return -ENOMEM;
+
+       /* Parse the options, doing only some light invalid value checks. */
+       while ((p = strsep(&o, ",\n")) != NULL) {
+               if (!*p)
+                       continue;
+
+               token = match_token(p, zloop_opt_tokens, args);
+               opts->mask |= token;
+               switch (token) {
+               case ZLOOP_OPT_ID:
+                       if (match_int(args, &opts->id)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       break;
+               case ZLOOP_OPT_CAPACITY:
+                       if (match_uint(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!token) {
+                               pr_err("Invalid capacity\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       opts->capacity =
+                               ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+                       break;
+               case ZLOOP_OPT_ZONE_SIZE:
+                       if (match_uint(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
+                           !is_power_of_2(token)) {
+                               pr_err("Invalid zone size %u\n", token);
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       opts->zone_size =
+                               ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+                       break;
+               case ZLOOP_OPT_ZONE_CAPACITY:
+                       if (match_uint(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!token) {
+                               pr_err("Invalid zone capacity\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       opts->zone_capacity =
+                               ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+                       break;
+               case ZLOOP_OPT_NR_CONV_ZONES:
+                       if (match_uint(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       opts->nr_conv_zones = token;
+                       break;
+               case ZLOOP_OPT_BASE_DIR:
+                       p = match_strdup(args);
+                       if (!p) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+                       kfree(opts->base_dir);
+                       opts->base_dir = p;
+                       break;
+               case ZLOOP_OPT_NR_QUEUES:
+                       if (match_uint(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!token) {
+                               pr_err("Invalid number of queues\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       opts->nr_queues = min(token, num_online_cpus());
+                       break;
+               case ZLOOP_OPT_QUEUE_DEPTH:
+                       if (match_uint(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!token) {
+                               pr_err("Invalid queue depth\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       opts->queue_depth = token;
+                       break;
+               case ZLOOP_OPT_BUFFERED_IO:
+                       opts->buffered_io = true;
+                       break;
+               case ZLOOP_OPT_ERR:
+               default:
+                       pr_warn("unknown parameter or missing value '%s'\n", p);
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       ret = -EINVAL;
+       if (opts->capacity <= opts->zone_size) {
+               pr_err("Invalid capacity\n");
+               goto out;
+       }
+
+       if (opts->zone_capacity > opts->zone_size) {
+               pr_err("Invalid zone capacity\n");
+               goto out;
+       }
+
+       ret = 0;
+out:
+       kfree(options);
+       return ret;
+}
+
+enum {
+       ZLOOP_CTL_ADD,
+       ZLOOP_CTL_REMOVE,
+};
+
+static struct zloop_ctl_op {
+       int             code;
+       const char      *name;
+} zloop_ctl_ops[] = {
+       { ZLOOP_CTL_ADD,        "add" },
+       { ZLOOP_CTL_REMOVE,     "remove" },
+       { -1,   NULL },
+};
+
+static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
+                              size_t count, loff_t *pos)
+{
+       struct zloop_options opts = { };
+       struct zloop_ctl_op *op;
+       const char *buf, *opts_buf;
+       int i, ret;
+
+       if (count > PAGE_SIZE)
+               return -ENOMEM;
+
+       buf = memdup_user_nul(ubuf, count);
+       if (IS_ERR(buf))
+               return PTR_ERR(buf);
+
+       for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
+               op = &zloop_ctl_ops[i];
+               if (!op->name) {
+                       pr_err("Invalid operation\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+               if (!strncmp(buf, op->name, strlen(op->name)))
+                       break;
+       }
+
+       if (count <= strlen(op->name))
+               opts_buf = NULL;
+       else
+               opts_buf = buf + strlen(op->name);
+
+       ret = zloop_parse_options(&opts, opts_buf);
+       if (ret) {
+               pr_err("Failed to parse options\n");
+               goto out;
+       }
+
+       switch (op->code) {
+       case ZLOOP_CTL_ADD:
+               ret = zloop_ctl_add(&opts);
+               break;
+       case ZLOOP_CTL_REMOVE:
+               ret = zloop_ctl_remove(&opts);
+               break;
+       default:
+               pr_err("Invalid operation\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+out:
+       kfree(opts.base_dir);
+       kfree(buf);
+       return ret ? ret : count;
+}
+
+static int zloop_ctl_show(struct seq_file *seq_file, void *private)
+{
+       const struct match_token *tok;
+       int i;
+
+       /* Add operation */
+       seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
+       for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
+               tok = &zloop_opt_tokens[i];
+               if (!tok->pattern)
+                       break;
+               if (i)
+                       seq_putc(seq_file, ',');
+               seq_puts(seq_file, tok->pattern);
+       }
+       seq_putc(seq_file, '\n');
+
+       /* Remove operation */
+       seq_puts(seq_file, zloop_ctl_ops[1].name);
+       seq_puts(seq_file, " id=%d\n");
+
+       return 0;
+}
+
+static int zloop_ctl_open(struct inode *inode, struct file *file)
+{
+       file->private_data = NULL;
+       return single_open(file, zloop_ctl_show, NULL);
+}
+
+static int zloop_ctl_release(struct inode *inode, struct file *file)
+{
+       return single_release(inode, file);
+}
+
+static const struct file_operations zloop_ctl_fops = {
+       .owner          = THIS_MODULE,
+       .open           = zloop_ctl_open,
+       .release        = zloop_ctl_release,
+       .write          = zloop_ctl_write,
+       .read           = seq_read,
+};
+
+static struct miscdevice zloop_misc = {
+       .minor          = MISC_DYNAMIC_MINOR,
+       .name           = "zloop-control",
+       .fops           = &zloop_ctl_fops,
+};
+
+static int __init zloop_init(void)
+{
+       int ret;
+
+       ret = misc_register(&zloop_misc);
+       if (ret) {
+               pr_err("Failed to register misc device: %d\n", ret);
+               return ret;
+       }
+       pr_info("Module loaded\n");
+
+       return 0;
+}
+
+static void __exit zloop_exit(void)
+{
+       misc_deregister(&zloop_misc);
+       idr_destroy(&zloop_index_idr);
+}
+
+module_init(zloop_init);
+module_exit(zloop_exit);
+
+MODULE_DESCRIPTION("Zoned loopback device");
+MODULE_LICENSE("GPL");