write_same_max_bytes is 0, write same is not supported
                by the device.
 
+What:          /sys/block/<disk>/queue/write_zeroes_max_bytes
+Date:          November 2016
+Contact:       Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+Description:
+               Devices that support write zeroes operation in which a
+               single request can be issued to zero out the range of
+               contiguous blocks on storage without having any payload
+               in the request. This can be used to optimize writing zeroes
+               to the devices. write_zeroes_max_bytes indicates how many
+               bytes can be written in a single write zeroes command. If
+               write_zeroes_max_bytes is 0, write zeroes is not supported
+               by the device.
+
 What:          /sys/block/<disk>/queue/zoned
 Date:          September 2016
 Contact:       Damien Le Moal <damien.lemoal@hgst.com>
 
        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_ZEROES:
                break;
        case REQ_OP_WRITE_SAME:
                bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
 
                if (!bdev_is_zoned(bio->bi_bdev))
                        goto not_supported;
                break;
+       case REQ_OP_WRITE_ZEROES:
+               if (!bdev_write_zeroes_sectors(bio->bi_bdev))
+                       goto not_supported;
+               break;
        default:
                break;
        }
 
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
+/**
+ * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
+ * @bdev:      blockdev to issue
+ * @sector:    start sector
+ * @nr_sects:  number of sectors to write
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ * @biop:      pointer to anchor bio
+ *
+ * Description:
+ *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
+ */
+static int __blkdev_issue_write_zeroes(struct block_device *bdev,
+               sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+               struct bio **biop)
+{
+       struct bio *bio = *biop;
+       unsigned int max_write_zeroes_sectors;
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       if (!q)
+               return -ENXIO;
+
+       /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
+       max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
+
+       if (max_write_zeroes_sectors == 0)
+               return -EOPNOTSUPP;
+
+       while (nr_sects) {
+               bio = next_bio(bio, 0, gfp_mask);
+               bio->bi_iter.bi_sector = sector;
+               bio->bi_bdev = bdev;
+               bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+
+               if (nr_sects > max_write_zeroes_sectors) {
+                       bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
+                       nr_sects -= max_write_zeroes_sectors;
+                       sector += max_write_zeroes_sectors;
+               } else {
+                       bio->bi_iter.bi_size = nr_sects << 9;
+                       nr_sects = 0;
+               }
+               cond_resched();
+       }
+
+       *biop = bio;
+       return 0;
+}
+
 /**
  * __blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:      blockdev to issue
                        goto out;
        }
 
+       ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
+                       biop);
+       if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+               goto out;
+
        ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
                        ZERO_PAGE(0), biop);
        if (ret == 0 || (ret && ret != -EOPNOTSUPP))
  *  the discard request fail, if the discard flag is not set, or if
  *  discard_zeroes_data is not supported, this function will resort to
  *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports the WRITE SAME command
- *  blkdev_issue_zeroout() will use it to optimize the process of
+ *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
+ *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
  *  clearing the block range. Otherwise the zeroing will be performed
  *  using regular WRITE calls.
  */
 
        case REQ_OP_SECURE_ERASE:
                split = blk_bio_discard_split(q, *bio, bs, &nsegs);
                break;
+       case REQ_OP_WRITE_ZEROES:
+               split = NULL;
+               nsegs = (*bio)->bi_phys_segments;
+               break;
        case REQ_OP_WRITE_SAME:
                split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
                break;
         * This should probably be returning 0, but blk_add_request_payload()
         * (Christoph!!!!)
         */
-       if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-               return 1;
-
-       if (bio_op(bio) == REQ_OP_WRITE_SAME)
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_SAME:
+       case REQ_OP_WRITE_ZEROES:
                return 1;
+       default:
+               break;
+       }
 
        fbio = bio;
        cluster = blk_queue_cluster(q);
        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_ZEROES:
                /*
                 * This is a hack - drivers should be neither modifying the
                 * biovec, nor relying on bi_vcnt - but because of
 
        lim->max_dev_sectors = 0;
        lim->chunk_sectors = 0;
        lim->max_write_same_sectors = 0;
+       lim->max_write_zeroes_sectors = 0;
        lim->max_discard_sectors = 0;
        lim->max_hw_discard_sectors = 0;
        lim->discard_granularity = 0;
        lim->max_sectors = UINT_MAX;
        lim->max_dev_sectors = UINT_MAX;
        lim->max_write_same_sectors = UINT_MAX;
+       lim->max_write_zeroes_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
 
 }
 EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
 
+/**
+ * blk_queue_max_write_zeroes_sectors - set max sectors for a single
+ *                                      write zeroes
+ * @q:  the request queue for the device
+ * @max_write_zeroes_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+               unsigned int max_write_zeroes_sectors)
+{
+       q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+
 /**
  * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
        t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
        t->max_write_same_sectors = min(t->max_write_same_sectors,
                                        b->max_write_same_sectors);
+       t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
+                                       b->max_write_zeroes_sectors);
        t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 
        t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 
                (unsigned long long)q->limits.max_write_same_sectors << 9);
 }
 
+static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
+{
+       return sprintf(page, "%llu\n",
+               (unsigned long long)q->limits.max_write_zeroes_sectors << 9);
+}
 
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
        .show = queue_write_same_max_show,
 };
 
+static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
+       .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+       .show = queue_write_zeroes_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
        .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
        .show = queue_show_nonrot,
        &queue_discard_max_hw_entry.attr,
        &queue_discard_zeroes_data_entry.attr,
        &queue_write_same_max_entry.attr,
+       &queue_write_zeroes_max_entry.attr,
        &queue_nonrot_entry.attr,
        &queue_zoned_entry.attr,
        &queue_nomerges_entry.attr,
 
        const int op = bio_op(bio);
 
        /*
-        * If not a WRITE (or a discard), do nothing
+        * If not a WRITE (or a discard or write zeroes), do nothing
         */
-       if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+       if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
+                               op == REQ_OP_WRITE_ZEROES))
                return false;
 
        /*
 
        if (bio &&
            bio->bi_iter.bi_size &&
            bio_op(bio) != REQ_OP_DISCARD &&
-           bio_op(bio) != REQ_OP_SECURE_ERASE)
+           bio_op(bio) != REQ_OP_SECURE_ERASE &&
+           bio_op(bio) != REQ_OP_WRITE_ZEROES)
                return true;
 
        return false;
 {
        return bio_op(bio) == REQ_OP_DISCARD ||
               bio_op(bio) == REQ_OP_SECURE_ERASE ||
-              bio_op(bio) == REQ_OP_WRITE_SAME;
+              bio_op(bio) == REQ_OP_WRITE_SAME ||
+              bio_op(bio) == REQ_OP_WRITE_ZEROES;
 }
 
 static inline bool bio_mergeable(struct bio *bio)
        struct bvec_iter iter;
 
        /*
-        * We special case discard/write same, because they interpret bi_size
-        * differently:
+        * We special case discard/write same/write zeroes, because they
+        * interpret bi_size differently:
         */
 
-       if (bio_op(bio) == REQ_OP_DISCARD)
-               return 1;
-
-       if (bio_op(bio) == REQ_OP_SECURE_ERASE)
-               return 1;
-
-       if (bio_op(bio) == REQ_OP_WRITE_SAME)
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_SAME:
+       case REQ_OP_WRITE_ZEROES:
                return 1;
+       default:
+               break;
+       }
 
        bio_for_each_segment(bv, bio, iter)
                segs++;
 
        REQ_OP_ZONE_RESET       = 6,
        /* write the same sector many times */
        REQ_OP_WRITE_SAME       = 7,
+       /* write the zero filled sector many times */
+       REQ_OP_WRITE_ZEROES     = 8,
 
        REQ_OP_LAST,
 };
 
        unsigned int            max_discard_sectors;
        unsigned int            max_hw_discard_sectors;
        unsigned int            max_write_same_sectors;
+       unsigned int            max_write_zeroes_sectors;
        unsigned int            discard_granularity;
        unsigned int            discard_alignment;
 
        if (req_op(rq) == REQ_OP_FLUSH)
                return false;
 
+       if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+               return false;
+
        if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                return false;
        if (rq->rq_flags & RQF_NOMERGE_FLAGS)
        if (unlikely(op == REQ_OP_WRITE_SAME))
                return q->limits.max_write_same_sectors;
 
+       if (unlikely(op == REQ_OP_WRITE_ZEROES))
+               return q->limits.max_write_zeroes_sectors;
+
        return q->limits.max_sectors;
 }
 
                unsigned int max_discard_sectors);
 extern void blk_queue_max_write_same_sectors(struct request_queue *q,
                unsigned int max_write_same_sectors);
+extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+               unsigned int max_write_same_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
        return 0;
 }
 
+static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       if (q)
+               return q->limits.max_write_zeroes_sectors;
+
+       return 0;
+}
+
 static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
 {
        struct request_queue *q = bdev_get_queue(bdev);