struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
        struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect);
        struct drm_v3d_submit_csd *args = &indirect_csd->job->args;
-       u32 *wg_counts;
+       struct v3d_dev *v3d = job->base.v3d;
+       u32 num_batches, *wg_counts;
 
        v3d_get_bo_vaddr(bo);
        v3d_get_bo_vaddr(indirect);
        args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
        args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
        args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
-       args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
-                      (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+
+       num_batches = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
+                     (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+
+       /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+       if (v3d->ver < 71 || (v3d->ver == 71 && v3d->rev < 6))
+               args->cfg[4] = num_batches - 1;
+       else
+               args->cfg[4] = num_batches;
+
+       WARN_ON(args->cfg[4] == ~0);
 
        for (int i = 0; i < 3; i++) {
                /* 0xffffffff indicates that the uniform rewrite is not needed */