/*
  * Writeback
  */
-#define AMDGPU_MAX_WB 128      /* Reserve at most 128 WB slots for amdgpu-owned rings. */
+#define AMDGPU_MAX_WB 256      /* Reserve at most 256 WB slots for amdgpu-owned rings. */
 
 struct amdgpu_wb {
        struct amdgpu_bo        *wb_obj;
 
 
        spin_lock_init(&kiq->ring_lock);
 
-       r = amdgpu_device_wb_get(adev, &kiq->reg_val_offs);
-       if (r)
-               return r;
-
        ring->adev = NULL;
        ring->ring_obj = NULL;
        ring->use_doorbell = true;
 
 void amdgpu_gfx_kiq_free_ring(struct amdgpu_ring *ring)
 {
-       amdgpu_device_wb_free(ring->adev, ring->adev->gfx.kiq.reg_val_offs);
        amdgpu_ring_fini(ring);
 }
 
 {
        signed long r, cnt = 0;
        unsigned long flags;
-       uint32_t seq;
+       uint32_t seq, reg_val_offs = 0, value = 0;
        struct amdgpu_kiq *kiq = &adev->gfx.kiq;
        struct amdgpu_ring *ring = &kiq->ring;
 
        BUG_ON(!ring->funcs->emit_rreg);
 
        spin_lock_irqsave(&kiq->ring_lock, flags);
+       if (amdgpu_device_wb_get(adev, ®_val_offs)) {
+               spin_unlock_irqrestore(&kiq->ring_lock, flags);
+               pr_err("critical bug! too many kiq readers\n");
+               goto failed_kiq_read;
+       }
        amdgpu_ring_alloc(ring, 32);
-       amdgpu_ring_emit_rreg(ring, reg);
+       amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
        amdgpu_fence_emit_polling(ring, &seq);
        amdgpu_ring_commit(ring);
        spin_unlock_irqrestore(&kiq->ring_lock, flags);
        if (cnt > MAX_KIQ_REG_TRY)
                goto failed_kiq_read;
 
-       return adev->wb.wb[kiq->reg_val_offs];
+       mb();
+       value = adev->wb.wb[reg_val_offs];
+       amdgpu_device_wb_free(adev, reg_val_offs);
+       return value;
 
 failed_kiq_read:
        pr_err("failed to read reg:%x\n", reg);
 
        struct amdgpu_ring      ring;
        struct amdgpu_irq_src   irq;
        const struct kiq_pm4_funcs *pmf;
-       uint32_t                        reg_val_offs;
 };
 
 /*
 
        void (*end_use)(struct amdgpu_ring *ring);
        void (*emit_switch_buffer) (struct amdgpu_ring *ring);
        void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
-       void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
+       void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg,
+                         uint32_t reg_val_offs);
        void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
        void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
                              uint32_t val, uint32_t mask);
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
-#define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
+#define amdgpu_ring_emit_rreg(r, d, o) (r)->funcs->emit_rreg((r), (d), (o))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
 #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
 #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
 
        amdgpu_ring_write(ring, FRAME_CMD(start ? 0 : 1)); /* frame_end */
 }
 
-static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+                                    uint32_t reg_val_offs)
 {
        struct amdgpu_device *adev = ring->adev;
-       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
        amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
        amdgpu_ring_write(ring, 0 |     /* src: register*/
        amdgpu_ring_write(ring, reg);
        amdgpu_ring_write(ring, 0);
        amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
        amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
 }
 
 static void gfx_v10_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
 
                ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
 }
 
-static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+                                   uint32_t reg_val_offs)
 {
        struct amdgpu_device *adev = ring->adev;
-       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
        amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
        amdgpu_ring_write(ring, 0 |     /* src: register*/
        amdgpu_ring_write(ring, reg);
        amdgpu_ring_write(ring, 0);
        amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
        amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
 }
 
 static void gfx_v8_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,
 
 {
        signed long r, cnt = 0;
        unsigned long flags;
-       uint32_t seq;
+       uint32_t seq, reg_val_offs = 0;
+       uint64_t value = 0;
        struct amdgpu_kiq *kiq = &adev->gfx.kiq;
        struct amdgpu_ring *ring = &kiq->ring;
 
        BUG_ON(!ring->funcs->emit_rreg);
 
        spin_lock_irqsave(&kiq->ring_lock, flags);
+       if (amdgpu_device_wb_get(adev, ®_val_offs)) {
+               spin_unlock_irqrestore(&kiq->ring_lock, flags);
+               pr_err("critical bug! too many kiq readers\n");
+               goto failed_kiq_read;
+       }
        amdgpu_ring_alloc(ring, 32);
        amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
        amdgpu_ring_write(ring, 9 |     /* src: register*/
        amdgpu_ring_write(ring, 0);
        amdgpu_ring_write(ring, 0);
        amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
        amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
        amdgpu_fence_emit_polling(ring, &seq);
        amdgpu_ring_commit(ring);
        spin_unlock_irqrestore(&kiq->ring_lock, flags);
        if (cnt > MAX_KIQ_REG_TRY)
                goto failed_kiq_read;
 
-       return (uint64_t)adev->wb.wb[kiq->reg_val_offs] |
-               (uint64_t)adev->wb.wb[kiq->reg_val_offs + 1 ] << 32ULL;
+       mb();
+       value = (uint64_t)adev->wb.wb[reg_val_offs] |
+               (uint64_t)adev->wb.wb[reg_val_offs + 1 ] << 32ULL;
+       amdgpu_device_wb_free(adev, reg_val_offs);
+       return value;
 
 failed_kiq_read:
        pr_err("failed to read gpu clock\n");
                ring->ring[offset] = (ring->ring_size>>2) - offset + cur;
 }
 
-static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+static void gfx_v9_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg,
+                                   uint32_t reg_val_offs)
 {
        struct amdgpu_device *adev = ring->adev;
-       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
        amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
        amdgpu_ring_write(ring, 0 |     /* src: register*/
        amdgpu_ring_write(ring, reg);
        amdgpu_ring_write(ring, 0);
        amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
        amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
-                               kiq->reg_val_offs * 4));
+                               reg_val_offs * 4));
 }
 
 static void gfx_v9_0_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg,