adev->gfx.ngg.gds_reserve_size = ALIGN(5 * 4, 0x40);
        adev->gds.mem.total_size -= adev->gfx.ngg.gds_reserve_size;
        adev->gds.mem.gfx_partition_size -= adev->gfx.ngg.gds_reserve_size;
-       adev->gfx.ngg.gds_reserve_addr = SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_BASE);
-       adev->gfx.ngg.gds_reserve_addr += adev->gds.mem.gfx_partition_size;
+       adev->gfx.ngg.gds_reserve_addr = RREG32_SOC15(GC, 0, mmGDS_VMID0_BASE);
+       adev->gfx.ngg.gds_reserve_addr += RREG32_SOC15(GC, 0, mmGDS_VMID0_SIZE);
 
        /* Primitive Buffer */
        r = gfx_v9_0_ngg_create_buf(adev, &adev->gfx.ngg.buf[NGG_PRIM],
 
        amdgpu_ring_write(ring, PACKET3(PACKET3_DMA_DATA, 5));
        amdgpu_ring_write(ring, (PACKET3_DMA_DATA_CP_SYNC |
+                               PACKET3_DMA_DATA_DST_SEL(1) |
                                PACKET3_DMA_DATA_SRC_SEL(2)));
        amdgpu_ring_write(ring, 0);
        amdgpu_ring_write(ring, 0);
        amdgpu_ring_write(ring, adev->gfx.ngg.gds_reserve_addr);
        amdgpu_ring_write(ring, 0);
-       amdgpu_ring_write(ring, adev->gfx.ngg.gds_reserve_size);
-
+       amdgpu_ring_write(ring, PACKET3_DMA_DATA_CMD_RAW_WAIT |
+                               adev->gfx.ngg.gds_reserve_size);
 
        gfx_v9_0_write_data_to_reg(ring, 0, false,
                                   SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_SIZE), 0);