Compute IBs need this too.
v2: split out version bump
v3: squash in emit frame count fixes
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
 #              define PACKET3_DMA_DATA_CMD_SAIC    (1 << 28)
 #              define PACKET3_DMA_DATA_CMD_DAIC    (1 << 29)
 #              define PACKET3_DMA_DATA_CMD_RAW_WAIT  (1 << 30)
-#define        PACKET3_AQUIRE_MEM                              0x58
+#define        PACKET3_ACQUIRE_MEM                             0x58
 #define        PACKET3_REWIND                                  0x59
 #define        PACKET3_LOAD_UCONFIG_REG                        0x5E
 #define        PACKET3_LOAD_SH_REG                             0x5F
 
                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
                SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
                2 + /* gfx_v10_0_ring_emit_vm_flush */
-               8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
+               8 + 8 + 8 + /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
+               8, /* gfx_v10_0_emit_mem_sync */
        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
        .emit_ib = gfx_v10_0_ring_emit_ib_compute,
        .emit_fence = gfx_v10_0_ring_emit_fence,
        .emit_wreg = gfx_v10_0_ring_emit_wreg,
        .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
        .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
+       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
 };
 
 static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
 
                5 + 5 + /* hdp flush / invalidate */
                7 + /* gfx_v6_0_ring_emit_pipeline_sync */
                SI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v6_0_ring_emit_vm_flush */
-               14 + 14 + 14, /* gfx_v6_0_ring_emit_fence x3 for user fence, vm fence */
+               14 + 14 + 14 + /* gfx_v6_0_ring_emit_fence x3 for user fence, vm fence */
+               5, /* SURFACE_SYNC */
        .emit_ib_size = 6, /* gfx_v6_0_ring_emit_ib */
        .emit_ib = gfx_v6_0_ring_emit_ib,
        .emit_fence = gfx_v6_0_ring_emit_fence,
        .test_ib = gfx_v6_0_ring_test_ib,
        .insert_nop = amdgpu_ring_insert_nop,
        .emit_wreg = gfx_v6_0_ring_emit_wreg,
+       .emit_mem_sync = gfx_v6_0_emit_mem_sync,
 };
 
 static void gfx_v6_0_set_ring_funcs(struct amdgpu_device *adev)
 
        amdgpu_ring_write(ring, 0x0000000A); /* poll interval */
 }
 
+static void gfx_v7_0_emit_mem_sync_compute(struct amdgpu_ring *ring)
+{
+       amdgpu_ring_write(ring, PACKET3(PACKET3_ACQUIRE_MEM, 5));
+       amdgpu_ring_write(ring, PACKET3_TCL1_ACTION_ENA |
+                         PACKET3_TC_ACTION_ENA |
+                         PACKET3_SH_KCACHE_ACTION_ENA |
+                         PACKET3_SH_ICACHE_ACTION_ENA);  /* CP_COHER_CNTL */
+       amdgpu_ring_write(ring, 0xffffffff);    /* CP_COHER_SIZE */
+       amdgpu_ring_write(ring, 0xff);          /* CP_COHER_SIZE_HI */
+       amdgpu_ring_write(ring, 0);             /* CP_COHER_BASE */
+       amdgpu_ring_write(ring, 0);             /* CP_COHER_BASE_HI */
+       amdgpu_ring_write(ring, 0x0000000A);    /* poll interval */
+}
+
 static const struct amd_ip_funcs gfx_v7_0_ip_funcs = {
        .name = "gfx_v7_0",
        .early_init = gfx_v7_0_early_init,
                5 + /* hdp invalidate */
                7 + /* gfx_v7_0_ring_emit_pipeline_sync */
                CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
-               7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
+               7 + 7 + 7 + /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
+               7, /* gfx_v7_0_emit_mem_sync_compute */
        .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
        .emit_ib = gfx_v7_0_ring_emit_ib_compute,
        .emit_fence = gfx_v7_0_ring_emit_fence_compute,
        .insert_nop = amdgpu_ring_insert_nop,
        .pad_ib = amdgpu_ring_generic_pad_ib,
        .emit_wreg = gfx_v7_0_ring_emit_wreg,
+       .emit_mem_sync = gfx_v7_0_emit_mem_sync_compute,
 };
 
 static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev)
 
        amdgpu_ring_write(ring, 0x0000000A); /* poll interval */
 }
 
+static void gfx_v8_0_emit_mem_sync_compute(struct amdgpu_ring *ring)
+{
+       amdgpu_ring_write(ring, PACKET3(PACKET3_ACQUIRE_MEM, 5));
+       amdgpu_ring_write(ring, PACKET3_TCL1_ACTION_ENA |
+                         PACKET3_TC_ACTION_ENA |
+                         PACKET3_SH_KCACHE_ACTION_ENA |
+                         PACKET3_SH_ICACHE_ACTION_ENA |
+                         PACKET3_TC_WB_ACTION_ENA);  /* CP_COHER_CNTL */
+       amdgpu_ring_write(ring, 0xffffffff);    /* CP_COHER_SIZE */
+       amdgpu_ring_write(ring, 0xff);          /* CP_COHER_SIZE_HI */
+       amdgpu_ring_write(ring, 0);             /* CP_COHER_BASE */
+       amdgpu_ring_write(ring, 0);             /* CP_COHER_BASE_HI */
+       amdgpu_ring_write(ring, 0x0000000A);    /* poll interval */
+}
+
 static const struct amd_ip_funcs gfx_v8_0_ip_funcs = {
        .name = "gfx_v8_0",
        .early_init = gfx_v8_0_early_init,
                5 + /* hdp_invalidate */
                7 + /* gfx_v8_0_ring_emit_pipeline_sync */
                VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
-               7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
+               7 + 7 + 7 + /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
+               7, /* gfx_v8_0_emit_mem_sync_compute */
        .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
        .emit_ib = gfx_v8_0_ring_emit_ib_compute,
        .emit_fence = gfx_v8_0_ring_emit_fence_compute,
        .insert_nop = amdgpu_ring_insert_nop,
        .pad_ib = amdgpu_ring_generic_pad_ib,
        .emit_wreg = gfx_v8_0_ring_emit_wreg,
+       .emit_mem_sync = gfx_v8_0_emit_mem_sync_compute,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
 
                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
                SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
                2 + /* gfx_v9_0_ring_emit_vm_flush */
-               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
+               8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
+               7, /* gfx_v9_0_emit_mem_sync */
        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
        .emit_ib = gfx_v9_0_ring_emit_ib_compute,
        .emit_fence = gfx_v9_0_ring_emit_fence,
        .emit_wreg = gfx_v9_0_ring_emit_wreg,
        .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
        .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
+       .emit_mem_sync = gfx_v9_0_emit_mem_sync,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
 
 #              define PACKET3_DMA_DATA_CMD_SAIC    (1 << 28)
 #              define PACKET3_DMA_DATA_CMD_DAIC    (1 << 29)
 #              define PACKET3_DMA_DATA_CMD_RAW_WAIT  (1 << 30)
-#define        PACKET3_AQUIRE_MEM                              0x58
+#define        PACKET3_ACQUIRE_MEM                             0x58
 #define        PACKET3_REWIND                                  0x59
 #define        PACKET3_LOAD_UCONFIG_REG                        0x5E
 #define        PACKET3_LOAD_SH_REG                             0x5F