int nr_cx_debugbus;
 
        struct msm_gpu_state_bo *gmu_log;
+       struct msm_gpu_state_bo *gmu_hfi;
+
+       s32 hfi_queue_history[2][HFI_HISTORY_SZ];
 
        struct list_head objs;
 };
        return snapshot;
 }
 
+static void a6xx_snapshot_gmu_hfi_history(struct msm_gpu *gpu,
+                                         struct a6xx_gpu_state *a6xx_state)
+{
+       struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+       struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
+       struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
+       unsigned i, j;
+
+       BUILD_BUG_ON(ARRAY_SIZE(gmu->queues) != ARRAY_SIZE(a6xx_state->hfi_queue_history));
+
+       for (i = 0; i < ARRAY_SIZE(gmu->queues); i++) {
+               struct a6xx_hfi_queue *queue = &gmu->queues[i];
+               for (j = 0; j < HFI_HISTORY_SZ; j++) {
+                       unsigned idx = (j + queue->history_idx) % HFI_HISTORY_SZ;
+                       a6xx_state->hfi_queue_history[i][j] = queue->history[idx];
+               }
+       }
+}
+
 #define A6XX_GBIF_REGLIST_SIZE   1
 static void a6xx_get_registers(struct msm_gpu *gpu,
                struct a6xx_gpu_state *a6xx_state,
        a6xx_get_gmu_registers(gpu, a6xx_state);
 
        a6xx_state->gmu_log = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.log);
+       a6xx_state->gmu_hfi = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.hfi);
+
+       a6xx_snapshot_gmu_hfi_history(gpu, a6xx_state);
 
        /* If GX isn't on the rest of the data isn't going to be accessible */
        if (!a6xx_gmu_gx_is_on(&a6xx_gpu->gmu))
        if (a6xx_state->gmu_log)
                kvfree(a6xx_state->gmu_log->data);
 
+       if (a6xx_state->gmu_hfi)
+               kvfree(a6xx_state->gmu_hfi->data);
+
        list_for_each_entry_safe(obj, tmp, &a6xx_state->objs, node)
                kfree(obj);
 
                struct msm_gpu_state_bo *gmu_log = a6xx_state->gmu_log;
 
                drm_printf(p, "    iova: 0x%016llx\n", gmu_log->iova);
-               drm_printf(p, "    size: %d\n", gmu_log->size);
+               drm_printf(p, "    size: %zu\n", gmu_log->size);
                adreno_show_object(p, &gmu_log->data, gmu_log->size,
                                &gmu_log->encoded);
        }
 
+       drm_puts(p, "gmu-hfi:\n");
+       if (a6xx_state->gmu_hfi) {
+               struct msm_gpu_state_bo *gmu_hfi = a6xx_state->gmu_hfi;
+               unsigned i, j;
+
+               drm_printf(p, "    iova: 0x%016llx\n", gmu_hfi->iova);
+               drm_printf(p, "    size: %zu\n", gmu_hfi->size);
+               for (i = 0; i < ARRAY_SIZE(a6xx_state->hfi_queue_history); i++) {
+                       drm_printf(p, "    queue-history[%u]:", i);
+                       for (j = 0; j < HFI_HISTORY_SZ; j++) {
+                               drm_printf(p, " %d", a6xx_state->hfi_queue_history[i][j]);
+                       }
+                       drm_printf(p, "\n");
+               }
+               adreno_show_object(p, &gmu_hfi->data, gmu_hfi->size,
+                               &gmu_hfi->encoded);
+       }
+
        drm_puts(p, "registers:\n");
        for (i = 0; i < a6xx_state->nr_registers; i++) {
                struct a6xx_gpu_state_obj *obj = &a6xx_state->registers[i];
 
 
        hdr = queue->data[index];
 
+       queue->history[(queue->history_idx++) % HFI_HISTORY_SZ] = index;
+
        /*
         * If we are to assume that the GMU firmware is in fact a rational actor
         * and is programmed to not send us a larger response than we expect
                return -ENOSPC;
        }
 
+       queue->history[(queue->history_idx++) % HFI_HISTORY_SZ] = index;
+
        for (i = 0; i < dwords; i++) {
                queue->data[index] = data[i];
                index = (index + 1) % header->size;
 
                queue->header->read_index = 0;
                queue->header->write_index = 0;
+
+               memset(&queue->history, 0xff, sizeof(queue->history));
+               queue->history_idx = 0;
        }
 }
 
        queue->data = virt;
        atomic_set(&queue->seqnum, 0);
 
+       memset(&queue->history, 0xff, sizeof(queue->history));
+       queue->history_idx = 0;
+
        /* Set up the shared memory header */
        header->iova = iova;
        header->type =  10 << 8 | id;