#include "xe_query.h"
 
 #include <linux/nospec.h>
+#include <linux/sched/clock.h>
 
 #include <drm/ttm/ttm_placement.h>
 #include <drm/xe_drm.h>
 
+#include "regs/xe_engine_regs.h"
 #include "xe_bo.h"
 #include "xe_device.h"
 #include "xe_exec_queue.h"
 #include "xe_gt.h"
 #include "xe_guc_hwconfig.h"
 #include "xe_macros.h"
+#include "xe_mmio.h"
 #include "xe_ttm_vram_mgr.h"
 
 static const u16 xe_to_user_engine_class[] = {
        [XE_ENGINE_CLASS_COMPUTE] = DRM_XE_ENGINE_CLASS_COMPUTE,
 };
 
+static const enum xe_engine_class user_to_xe_engine_class[] = {
+       [DRM_XE_ENGINE_CLASS_RENDER] = XE_ENGINE_CLASS_RENDER,
+       [DRM_XE_ENGINE_CLASS_COPY] = XE_ENGINE_CLASS_COPY,
+       [DRM_XE_ENGINE_CLASS_VIDEO_DECODE] = XE_ENGINE_CLASS_VIDEO_DECODE,
+       [DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_ENGINE_CLASS_VIDEO_ENHANCE,
+       [DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
+};
+
 static size_t calc_hw_engine_info_size(struct xe_device *xe)
 {
        struct xe_hw_engine *hwe;
        return i * sizeof(struct drm_xe_engine_class_instance);
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+       /*
+        * Use logic same as the perf subsystem to allow user to select the
+        * reference clock id to be used for timestamps.
+        */
+       switch (clk_id) {
+       case CLOCK_MONOTONIC:
+               return &ktime_get_ns;
+       case CLOCK_MONOTONIC_RAW:
+               return &ktime_get_raw_ns;
+       case CLOCK_REALTIME:
+               return &ktime_get_real_ns;
+       case CLOCK_BOOTTIME:
+               return &ktime_get_boottime_ns;
+       case CLOCK_TAI:
+               return &ktime_get_clocktai_ns;
+       default:
+               return NULL;
+       }
+}
+
+static void
+__read_timestamps(struct xe_gt *gt,
+                 struct xe_reg lower_reg,
+                 struct xe_reg upper_reg,
+                 u64 *engine_ts,
+                 u64 *cpu_ts,
+                 u64 *cpu_delta,
+                 __ktime_func_t cpu_clock)
+{
+       u32 upper, lower, old_upper, loop = 0;
+
+       upper = xe_mmio_read32(gt, upper_reg);
+       do {
+               *cpu_delta = local_clock();
+               *cpu_ts = cpu_clock();
+               lower = xe_mmio_read32(gt, lower_reg);
+               *cpu_delta = local_clock() - *cpu_delta;
+               old_upper = upper;
+               upper = xe_mmio_read32(gt, upper_reg);
+       } while (upper != old_upper && loop++ < 2);
+
+       *engine_ts = (u64)upper << 32 | lower;
+}
+
+static int
+query_engine_cycles(struct xe_device *xe,
+                   struct drm_xe_device_query *query)
+{
+       struct drm_xe_query_engine_cycles __user *query_ptr;
+       struct drm_xe_engine_class_instance *eci;
+       struct drm_xe_query_engine_cycles resp;
+       size_t size = sizeof(resp);
+       __ktime_func_t cpu_clock;
+       struct xe_hw_engine *hwe;
+       struct xe_gt *gt;
+
+       if (query->size == 0) {
+               query->size = size;
+               return 0;
+       } else if (XE_IOCTL_DBG(xe, query->size != size)) {
+               return -EINVAL;
+       }
+
+       query_ptr = u64_to_user_ptr(query->data);
+       if (copy_from_user(&resp, query_ptr, size))
+               return -EFAULT;
+
+       cpu_clock = __clock_id_to_func(resp.clockid);
+       if (!cpu_clock)
+               return -EINVAL;
+
+       eci = &resp.eci;
+       if (eci->gt_id > XE_MAX_GT_PER_TILE)
+               return -EINVAL;
+
+       gt = xe_device_get_gt(xe, eci->gt_id);
+       if (!gt)
+               return -EINVAL;
+
+       if (eci->engine_class >= ARRAY_SIZE(user_to_xe_engine_class))
+               return -EINVAL;
+
+       hwe = xe_gt_hw_engine(gt, user_to_xe_engine_class[eci->engine_class],
+                             eci->engine_instance, true);
+       if (!hwe)
+               return -EINVAL;
+
+       resp.engine_frequency = gt->info.clock_freq;
+
+       xe_device_mem_access_get(xe);
+       xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
+
+       __read_timestamps(gt,
+                         RING_TIMESTAMP(hwe->mmio_base),
+                         RING_TIMESTAMP_UDW(hwe->mmio_base),
+                         &resp.engine_cycles,
+                         &resp.cpu_timestamp,
+                         &resp.cpu_delta,
+                         cpu_clock);
+
+       xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
+       xe_device_mem_access_put(xe);
+       resp.width = 36;
+
+       /* Only write to the output fields of user query */
+       if (put_user(resp.engine_frequency, &query_ptr->engine_frequency))
+               return -EFAULT;
+
+       if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp))
+               return -EFAULT;
+
+       if (put_user(resp.cpu_delta, &query_ptr->cpu_delta))
+               return -EFAULT;
+
+       if (put_user(resp.engine_cycles, &query_ptr->engine_cycles))
+               return -EFAULT;
+
+       if (put_user(resp.width, &query_ptr->width))
+               return -EFAULT;
+
+       return 0;
+}
+
 static int query_engines(struct xe_device *xe,
                         struct drm_xe_device_query *query)
 {
        query_gts,
        query_hwconfig,
        query_gt_topology,
+       query_engine_cycles,
 };
 
 int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 #define DRM_IOCTL_XE_WAIT_USER_FENCE           DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence)
 #define DRM_IOCTL_XE_VM_MADVISE                         DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_MADVISE, struct drm_xe_vm_madvise)
 
+/** struct drm_xe_engine_class_instance - instance of an engine class */
+struct drm_xe_engine_class_instance {
+#define DRM_XE_ENGINE_CLASS_RENDER             0
+#define DRM_XE_ENGINE_CLASS_COPY               1
+#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE       2
+#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE      3
+#define DRM_XE_ENGINE_CLASS_COMPUTE            4
+       /*
+        * Kernel only class (not actual hardware engine class). Used for
+        * creating ordered queues of VM bind operations.
+        */
+#define DRM_XE_ENGINE_CLASS_VM_BIND            5
+       __u16 engine_class;
+
+       __u16 engine_instance;
+       __u16 gt_id;
+       __u16 rsvd;
+};
+
 /**
  * enum drm_xe_memory_class - Supported memory classes.
  */
        __u64 reserved[6];
 };
 
+/**
+ * struct drm_xe_query_engine_cycles - correlate CPU and GPU timestamps
+ *
+ * If a query is made with a struct drm_xe_device_query where .query is equal to
+ * DRM_XE_DEVICE_QUERY_ENGINE_CYCLES, then the reply uses struct drm_xe_query_engine_cycles
+ * in .data. struct drm_xe_query_engine_cycles is allocated by the user and
+ * .data points to this allocated structure.
+ *
+ * The query returns the engine cycles and the frequency that can
+ * be used to calculate the engine timestamp. In addition the
+ * query returns a set of cpu timestamps that indicate when the command
+ * streamer cycle count was captured.
+ */
+struct drm_xe_query_engine_cycles {
+       /**
+        * @eci: This is input by the user and is the engine for which command
+        * streamer cycles is queried.
+        */
+       struct drm_xe_engine_class_instance eci;
+
+       /**
+        * @clockid: This is input by the user and is the reference clock id for
+        * CPU timestamp. For definition, see clock_gettime(2) and
+        * perf_event_open(2). Supported clock ids are CLOCK_MONOTONIC,
+        * CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME, CLOCK_TAI.
+        */
+       __s32 clockid;
+
+       /** @width: Width of the engine cycle counter in bits. */
+       __u32 width;
+
+       /**
+        * @engine_cycles: Engine cycles as read from its register
+        * at 0x358 offset.
+        */
+       __u64 engine_cycles;
+
+       /** @engine_frequency: Frequency of the engine cycles in Hz. */
+       __u64 engine_frequency;
+
+       /**
+        * @cpu_timestamp: CPU timestamp in ns. The timestamp is captured before
+        * reading the engine_cycles register using the reference clockid set by the
+        * user.
+        */
+       __u64 cpu_timestamp;
+
+       /**
+        * @cpu_delta: Time delta in ns captured around reading the lower dword
+        * of the engine_cycles register.
+        */
+       __u64 cpu_delta;
+};
+
 /**
  * struct drm_xe_query_mem_usage - describe memory regions and usage
  *
        /** @extensions: Pointer to the first extension struct, if any */
        __u64 extensions;
 
-#define DRM_XE_DEVICE_QUERY_ENGINES    0
-#define DRM_XE_DEVICE_QUERY_MEM_USAGE  1
-#define DRM_XE_DEVICE_QUERY_CONFIG     2
-#define DRM_XE_DEVICE_QUERY_GTS                3
-#define DRM_XE_DEVICE_QUERY_HWCONFIG   4
-#define DRM_XE_DEVICE_QUERY_GT_TOPOLOGY        5
+#define DRM_XE_DEVICE_QUERY_ENGINES            0
+#define DRM_XE_DEVICE_QUERY_MEM_USAGE          1
+#define DRM_XE_DEVICE_QUERY_CONFIG             2
+#define DRM_XE_DEVICE_QUERY_GTS                        3
+#define DRM_XE_DEVICE_QUERY_HWCONFIG           4
+#define DRM_XE_DEVICE_QUERY_GT_TOPOLOGY                5
+#define DRM_XE_DEVICE_QUERY_ENGINE_CYCLES      6
        /** @query: The type of data to query */
        __u32 query;
 
        __u64 reserved[2];
 };
 
-/** struct drm_xe_engine_class_instance - instance of an engine class */
-struct drm_xe_engine_class_instance {
-#define DRM_XE_ENGINE_CLASS_RENDER             0
-#define DRM_XE_ENGINE_CLASS_COPY               1
-#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE       2
-#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE      3
-#define DRM_XE_ENGINE_CLASS_COMPUTE            4
-       /*
-        * Kernel only class (not actual hardware engine class). Used for
-        * creating ordered queues of VM bind operations.
-        */
-#define DRM_XE_ENGINE_CLASS_VM_BIND            5
-       __u16 engine_class;
-
-       __u16 engine_instance;
-       __u16 gt_id;
-};
-
 struct drm_xe_exec_queue_create {
 #define XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY               0
        /** @extensions: Pointer to the first extension struct, if any */