DECLARE_EVENT_CLASS(cgroup_rstat,
 
-       TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
 
-       TP_ARGS(cgrp, cpu_in_loop, contended),
+       TP_ARGS(cgrp, cpu, contended),
 
        TP_STRUCT__entry(
                __field(        int,            root                    )
                __field(        int,            level                   )
                __field(        u64,            id                      )
-               __field(        int,            cpu_in_loop             )
+               __field(        int,            cpu                     )
                __field(        bool,           contended               )
        ),
 
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
-               __entry->cpu_in_loop = cpu_in_loop;
+               __entry->cpu = cpu;
                __entry->contended = contended;
        ),
 
-       TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
+       TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
                  __entry->root, __entry->id, __entry->level,
-                 __entry->cpu_in_loop, __entry->contended)
+                 __entry->cpu, __entry->contended)
 );
 
+/* Related to global: cgroup_rstat_lock */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
        TP_ARGS(cgrp, cpu, contended)
 );
 
+/* Related to per CPU: cgroup_rstat_cpu_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
+
+       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+       TP_ARGS(cgrp, cpu, contended)
+);
+
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
 
        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
 }
 
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+                                    struct cgroup *cgrp, const bool fast_path)
+{
+       unsigned long flags;
+       bool contended;
+
+       /*
+        * The _irqsave() is needed because cgroup_rstat_lock is
+        * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+        * this lock with the _irq() suffix only disables interrupts on
+        * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+        * interrupts on both configurations. The _irqsave() ensures
+        * that interrupts are always disabled and later restored.
+        */
+       contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+       if (contended) {
+               if (fast_path)
+                       trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+               else
+                       trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+               raw_spin_lock_irqsave(cpu_lock, flags);
+       }
+
+       if (fast_path)
+               trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+       else
+               trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+       return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+                             struct cgroup *cgrp, unsigned long flags,
+                             const bool fast_path)
+{
+       if (fast_path)
+               trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+       else
+               trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+       raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
 /**
  * cgroup_rstat_updated - keep track of updated rstat_cpu
  * @cgrp: target cgroup
        if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
                return;
 
-       raw_spin_lock_irqsave(cpu_lock, flags);
+       flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
 
        /* put @cgrp and all ancestors on the corresponding updated lists */
        while (true) {
                cgrp = parent;
        }
 
-       raw_spin_unlock_irqrestore(cpu_lock, flags);
+       _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
 }
 
 /**
        struct cgroup *head = NULL, *parent, *child;
        unsigned long flags;
 
-       /*
-        * The _irqsave() is needed because cgroup_rstat_lock is
-        * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-        * this lock with the _irq() suffix only disables interrupts on
-        * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-        * interrupts on both configurations. The _irqsave() ensures
-        * that interrupts are always disabled and later restored.
-        */
-       raw_spin_lock_irqsave(cpu_lock, flags);
+       flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
 
        /* Return NULL if this subtree is not on-list */
        if (!rstatc->updated_next)
        if (child != root)
                head = cgroup_rstat_push_children(head, child, cpu);
 unlock_ret:
-       raw_spin_unlock_irqrestore(cpu_lock, flags);
+       _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
        return head;
 }