extern int perf_num_counters(void);
 extern const char *perf_pmu_name(void);
-extern void __perf_event_task_sched_in(struct task_struct *task);
-extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
+extern void __perf_event_task_sched_in(struct task_struct *prev,
+                                      struct task_struct *task);
+extern void __perf_event_task_sched_out(struct task_struct *prev,
+                                       struct task_struct *next);
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 
 extern struct jump_label_key perf_sched_events;
 
-static inline void perf_event_task_sched_in(struct task_struct *task)
+static inline void perf_event_task_sched_in(struct task_struct *prev,
+                                           struct task_struct *task)
 {
        if (static_branch(&perf_sched_events))
-               __perf_event_task_sched_in(task);
+               __perf_event_task_sched_in(prev, task);
 }
 
-static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
+static inline void perf_event_task_sched_out(struct task_struct *prev,
+                                            struct task_struct *next)
 {
        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
 
-       __perf_event_task_sched_out(task, next);
+       if (static_branch(&perf_sched_events))
+               __perf_event_task_sched_out(prev, next);
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern void perf_event_task_tick(void);
 #else
 static inline void
-perf_event_task_sched_in(struct task_struct *task)                     { }
+perf_event_task_sched_in(struct task_struct *prev,
+                        struct task_struct *task)                      { }
 static inline void
-perf_event_task_sched_out(struct task_struct *task,
-                           struct task_struct *next)                   { }
+perf_event_task_sched_out(struct task_struct *prev,
+                         struct task_struct *next)                     { }
 static inline int perf_event_init_task(struct task_struct *child)      { return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)     { }
 static inline void perf_event_free_task(struct task_struct *task)      { }
 
        local_irq_restore(flags);
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task)
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+                                        struct task_struct *next)
 {
-       perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+       struct perf_cgroup *cgrp1;
+       struct perf_cgroup *cgrp2 = NULL;
+
+       /*
+        * we come here when we know perf_cgroup_events > 0
+        */
+       cgrp1 = perf_cgroup_from_task(task);
+
+       /*
+        * next is NULL when called from perf_event_enable_on_exec()
+        * that will systematically cause a cgroup_switch()
+        */
+       if (next)
+               cgrp2 = perf_cgroup_from_task(next);
+
+       /*
+        * only schedule out current cgroup events if we know
+        * that we are switching to a different cgroup. Otherwise,
+        * do no touch the cgroup events.
+        */
+       if (cgrp1 != cgrp2)
+               perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 }
 
-static inline void perf_cgroup_sched_in(struct task_struct *task)
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+                                       struct task_struct *task)
 {
-       perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+       struct perf_cgroup *cgrp1;
+       struct perf_cgroup *cgrp2 = NULL;
+
+       /*
+        * we come here when we know perf_cgroup_events > 0
+        */
+       cgrp1 = perf_cgroup_from_task(task);
+
+       /* prev can never be NULL */
+       cgrp2 = perf_cgroup_from_task(prev);
+
+       /*
+        * only need to schedule in cgroup events if we are changing
+        * cgroup during ctxsw. Cgroup events were not scheduled
+        * out of ctxsw out if that was not the case.
+        */
+       if (cgrp1 != cgrp2)
+               perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 }
 
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 {
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task)
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+                                        struct task_struct *next)
 {
 }
 
-static inline void perf_cgroup_sched_in(struct task_struct *task)
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+                                       struct task_struct *task)
 {
 }
 
         * cgroup event are system-wide mode only
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
-               perf_cgroup_sched_out(task);
+               perf_cgroup_sched_out(task, next);
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx)
  * accessing the event control register. If a NMI hits, then it will
  * keep the event running.
  */
-void __perf_event_task_sched_in(struct task_struct *task)
+void __perf_event_task_sched_in(struct task_struct *prev,
+                               struct task_struct *task)
 {
        struct perf_event_context *ctx;
        int ctxn;
         * cgroup event are system-wide mode only
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
-               perf_cgroup_sched_in(task);
+               perf_cgroup_sched_in(prev, task);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
         * ctxswin cgroup events which are already scheduled
         * in.
         */
-       perf_cgroup_sched_out(current);
+       perf_cgroup_sched_out(current, NULL);
 
        raw_spin_lock(&ctx->lock);
        task_ctx_sched_out(ctx);