static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
        }
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-       --cpuctx->sched_cb_usage;
+       this_cpu_dec(perf_sched_cb_usages);
+
+       if (!--cpuctx->sched_cb_usage)
+               list_del(&cpuctx->sched_cb_entry);
 }
 
 
 {
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-       cpuctx->sched_cb_usage++;
+       if (!cpuctx->sched_cb_usage++)
+               list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+       this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+                               struct task_struct *next,
+                               bool sched_in)
+{
+       struct perf_cpu_context *cpuctx;
+
+       if (prev == next)
+               return;
+
+       list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+               /* will be handled in perf_event_context_sched_in/out */
+               if (cpuctx->task_ctx)
+                       continue;
+
+               __perf_pmu_sched_task(cpuctx, sched_in);
+       }
+}
+
 static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);
 
 {
        int ctxn;
 
+       if (__this_cpu_read(perf_sched_cb_usages))
+               perf_pmu_sched_task(task, next, false);
+
        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);
 
 
        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);
+
+       if (__this_cpu_read(perf_sched_cb_usages))
+               perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
        if (event->parent)
                return;
 
-       if (event->attach_state & PERF_ATTACH_TASK)
+       if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->parent)
                return;
 
-       if (event->attach_state & PERF_ATTACH_TASK)
+       if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
 #ifdef CONFIG_CGROUP_PERF
                INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
+               INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
        }
 }