return data.ret;
 }
 
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
 {
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
 }
 
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+       lockdep_assert_irqs_disabled();
+       return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
 /*
  * On task ctx scheduling...
  *
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;
 
 static void event_function_local(struct perf_event *event, event_f func, void *data)
 {
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;
 
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
        WRITE_ONCE(perf_sample_allowed_ns, tmp);
 }
 
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
 
 int perf_proc_update_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
 
 static atomic64_t perf_event_id;
 
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                            enum event_type_t event_type);
-
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
 
        ___p;                                                           \
 })
 
+static void perf_ctx_disable(struct perf_event_context *ctx)
+{
+       struct perf_event_pmu_context *pmu_ctx;
+
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx)
+{
+       struct perf_event_pmu_context *pmu_ctx;
+
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+
 #ifdef CONFIG_CGROUP_PERF
 
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 
        /* @event doesn't care about cgroup */
        if (!event->cgrp)
        }
 }
 
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
 /*
  * reschedule events based on the cgroup constraint of task.
  */
 static void perf_cgroup_switch(struct task_struct *task)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cgroup *cgrp;
-       struct perf_cpu_context *cpuctx, *tmp;
-       struct list_head *list;
-       unsigned long flags;
-
-       /*
-        * Disable interrupts and preemption to avoid this CPU's
-        * cgrp_cpuctx_entry to change under us.
-        */
-       local_irq_save(flags);
 
        cgrp = perf_cgroup_from_task(task, NULL);
 
-       list = this_cpu_ptr(&cgrp_cpuctx_list);
-       list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
-               WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-               if (READ_ONCE(cpuctx->cgrp) == cgrp)
-                       continue;
-
-               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-               perf_pmu_disable(cpuctx->ctx.pmu);
+       WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+       if (READ_ONCE(cpuctx->cgrp) == cgrp)
+               return;
 
-               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-               /*
-                * must not be done before ctxswout due
-                * to update_cgrp_time_from_cpuctx() in
-                * ctx_sched_out()
-                */
-               cpuctx->cgrp = cgrp;
-               /*
-                * set cgrp before ctxsw in to allow
-                * perf_cgroup_set_timestamp() in ctx_sched_in()
-                * to not have to pass task around
-                */
-               cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+       perf_ctx_disable(&cpuctx->ctx);
 
-               perf_pmu_enable(cpuctx->ctx.pmu);
-               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-       }
+       ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+       /*
+        * must not be done before ctxswout due
+        * to update_cgrp_time_from_cpuctx() in
+        * ctx_sched_out()
+        */
+       cpuctx->cgrp = cgrp;
+       /*
+        * set cgrp before ctxsw in to allow
+        * perf_cgroup_set_timestamp() in ctx_sched_in()
+        * to not have to pass task around
+        */
+       ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
 
-       local_irq_restore(flags);
+       perf_ctx_enable(&cpuctx->ctx);
+       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
 static int perf_cgroup_ensure_storage(struct perf_event *event,
                heap_size++;
 
        for_each_possible_cpu(cpu) {
-               cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+               cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;
 
                return;
 
        cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
-       list_add(&cpuctx->cgrp_cpuctx_entry,
-                       per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
 }
 
 static inline void
                return;
 
        cpuctx->cgrp = NULL;
-       list_del(&cpuctx->cgrp_cpuctx_entry);
 }
 
 #else /* !CONFIG_CGROUP_PERF */
  */
 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
-       struct perf_cpu_context *cpuctx;
+       struct perf_cpu_pmu_context *cpc;
        bool rotations;
 
        lockdep_assert_irqs_disabled();
 
-       cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-       rotations = perf_rotate_context(cpuctx);
+       cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+       rotations = perf_rotate_context(cpc);
 
-       raw_spin_lock(&cpuctx->hrtimer_lock);
+       raw_spin_lock(&cpc->hrtimer_lock);
        if (rotations)
-               hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+               hrtimer_forward_now(hr, cpc->hrtimer_interval);
        else
-               cpuctx->hrtimer_active = 0;
-       raw_spin_unlock(&cpuctx->hrtimer_lock);
+               cpc->hrtimer_active = 0;
+       raw_spin_unlock(&cpc->hrtimer_lock);
 
        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 }
 
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
 {
-       struct hrtimer *timer = &cpuctx->hrtimer;
-       struct pmu *pmu = cpuctx->ctx.pmu;
+       struct hrtimer *timer = &cpc->hrtimer;
+       struct pmu *pmu = cpc->epc.pmu;
        u64 interval;
 
-       /* no multiplexing needed for SW PMU */
-       if (pmu->task_ctx_nr == perf_sw_context)
-               return;
-
        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 
-       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+       cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-       raw_spin_lock_init(&cpuctx->hrtimer_lock);
+       raw_spin_lock_init(&cpc->hrtimer_lock);
        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
        timer->function = perf_mux_hrtimer_handler;
 }
 
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
 {
-       struct hrtimer *timer = &cpuctx->hrtimer;
-       struct pmu *pmu = cpuctx->ctx.pmu;
+       struct hrtimer *timer = &cpc->hrtimer;
        unsigned long flags;
 
-       /* not for SW PMU */
-       if (pmu->task_ctx_nr == perf_sw_context)
-               return 0;
-
-       raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
-       if (!cpuctx->hrtimer_active) {
-               cpuctx->hrtimer_active = 1;
-               hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+       raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+       if (!cpc->hrtimer_active) {
+               cpc->hrtimer_active = 1;
+               hrtimer_forward_now(timer, cpc->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
-       raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+       raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
 
        return 0;
 }
                pmu->pmu_enable(pmu);
 }
 
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
-{
-       struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
-       lockdep_assert_irqs_disabled();
-
-       WARN_ON(!list_empty(&ctx->active_ctx_list));
-
-       list_add(&ctx->active_ctx_list, head);
-}
-
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
 {
-       lockdep_assert_irqs_disabled();
-
-       WARN_ON(list_empty(&ctx->active_ctx_list));
-
-       list_del_init(&ctx->active_ctx_list);
+       WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
        struct perf_event_context *ctx;
 
        ctx = container_of(head, struct perf_event_context, rcu_head);
-       free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
        kfree(ctx);
 }
 
  * the context could get moved to another task.
  */
 static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 {
        struct perf_event_context *ctx;
 
         */
        local_irq_save(*flags);
        rcu_read_lock();
-       ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+       ctx = rcu_dereference(task->perf_event_ctxp);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
-               if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+               if (ctx != rcu_dereference(task->perf_event_ctxp)) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
  * reference count so that the context can't get freed.
  */
 static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
 {
        struct perf_event_context *ctx;
        unsigned long flags;
 
-       ctx = perf_lock_task_context(task, ctxn, &flags);
+       ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
  * which provides ordering when rotating groups for the same CPU.
  */
 static __always_inline int
-perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
-                     const u64 left_group_index, const struct perf_event *right)
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+                     const struct cgroup *left_cgroup, const u64 left_group_index,
+                     const struct perf_event *right)
 {
        if (left_cpu < right->cpu)
                return -1;
        if (left_cpu > right->cpu)
                return 1;
 
+       if (left_pmu) {
+               if (left_pmu < right->pmu_ctx->pmu)
+                       return -1;
+               if (left_pmu > right->pmu_ctx->pmu)
+                       return 1;
+       }
+
 #ifdef CONFIG_CGROUP_PERF
        {
                const struct cgroup *right_cgroup = event_cgroup(right);
 static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
 {
        struct perf_event *e = __node_2_pe(a);
-       return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
-                                    __node_2_pe(b)) < 0;
+       return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+                                    e->group_index, __node_2_pe(b)) < 0;
 }
 
 struct __group_key {
        int cpu;
+       struct pmu *pmu;
        struct cgroup *cgroup;
 };
 
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);
 
-       /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
-       return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+       /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+       return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+       const struct __group_key *a = key;
+       const struct perf_event *b = __node_2_pe(node);
+
+       /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+       return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+                                    b->group_index, b);
 }
 
 /*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
  */
 static void
 perf_event_groups_insert(struct perf_event_groups *groups,
 }
 
 /*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
  */
 static struct perf_event *
 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
-                       struct cgroup *cgrp)
+                       struct pmu *pmu, struct cgroup *cgrp)
 {
        struct __group_key key = {
                .cpu = cpu,
+               .pmu = pmu,
                .cgroup = cgrp,
        };
        struct rb_node *node;
        return NULL;
 }
 
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
 static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
 {
        struct __group_key key = {
                .cpu = event->cpu,
+               .pmu = pmu,
                .cgroup = event_cgroup(event),
        };
        struct rb_node *next;
        return NULL;
 }
 
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)         \
+       for (event = perf_event_groups_first(groups, cpu, pmu, NULL);   \
+            event; event = perf_event_groups_next(event, pmu))
+
 /*
  * Iterate through the whole groups tree.
  */
                perf_cgroup_event_enable(event, ctx);
 
        ctx->generation++;
+       event->pmu_ctx->nr_events++;
 }
 
 /*
        lockdep_assert_held(&event->ctx->lock);
 
        /*
-        * We can have double attach due to group movement in perf_event_open.
+        * We can have double attach due to group movement (move_group) in
+        * perf_event_open().
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;
        }
 
        ctx->generation++;
+       event->pmu_ctx->nr_events--;
 }
 
 static int
 
 static void put_event(struct perf_event *event);
 static void event_sched_out(struct perf_event *event,
-                           struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx);
 
 static void perf_put_aux_event(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *iter;
 
        /*
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
-               event_sched_out(iter, cpuctx, ctx);
+               event_sched_out(iter, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
 }
 
 static inline struct list_head *get_event_list(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
-       return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+       return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+                                   &event->pmu_ctx->flexible_active;
 }
 
 /*
  */
 static inline void perf_remove_sibling_event(struct perf_event *event)
 {
-       struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
-       event_sched_out(event, cpuctx, ctx);
+       event_sched_out(event, event->ctx);
        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 }
 
        return event->state == PERF_EVENT_STATE_DEAD;
 }
 
-static inline int __pmu_filter_match(struct perf_event *event)
-{
-       struct pmu *pmu = event->pmu;
-       return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
-
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
-{
-       struct perf_event *sibling;
-       unsigned long flags;
-       int ret = 1;
-
-       if (!__pmu_filter_match(event))
-               return 0;
-
-       local_irq_save(flags);
-       for_each_sibling_event(sibling, event) {
-               if (!__pmu_filter_match(sibling)) {
-                       ret = 0;
-                       break;
-               }
-       }
-       local_irq_restore(flags);
-
-       return ret;
-}
-
 static inline int
 event_filter_match(struct perf_event *event)
 {
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
-              perf_cgroup_match(event) && pmu_filter_match(event);
+              perf_cgroup_match(event);
 }
 
 static void
-event_sched_out(struct perf_event *event,
-                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
 {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
 
+       // XXX cpc serialization, probably per-cpu IRQ disabled
+
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);
 
        perf_event_set_state(event, state);
 
        if (!is_software_event(event))
-               cpuctx->active_oncpu--;
-       if (!--ctx->nr_active)
-               perf_event_ctx_deactivate(ctx);
+               cpc->active_oncpu--;
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq--;
-       if (event->attr.exclusive || !cpuctx->active_oncpu)
-               cpuctx->exclusive = 0;
+       if (event->attr.exclusive || !cpc->active_oncpu)
+               cpc->exclusive = 0;
 
        perf_pmu_enable(event->pmu);
 }
 
 static void
-group_sched_out(struct perf_event *group_event,
-               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
 {
        struct perf_event *event;
 
        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;
 
-       perf_pmu_disable(ctx->pmu);
+       perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
 
-       event_sched_out(group_event, cpuctx, ctx);
+       event_sched_out(group_event, ctx);
 
        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
-               event_sched_out(event, cpuctx, ctx);
-
-       perf_pmu_enable(ctx->pmu);
+               event_sched_out(event, ctx);
 }
 
 #define DETACH_GROUP   0x01UL
                           struct perf_event_context *ctx,
                           void *info)
 {
+       struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
        unsigned long flags = (unsigned long)info;
 
        if (ctx->is_active & EVENT_TIME) {
                update_cgrp_time_from_cpuctx(cpuctx, false);
        }
 
-       event_sched_out(event, cpuctx, ctx);
+       event_sched_out(event, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);
 
+       if (!pmu_ctx->nr_events) {
+               pmu_ctx->rotate_necessary = 0;
+
+               if (ctx->task && ctx->is_active) {
+                       struct perf_cpu_pmu_context *cpc;
+
+                       cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+                       WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+                       cpc->task_epc = NULL;
+               }
+       }
+
        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);
 
                ctx->is_active = 0;
-               ctx->rotate_necessary = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
         * event_function_call() user.
         */
        raw_spin_lock_irq(&ctx->lock);
-       /*
-        * Cgroup events are per-cpu events, and must IPI because of
-        * cgrp_cpuctx_list.
-        */
-       if (!ctx->is_active && !is_cgroup_event(event)) {
-               __perf_remove_from_context(event, __get_cpu_context(ctx),
+       if (!ctx->is_active) {
+               __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
                                           ctx, (void *)flags);
                raw_spin_unlock_irq(&ctx->lock);
                return;
                update_cgrp_time_from_event(event);
        }
 
+       perf_pmu_disable(event->pmu_ctx->pmu);
+
        if (event == event->group_leader)
-               group_sched_out(event, cpuctx, ctx);
+               group_sched_out(event, ctx);
        else
-               event_sched_out(event, cpuctx, ctx);
+               event_sched_out(event, ctx);
 
        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        perf_cgroup_event_disable(event, ctx);
+
+       perf_pmu_enable(event->pmu_ctx->pmu);
 }
 
 /*
 static void perf_log_itrace_start(struct perf_event *event);
 
 static int
-event_sched_in(struct perf_event *event,
-                struct perf_cpu_context *cpuctx,
-                struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
 {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        int ret = 0;
 
        WARN_ON_ONCE(event->ctx != ctx);
        }
 
        if (!is_software_event(event))
-               cpuctx->active_oncpu++;
-       if (!ctx->nr_active++)
-               perf_event_ctx_activate(ctx);
+               cpc->active_oncpu++;
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq++;
 
        if (event->attr.exclusive)
-               cpuctx->exclusive = 1;
+               cpc->exclusive = 1;
 
 out:
        perf_pmu_enable(event->pmu);
 }
 
 static int
-group_sched_in(struct perf_event *group_event,
-              struct perf_cpu_context *cpuctx,
-              struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
 {
        struct perf_event *event, *partial_group = NULL;
-       struct pmu *pmu = ctx->pmu;
+       struct pmu *pmu = group_event->pmu_ctx->pmu;
 
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
 
        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
 
-       if (event_sched_in(group_event, cpuctx, ctx))
+       if (event_sched_in(group_event, ctx))
                goto error;
 
        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
-               if (event_sched_in(event, cpuctx, ctx)) {
+               if (event_sched_in(event, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
                if (event == partial_group)
                        break;
 
-               event_sched_out(event, cpuctx, ctx);
+               event_sched_out(event, ctx);
        }
-       event_sched_out(group_event, cpuctx, ctx);
+       event_sched_out(group_event, ctx);
 
 error:
        pmu->cancel_txn(pmu);
 /*
  * Work out whether we can put this event group on the CPU now.
  */
-static int group_can_go_on(struct perf_event *event,
-                          struct perf_cpu_context *cpuctx,
-                          int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
 {
+       struct perf_event_pmu_context *epc = event->pmu_ctx;
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+
        /*
         * Groups consisting entirely of software events can always go on.
         */
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
-       if (cpuctx->exclusive)
+       if (cpc->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
        perf_group_attach(event);
 }
 
-static void ctx_sched_out(struct perf_event_context *ctx,
-                         struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
-            struct perf_cpu_context *cpuctx,
-            enum event_type_t event_type);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                              struct perf_event_context *ctx,
-                              enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+                               enum event_type_t event_type)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
        if (!cpuctx->task_ctx)
                return;
 
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
 
-       ctx_sched_out(ctx, cpuctx, event_type);
+       ctx_sched_out(ctx, event_type);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx)
 {
-       cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
+       ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
        if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+                ctx_sched_in(ctx, EVENT_PINNED);
+       ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
        if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+                ctx_sched_in(ctx, EVENT_FLEXIBLE);
 }
 
 /*
  * event_type is a bit mask of the types of events involved. For CPU events,
  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
  */
+/*
+ * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+ * event to the context or enabling existing event in the context. We can
+ * probably optimize it by rescheduling only affected pmu_ctx.
+ */
 static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        enum event_type_t event_type)
 {
-       enum event_type_t ctx_event_type;
        bool cpu_event = !!(event_type & EVENT_CPU);
 
        /*
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;
 
-       ctx_event_type = event_type & EVENT_ALL;
+       event_type &= EVENT_ALL;
 
-       perf_pmu_disable(cpuctx->ctx.pmu);
-       if (task_ctx)
-               task_ctx_sched_out(cpuctx, task_ctx, event_type);
+       perf_ctx_disable(&cpuctx->ctx);
+       if (task_ctx) {
+               perf_ctx_disable(task_ctx);
+               task_ctx_sched_out(task_ctx, event_type);
+       }
 
        /*
         * Decide which cpu ctx groups to schedule out based on the types
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
-               cpu_ctx_sched_out(cpuctx, ctx_event_type);
-       else if (ctx_event_type & EVENT_PINNED)
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+               ctx_sched_out(&cpuctx->ctx, event_type);
+       else if (event_type & EVENT_PINNED)
+               ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
 
        perf_event_sched_in(cpuctx, task_ctx);
-       perf_pmu_enable(cpuctx->ctx.pmu);
+
+       perf_ctx_enable(&cpuctx->ctx);
+       if (task_ctx)
+               perf_ctx_enable(task_ctx);
 }
 
 void perf_pmu_resched(struct pmu *pmu)
 {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
 
        perf_ctx_lock(cpuctx, task_ctx);
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;
 #endif
 
        if (reprogram) {
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
 
        if (event->cpu != -1)
-               event->cpu = cpu;
+               WARN_ON_ONCE(event->cpu != cpu);
 
        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
-        * Similarly, cgroup events for the context also needs the IPI to
-        * manipulate the cgrp_cpuctx_list.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
                return;
 
        if (ctx->is_active)
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, EVENT_TIME);
 
        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);
                return;
 
        if (!event_filter_match(event)) {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
                return;
        }
 
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
                return;
        }
 
        return err;
 }
 
-static void ctx_sched_out(struct perf_event_context *ctx,
-                         struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+                               enum event_type_t event_type)
 {
+       struct perf_event_context *ctx = pmu_ctx->ctx;
        struct perf_event *event, *tmp;
+       struct pmu *pmu = pmu_ctx->pmu;
+
+       if (ctx->task && !ctx->is_active) {
+               struct perf_cpu_pmu_context *cpc;
+
+               cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+               WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+               cpc->task_epc = NULL;
+       }
+
+       if (!event_type)
+               return;
+
+       perf_pmu_disable(pmu);
+       if (event_type & EVENT_PINNED) {
+               list_for_each_entry_safe(event, tmp,
+                                        &pmu_ctx->pinned_active,
+                                        active_list)
+                       group_sched_out(event, ctx);
+       }
+
+       if (event_type & EVENT_FLEXIBLE) {
+               list_for_each_entry_safe(event, tmp,
+                                        &pmu_ctx->flexible_active,
+                                        active_list)
+                       group_sched_out(event, ctx);
+               /*
+                * Since we cleared EVENT_FLEXIBLE, also clear
+                * rotate_necessary, is will be reset by
+                * ctx_flexible_sched_in() when needed.
+                */
+               pmu_ctx->rotate_necessary = 0;
+       }
+       perf_pmu_enable(pmu);
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;
 
        lockdep_assert_held(&ctx->lock);
 
        is_active ^= ctx->is_active; /* changed bits */
 
-       if (!ctx->nr_active || !(is_active & EVENT_ALL))
-               return;
-
-       perf_pmu_disable(ctx->pmu);
-       if (is_active & EVENT_PINNED) {
-               list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
-                       group_sched_out(event, cpuctx, ctx);
-       }
-
-       if (is_active & EVENT_FLEXIBLE) {
-               list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
-                       group_sched_out(event, cpuctx, ctx);
-
-               /*
-                * Since we cleared EVENT_FLEXIBLE, also clear
-                * rotate_necessary, is will be reset by
-                * ctx_flexible_sched_in() when needed.
-                */
-               ctx->rotate_necessary = 0;
-       }
-       perf_pmu_enable(ctx->pmu);
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+               __pmu_ctx_sched_out(pmu_ctx, is_active);
 }
 
 /*
        }
 }
 
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
-                                        struct task_struct *next)
+#define double_list_for_each_entry(pos1, pos2, head1, head2, member)   \
+       for (pos1 = list_first_entry(head1, typeof(*pos1), member),     \
+            pos2 = list_first_entry(head2, typeof(*pos2), member);     \
+            !list_entry_is_head(pos1, head1, member) &&                \
+            !list_entry_is_head(pos2, head2, member);                  \
+            pos1 = list_next_entry(pos1, member),                      \
+            pos2 = list_next_entry(pos2, member))
+
+static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+                                         struct perf_event_context *next_ctx)
+{
+       struct perf_event_pmu_context *prev_epc, *next_epc;
+
+       if (!prev_ctx->nr_task_data)
+               return;
+
+       double_list_for_each_entry(prev_epc, next_epc,
+                                  &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+                                  pmu_ctx_entry) {
+
+               if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+                       continue;
+
+               /*
+                * PMU specific parts of task perf context can require
+                * additional synchronization. As an example of such
+                * synchronization see implementation details of Intel
+                * LBR call stack data profiling;
+                */
+               if (prev_epc->pmu->swap_task_ctx)
+                       prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+               else
+                       swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+       }
+}
+
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+{
+       struct perf_event_pmu_context *pmu_ctx;
+       struct perf_cpu_pmu_context *cpc;
+
+       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+               cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+
+               if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+                       pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+       }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 {
-       struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+       struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
-       struct perf_cpu_context *cpuctx;
        int do_switch = 1;
-       struct pmu *pmu;
 
        if (likely(!ctx))
                return;
 
-       pmu = ctx->pmu;
-       cpuctx = __get_cpu_context(ctx);
-       if (!cpuctx->task_ctx)
-               return;
-
        rcu_read_lock();
-       next_ctx = next->perf_event_ctxp[ctxn];
+       next_ctx = rcu_dereference(next->perf_event_ctxp);
        if (!next_ctx)
                goto unlock;
 
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {
 
-                       perf_pmu_disable(pmu);
+                       perf_ctx_disable(ctx);
 
                        /* PMIs are disabled; ctx->nr_pending is stable. */
                        if (local_read(&ctx->nr_pending) ||
                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);
 
-                       if (cpuctx->sched_cb_usage && pmu->sched_task)
-                               pmu->sched_task(ctx, false);
-
-                       /*
-                        * PMU specific parts of task perf context can require
-                        * additional synchronization. As an example of such
-                        * synchronization see implementation details of Intel
-                        * LBR call stack data profiling;
-                        */
-                       if (pmu->swap_task_ctx)
-                               pmu->swap_task_ctx(ctx, next_ctx);
-                       else
-                               swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+                       perf_ctx_sched_task_cb(ctx, false);
+                       perf_event_swap_task_ctx_data(ctx, next_ctx);
 
-                       perf_pmu_enable(pmu);
+                       perf_ctx_enable(ctx);
 
                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * since those values are always verified under
                         * ctx->lock which we're now holding.
                         */
-                       RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
-                       RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+                       RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+                       RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
 
                        do_switch = 0;
 
 
        if (do_switch) {
                raw_spin_lock(&ctx->lock);
-               perf_pmu_disable(pmu);
+               perf_ctx_disable(ctx);
 
 inside_switch:
-               if (cpuctx->sched_cb_usage && pmu->sched_task)
-                       pmu->sched_task(ctx, false);
-               task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+               perf_ctx_sched_task_cb(ctx, false);
+               task_ctx_sched_out(ctx, EVENT_ALL);
 
-               perf_pmu_enable(pmu);
+               perf_ctx_enable(ctx);
                raw_spin_unlock(&ctx->lock);
        }
 }
 
 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
 void perf_sched_cb_dec(struct pmu *pmu)
 {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
 
        this_cpu_dec(perf_sched_cb_usages);
+       barrier();
 
-       if (!--cpuctx->sched_cb_usage)
-               list_del(&cpuctx->sched_cb_entry);
+       if (!--cpc->sched_cb_usage)
+               list_del(&cpc->sched_cb_entry);
 }
 
 
 void perf_sched_cb_inc(struct pmu *pmu)
 {
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
 
-       if (!cpuctx->sched_cb_usage++)
-               list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+       if (!cpc->sched_cb_usage++)
+               list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
 
+       barrier();
        this_cpu_inc(perf_sched_cb_usages);
 }
 
  * PEBS requires this to provide PID/TID information. This requires we flush
  * all queued PEBS records before we context switch to a new task.
  */
-static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu;
 
-       pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+       pmu = cpc->epc.pmu;
 
+       /* software PMUs will not have sched_task */
        if (WARN_ON_ONCE(!pmu->sched_task))
                return;
 
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);
 
-       pmu->sched_task(cpuctx->task_ctx, sched_in);
+       pmu->sched_task(cpc->task_epc, sched_in);
 
        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
                                struct task_struct *next,
                                bool sched_in)
 {
-       struct perf_cpu_context *cpuctx;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_cpu_pmu_context *cpc;
 
-       if (prev == next)
+       /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+       if (prev == next || cpuctx->task_ctx)
                return;
 
-       list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
-               /* will be handled in perf_event_context_sched_in/out */
-               if (cpuctx->task_ctx)
-                       continue;
-
-               __perf_pmu_sched_task(cpuctx, sched_in);
-       }
+       list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+               __perf_pmu_sched_task(cpc, sched_in);
 }
 
 static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);
 
-#define for_each_task_context_nr(ctxn)                                 \
-       for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
 /*
  * Called from scheduler to remove the events of the current task,
  * with interrupts disabled.
 void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
 {
-       int ctxn;
-
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);
 
        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);
 
-       for_each_task_context_nr(ctxn)
-               perf_event_context_sched_out(task, ctxn, next);
+       perf_event_context_sched_out(task, next);
 
        /*
         * if cgroup events exist on this CPU, then we need
                perf_cgroup_switch(next);
 }
 
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type)
-{
-       ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
-}
-
 static bool perf_less_group_idx(const void *l, const void *r)
 {
        const struct perf_event *le = *(const struct perf_event **)l;
        }
 }
 
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+       struct perf_cpu_pmu_context *cpc;
+
+       if (!pmu_ctx->ctx->task)
+               return;
+
+       cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+       WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+       cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
                                struct perf_event_groups *groups, int cpu,
+                               struct pmu *pmu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
 {
 #ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
 #endif
+       struct perf_cpu_context *cpuctx = NULL;
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct min_heap event_heap;
        struct perf_event **evt;
        int ret;
 
-       if (cpuctx) {
+       if (pmu->filter && pmu->filter(pmu, cpu))
+               return 0;
+
+       if (!ctx->task) {
+               cpuctx = this_cpu_ptr(&perf_cpu_context);
                event_heap = (struct min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
-               __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+               __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
        }
        evt = event_heap.data;
 
-       __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+       __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
 
 #ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
-               __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+               __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
 #endif
 
+       if (event_heap.nr) {
+               __link_epc((*evt)->pmu_ctx);
+               perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+       }
+
        min_heapify_all(&event_heap, &perf_min_heap);
 
        while (event_heap.nr) {
                if (ret)
                        return ret;
 
-               *evt = perf_event_groups_next(*evt);
+               *evt = perf_event_groups_next(*evt, pmu);
                if (*evt)
                        min_heapify(&event_heap, 0, &perf_min_heap);
                else
 static int merge_sched_in(struct perf_event *event, void *data)
 {
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int *can_add_hw = data;
 
        if (event->state <= PERF_EVENT_STATE_OFF)
        if (!event_filter_match(event))
                return 0;
 
-       if (group_can_go_on(event, cpuctx, *can_add_hw)) {
-               if (!group_sched_in(event, cpuctx, ctx))
+       if (group_can_go_on(event, *can_add_hw)) {
+               if (!group_sched_in(event, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }
 
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
                } else {
-                       ctx->rotate_necessary = 1;
-                       perf_mux_hrtimer_restart(cpuctx);
+                       struct perf_cpu_pmu_context *cpc;
+
+                       event->pmu_ctx->rotate_necessary = 1;
+                       cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+                       perf_mux_hrtimer_restart(cpc);
                        group_update_userpage(event);
                }
        }
        return 0;
 }
 
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-                   struct perf_cpu_context *cpuctx)
+static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
 {
+       struct perf_event_pmu_context *pmu_ctx;
        int can_add_hw = 1;
 
-       if (ctx != &cpuctx->ctx)
-               cpuctx = NULL;
-
-       visit_groups_merge(cpuctx, &ctx->pinned_groups,
-                          smp_processor_id(),
-                          merge_sched_in, &can_add_hw);
+       if (pmu) {
+               visit_groups_merge(ctx, &ctx->pinned_groups,
+                                  smp_processor_id(), pmu,
+                                  merge_sched_in, &can_add_hw);
+       } else {
+               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+                       can_add_hw = 1;
+                       visit_groups_merge(ctx, &ctx->pinned_groups,
+                                          smp_processor_id(), pmu_ctx->pmu,
+                                          merge_sched_in, &can_add_hw);
+               }
+       }
 }
 
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-                     struct perf_cpu_context *cpuctx)
+static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
 {
+       struct perf_event_pmu_context *pmu_ctx;
        int can_add_hw = 1;
 
-       if (ctx != &cpuctx->ctx)
-               cpuctx = NULL;
+       if (pmu) {
+               visit_groups_merge(ctx, &ctx->flexible_groups,
+                                  smp_processor_id(), pmu,
+                                  merge_sched_in, &can_add_hw);
+       } else {
+               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+                       can_add_hw = 1;
+                       visit_groups_merge(ctx, &ctx->flexible_groups,
+                                          smp_processor_id(), pmu_ctx->pmu,
+                                          merge_sched_in, &can_add_hw);
+               }
+       }
+}
 
-       visit_groups_merge(cpuctx, &ctx->flexible_groups,
-                          smp_processor_id(),
-                          merge_sched_in, &can_add_hw);
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+{
+       ctx_flexible_sched_in(ctx, pmu);
 }
 
 static void
-ctx_sched_in(struct perf_event_context *ctx,
-            struct perf_cpu_context *cpuctx,
-            enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        int is_active = ctx->is_active;
 
        lockdep_assert_held(&ctx->lock);
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED)
-               ctx_pinned_sched_in(ctx, cpuctx);
+               ctx_pinned_sched_in(ctx, NULL);
 
        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE)
-               ctx_flexible_sched_in(ctx, cpuctx);
+               ctx_flexible_sched_in(ctx, NULL);
 }
 
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-                            enum event_type_t event_type)
+static void perf_event_context_sched_in(struct task_struct *task)
 {
-       struct perf_event_context *ctx = &cpuctx->ctx;
-
-       ctx_sched_in(ctx, cpuctx, event_type);
-}
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_context *ctx;
 
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
-                                       struct task_struct *task)
-{
-       struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
+       rcu_read_lock();
+       ctx = rcu_dereference(task->perf_event_ctxp);
+       if (!ctx)
+               goto rcu_unlock;
 
-       cpuctx = __get_cpu_context(ctx);
+       if (cpuctx->task_ctx == ctx) {
+               perf_ctx_lock(cpuctx, ctx);
+               perf_ctx_disable(ctx);
 
-       /*
-        * HACK: for HETEROGENEOUS the task context might have switched to a
-        * different PMU, force (re)set the context,
-        */
-       pmu = ctx->pmu = cpuctx->ctx.pmu;
+               perf_ctx_sched_task_cb(ctx, true);
 
-       if (cpuctx->task_ctx == ctx) {
-               if (cpuctx->sched_cb_usage)
-                       __perf_pmu_sched_task(cpuctx, true);
-               return;
+               perf_ctx_enable(ctx);
+               perf_ctx_unlock(cpuctx, ctx);
+               goto rcu_unlock;
        }
 
        perf_ctx_lock(cpuctx, ctx);
        if (!ctx->nr_events)
                goto unlock;
 
-       perf_pmu_disable(pmu);
+       perf_ctx_disable(ctx);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
-       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+               perf_ctx_disable(&cpuctx->ctx);
+               ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+       }
+
        perf_event_sched_in(cpuctx, ctx);
 
-       if (cpuctx->sched_cb_usage && pmu->sched_task)
-               pmu->sched_task(cpuctx->task_ctx, true);
+       perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
-       perf_pmu_enable(pmu);
+       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+               perf_ctx_enable(&cpuctx->ctx);
+
+       perf_ctx_enable(ctx);
 
 unlock:
        perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+       rcu_read_unlock();
 }
 
 /*
 void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
 {
-       struct perf_event_context *ctx;
-       int ctxn;
-
-       for_each_task_context_nr(ctxn) {
-               ctx = task->perf_event_ctxp[ctxn];
-               if (likely(!ctx))
-                       continue;
-
-               perf_event_context_sched_in(ctx, task);
-       }
+       perf_event_context_sched_in(task);
 
        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);
  * events. At the same time, make sure, having freq events does not change
  * the rate of unthrottling as that would introduce bias.
  */
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
-                                          int needs_unthr)
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
-       if (!(ctx->nr_freq || needs_unthr))
+       if (!(ctx->nr_freq || unthrottle))
                return;
 
        raw_spin_lock(&ctx->lock);
-       perf_pmu_disable(ctx->pmu);
 
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
 
+               // XXX use visit thingy to avoid the -1,cpu match
                if (!event_filter_match(event))
                        continue;
 
                perf_pmu_enable(event->pmu);
        }
 
-       perf_pmu_enable(ctx->pmu);
        raw_spin_unlock(&ctx->lock);
 }
 
 
 /* pick an event from the flexible_groups to rotate */
 static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
 {
        struct perf_event *event;
+       struct rb_node *node;
+       struct rb_root *tree;
+       struct __group_key key = {
+               .pmu = pmu_ctx->pmu,
+       };
 
        /* pick the first active flexible event */
-       event = list_first_entry_or_null(&ctx->flexible_active,
+       event = list_first_entry_or_null(&pmu_ctx->flexible_active,
                                         struct perf_event, active_list);
+       if (event)
+               goto out;
 
        /* if no active flexible event, pick the first event */
-       if (!event) {
-               event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
-                                     typeof(*event), group_node);
+       tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+       if (!pmu_ctx->ctx->task) {
+               key.cpu = smp_processor_id();
+
+               node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+               if (node)
+                       event = __node_2_pe(node);
+               goto out;
        }
 
-       /*
+       key.cpu = -1;
+       node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+       if (node) {
+               event = __node_2_pe(node);
+               goto out;
+       }
+
+       key.cpu = smp_processor_id();
+       node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+       if (node)
+               event = __node_2_pe(node);
+
+out:
+       /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
-       ctx->rotate_necessary = 0;
+       pmu_ctx->rotate_necessary = 0;
 
        return event;
 }
 
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
        struct perf_event *cpu_event = NULL, *task_event = NULL;
        struct perf_event_context *task_ctx = NULL;
        int cpu_rotate, task_rotate;
+       struct pmu *pmu;
 
        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */
 
-       cpu_rotate = cpuctx->ctx.rotate_necessary;
+       cpu_epc = &cpc->epc;
+       pmu = cpu_epc->pmu;
+       task_epc = cpc->task_epc;
+
+       cpu_rotate = cpu_epc->rotate_necessary;
        task_ctx = cpuctx->task_ctx;
-       task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+       task_rotate = task_epc ? task_epc->rotate_necessary : 0;
 
        if (!(cpu_rotate || task_rotate))
                return false;
 
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-       perf_pmu_disable(cpuctx->ctx.pmu);
+       perf_pmu_disable(pmu);
 
        if (task_rotate)
-               task_event = ctx_event_to_rotate(task_ctx);
+               task_event = ctx_event_to_rotate(task_epc);
        if (cpu_rotate)
-               cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+               cpu_event = ctx_event_to_rotate(cpu_epc);
 
        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
-       if (task_event || (task_ctx && cpu_event))
-               ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
-       if (cpu_event)
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       if (task_event || (task_epc && cpu_event)) {
+               update_context_time(task_epc->ctx);
+               __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+       }
 
-       if (task_event)
-               rotate_ctx(task_ctx, task_event);
-       if (cpu_event)
+       if (cpu_event) {
+               update_context_time(&cpuctx->ctx);
+               __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                rotate_ctx(&cpuctx->ctx, cpu_event);
+               __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+       }
 
-       perf_event_sched_in(cpuctx, task_ctx);
+       if (task_event)
+               rotate_ctx(task_epc->ctx, task_event);
+
+       if (task_event || (task_epc && cpu_event))
+               __pmu_ctx_sched_in(task_epc->ctx, pmu);
 
-       perf_pmu_enable(cpuctx->ctx.pmu);
+       perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 
        return true;
 
 void perf_event_task_tick(void)
 {
-       struct list_head *head = this_cpu_ptr(&active_ctx_list);
-       struct perf_event_context *ctx, *tmp;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+       struct perf_event_context *ctx;
        int throttled;
 
        lockdep_assert_irqs_disabled();
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
 
-       list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
-               perf_adjust_freq_unthr_context(ctx, throttled);
+       perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+       rcu_read_lock();
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_adjust_freq_unthr_context(ctx, !!throttled);
+       rcu_read_unlock();
 }
 
 static int event_enable_on_exec(struct perf_event *event,
  * Enable all of a task's events that have been marked enable-on-exec.
  * This expects task == current.
  */
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 {
-       struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_event_context *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        int enabled = 0;
 
        local_irq_save(flags);
-       ctx = current->perf_event_ctxp[ctxn];
-       if (!ctx || !ctx->nr_events)
+       if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
+               goto out;
+
+       if (!ctx->nr_events)
                goto out;
 
-       cpuctx = __get_cpu_context(ctx);
+       cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
-       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       ctx_sched_out(ctx, EVENT_TIME);
+
        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
        } else {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_in(ctx, EVENT_TIME);
        }
        perf_ctx_unlock(cpuctx, ctx);
 
  * Removes all events from the current task that have been marked
  * remove-on-exec, and feeds their values back to parent events.
  */
-static void perf_event_remove_on_exec(int ctxn)
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
 {
-       struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_event_context *clone_ctx = NULL;
        struct perf_event *event, *next;
        unsigned long flags;
        bool modified = false;
 
-       ctx = perf_pin_task_context(current, ctxn);
-       if (!ctx)
-               return;
-
        mutex_lock(&ctx->mutex);
 
        if (WARN_ON_ONCE(ctx->task != current))
        raw_spin_lock_irqsave(&ctx->lock, flags);
        if (modified)
                clone_ctx = unclone_ctx(ctx);
-       --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
 
 unlock:
        mutex_unlock(&ctx->mutex);
 
-       put_ctx(ctx);
        if (clone_ctx)
                put_ctx(clone_ctx);
 }
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu = event->pmu;
 
        /*
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
-       INIT_LIST_HEAD(&ctx->active_ctx_list);
+       INIT_LIST_HEAD(&ctx->pmu_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
-       INIT_LIST_HEAD(&ctx->pinned_active);
-       INIT_LIST_HEAD(&ctx->flexible_active);
        refcount_set(&ctx->refcount, 1);
 }
 
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+       epc->pmu = pmu;
+       INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+       INIT_LIST_HEAD(&epc->pinned_active);
+       INIT_LIST_HEAD(&epc->flexible_active);
+       atomic_set(&epc->refcount, 1);
+}
+
 static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
 {
        struct perf_event_context *ctx;
 
        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);
-       ctx->pmu = pmu;
 
        return ctx;
 }
  * Returns a matching context with refcount and pincount.
  */
 static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
-               struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
 {
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
-       void *task_ctx_data = NULL;
        unsigned long flags;
-       int ctxn, err;
-       int cpu = event->cpu;
+       int err;
 
        if (!task) {
                /* Must be root to operate on a CPU event: */
                if (err)
                        return ERR_PTR(err);
 
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
        }
 
        err = -EINVAL;
-       ctxn = pmu->task_ctx_nr;
-       if (ctxn < 0)
-               goto errout;
-
-       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-               task_ctx_data = alloc_task_ctx_data(pmu);
-               if (!task_ctx_data) {
-                       err = -ENOMEM;
-                       goto errout;
-               }
-       }
-
 retry:
-       ctx = perf_lock_task_context(task, ctxn, &flags);
+       ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;
 
-               if (task_ctx_data && !ctx->task_ctx_data) {
-                       ctx->task_ctx_data = task_ctx_data;
-                       task_ctx_data = NULL;
-               }
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
 
                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
-               ctx = alloc_perf_context(pmu, task);
+               ctx = alloc_perf_context(task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
 
-               if (task_ctx_data) {
-                       ctx->task_ctx_data = task_ctx_data;
-                       task_ctx_data = NULL;
-               }
-
                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
-               else if (task->perf_event_ctxp[ctxn])
+               else if (task->perf_event_ctxp)
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
-                       rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+                       rcu_assign_pointer(task->perf_event_ctxp, ctx);
                }
                mutex_unlock(&task->perf_event_mutex);
 
                }
        }
 
-       free_task_ctx_data(pmu, task_ctx_data);
        return ctx;
 
 errout:
-       free_task_ctx_data(pmu, task_ctx_data);
        return ERR_PTR(err);
 }
 
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+                    struct perf_event *event)
+{
+       struct perf_event_pmu_context *new = NULL, *epc;
+       void *task_ctx_data = NULL;
+
+       if (!ctx->task) {
+               struct perf_cpu_pmu_context *cpc;
+
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+               epc = &cpc->epc;
+
+               if (!epc->ctx) {
+                       atomic_set(&epc->refcount, 1);
+                       epc->embedded = 1;
+                       raw_spin_lock_irq(&ctx->lock);
+                       list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+                       epc->ctx = ctx;
+                       raw_spin_unlock_irq(&ctx->lock);
+               } else {
+                       WARN_ON_ONCE(epc->ctx != ctx);
+                       atomic_inc(&epc->refcount);
+               }
+
+               return epc;
+       }
+
+       new = kzalloc(sizeof(*epc), GFP_KERNEL);
+       if (!new)
+               return ERR_PTR(-ENOMEM);
+
+       if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+               task_ctx_data = alloc_task_ctx_data(pmu);
+               if (!task_ctx_data) {
+                       kfree(new);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+
+       __perf_init_event_pmu_context(new, pmu);
+
+       /*
+        * XXX
+        *
+        * lockdep_assert_held(&ctx->mutex);
+        *
+        * can't because perf_event_init_task() doesn't actually hold the
+        * child_ctx->mutex.
+        */
+
+       raw_spin_lock_irq(&ctx->lock);
+       list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+               if (epc->pmu == pmu) {
+                       WARN_ON_ONCE(epc->ctx != ctx);
+                       atomic_inc(&epc->refcount);
+                       goto found_epc;
+               }
+       }
+
+       epc = new;
+       new = NULL;
+
+       list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+       epc->ctx = ctx;
+
+found_epc:
+       if (task_ctx_data && !epc->task_ctx_data) {
+               epc->task_ctx_data = task_ctx_data;
+               task_ctx_data = NULL;
+               ctx->nr_task_data++;
+       }
+       raw_spin_unlock_irq(&ctx->lock);
+
+       free_task_ctx_data(pmu, task_ctx_data);
+       kfree(new);
+
+       return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+       WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+       struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+       kfree(epc->task_ctx_data);
+       kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+       unsigned long flags;
+
+       if (!atomic_dec_and_test(&epc->refcount))
+               return;
+
+       if (epc->ctx) {
+               struct perf_event_context *ctx = epc->ctx;
+
+               /*
+                * XXX
+                *
+                * lockdep_assert_held(&ctx->mutex);
+                *
+                * can't because of the call-site in _free_event()/put_event()
+                * which isn't always called under ctx->mutex.
+                */
+
+               WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+               raw_spin_lock_irqsave(&ctx->lock, flags);
+               list_del_init(&epc->pmu_ctx_entry);
+               epc->ctx = NULL;
+               raw_spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+
+       WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+       WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+       if (epc->embedded)
+               return;
+
+       call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
 static void perf_event_free_filter(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
-       struct perf_event *event;
+       struct perf_event *event = container_of(head, typeof(*event), rcu_head);
 
-       event = container_of(head, struct perf_event, rcu_head);
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
  *
  *  1) cpu-wide events in the presence of per-task events,
  *  2) per-task events in the presence of cpu-wide events,
- *  3) two matching events on the same context.
+ *  3) two matching events on the same perf_event_context.
  *
  * The former two cases are handled in the allocation path (perf_event_alloc(),
  * _free_event()), the latter -- before the first perf_install_in_context().
        if (event->hw.target)
                put_task_struct(event->hw.target);
 
+       if (event->pmu_ctx)
+               put_pmu_ctx(event->pmu_ctx);
+
        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in particular
         * all task references must be cleaned up.
        LIST_HEAD(free_list);
 
        /*
-        * If we got here through err_file: fput(event_file); we will not have
-        * attached to a context yet.
+        * If we got here through err_alloc: free_event(event); we will not
+        * have attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
 
        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
-               perf_pmu_disable(ctx->pmu);
+               perf_pmu_disable(event->pmu);
                /*
                 * We could be throttled; unthrottle now to avoid the tick
                 * trying to unthrottle while we already re-started the event.
 
        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
-               perf_pmu_enable(ctx->pmu);
+               perf_pmu_enable(event->pmu);
        }
 }
 
               struct perf_event_context *task_ctx)
 {
        struct perf_event_context *ctx;
-       int ctxn;
 
        rcu_read_lock();
        preempt_disable();
 
        perf_iterate_sb_cpu(output, data);
 
-       for_each_task_context_nr(ctxn) {
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx)
-                       perf_iterate_ctx(ctx, output, data, false);
-       }
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
+               perf_iterate_ctx(ctx, output, data, false);
 done:
        preempt_enable();
        rcu_read_unlock();
 void perf_event_exec(void)
 {
        struct perf_event_context *ctx;
-       int ctxn;
 
-       for_each_task_context_nr(ctxn) {
-               perf_event_enable_on_exec(ctxn);
-               perf_event_remove_on_exec(ctxn);
+       ctx = perf_pin_task_context(current);
+       if (!ctx)
+               return;
+
+       perf_event_enable_on_exec(ctx);
+       perf_event_remove_on_exec(ctx);
+       perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
 
-               rcu_read_lock();
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (ctx) {
-                       perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
-                                        NULL, true);
-               }
-               rcu_read_unlock();
-       }
+       perf_unpin_context(ctx);
+       put_ctx(ctx);
 }
 
 struct remote_output {
 static int __perf_pmu_output_stop(void *info)
 {
        struct perf_event *event = info;
-       struct pmu *pmu = event->ctx->pmu;
-       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct remote_output ro = {
                .rb     = event->rb,
        };
 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
 {
        struct perf_event_context *ctx;
-       int ctxn;
 
        /*
         * Data tracing isn't supported yet and as such there is no need
                return;
 
        rcu_read_lock();
-       for_each_task_context_nr(ctxn) {
-               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-               if (!ctx)
-                       continue;
-
+       ctx = rcu_dereference(current->perf_event_ctxp);
+       if (ctx)
                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
-       }
        rcu_read_unlock();
 }
 
                struct trace_entry *entry = record;
 
                rcu_read_lock();
-               ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+               ctx = rcu_dereference(task->perf_event_ctxp);
                if (!ctx)
                        goto unlock;
 
+               // XXX iterate groups instead, we should be able to
+               // find the subtree for the perf_tracepoint pmu and CPU.
+
                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                        if (event->cpu != smp_processor_id())
                                continue;
        return 0;
 }
 
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
-       struct pmu *pmu;
-
-       if (ctxn < 0)
-               return NULL;
-
-       list_for_each_entry(pmu, &pmus, entry) {
-               if (pmu->task_ctx_nr == ctxn)
-                       return pmu->pmu_cpu_context;
-       }
-
-       return NULL;
-}
-
 static void free_pmu_context(struct pmu *pmu)
 {
-       /*
-        * Static contexts such as perf_sw_context have a global lifetime
-        * and may be shared between different PMUs. Avoid freeing them
-        * when a single PMU is going away.
-        */
-       if (pmu->task_ctx_nr > perf_invalid_context)
-               return;
-
-       free_percpu(pmu->pmu_cpu_context);
+       free_percpu(pmu->cpu_pmu_context);
 }
 
 /*
        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+               struct perf_cpu_pmu_context *cpc;
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+               cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 
                cpu_function_call(cpu,
-                       (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+                       (remote_function_f)perf_mux_hrtimer_restart, cpc);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);
        }
 
 skip_type:
-       if (pmu->task_ctx_nr == perf_hw_context) {
-               static int hw_context_taken = 0;
-
-               /*
-                * Other than systems with heterogeneous CPUs, it never makes
-                * sense for two PMUs to share perf_hw_context. PMUs which are
-                * uncore must use perf_invalid_context.
-                */
-               if (WARN_ON_ONCE(hw_context_taken &&
-                   !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
-                       pmu->task_ctx_nr = perf_invalid_context;
-
-               hw_context_taken = 1;
-       }
-
-       pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
-       if (pmu->pmu_cpu_context)
-               goto got_cpu_context;
-
        ret = -ENOMEM;
-       pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
-       if (!pmu->pmu_cpu_context)
+       pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+       if (!pmu->cpu_pmu_context)
                goto free_dev;
 
        for_each_possible_cpu(cpu) {
-               struct perf_cpu_context *cpuctx;
-
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               __perf_event_init_context(&cpuctx->ctx);
-               lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
-               lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
-               cpuctx->ctx.pmu = pmu;
-               cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
-               __perf_mux_hrtimer_init(cpuctx, cpu);
+               struct perf_cpu_pmu_context *cpc;
 
-               cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
-               cpuctx->heap = cpuctx->heap_default;
+               cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+               __perf_init_event_pmu_context(&cpc->epc, pmu);
+               __perf_mux_hrtimer_init(cpc, cpu);
        }
 
-got_cpu_context:
        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
        }
 
        /*
-        * Disallow uncore-cgroup events, they don't make sense as the cgroup will
-        * be different on other CPUs in the uncore mask.
+        * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+        * events (they don't make sense as the cgroup will be different
+        * on other CPUs in the uncore mask).
         */
-       if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+       if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
                err = -EINVAL;
                goto err_pmu;
        }
        return 0;
 }
 
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
-                            struct perf_event_context *ctx)
-{
-       struct perf_event_context *gctx;
-
-again:
-       rcu_read_lock();
-       gctx = READ_ONCE(group_leader->ctx);
-       if (!refcount_inc_not_zero(&gctx->refcount)) {
-               rcu_read_unlock();
-               goto again;
-       }
-       rcu_read_unlock();
-
-       mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
-       if (group_leader->ctx != gctx) {
-               mutex_unlock(&ctx->mutex);
-               mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
-               goto again;
-       }
-
-       return gctx;
-}
-
 static bool
 perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
 {
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
        struct perf_event *group_leader = NULL, *output_event = NULL;
+       struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
-       struct perf_event_context *ctx, *gctx;
+       struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;
 
-       if (group_leader) {
-               if (is_software_event(event) &&
-                   !in_software_context(group_leader)) {
-                       /*
-                        * If the event is a sw event, but the group_leader
-                        * is on hw context.
-                        *
-                        * Allow the addition of software events to hw
-                        * groups, this is safe because software events
-                        * never fail to schedule.
-                        */
-                       pmu = group_leader->ctx->pmu;
-               } else if (!is_software_event(event) &&
-                          is_software_event(group_leader) &&
-                          (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
-                       /*
-                        * In case the group is a pure software group, and we
-                        * try to add a hardware event, move the whole group to
-                        * the hardware context.
-                        */
-                       move_group = 1;
-               }
+       if (task) {
+               err = down_read_interruptible(&task->signal->exec_update_lock);
+               if (err)
+                       goto err_alloc;
+
+               /*
+                * We must hold exec_update_lock across this and any potential
+                * perf_install_in_context() call for this new event to
+                * serialize against exec() altering our credentials (and the
+                * perf_event_exit_task() that could imply).
+                */
+               err = -EACCES;
+               if (!perf_check_permission(&attr, task))
+                       goto err_cred;
        }
 
        /*
         * Get the target context (task or percpu):
         */
-       ctx = find_get_context(pmu, task, event);
+       ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
-               goto err_alloc;
+               goto err_cred;
+       }
+
+       mutex_lock(&ctx->mutex);
+
+       if (ctx->task == TASK_TOMBSTONE) {
+               err = -ESRCH;
+               goto err_locked;
+       }
+
+       if (!task) {
+               /*
+                * Check if the @cpu we're creating an event for is online.
+                *
+                * We use the perf_cpu_context::ctx::mutex to serialize against
+                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+                */
+               struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+               if (!cpuctx->online) {
+                       err = -ENODEV;
+                       goto err_locked;
+               }
        }
 
-       /*
-        * Look up the group leader (we will attach this event to it):
-        */
        if (group_leader) {
                err = -EINVAL;
 
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
-                       goto err_context;
+                       goto err_locked;
 
                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
-                       goto err_context;
+                       goto err_locked;
 
                /*
                 * Make sure we're both events for the same CPU;
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
-                       goto err_context;
+                       goto err_locked;
 
                /*
-                * Make sure we're both on the same task, or both
-                * per-CPU events.
+                * Make sure we're both on the same context; either task or cpu.
                 */
-               if (group_leader->ctx->task != ctx->task)
-                       goto err_context;
-
-               /*
-                * Do not allow to attach to a group in a different task
-                * or CPU context. If we're moving SW events, we'll fix
-                * this up later, so allow that.
-                *
-                * Racy, not holding group_leader->ctx->mutex, see comment with
-                * perf_event_ctx_lock().
-                */
-               if (!move_group && group_leader->ctx != ctx)
-                       goto err_context;
+               if (group_leader->ctx != ctx)
+                       goto err_locked;
 
                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
-                       goto err_context;
-       }
-
-       if (output_event) {
-               err = perf_event_set_output(event, output_event);
-               if (err)
-                       goto err_context;
-       }
-
-       event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
-                                       f_flags);
-       if (IS_ERR(event_file)) {
-               err = PTR_ERR(event_file);
-               event_file = NULL;
-               goto err_context;
-       }
-
-       if (task) {
-               err = down_read_interruptible(&task->signal->exec_update_lock);
-               if (err)
-                       goto err_file;
-
-               /*
-                * We must hold exec_update_lock across this and any potential
-                * perf_install_in_context() call for this new event to
-                * serialize against exec() altering our credentials (and the
-                * perf_event_exit_task() that could imply).
-                */
-               err = -EACCES;
-               if (!perf_check_permission(&attr, task))
-                       goto err_cred;
-       }
-
-       if (move_group) {
-               gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
-               if (gctx->task == TASK_TOMBSTONE) {
-                       err = -ESRCH;
                        goto err_locked;
-               }
 
-               /*
-                * Check if we raced against another sys_perf_event_open() call
-                * moving the software group underneath us.
-                */
-               if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+               if (is_software_event(event) &&
+                   !in_software_context(group_leader)) {
                        /*
-                        * If someone moved the group out from under us, check
-                        * if this new event wound up on the same ctx, if so
-                        * its the regular !move_group case, otherwise fail.
+                        * If the event is a sw event, but the group_leader
+                        * is on hw context.
+                        *
+                        * Allow the addition of software events to hw
+                        * groups, this is safe because software events
+                        * never fail to schedule.
+                        *
+                        * Note the comment that goes with struct
+                        * perf_event_pmu_context.
                         */
-                       if (gctx != ctx) {
-                               err = -EINVAL;
-                               goto err_locked;
-                       } else {
-                               perf_event_ctx_unlock(group_leader, gctx);
-                               move_group = 0;
-                               goto not_move_group;
-                       }
-               }
-
-               /*
-                * Failure to create exclusive events returns -EBUSY.
-                */
-               err = -EBUSY;
-               if (!exclusive_event_installable(group_leader, ctx))
-                       goto err_locked;
-
-               for_each_sibling_event(sibling, group_leader) {
-                       if (!exclusive_event_installable(sibling, ctx))
-                               goto err_locked;
-               }
-       } else {
-               mutex_lock(&ctx->mutex);
-
-               /*
-                * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
-                * see the group_leader && !move_group test earlier.
-                */
-               if (group_leader && group_leader->ctx != ctx) {
-                       err = -EINVAL;
-                       goto err_locked;
+                       pmu = group_leader->pmu_ctx->pmu;
+               } else if (!is_software_event(event) &&
+                       is_software_event(group_leader) &&
+                       (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+                       /*
+                        * In case the group is a pure software group, and we
+                        * try to add a hardware event, move the whole group to
+                        * the hardware context.
+                        */
+                       move_group = 1;
                }
        }
-not_move_group:
 
-       if (ctx->task == TASK_TOMBSTONE) {
-               err = -ESRCH;
+       /*
+        * Now that we're certain of the pmu; find the pmu_ctx.
+        */
+       pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+       if (IS_ERR(pmu_ctx)) {
+               err = PTR_ERR(pmu_ctx);
                goto err_locked;
        }
+       event->pmu_ctx = pmu_ctx;
 
-       if (!perf_event_validate_size(event)) {
-               err = -E2BIG;
-               goto err_locked;
+       if (output_event) {
+               err = perf_event_set_output(event, output_event);
+               if (err)
+                       goto err_context;
        }
 
-       if (!task) {
-               /*
-                * Check if the @cpu we're creating an event for is online.
-                *
-                * We use the perf_cpu_context::ctx::mutex to serialize against
-                * the hotplug notifiers. See perf_event_{init,exit}_cpu().
-                */
-               struct perf_cpu_context *cpuctx =
-                       container_of(ctx, struct perf_cpu_context, ctx);
-
-               if (!cpuctx->online) {
-                       err = -ENODEV;
-                       goto err_locked;
-               }
+       if (!perf_event_validate_size(event)) {
+               err = -E2BIG;
+               goto err_context;
        }
 
        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
-               goto err_locked;
+               goto err_context;
        }
 
        /*
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
-               goto err_locked;
+               goto err_context;
        }
 
        WARN_ON_ONCE(ctx->parent_ctx);
 
+       event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+       if (IS_ERR(event_file)) {
+               err = PTR_ERR(event_file);
+               event_file = NULL;
+               goto err_context;
+       }
+
        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */
 
        if (move_group) {
-               /*
-                * See perf_event_ctx_lock() for comments on the details
-                * of swizzling perf_event::ctx.
-                */
                perf_remove_from_context(group_leader, 0);
-               put_ctx(gctx);
+               put_pmu_ctx(group_leader->pmu_ctx);
 
                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
-                       put_ctx(gctx);
+                       put_pmu_ctx(sibling->pmu_ctx);
                }
 
-               /*
-                * Wait for everybody to stop referencing the events through
-                * the old lists, before installing it on new lists.
-                */
-               synchronize_rcu();
-
                /*
                 * Install the group siblings before the group leader.
                 *
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
+                       sibling->pmu_ctx = pmu_ctx;
+                       get_pmu_ctx(pmu_ctx);
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
-                       get_ctx(ctx);
                }
 
                /*
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
+               group_leader->pmu_ctx = pmu_ctx;
+               get_pmu_ctx(pmu_ctx);
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
-               get_ctx(ctx);
        }
 
        /*
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
 
-       if (move_group)
-               perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);
 
        if (task) {
        fd_install(event_fd, event_file);
        return event_fd;
 
+err_context:
+       /* event->pmu_ctx freed by free_event() */
 err_locked:
-       if (move_group)
-               perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);
+       perf_unpin_context(ctx);
+       put_ctx(ctx);
 err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
-err_file:
-       fput(event_file);
-err_context:
-       perf_unpin_context(ctx);
-       put_ctx(ctx);
 err_alloc:
-       /*
-        * If event_file is set, the fput() above will have called ->release()
-        * and that will take care of freeing the event.
-        */
-       if (!event_file)
-               free_event(event);
+       free_event(event);
 err_task:
        if (task)
                put_task_struct(task);
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
 {
+       struct perf_event_pmu_context *pmu_ctx;
        struct perf_event_context *ctx;
        struct perf_event *event;
+       struct pmu *pmu;
        int err;
 
        /*
 
        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;
+       pmu = event->pmu;
+
+       if (pmu->task_ctx_nr == perf_sw_context)
+               event->event_caps |= PERF_EV_CAP_SOFTWARE;
 
        /*
         * Get the target context (task or percpu):
         */
-       ctx = find_get_context(event->pmu, task, event);
+       ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
-               goto err_free;
+               goto err_alloc;
        }
 
        WARN_ON_ONCE(ctx->parent_ctx);
                goto err_unlock;
        }
 
+       pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+       if (IS_ERR(pmu_ctx)) {
+               err = PTR_ERR(pmu_ctx);
+               goto err_unlock;
+       }
+       event->pmu_ctx = pmu_ctx;
+
        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
-                       goto err_unlock;
+                       goto err_pmu_ctx;
                }
        }
 
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
-               goto err_unlock;
+               goto err_pmu_ctx;
        }
 
        perf_install_in_context(ctx, event, event->cpu);
 
        return event;
 
+err_pmu_ctx:
+       put_pmu_ctx(pmu_ctx);
 err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
-err_free:
+err_alloc:
        free_event(event);
 err:
        return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+                             int cpu, struct pmu *pmu,
+                             struct perf_event_groups *groups,
+                             struct list_head *events)
 {
-       struct perf_event_context *src_ctx;
-       struct perf_event_context *dst_ctx;
-       struct perf_event *event, *tmp;
-       LIST_HEAD(events);
-
-       src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
-       dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+       struct perf_event *event, *sibling;
 
-       /*
-        * See perf_event_ctx_lock() for comments on the details
-        * of swizzling perf_event::ctx.
-        */
-       mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
-       list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
-                                event_entry) {
+       perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
                perf_remove_from_context(event, 0);
-               unaccount_event_cpu(event, src_cpu);
-               put_ctx(src_ctx);
-               list_add(&event->migrate_entry, &events);
+               unaccount_event_cpu(event, cpu);
+               put_pmu_ctx(event->pmu_ctx);
+               list_add(&event->migrate_entry, events);
+
+               for_each_sibling_event(sibling, event) {
+                       perf_remove_from_context(sibling, 0);
+                       unaccount_event_cpu(sibling, cpu);
+                       put_pmu_ctx(sibling->pmu_ctx);
+                       list_add(&sibling->migrate_entry, events);
+               }
        }
+}
 
-       /*
-        * Wait for the events to quiesce before re-instating them.
-        */
-       synchronize_rcu();
+static void __perf_pmu_install_event(struct pmu *pmu,
+                                    struct perf_event_context *ctx,
+                                    int cpu, struct perf_event *event)
+{
+       struct perf_event_pmu_context *epc;
+
+       event->cpu = cpu;
+       epc = find_get_pmu_context(pmu, ctx, event);
+       event->pmu_ctx = epc;
+
+       if (event->state >= PERF_EVENT_STATE_OFF)
+               event->state = PERF_EVENT_STATE_INACTIVE;
+       account_event_cpu(event, cpu);
+       perf_install_in_context(ctx, event, cpu);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+                              int cpu, struct pmu *pmu, struct list_head *events)
+{
+       struct perf_event *event, *tmp;
 
        /*
         * Re-instate events in 2 passes.
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
-       list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+       list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                if (event->group_leader == event)
                        continue;
 
                list_del(&event->migrate_entry);
-               if (event->state >= PERF_EVENT_STATE_OFF)
-                       event->state = PERF_EVENT_STATE_INACTIVE;
-               account_event_cpu(event, dst_cpu);
-               perf_install_in_context(dst_ctx, event, dst_cpu);
-               get_ctx(dst_ctx);
+               __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
 
        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
-       list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+       list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                list_del(&event->migrate_entry);
-               if (event->state >= PERF_EVENT_STATE_OFF)
-                       event->state = PERF_EVENT_STATE_INACTIVE;
-               account_event_cpu(event, dst_cpu);
-               perf_install_in_context(dst_ctx, event, dst_cpu);
-               get_ctx(dst_ctx);
+               __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+       struct perf_event_context *src_ctx, *dst_ctx;
+       LIST_HEAD(events);
+
+       src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+       dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+       /*
+        * See perf_event_ctx_lock() for comments on the details
+        * of swizzling perf_event::ctx.
+        */
+       mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+       __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+       __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+       /*
+        * Wait for the events to quiesce before re-instating them.
+        */
+       synchronize_rcu();
+
+       __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+
        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
 }
        perf_event_wakeup(event);
 }
 
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *child)
 {
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;
 
        WARN_ON_ONCE(child != current);
 
-       child_ctx = perf_pin_task_context(child, ctxn);
+       child_ctx = perf_pin_task_context(child);
        if (!child_ctx)
                return;
 
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
-       task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+       task_ctx_sched_out(child_ctx, EVENT_ALL);
 
        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
-       RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+       RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
        put_ctx(child_ctx); /* cannot be last */
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        put_task_struct(current); /* cannot be last */
 void perf_event_exit_task(struct task_struct *child)
 {
        struct perf_event *event, *tmp;
-       int ctxn;
 
        mutex_lock(&child->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
        }
        mutex_unlock(&child->perf_event_mutex);
 
-       for_each_task_context_nr(ctxn)
-               perf_event_exit_task_context(child, ctxn);
+       perf_event_exit_task_context(child);
 
        /*
         * The perf_event_exit_task_context calls perf_event_task
 {
        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;
-       int ctxn;
 
-       for_each_task_context_nr(ctxn) {
-               ctx = task->perf_event_ctxp[ctxn];
-               if (!ctx)
-                       continue;
+       ctx = rcu_access_pointer(task->perf_event_ctxp);
+       if (!ctx)
+               return;
 
-               mutex_lock(&ctx->mutex);
-               raw_spin_lock_irq(&ctx->lock);
-               /*
-                * Destroy the task <-> ctx relation and mark the context dead.
-                *
-                * This is important because even though the task hasn't been
-                * exposed yet the context has been (through child_list).
-                */
-               RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
-               WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
-               put_task_struct(task); /* cannot be last */
-               raw_spin_unlock_irq(&ctx->lock);
+       mutex_lock(&ctx->mutex);
+       raw_spin_lock_irq(&ctx->lock);
+       /*
+        * Destroy the task <-> ctx relation and mark the context dead.
+        *
+        * This is important because even though the task hasn't been
+        * exposed yet the context has been (through child_list).
+        */
+       RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+       WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+       put_task_struct(task); /* cannot be last */
+       raw_spin_unlock_irq(&ctx->lock);
 
-               list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
-                       perf_free_event(event, ctx);
 
-               mutex_unlock(&ctx->mutex);
+       list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+               perf_free_event(event, ctx);
 
-               /*
-                * perf_event_release_kernel() could've stolen some of our
-                * child events and still have them on its free_list. In that
-                * case we must wait for these events to have been freed (in
-                * particular all their references to this task must've been
-                * dropped).
-                *
-                * Without this copy_process() will unconditionally free this
-                * task (irrespective of its reference count) and
-                * _free_event()'s put_task_struct(event->hw.target) will be a
-                * use-after-free.
-                *
-                * Wait for all events to drop their context reference.
-                */
-               wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
-               put_ctx(ctx); /* must be last */
-       }
+       mutex_unlock(&ctx->mutex);
+
+       /*
+        * perf_event_release_kernel() could've stolen some of our
+        * child events and still have them on its free_list. In that
+        * case we must wait for these events to have been freed (in
+        * particular all their references to this task must've been
+        * dropped).
+        *
+        * Without this copy_process() will unconditionally free this
+        * task (irrespective of its reference count) and
+        * _free_event()'s put_task_struct(event->hw.target) will be a
+        * use-after-free.
+        *
+        * Wait for all events to drop their context reference.
+        */
+       wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+       put_ctx(ctx); /* must be last */
 }
 
 void perf_event_delayed_put(struct task_struct *task)
 {
-       int ctxn;
-
-       for_each_task_context_nr(ctxn)
-               WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+       WARN_ON_ONCE(task->perf_event_ctxp);
 }
 
 struct file *perf_event_get(unsigned int fd)
              struct perf_event_context *child_ctx)
 {
        enum perf_event_state parent_state = parent_event->state;
+       struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *child_event;
        unsigned long flags;
 
        if (IS_ERR(child_event))
                return child_event;
 
-
-       if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
-           !child_ctx->task_ctx_data) {
-               struct pmu *pmu = child_event->pmu;
-
-               child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
-               if (!child_ctx->task_ctx_data) {
-                       free_event(child_event);
-                       return ERR_PTR(-ENOMEM);
-               }
+       pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+       if (!pmu_ctx) {
+               free_event(child_event);
+               return NULL;
        }
+       child_event->pmu_ctx = pmu_ctx;
 
        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
 static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
-                  struct task_struct *child, int ctxn,
+                  struct task_struct *child,
                   u64 clone_flags, int *inherited_all)
 {
-       int ret;
        struct perf_event_context *child_ctx;
+       int ret;
 
        if (!event->attr.inherit ||
            (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
                return 0;
        }
 
-       child_ctx = child->perf_event_ctxp[ctxn];
+       child_ctx = child->perf_event_ctxp;
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * First allocate and initialize a context for the
                 * child.
                 */
-               child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+               child_ctx = alloc_perf_context(child);
                if (!child_ctx)
                        return -ENOMEM;
 
-               child->perf_event_ctxp[ctxn] = child_ctx;
+               child->perf_event_ctxp = child_ctx;
        }
 
-       ret = inherit_group(event, parent, parent_ctx,
-                           child, child_ctx);
-
+       ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
        if (ret)
                *inherited_all = 0;
 
 /*
  * Initialize the perf_event context in task_struct
  */
-static int perf_event_init_context(struct task_struct *child, int ctxn,
-                                  u64 clone_flags)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
 {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        unsigned long flags;
        int ret = 0;
 
-       if (likely(!parent->perf_event_ctxp[ctxn]))
+       if (likely(!parent->perf_event_ctxp))
                return 0;
 
        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
-       parent_ctx = perf_pin_task_context(parent, ctxn);
+       parent_ctx = perf_pin_task_context(parent);
        if (!parent_ctx)
                return 0;
 
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, clone_flags,
-                                        &inherited_all);
+                                        child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }
 
        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, clone_flags,
-                                        &inherited_all);
+                                        child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;
 
-       child_ctx = child->perf_event_ctxp[ctxn];
+       child_ctx = child->perf_event_ctxp;
 
        if (child_ctx && inherited_all) {
                /*
  */
 int perf_event_init_task(struct task_struct *child, u64 clone_flags)
 {
-       int ctxn, ret;
+       int ret;
 
-       memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+       child->perf_event_ctxp = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);
 
-       for_each_task_context_nr(ctxn) {
-               ret = perf_event_init_context(child, ctxn, clone_flags);
-               if (ret) {
-                       perf_event_free_task(child);
-                       return ret;
-               }
+       ret = perf_event_init_context(child, clone_flags);
+       if (ret) {
+               perf_event_free_task(child);
+               return ret;
        }
 
        return 0;
 static void __init perf_event_init_all_cpus(void)
 {
        struct swevent_htable *swhash;
+       struct perf_cpu_context *cpuctx;
        int cpu;
 
        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
-               INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
 
                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
-#ifdef CONFIG_CGROUP_PERF
-               INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+               cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+               __perf_event_init_context(&cpuctx->ctx);
+               lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+               lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+               cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+               cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+               cpuctx->heap = cpuctx->heap_default;
        }
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx = __info;
-       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        struct perf_event *event;
 
        raw_spin_lock(&ctx->lock);
-       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       ctx_sched_out(ctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-       struct pmu *pmu;
 
+       // XXX simplify cpuctx->online
        mutex_lock(&pmus_lock);
-       list_for_each_entry(pmu, &pmus, entry) {
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
+       cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+       ctx = &cpuctx->ctx;
 
-               mutex_lock(&ctx->mutex);
-               smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
-               cpuctx->online = 0;
-               mutex_unlock(&ctx->mutex);
-       }
+       mutex_lock(&ctx->mutex);
+       smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+       cpuctx->online = 0;
+       mutex_unlock(&ctx->mutex);
        cpumask_clear_cpu(cpu, perf_online_mask);
        mutex_unlock(&pmus_lock);
 }
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-       struct pmu *pmu;
 
        perf_swevent_init_cpu(cpu);
 
        mutex_lock(&pmus_lock);
        cpumask_set_cpu(cpu, perf_online_mask);
-       list_for_each_entry(pmu, &pmus, entry) {
-               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
+       cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+       ctx = &cpuctx->ctx;
 
-               mutex_lock(&ctx->mutex);
-               cpuctx->online = 1;
-               mutex_unlock(&ctx->mutex);
-       }
+       mutex_lock(&ctx->mutex);
+       cpuctx->online = 1;
+       mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);
 
        return 0;
 static int __perf_cgroup_move(void *info)
 {
        struct task_struct *task = info;
-       rcu_read_lock();
-       perf_cgroup_switch(task);
-       rcu_read_unlock();
+
+       preempt_disable();
+       if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+               perf_cgroup_switch(task);
+       preempt_enable();
+
        return 0;
 }