#define PERF_PMU_CAP_NO_EXCLUDE                        0x80
 #define PERF_PMU_CAP_AUX_OUTPUT                        0x100
 
+struct perf_output_handle;
+
 /**
  * struct pmu - generic performance monitoring unit
  */
         */
        void (*free_aux)                (void *aux); /* optional */
 
+       /*
+        * Take a snapshot of the AUX buffer without touching the event
+        * state, so that preempting ->start()/->stop() callbacks does
+        * not interfere with their logic. Called in PMI context.
+        *
+        * Returns the size of AUX data copied to the output handle.
+        *
+        * Optional.
+        */
+       long (*snapshot_aux)            (struct perf_event *event,
+                                        struct perf_output_handle *handle,
+                                        unsigned long size);
+
        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
                u32     reserved;
        }                               cpu_entry;
        struct perf_callchain_entry     *callchain;
+       u64                             aux_size;
 
        /*
         * regs_user may point to task_pt_regs or to regs_user_copy, depending
                             const void *buf, unsigned int len);
 extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
+extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+                                struct perf_output_handle *handle,
+                                unsigned long from, unsigned long to);
 extern int perf_swevent_get_recursion_context(void);
 extern void perf_swevent_put_recursion_context(int rctx);
 extern u64 perf_swevent_set_period(struct perf_event *event);
 
        }
 }
 
+static bool perf_need_aux_event(struct perf_event *event)
+{
+       return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+}
+
 static int perf_get_aux_event(struct perf_event *event,
                              struct perf_event *group_leader)
 {
        if (!group_leader)
                return 0;
 
-       if (!perf_aux_output_match(event, group_leader))
+       /*
+        * aux_output and aux_sample_size are mutually exclusive.
+        */
+       if (event->attr.aux_output && event->attr.aux_sample_size)
+               return 0;
+
+       if (event->attr.aux_output &&
+           !perf_aux_output_match(event, group_leader))
+               return 0;
+
+       if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                return 0;
 
        if (!atomic_long_inc_not_zero(&group_leader->refcount))
        }
 }
 
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+                                         struct perf_sample_data *data,
+                                         size_t size)
+{
+       struct perf_event *sampler = event->aux_event;
+       struct ring_buffer *rb;
+
+       data->aux_size = 0;
+
+       if (!sampler)
+               goto out;
+
+       if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+               goto out;
+
+       if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+               goto out;
+
+       rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+       if (!rb)
+               goto out;
+
+       /*
+        * If this is an NMI hit inside sampling code, don't take
+        * the sample. See also perf_aux_sample_output().
+        */
+       if (READ_ONCE(rb->aux_in_sampling)) {
+               data->aux_size = 0;
+       } else {
+               size = min_t(size_t, size, perf_aux_size(rb));
+               data->aux_size = ALIGN(size, sizeof(u64));
+       }
+       ring_buffer_put(rb);
+
+out:
+       return data->aux_size;
+}
+
+long perf_pmu_snapshot_aux(struct ring_buffer *rb,
+                          struct perf_event *event,
+                          struct perf_output_handle *handle,
+                          unsigned long size)
+{
+       unsigned long flags;
+       long ret;
+
+       /*
+        * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+        * paths. If we start calling them in NMI context, they may race with
+        * the IRQ ones, that is, for example, re-starting an event that's just
+        * been stopped, which is why we're using a separate callback that
+        * doesn't change the event state.
+        *
+        * IRQs need to be disabled to prevent IPIs from racing with us.
+        */
+       local_irq_save(flags);
+       /*
+        * Guard against NMI hits inside the critical section;
+        * see also perf_prepare_sample_aux().
+        */
+       WRITE_ONCE(rb->aux_in_sampling, 1);
+       barrier();
+
+       ret = event->pmu->snapshot_aux(event, handle, size);
+
+       barrier();
+       WRITE_ONCE(rb->aux_in_sampling, 0);
+       local_irq_restore(flags);
+
+       return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+                                  struct perf_output_handle *handle,
+                                  struct perf_sample_data *data)
+{
+       struct perf_event *sampler = event->aux_event;
+       unsigned long pad;
+       struct ring_buffer *rb;
+       long size;
+
+       if (WARN_ON_ONCE(!sampler || !data->aux_size))
+               return;
+
+       rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+       if (!rb)
+               return;
+
+       size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+
+       /*
+        * An error here means that perf_output_copy() failed (returned a
+        * non-zero surplus that it didn't copy), which in its current
+        * enlightened implementation is not possible. If that changes, we'd
+        * like to know.
+        */
+       if (WARN_ON_ONCE(size < 0))
+               goto out_put;
+
+       /*
+        * The pad comes from ALIGN()ing data->aux_size up to u64 in
+        * perf_prepare_sample_aux(), so should not be more than that.
+        */
+       pad = data->aux_size - size;
+       if (WARN_ON_ONCE(pad >= sizeof(u64)))
+               pad = 8;
+
+       if (pad) {
+               u64 zero = 0;
+               perf_output_copy(handle, &zero, pad);
+       }
+
+out_put:
+       ring_buffer_put(rb);
+}
+
 static void __perf_event_header__init_id(struct perf_event_header *header,
                                         struct perf_sample_data *data,
                                         struct perf_event *event)
        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                perf_output_put(handle, data->phys_addr);
 
+       if (sample_type & PERF_SAMPLE_AUX) {
+               perf_output_put(handle, data->aux_size);
+
+               if (data->aux_size)
+                       perf_aux_sample_output(event, handle, data);
+       }
+
        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;
 
 
        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                data->phys_addr = perf_virt_to_phys(data->addr);
+
+       if (sample_type & PERF_SAMPLE_AUX) {
+               u64 size;
+
+               header->size += sizeof(u64); /* size */
+
+               /*
+                * Given the 16bit nature of header::size, an AUX sample can
+                * easily overflow it, what with all the preceding sample bits.
+                * Make sure this doesn't happen by using up to U16_MAX bytes
+                * per sample in total (rounded down to 8 byte boundary).
+                */
+               size = min_t(size_t, U16_MAX - header->size,
+                            event->attr.aux_sample_size);
+               size = rounddown(size, 8);
+               size = perf_prepare_sample_aux(event, data, size);
+
+               WARN_ON_ONCE(size + header->size > U16_MAX);
+               header->size += size;
+       }
+       /*
+        * If you're adding more sample types here, you likely need to do
+        * something about the overflowing header::size, like repurpose the
+        * lowest 3 bits of size, which should be always zero at the moment.
+        * This raises a more important question, do we really need 512k sized
+        * samples and why, so good argumentation is in order for whatever you
+        * do here next.
+        */
+       WARN_ON_ONCE(header->size & 7);
 }
 
 static __always_inline int
 
        attr->size = size;
 
-       if (attr->__reserved_1 || attr->__reserved_2)
+       if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                return -EINVAL;
 
        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
                }
        }
 
-       if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
+       if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader))
                goto err_locked;
 
        /*