struct pt_regs regs;
        struct perf_sf_sde_regs *sde_regs;
        struct perf_sample_data data;
-       struct perf_raw_record raw;
+       struct perf_raw_record raw = {
+               .frag = {
+                       .size = sfr->size,
+                       .data = sfr,
+               },
+       };
 
        /* Setup perf sample */
        perf_sample_data_init(&data, 0, event->hw.last_period);
-       raw.size = sfr->size;
-       raw.data = sfr;
        data.raw = &raw;
 
        /* Setup pt_regs to look like an CPU-measurement external interrupt
 
        }
 
        if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-               raw.size = sizeof(u32) + ibs_data.size;
-               raw.data = ibs_data.data;
+               raw = (struct perf_raw_record){
+                       .frag = {
+                               .size = sizeof(u32) + ibs_data.size,
+                               .data = ibs_data.data,
+                       },
+               };
                data.raw = &raw;
        }
 
 
        bool                        contexts_maxed;
 };
 
+typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
+                                    unsigned long len);
+
+struct perf_raw_frag {
+       union {
+               struct perf_raw_frag    *next;
+               unsigned long           pad;
+       };
+       perf_copy_f                     copy;
+       void                            *data;
+       u32                             size;
+} __packed;
+
 struct perf_raw_record {
+       struct perf_raw_frag            frag;
        u32                             size;
-       void                            *data;
 };
 
 /*
 static inline void perf_restore_debug_store(void)                      { }
 #endif
 
+static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
+{
+       return frag->pad < sizeof(u64);
+}
+
 #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
 
 /*
 
        }
 
        if (sample_type & PERF_SAMPLE_RAW) {
-               if (data->raw) {
-                       u32 raw_size = data->raw->size;
-                       u32 real_size = round_up(raw_size + sizeof(u32),
-                                                sizeof(u64)) - sizeof(u32);
-                       u64 zero = 0;
-
-                       perf_output_put(handle, real_size);
-                       __output_copy(handle, data->raw->data, raw_size);
-                       if (real_size - raw_size)
-                               __output_copy(handle, &zero, real_size - raw_size);
+               struct perf_raw_record *raw = data->raw;
+
+               if (raw) {
+                       struct perf_raw_frag *frag = &raw->frag;
+
+                       perf_output_put(handle, raw->size);
+                       do {
+                               if (frag->copy) {
+                                       __output_custom(handle, frag->copy,
+                                                       frag->data, frag->size);
+                               } else {
+                                       __output_copy(handle, frag->data,
+                                                     frag->size);
+                               }
+                               if (perf_raw_frag_last(frag))
+                                       break;
+                               frag = frag->next;
+                       } while (1);
+                       if (frag->pad)
+                               __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32     size;
        }
 
        if (sample_type & PERF_SAMPLE_RAW) {
-               int size = sizeof(u32);
-
-               if (data->raw)
-                       size += data->raw->size;
-               else
-                       size += sizeof(u32);
+               struct perf_raw_record *raw = data->raw;
+               int size;
+
+               if (raw) {
+                       struct perf_raw_frag *frag = &raw->frag;
+                       u32 sum = 0;
+
+                       do {
+                               sum += frag->size;
+                               if (perf_raw_frag_last(frag))
+                                       break;
+                               frag = frag->next;
+                       } while (1);
+
+                       size = round_up(sum + sizeof(u32), sizeof(u64));
+                       raw->size = size - sizeof(u32);
+                       frag->pad = raw->size - sum;
+               } else {
+                       size = sizeof(u64);
+               }
 
-               header->size += round_up(size, sizeof(u64));
+               header->size += size;
        }
 
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
 static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
-       void *record = data->raw->data;
+       void *record = data->raw->frag.data;
 
        /* only top level events have filters set */
        if (event->parent)
        struct perf_event *event;
 
        struct perf_raw_record raw = {
-               .size = entry_size,
-               .data = record,
+               .frag = {
+                       .size = entry_size,
+                       .data = record,
+               },
        };
 
        perf_sample_data_init(&data, 0, 0);
 
        return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                     \
-static inline unsigned long                                            \
-func_name(struct perf_output_handle *handle,                           \
-         const void *buf, unsigned long len)                           \
+#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func)                         \
 {                                                                      \
        unsigned long size, written;                                    \
                                                                        \
        return len;                                                     \
 }
 
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                     \
+static inline unsigned long                                            \
+func_name(struct perf_output_handle *handle,                           \
+         const void *buf, unsigned long len)                           \
+__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+               const void *buf, unsigned long len)
+__DEFINE_OUTPUT_COPY_BODY(copy_func)
+
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
 {
 
        struct bpf_event_entry *ee;
        struct perf_event *event;
        struct perf_raw_record raw = {
-               .size = size,
-               .data = data,
+               .frag = {
+                       .size = size,
+                       .data = data,
+               },
        };
 
        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))