Currently perf_event_release_kernel() will iterate the child events and attempt
tear-down. However, it removes them from the child_list using list_move(),
notably skipping the state management done by perf_child_detach().
Crucially, it fails to clear PERF_ATTACH_CHILD, which opens the door for a
concurrent perf_remove_from_context() to race.
This way child_list management stays fully serialized using child_mutex.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Link: https://lkml.kernel.org/r/20250307193305.486326750@infradead.org
        if (WARN_ON_ONCE(!parent_event))
                return;
 
+       /*
+        * Can't check this from an IPI, the holder is likey another CPU.
+        *
        lockdep_assert_held(&parent_event->child_mutex);
+        */
 
        sync_child_event(event);
        list_del_init(&event->child_list);
                tmp = list_first_entry_or_null(&event->child_list,
                                               struct perf_event, child_list);
                if (tmp == child) {
-                       perf_remove_from_context(child, DETACH_GROUP);
-                       list_move(&child->child_list, &free_list);
+                       perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
+                       list_add(&child->child_list, &free_list);
                } else {
                        var = &ctx->refcount;
                }