syzbot wrote:
> ========================================================
> WARNING: possible irq lock inversion dependency detected
> 5.6.0-syzkaller #0 Not tainted
> --------------------------------------------------------
> swapper/1/0 just changed the state of lock:
> 
ffffffff898090d8 (tasklist_lock){.+.?}-{2:2}, at: send_sigurg+0x9f/0x320 fs/fcntl.c:840
> but this lock took another, SOFTIRQ-unsafe lock in the past:
>  (&pid->wait_pidfd){+.+.}-{2:2}
>
>
> and interrupts could create inverse lock ordering between them.
>
>
> other info that might help us debug this:
>  Possible interrupt unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(&pid->wait_pidfd);
>                                local_irq_disable();
>                                lock(tasklist_lock);
>                                lock(&pid->wait_pidfd);
>   <Interrupt>
>     lock(tasklist_lock);
>
>  *** DEADLOCK ***
>
> 4 locks held by swapper/1/0:
The problem is that because wait_pidfd.lock is taken under the tasklist
lock.  It must always be taken with irqs disabled as tasklist_lock can be
taken from interrupt context and if wait_pidfd.lock was already taken this
would create a lock order inversion.
Oleg suggested just disabling irqs where I have added extra calls to
wait_pidfd.lock.  That should be safe and I think the code will eventually
do that.  It was rightly pointed out by Christian that sharing the
wait_pidfd.lock was a premature optimization.
It is also true that my pre-merge window testing was insufficient.  So
remove the premature optimization and give struct pid a dedicated lock of
it's own for struct pid things.  I have verified that lockdep sees all 3
paths where we take the new pid->lock and lockdep does not complain.
It is my current day dream that one day pid->lock can be used to guard the
task lists as well and then the tasklist_lock won't need to be held to
deliver signals.  That will require taking pid->lock with irqs disabled.
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/lkml/00000000000011d66805a25cd73f@google.com/
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Reported-by: syzbot+343f75cdeea091340956@syzkaller.appspotmail.com
Reported-by: syzbot+832aabf700bc3ec920b9@syzkaller.appspotmail.com
Reported-by: syzbot+f675f964019f884dbd0f@syzkaller.appspotmail.com
Reported-by: syzbot+a9fb1457d720a55d6dc5@syzkaller.appspotmail.com
Fixes: 7bc3e6e55acf ("proc: Use a list of inodes to flush from proc")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
        struct pid *pid = ei->pid;
 
        if (S_ISDIR(ei->vfs_inode.i_mode)) {
-               spin_lock(&pid->wait_pidfd.lock);
+               spin_lock(&pid->lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
-               spin_unlock(&pid->wait_pidfd.lock);
+               spin_unlock(&pid->lock);
        }
 
        put_pid(pid);
        /* Let the pid remember us for quick removal */
        ei->pid = pid;
        if (S_ISDIR(mode)) {
-               spin_lock(&pid->wait_pidfd.lock);
+               spin_lock(&pid->lock);
                hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
-               spin_unlock(&pid->wait_pidfd.lock);
+               spin_unlock(&pid->lock);
        }
 
        task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 
 void proc_flush_pid(struct pid *pid)
 {
-       proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
+       proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
        put_pid(pid);
 }
 
 
 {
        refcount_t count;
        unsigned int level;
+       spinlock_t lock;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
        struct hlist_head inodes;
 
 
        get_pid_ns(ns);
        refcount_set(&pid->count, 1);
+       spin_lock_init(&pid->lock);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);