syzbot reported a race around qdisc->qdisc_sleeping [1]
It is time we add proper annotations to reads and writes to/from
qdisc->qdisc_sleeping.
[1]
BUG: KCSAN: data-race in dev_graft_qdisc / qdisc_lookup_rcu
read to 0xffff8881286fc618 of 8 bytes by task 6928 on cpu 1:
qdisc_lookup_rcu+0x192/0x2c0 net/sched/sch_api.c:331
__tcf_qdisc_find+0x74/0x3c0 net/sched/cls_api.c:1174
tc_get_tfilter+0x18f/0x990 net/sched/cls_api.c:2547
rtnetlink_rcv_msg+0x7af/0x8c0 net/core/rtnetlink.c:6386
netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2546
rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6413
netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline]
netlink_unicast+0x56f/0x640 net/netlink/af_netlink.c:1365
netlink_sendmsg+0x665/0x770 net/netlink/af_netlink.c:1913
sock_sendmsg_nosec net/socket.c:724 [inline]
sock_sendmsg net/socket.c:747 [inline]
____sys_sendmsg+0x375/0x4c0 net/socket.c:2503
___sys_sendmsg net/socket.c:2557 [inline]
__sys_sendmsg+0x1e3/0x270 net/socket.c:2586
__do_sys_sendmsg net/socket.c:2595 [inline]
__se_sys_sendmsg net/socket.c:2593 [inline]
__x64_sys_sendmsg+0x46/0x50 net/socket.c:2593
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
write to 0xffff8881286fc618 of 8 bytes by task 6912 on cpu 0:
dev_graft_qdisc+0x4f/0x80 net/sched/sch_generic.c:1115
qdisc_graft+0x7d0/0xb60 net/sched/sch_api.c:1103
tc_modify_qdisc+0x712/0xf10 net/sched/sch_api.c:1693
rtnetlink_rcv_msg+0x807/0x8c0 net/core/rtnetlink.c:6395
netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2546
rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6413
netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline]
netlink_unicast+0x56f/0x640 net/netlink/af_netlink.c:1365
netlink_sendmsg+0x665/0x770 net/netlink/af_netlink.c:1913
sock_sendmsg_nosec net/socket.c:724 [inline]
sock_sendmsg net/socket.c:747 [inline]
____sys_sendmsg+0x375/0x4c0 net/socket.c:2503
___sys_sendmsg net/socket.c:2557 [inline]
__sys_sendmsg+0x1e3/0x270 net/socket.c:2586
__do_sys_sendmsg net/socket.c:2595 [inline]
__se_sys_sendmsg net/socket.c:2593 [inline]
__x64_sys_sendmsg+0x46/0x50 net/socket.c:2593
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 6912 Comm: syz-executor.5 Not tainted 
6.4.0-rc3-syzkaller-00190-g0d85b27b0cc6 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/16/2023
Fixes: 3a7d0d07a386 ("net: sched: extend Qdisc with rcu")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Buslov <vladbu@nvidia.com>
Acked-by: Jamal Hadi Salim<jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
        netdevice_tracker       dev_tracker;
 
        struct Qdisc __rcu      *qdisc;
-       struct Qdisc            *qdisc_sleeping;
+       struct Qdisc __rcu      *qdisc_sleeping;
 #ifdef CONFIG_SYSFS
        struct kobject          kobj;
 #endif
 
 
 static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
 {
-       return qdisc->dev_queue->qdisc_sleeping;
+       return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping);
 }
 
 static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
 
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-               if (rcu_access_pointer(txq->qdisc) != txq->qdisc_sleeping)
+
+               if (rcu_access_pointer(txq->qdisc) !=
+                   rcu_access_pointer(txq->qdisc_sleeping))
                        return true;
        }
        return false;
 
                return NULL;
        netdev_init_one_queue(dev, queue, NULL);
        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
-       queue->qdisc_sleeping = &noop_qdisc;
+       RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
        rcu_assign_pointer(dev->ingress_queue, queue);
 #endif
        return queue;
 
 
        if (dev_ingress_queue(dev))
                q = qdisc_match_from_root(
-                       dev_ingress_queue(dev)->qdisc_sleeping,
+                       rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
                        handle);
 out:
        return q;
 
        nq = dev_ingress_queue_rcu(dev);
        if (nq)
-               q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
+               q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
+                                         handle);
 out:
        return q;
 }
 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
                                      u64 delta_ns)
 {
-       if (test_bit(__QDISC_STATE_DEACTIVATED,
-                    &qdisc_root_sleeping(wd->qdisc)->state))
+       bool deactivated;
+
+       rcu_read_lock();
+       deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
+                              &qdisc_root_sleeping(wd->qdisc)->state);
+       rcu_read_unlock();
+       if (deactivated)
                return;
 
        if (hrtimer_is_queued(&wd->timer)) {
                                }
                                q = qdisc_leaf(p, clid);
                        } else if (dev_ingress_queue(dev)) {
-                               q = dev_ingress_queue(dev)->qdisc_sleeping;
+                               q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
                        }
                } else {
                        q = rtnl_dereference(dev->qdisc);
                                }
                                q = qdisc_leaf(p, clid);
                        } else if (dev_ingress_queue_create(dev)) {
-                               q = dev_ingress_queue(dev)->qdisc_sleeping;
+                               q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
                        }
                } else {
                        q = rtnl_dereference(dev->qdisc);
 
                dev_queue = dev_ingress_queue(dev);
                if (dev_queue &&
-                   tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
-                                      &q_idx, s_q_idx, false,
+                   tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
+                                      skb, cb, &q_idx, s_q_idx, false,
                                       tca[TCA_DUMP_INVISIBLE]) < 0)
                        goto done;
 
 
        dev_queue = dev_ingress_queue(dev);
        if (dev_queue &&
-           tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
-                               &t, s_t, false) < 0)
+           tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
+                               skb, tcm, cb, &t, s_t, false) < 0)
                goto done;
 
 done:
 
        spinlock_t *root_lock; /* to lock qdisc for probability calculations */
        u32 idx;
 
+       rcu_read_lock();
        root_lock = qdisc_lock(qdisc_root_sleeping(sch));
        spin_lock(root_lock);
 
                mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
 
        spin_unlock(root_lock);
+       rcu_read_unlock();
 }
 
 static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
 
 
 static struct netdev_queue noop_netdev_queue = {
        RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
-       .qdisc_sleeping =       &noop_qdisc,
+       RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc),
 };
 
 struct Qdisc noop_qdisc = {
 struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc)
 {
-       struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
+       struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        spinlock_t *root_lock;
 
        root_lock = qdisc_lock(oqdisc);
        /* ... and graft new one */
        if (qdisc == NULL)
                qdisc = &noop_qdisc;
-       dev_queue->qdisc_sleeping = qdisc;
+       rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
        rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
 
        spin_unlock_bh(root_lock);
                                     struct netdev_queue *dev_queue,
                                     void *_qdisc_default)
 {
-       struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+       struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        struct Qdisc *qdisc_default = _qdisc_default;
 
        if (qdisc) {
                rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
-               dev_queue->qdisc_sleeping = qdisc_default;
+               rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default);
 
                qdisc_put(qdisc);
        }
 
        if (!netif_is_multiqueue(dev))
                qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
-       dev_queue->qdisc_sleeping = qdisc;
+       rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
 }
 
 static void attach_default_qdiscs(struct net_device *dev)
        if (!netif_is_multiqueue(dev) ||
            dev->priv_flags & IFF_NO_QUEUE) {
                netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
-               qdisc = txq->qdisc_sleeping;
+               qdisc = rtnl_dereference(txq->qdisc_sleeping);
                rcu_assign_pointer(dev->qdisc, qdisc);
                qdisc_refcount_inc(qdisc);
        } else {
                netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
                dev->priv_flags |= IFF_NO_QUEUE;
                netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
-               qdisc = txq->qdisc_sleeping;
+               qdisc = rtnl_dereference(txq->qdisc_sleeping);
                rcu_assign_pointer(dev->qdisc, qdisc);
                qdisc_refcount_inc(qdisc);
                dev->priv_flags ^= IFF_NO_QUEUE;
                                 struct netdev_queue *dev_queue,
                                 void *_need_watchdog)
 {
-       struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+       struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        int *need_watchdog_p = _need_watchdog;
 
        if (!(new_qdisc->flags & TCQ_F_BUILTIN))
        struct Qdisc *qdisc;
        bool nolock;
 
-       qdisc = dev_queue->qdisc_sleeping;
+       qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        if (!qdisc)
                return;
 
                int val;
 
                dev_queue = netdev_get_tx_queue(dev, i);
-               q = dev_queue->qdisc_sleeping;
+               q = rtnl_dereference(dev_queue->qdisc_sleeping);
 
                root_lock = qdisc_lock(q);
                spin_lock_bh(root_lock);
 static int qdisc_change_tx_queue_len(struct net_device *dev,
                                     struct netdev_queue *dev_queue)
 {
-       struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+       struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
        const struct Qdisc_ops *ops = qdisc->ops;
 
        if (ops->change_tx_queue_len)
        unsigned int i;
 
        for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
-               qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+               qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
                /* Only update the default qdiscs we created,
                 * qdiscs with handles are always hashed.
                 */
                        qdisc_hash_del(qdisc);
        }
        for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
-               qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
+               qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
                if (qdisc != &noop_qdisc && !qdisc->handle)
                        qdisc_hash_add(qdisc, false);
        }
        struct Qdisc *qdisc = _qdisc;
 
        rcu_assign_pointer(dev_queue->qdisc, qdisc);
-       dev_queue->qdisc_sleeping = qdisc;
+       rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
 }
 
 void dev_init_scheduler(struct net_device *dev)
 
         * qdisc totals are added at end.
         */
        for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
-               qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+               qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
                spin_lock_bh(qdisc_lock(qdisc));
 
                gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats,
 {
        struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
 
-       return dev_queue->qdisc_sleeping;
+       return rtnl_dereference(dev_queue->qdisc_sleeping);
 }
 
 static unsigned long mq_find(struct Qdisc *sch, u32 classid)
 
        tcm->tcm_parent = TC_H_ROOT;
        tcm->tcm_handle |= TC_H_MIN(cl);
-       tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+       tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle;
        return 0;
 }
 
 {
        struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
 
-       sch = dev_queue->qdisc_sleeping;
+       sch = rtnl_dereference(dev_queue->qdisc_sleeping);
        if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 ||
            qdisc_qstats_copy(d, sch) < 0)
                return -1;
 
         * qdisc totals are added at end.
         */
        for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
-               qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+               qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
                spin_lock_bh(qdisc_lock(qdisc));
 
                gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats,
        if (!dev_queue)
                return NULL;
 
-       return dev_queue->qdisc_sleeping;
+       return rtnl_dereference(dev_queue->qdisc_sleeping);
 }
 
 static unsigned long mqprio_find(struct Qdisc *sch, u32 classid)
                tcm->tcm_parent = (tc < 0) ? 0 :
                        TC_H_MAKE(TC_H_MAJ(sch->handle),
                                  TC_H_MIN(tc + TC_H_MIN_PRIORITY));
-               tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+               tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle;
        } else {
                tcm->tcm_parent = TC_H_ROOT;
                tcm->tcm_info = 0;
        } else {
                struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
 
-               sch = dev_queue->qdisc_sleeping;
+               sch = rtnl_dereference(dev_queue->qdisc_sleeping);
                if (gnet_stats_copy_basic(d, sch->cpu_bstats,
                                          &sch->bstats, true) < 0 ||
                    qdisc_qstats_copy(d, sch) < 0)
 
 {
        struct pie_sched_data *q = from_timer(q, t, adapt_timer);
        struct Qdisc *sch = q->sch;
-       spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+       spinlock_t *root_lock;
 
+       rcu_read_lock();
+       root_lock = qdisc_lock(qdisc_root_sleeping(sch));
        spin_lock(root_lock);
        pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog);
 
        if (q->params.tupdate)
                mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
        spin_unlock(root_lock);
+       rcu_read_unlock();
 }
 
 static int pie_init(struct Qdisc *sch, struct nlattr *opt,
 
 {
        struct red_sched_data *q = from_timer(q, t, adapt_timer);
        struct Qdisc *sch = q->sch;
-       spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+       spinlock_t *root_lock;
 
+       rcu_read_lock();
+       root_lock = qdisc_lock(qdisc_root_sleeping(sch));
        spin_lock(root_lock);
        red_adaptative_algo(&q->parms, &q->vars);
        mod_timer(&q->adapt_timer, jiffies + HZ/2);
        spin_unlock(root_lock);
+       rcu_read_unlock();
 }
 
 static int red_init(struct Qdisc *sch, struct nlattr *opt,
 
 {
        struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
        struct Qdisc *sch = q->sch;
-       spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+       spinlock_t *root_lock;
        siphash_key_t nkey;
 
        get_random_bytes(&nkey, sizeof(nkey));
+       rcu_read_lock();
+       root_lock = qdisc_lock(qdisc_root_sleeping(sch));
        spin_lock(root_lock);
        q->perturbation = nkey;
        if (!q->filter_list && q->tail)
 
        if (q->perturb_period)
                mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+       rcu_read_unlock();
 }
 
 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 
        if (!dev_queue)
                return NULL;
 
-       return dev_queue->qdisc_sleeping;
+       return rtnl_dereference(dev_queue->qdisc_sleeping);
 }
 
 static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
 
        tcm->tcm_parent = TC_H_ROOT;
        tcm->tcm_handle |= TC_H_MIN(cl);
-       tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+       tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle;
 
        return 0;
 }
 {
        struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
 
-       sch = dev_queue->qdisc_sleeping;
+       sch = rtnl_dereference(dev_queue->qdisc_sleeping);
        if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 ||
            qdisc_qstats_copy(d, sch) < 0)
                return -1;
 
                struct net_device *slave = qdisc_dev(q);
                struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
 
-               if (slave_txq->qdisc_sleeping != q)
+               if (rcu_access_pointer(slave_txq->qdisc_sleeping) != q)
                        continue;
                if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
                    !netif_running(slave)) {