From df008598b3a00be02a8051fde89ca0fbc416bd55 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:10:24 -0700 Subject: [PATCH 01/16] sch_drr: make drr_qlen_notify() idempotent drr_qlen_notify() always deletes the DRR class from its active list with list_del(), therefore, it is not idempotent and not friendly to its callers, like fq_codel_dequeue(). Let's make it idempotent to ease qdisc_tree_reduce_backlog() callers' life. Also change other list_del()'s to list_del_init() just to be extra safe. Reported-by: Gerrard Tai Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250403211033.166059-3-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- net/sched/sch_drr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index c69b999fae17..e0a81d313aa7 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -105,6 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); + INIT_LIST_HEAD(&cl->alist); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @@ -229,7 +230,7 @@ static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; - list_del(&cl->alist); + list_del_init(&cl->alist); } static int drr_dump_class(struct Qdisc *sch, unsigned long arg, @@ -390,7 +391,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) if (unlikely(skb == NULL)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); @@ -431,7 +432,7 @@ static void drr_reset_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); qdisc_reset(cl->qdisc); } } -- 2.50.1 From 51eb3b65544c9efd6a1026889ee5fb5aa62da3bb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:10:25 -0700 Subject: [PATCH 02/16] sch_hfsc: make hfsc_qlen_notify() idempotent hfsc_qlen_notify() is not idempotent either and not friendly to its callers, like fq_codel_dequeue(). Let's make it idempotent to ease qdisc_tree_reduce_backlog() callers' life: 1. update_vf() decreases cl->cl_nactive, so we can check whether it is non-zero before calling it. 2. eltree_remove() always removes RB node cl->el_node, but we can use RB_EMPTY_NODE() + RB_CLEAR_NODE() to make it safe. Reported-by: Gerrard Tai Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250403211033.166059-4-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- net/sched/sch_hfsc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index c287bf8423b4..ce5045eea065 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -203,7 +203,10 @@ eltree_insert(struct hfsc_class *cl) static inline void eltree_remove(struct hfsc_class *cl) { - rb_erase(&cl->el_node, &cl->sched->eligible); + if (!RB_EMPTY_NODE(&cl->el_node)) { + rb_erase(&cl->el_node, &cl->sched->eligible); + RB_CLEAR_NODE(&cl->el_node); + } } static inline void @@ -1220,7 +1223,8 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg) /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from vttree. */ - update_vf(cl, 0, 0); + if (cl->cl_nactive) + update_vf(cl, 0, 0); if (cl->cl_flags & HFSC_RSC) eltree_remove(cl); } -- 2.50.1 From 55f9eca4bfe30a15d8656f915922e8c98b7f0728 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:10:26 -0700 Subject: [PATCH 03/16] sch_qfq: make qfq_qlen_notify() idempotent qfq_qlen_notify() always deletes its class from its active list with list_del_init() _and_ calls qfq_deactivate_agg() when the whole list becomes empty. To make it idempotent, just skip everything when it is not in the active list. Also change other list_del()'s to list_del_init() just to be extra safe. Reported-by: Gerrard Tai Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250403211033.166059-5-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- net/sched/sch_qfq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 2cfbc977fe6d..687a932eb9b2 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -347,7 +347,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) struct qfq_aggregate *agg = cl->agg; - list_del(&cl->alist); /* remove from RR queue of the aggregate */ + list_del_init(&cl->alist); /* remove from RR queue of the aggregate */ if (list_empty(&agg->active)) /* agg is now inactive */ qfq_deactivate_agg(q, agg); } @@ -474,6 +474,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; + INIT_LIST_HEAD(&cl->alist); cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); @@ -982,7 +983,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, cl->deficit -= (int) len; if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ - list_del(&cl->alist); + list_del_init(&cl->alist); else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); @@ -1415,6 +1416,8 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; + if (list_empty(&cl->alist)) + return; qfq_deactivate_class(q, cl); } -- 2.50.1 From a7a15f39c682ac4268624da2abdb9114bdde96d5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:10:27 -0700 Subject: [PATCH 04/16] sch_ets: make est_qlen_notify() idempotent est_qlen_notify() deletes its class from its active list with list_del() when qlen is 0, therefore, it is not idempotent and not friendly to its callers, like fq_codel_dequeue(). Let's make it idempotent to ease qdisc_tree_reduce_backlog() callers' life. Also change other list_del()'s to list_del_init() just to be extra safe. Reported-by: Gerrard Tai Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250403211033.166059-6-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- net/sched/sch_ets.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 516038a44163..c3bdeb14185b 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -293,7 +293,7 @@ static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) * to remove them. */ if (!ets_class_is_strict(q, cl) && sch->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); } static int ets_class_dump(struct Qdisc *sch, unsigned long arg, @@ -488,7 +488,7 @@ static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); return ets_qdisc_dequeue_skb(sch, skb); } @@ -657,7 +657,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, } for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) - list_del(&q->classes[i].alist); + list_del_init(&q->classes[i].alist); qdisc_tree_flush_backlog(q->classes[i].qdisc); } WRITE_ONCE(q->nstrict, nstrict); @@ -713,7 +713,7 @@ static void ets_qdisc_reset(struct Qdisc *sch) for (band = q->nstrict; band < q->nbands; band++) { if (q->classes[band].qdisc->q.qlen) - list_del(&q->classes[band].alist); + list_del_init(&q->classes[band].alist); } for (band = 0; band < q->nbands; band++) qdisc_reset(q->classes[band].qdisc); -- 2.50.1 From 342debc12183b51773b3345ba267e9263bdfaaef Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:16:31 -0700 Subject: [PATCH 05/16] codel: remove sch->q.qlen check before qdisc_tree_reduce_backlog() After making all ->qlen_notify() callbacks idempotent, now it is safe to remove the check of qlen!=0 from both fq_codel_dequeue() and codel_qdisc_dequeue(). Reported-by: Gerrard Tai Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250403211636.166257-1-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- net/sched/sch_codel.c | 5 +---- net/sched/sch_fq_codel.c | 6 ++---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 81189d02fee7..12dd71139da3 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -65,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) &q->stats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->stats.drop_count && sch->q.qlen) { + if (q->stats.drop_count) { qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; q->stats.drop_len = 0; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 799f5397ad4c..6c9029f71e88 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -315,10 +315,8 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->cstats.drop_count && sch->q.qlen) { + + if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; -- 2.50.1 From cbe9588b12d058cbb16735fd468c82ec4b3d1256 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:16:32 -0700 Subject: [PATCH 06/16] selftests/tc-testing: Add a test case for FQ_CODEL with HTB parent Add a test case for FQ_CODEL with HTB parent to verify packet drop behavior when the queue becomes empty. This helps ensure proper notification mechanisms between qdiscs. Note this is best-effort, it is hard to play with those parameters perfectly to always trigger ->qlen_notify(). Cc: Pedro Tammela Signed-off-by: Cong Wang Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20250403211636.166257-2-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 25454fd95537..545966b6adc6 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -158,5 +158,36 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "a4bb", + "name": "Test FQ_CODEL with HTB parent - force packet drop with empty queue", + "category": [ + "qdisc", + "fq_codel", + "htb" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1: root htb default 10", + "$TC class add dev $DUMMY parent 1: classid 1:10 htb rate 1kbit", + "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "matchPattern": "dropped [1-9][0-9]*", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] -- 2.50.1 From 4cb1837ac5375b9b271cb83b2a43a3f942f4c36e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:16:33 -0700 Subject: [PATCH 07/16] selftests/tc-testing: Add a test case for FQ_CODEL with QFQ parent Add a test case for FQ_CODEL with QFQ parent to verify packet drop behavior when the queue becomes empty. This helps ensure proper notification mechanisms between qdiscs. Note this is best-effort, it is hard to play with those parameters perfectly to always trigger ->qlen_notify(). Cc: Pedro Tammela Signed-off-by: Cong Wang Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20250403211636.166257-3-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 545966b6adc6..695522b00a3c 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -189,5 +189,36 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "a4be", + "name": "Test FQ_CODEL with QFQ parent - force packet drop with empty queue", + "category": [ + "qdisc", + "fq_codel", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1: root qfq", + "$TC class add dev $DUMMY parent 1: classid 1:10 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "ping -c 10 -s 1000 -f -I $DUMMY 10.10.10.1 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "matchPattern": "dropped [1-9][0-9]*", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] -- 2.50.1 From 72b05c1bf7ea799bfce1164d6605b27f060191ac Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:16:34 -0700 Subject: [PATCH 08/16] selftests/tc-testing: Add a test case for FQ_CODEL with HFSC parent Add a test case for FQ_CODEL with HFSC parent to verify packet drop behavior when the queue becomes empty. This helps ensure proper notification mechanisms between qdiscs. Note this is best-effort, it is hard to play with those parameters perfectly to always trigger ->qlen_notify(). Cc: Pedro Tammela Signed-off-by: Cong Wang Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20250403211636.166257-4-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 695522b00a3c..0347b207fe6d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -220,5 +220,36 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "a4bf", + "name": "Test FQ_CODEL with HFSC parent - force packet drop with empty queue", + "category": [ + "qdisc", + "fq_codel", + "hfsc" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1: root hfsc default 10", + "$TC class add dev $DUMMY parent 1: classid 1:10 hfsc sc rate 1kbit ul rate 1kbit", + "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "matchPattern": "dropped [1-9][0-9]*", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] -- 2.50.1 From 0d5c27ecb60c6cc4e394035aa04696d7eb39f072 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:16:35 -0700 Subject: [PATCH 09/16] selftests/tc-testing: Add a test case for FQ_CODEL with DRR parent Add a test case for FQ_CODEL with DRR parent to verify packet drop behavior when the queue becomes empty. This helps ensure proper notification mechanisms between qdiscs. Note this is best-effort, it is hard to play with those parameters perfectly to always trigger ->qlen_notify(). Cc: Pedro Tammela Signed-off-by: Cong Wang Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20250403211636.166257-5-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 0347b207fe6d..4a45fedad876 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -251,5 +251,36 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "a4c0", + "name": "Test FQ_CODEL with DRR parent - force packet drop with empty queue", + "category": [ + "qdisc", + "fq_codel", + "drr" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1: root drr", + "$TC class add dev $DUMMY parent 1: classid 1:10 drr quantum 1500", + "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "matchPattern": "dropped [1-9][0-9]*", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] -- 2.50.1 From ce94507f5fe04eb7fe1eecfe32a2b29233341ff0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 3 Apr 2025 14:16:36 -0700 Subject: [PATCH 10/16] selftests/tc-testing: Add a test case for FQ_CODEL with ETS parent Add a test case for FQ_CODEL with ETS parent to verify packet drop behavior when the queue becomes empty. This helps ensure proper notification mechanisms between qdiscs. Note this is best-effort, it is hard to play with those parameters perfectly to always trigger ->qlen_notify(). Cc: Pedro Tammela Signed-off-by: Cong Wang Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20250403211636.166257-6-xiyou.wangcong@gmail.com Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 4a45fedad876..d4ea9cd845a3 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -282,5 +282,36 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "a4c1", + "name": "Test FQ_CODEL with ETS parent - force packet drop with empty queue", + "category": [ + "qdisc", + "fq_codel", + "ets" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1: root ets bands 2 strict 1", + "$TC class change dev $DUMMY parent 1: classid 1:1 ets", + "$TC qdisc add dev $DUMMY parent 1:1 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:1", + "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "matchPattern": "dropped [1-9][0-9]*", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] -- 2.50.1 From f1a69a940de58b16e8249dff26f74c8cc59b32be Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ricardo=20Ca=C3=B1uelo=20Navarro?= Date: Fri, 4 Apr 2025 16:53:21 +0200 Subject: [PATCH 11/16] sctp: detect and prevent references to a freed transport in sendmsg MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit sctp_sendmsg() re-uses associations and transports when possible by doing a lookup based on the socket endpoint and the message destination address, and then sctp_sendmsg_to_asoc() sets the selected transport in all the message chunks to be sent. There's a possible race condition if another thread triggers the removal of that selected transport, for instance, by explicitly unbinding an address with setsockopt(SCTP_SOCKOPT_BINDX_REM), after the chunks have been set up and before the message is sent. This can happen if the send buffer is full, during the period when the sender thread temporarily releases the socket lock in sctp_wait_for_sndbuf(). This causes the access to the transport data in sctp_outq_select_transport(), when the association outqueue is flushed, to result in a use-after-free read. This change avoids this scenario by having sctp_transport_free() signal the freeing of the transport, tagging it as "dead". In order to do this, the patch restores the "dead" bit in struct sctp_transport, which was removed in commit 47faa1e4c50e ("sctp: remove the dead field of sctp_transport"). Then, in the scenario where the sender thread has released the socket lock in sctp_wait_for_sndbuf(), the bit is checked again after re-acquiring the socket lock to detect the deletion. This is done while holding a reference to the transport to prevent it from being freed in the process. If the transport was deleted while the socket lock was relinquished, sctp_sendmsg_to_asoc() will return -EAGAIN to let userspace retry the send. The bug was found by a private syzbot instance (see the error report [1] and the C reproducer that triggers it [2]). Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-free_Read_in_sctp_outq_select_transport.txt [1] Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-free_Read_in_sctp_outq_select_transport__repro.c [2] Cc: stable@vger.kernel.org Fixes: df132eff4638 ("sctp: clear the transport of some out_chunk_list chunks in sctp_assoc_rm_peer") Suggested-by: Xin Long Signed-off-by: Ricardo Cañuelo Navarro Acked-by: Xin Long Link: https://patch.msgid.link/20250404-kasan_slab-use-after-free_read_in_sctp_outq_select_transport__20250404-v1-1-5ce4a0b78ef2@igalia.com Signed-off-by: Paolo Abeni --- include/net/sctp/structs.h | 3 ++- net/sctp/socket.c | 22 ++++++++++++++-------- net/sctp/transport.c | 2 ++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 31248cfdfb23..dcd288fa1bb6 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -775,6 +775,7 @@ struct sctp_transport { /* Reference counting. */ refcount_t refcnt; + __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, @@ -784,7 +785,7 @@ struct sctp_transport { * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ - __u32 rto_pending:1, + rto_pending:1, /* * hb_sent : a flag that signals that we have a pending diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 36ee34f483d7..53725ee7ba06 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -72,8 +72,9 @@ /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); static void sctp_wfree(struct sk_buff *skb); -static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len); +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, + struct sctp_transport *transport, + long *timeo_p, size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1828,7 +1829,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + err = sctp_wait_for_sndbuf(asoc, transport, &timeo, msg_len); if (err) goto err; if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) { @@ -9214,8 +9215,9 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ -static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len) +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, + struct sctp_transport *transport, + long *timeo_p, size_t msg_len) { struct sock *sk = asoc->base.sk; long current_timeo = *timeo_p; @@ -9225,7 +9227,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); - /* Increment the association's refcnt. */ + /* Increment the transport and association's refcnt. */ + if (transport) + sctp_transport_hold(transport); sctp_association_hold(asoc); /* Wait on the association specific sndbuf space. */ @@ -9234,7 +9238,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, TASK_INTERRUPTIBLE); if (asoc->base.dead) goto do_dead; - if (!*timeo_p) + if ((!*timeo_p) || (transport && transport->dead)) goto do_nonblock; if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; @@ -9259,7 +9263,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, out: finish_wait(&asoc->wait, &wait); - /* Release the association's refcnt. */ + /* Release the transport and association's refcnt. */ + if (transport) + sctp_transport_put(transport); sctp_association_put(asoc); return err; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 2abe45af98e7..31eca29b6cfb 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -117,6 +117,8 @@ fail: */ void sctp_transport_free(struct sctp_transport *transport) { + transport->dead = 1; + /* Try to delete the heartbeat timer. */ if (del_timer(&transport->hb_timer)) sctp_transport_put(transport); -- 2.50.1 From 5071a1e606b30c0c11278d3c6620cd6a24724cf6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Apr 2025 11:03:33 -0700 Subject: [PATCH 12/16] net: tls: explicitly disallow disconnect syzbot discovered that it can disconnect a TLS socket and then run into all sort of unexpected corner cases. I have a vague recollection of Eric pointing this out to us a long time ago. Supporting disconnect is really hard, for one thing if offload is enabled we'd need to wait for all packets to be _acked_. Disconnect is not commonly used, disallow it. The immediate problem syzbot run into is the warning in the strp, but that's just the easiest bug to trigger: WARNING: CPU: 0 PID: 5834 at net/tls/tls_strp.c:486 tls_strp_msg_load+0x72e/0xa80 net/tls/tls_strp.c:486 RIP: 0010:tls_strp_msg_load+0x72e/0xa80 net/tls/tls_strp.c:486 Call Trace: tls_rx_rec_wait+0x280/0xa60 net/tls/tls_sw.c:1363 tls_sw_recvmsg+0x85c/0x1c30 net/tls/tls_sw.c:2043 inet6_recvmsg+0x2c9/0x730 net/ipv6/af_inet6.c:678 sock_recvmsg_nosec net/socket.c:1023 [inline] sock_recvmsg+0x109/0x280 net/socket.c:1045 __sys_recvfrom+0x202/0x380 net/socket.c:2237 Fixes: 3c4d7559159b ("tls: kernel TLS support") Reported-by: syzbot+b4cd76826045a1eb93c1@syzkaller.appspotmail.com Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250404180334.3224206-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index cb86b0bf9a53..a3ccb3135e51 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -852,6 +852,11 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } +static int tls_disconnect(struct sock *sk, int flags) +{ + return -EOPNOTSUPP; +} + struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -947,6 +952,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_BASE] = *base; prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE][TLS_BASE].disconnect = tls_disconnect; prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; -- 2.50.1 From a1328a671e1c93a3513c286a05ff0abe6698d891 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Apr 2025 11:03:34 -0700 Subject: [PATCH 13/16] selftests: tls: check that disconnect does nothing "Inspired" by syzbot test, pre-queue some data, disconnect() and try to receive(). This used to trigger a warning in TLS's strp. Now we expect the disconnect() to have almost no effect. Link: https://lore.kernel.org/67e6be74.050a0220.2f068f.007e.GAE@google.com Signed-off-by: Jakub Kicinski Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250404180334.3224206-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tls.c | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 9a85f93c33d8..5ded3b3a7538 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -1753,6 +1753,42 @@ TEST_F(tls_basic, rekey_tx) EXPECT_EQ(memcmp(buf, test_str, send_len), 0); } +TEST_F(tls_basic, disconnect) +{ + char const *test_str = "test_message"; + int send_len = strlen(test_str) + 1; + struct tls_crypto_info_keys key; + struct sockaddr_in addr; + char buf[20]; + int ret; + + if (self->notls) + return; + + tls_crypto_info_init(TLS_1_3_VERSION, TLS_CIPHER_AES_GCM_128, + &key, 0); + + ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &key, key.len); + ASSERT_EQ(ret, 0); + + /* Pre-queue the data so that setsockopt parses it but doesn't + * dequeue it from the TCP socket. recvmsg would dequeue. + */ + EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len); + + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &key, key.len); + ASSERT_EQ(ret, 0); + + addr.sin_family = AF_UNSPEC; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = 0; + ret = connect(self->cfd, &addr, sizeof(addr)); + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno, EOPNOTSUPP); + + EXPECT_EQ(recv(self->cfd, buf, send_len, 0), send_len); +} + TEST_F(tls, rekey) { char const *test_str_1 = "test_message_before_rekey"; -- 2.50.1 From b7db94734e785e380b0db0f9295e07024f4d42a0 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Mon, 7 Apr 2025 12:33:41 +0530 Subject: [PATCH 14/16] octeontx2-pf: qos: fix VF root node parent queue index The current code configures the Physical Function (PF) root node at TL1 and the Virtual Function (VF) root node at TL2. This ensure at any given point of time PF traffic gets more priority. PF root node TL1 / \ TL2 TL2 VF root node / \ TL3 TL3 / \ TL4 TL4 / \ SMQ SMQ Due to a bug in the current code, the TL2 parent queue index on the VF interface is not being configured, leading to 'SMQ Flush' errors Fixes: 5e6808b4c68d ("octeontx2-pf: Add support for HTB offload") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250407070341.2765426-1-hkelam@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index 0f844c14485a..35acc07bd964 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -165,6 +165,11 @@ static void __otx2_qos_txschq_cfg(struct otx2_nic *pfvf, otx2_config_sched_shaping(pfvf, node, cfg, &num_regs); } else if (level == NIX_TXSCH_LVL_TL2) { + /* configure parent txschq */ + cfg->reg[num_regs] = NIX_AF_TL2X_PARENT(node->schq); + cfg->regval[num_regs] = (u64)hw->tx_link << 16; + num_regs++; + /* configure link cfg */ if (level == pfvf->qos.link_cfg_lvl) { cfg->reg[num_regs] = NIX_AF_TL3_TL2X_LINKX_CFG(node->schq, hw->tx_link); -- 2.50.1 From 13e7d7240a43d8ea528c12ae5a912be1ff7fa29b Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Mon, 7 Apr 2025 18:33:22 +0800 Subject: [PATCH 15/16] net: libwx: Fix the wrong Rx descriptor field WX_RXD_IPV6EX was incorrectly defined in Rx ring descriptor. In fact, this field stores the 802.1ad ID from which the packet was received. The wrong definition caused the statistics rx_csum_offload_errors to fail to grow when receiving the 802.1ad packet with incorrect checksum. Fixes: ef4f3c19f912 ("net: wangxun: libwx add rx offload functions") Signed-off-by: Jiawen Wu Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/20250407103322.273241-1-jiawenwu@trustnetic.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 3 ++- drivers/net/ethernet/wangxun/libwx/wx_type.h | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 00b0b318df27..6ebefa31ece1 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -546,7 +546,8 @@ static void wx_rx_checksum(struct wx_ring *ring, return; /* Hardware can't guarantee csum if IPv6 Dest Header found */ - if (dptype.prot != WX_DEC_PTYPE_PROT_SCTP && WX_RXD_IPV6EX(rx_desc)) + if (dptype.prot != WX_DEC_PTYPE_PROT_SCTP && + wx_test_staterr(rx_desc, WX_RXD_STAT_IPV6EX)) return; /* if L4 checksum error */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 5b230ecbbabb..4c545b2aa997 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -513,6 +513,7 @@ enum WX_MSCA_CMD_value { #define WX_RXD_STAT_L4CS BIT(7) /* L4 xsum calculated */ #define WX_RXD_STAT_IPCS BIT(8) /* IP xsum calculated */ #define WX_RXD_STAT_OUTERIPCS BIT(10) /* Cloud IP xsum calculated*/ +#define WX_RXD_STAT_IPV6EX BIT(12) /* IPv6 Dest Header */ #define WX_RXD_STAT_TS BIT(14) /* IEEE1588 Time Stamp */ #define WX_RXD_ERR_OUTERIPER BIT(26) /* CRC IP Header error */ @@ -589,8 +590,6 @@ enum wx_l2_ptypes { #define WX_RXD_PKTTYPE(_rxd) \ ((le32_to_cpu((_rxd)->wb.lower.lo_dword.data) >> 9) & 0xFF) -#define WX_RXD_IPV6EX(_rxd) \ - ((le32_to_cpu((_rxd)->wb.lower.lo_dword.data) >> 6) & 0x1) /*********************** Transmit Descriptor Config Masks ****************/ #define WX_TXD_STAT_DD BIT(0) /* Descriptor Done */ #define WX_TXD_DTYP_DATA 0 /* Adv Data Descriptor */ -- 2.50.1 From 369609fc6272c2f6ad666ba4fd913f3baf32908f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 7 Apr 2025 12:55:34 +0200 Subject: [PATCH 16/16] tc: Ensure we have enough buffer space when sending filter netlink notifications MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The tfilter_notify() and tfilter_del_notify() functions assume that NLMSG_GOODSIZE is always enough to dump the filter chain. This is not always the case, which can lead to silent notify failures (because the return code of tfilter_notify() is not always checked). In particular, this can lead to NLM_F_ECHO not being honoured even though an action succeeds, which forces userspace to create workarounds[0]. Fix this by increasing the message size if dumping the filter chain into the allocated skb fails. Use the size of the incoming skb as a size hint if set, so we can start at a larger value when appropriate. To trigger this, run the following commands: # ip link add type veth # tc qdisc replace dev veth0 root handle 1: fq_codel # tc -echo filter add dev veth0 parent 1: u32 match u32 0 0 $(for i in $(seq 32); do echo action pedit munge ip dport set 22; done) Before this fix, tc just returns: Not a filter(cmd 2) After the fix, we get the correct echo: added filter dev veth0 parent 1: protocol all pref 49152 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 terminal flowid not_in_hw match 00000000/00000000 at 0 action order 1: pedit action pass keys 1 index 1 ref 1 bind 1 key #0 at 20: val 00000016 mask ffff0000 [repeated 32 times] [0] https://github.com/openvswitch/ovs/commit/106ef21860c935e5e0017a88bf42b94025c4e511 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Frode Nordahl Closes: https://bugs.launchpad.net/ubuntu/+source/openvswitch/+bug/2018500 Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250407105542.16601-1-toke@redhat.com Signed-off-by: Paolo Abeni --- net/sched/cls_api.c | 66 ++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4f648af8cfaa..ecec0a1e1c1a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2057,6 +2057,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); + int ret = -EMSGSIZE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -2101,11 +2102,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, return skb->len; +cls_op_not_supp: + ret = -EOPNOTSUPP; out_nlmsg_trim: nla_put_failure: -cls_op_not_supp: nlmsg_trim(skb, b); - return -1; + return ret; +} + +static struct sk_buff *tfilter_notify_prep(struct net *net, + struct sk_buff *oskb, + struct nlmsghdr *n, + struct tcf_proto *tp, + struct tcf_block *block, + struct Qdisc *q, u32 parent, + void *fh, int event, + u32 portid, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE; + struct sk_buff *skb; + int ret; + +retry: + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOBUFS); + + ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, event, false, + rtnl_held, extack); + if (ret <= 0) { + kfree_skb(skb); + if (ret == -EMSGSIZE) { + size += NLMSG_GOODSIZE; + goto retry; + } + return ERR_PTR(-EINVAL); + } + return skb; } static int tfilter_notify(struct net *net, struct sk_buff *oskb, @@ -2121,16 +2156,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held, extack) <= 0) { - kfree_skb(skb); - return -EINVAL; - } + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event, + portid, rtnl_held, extack); + if (IS_ERR(skb)) + return PTR_ERR(skb); if (unicast) err = rtnl_unicast(skb, net, portid); @@ -2153,16 +2182,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return tp->ops->delete(tp, fh, last, rtnl_held, extack); - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held, extack) <= 0) { + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, + RTM_DELTFILTER, portid, rtnl_held, extack); + if (IS_ERR(skb)) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); - kfree_skb(skb); - return -EINVAL; + return PTR_ERR(skb); } err = tp->ops->delete(tp, fh, last, rtnl_held, extack); -- 2.50.1