From 625bb8a9e100d5e4d268d947911fe7b3fbcbb12c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 13 Oct 2024 21:38:30 +0100 Subject: [PATCH 01/16] cxgb4: Remove unused cxgb4_l2t_alloc_switching cxgb4_l2t_alloc_switching() has been unused since it was added in commit f7502659cec8 ("cxgb4: Add API to alloc l2t entry; also update existing ones") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20241013203831.88051-6-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/l2t.c | 19 ------------------- drivers/net/ethernet/chelsio/cxgb4/l2t.h | 2 -- 2 files changed, 21 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c index 1e5f5b1a22a6..c02b4e9c06b2 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c @@ -608,25 +608,6 @@ struct l2t_entry *t4_l2t_alloc_switching(struct adapter *adap, u16 vlan, return e; } -/** - * cxgb4_l2t_alloc_switching - Allocates an L2T entry for switch filters - * @dev: net_device pointer - * @vlan: VLAN Id - * @port: Associated port - * @dmac: Destination MAC address to add to L2T - * Returns pointer to the allocated l2t entry - * - * Allocates an L2T entry for use by switching rule of a filter - */ -struct l2t_entry *cxgb4_l2t_alloc_switching(struct net_device *dev, u16 vlan, - u8 port, u8 *dmac) -{ - struct adapter *adap = netdev2adap(dev); - - return t4_l2t_alloc_switching(adap, vlan, port, dmac); -} -EXPORT_SYMBOL(cxgb4_l2t_alloc_switching); - struct l2t_data *t4_init_l2t(unsigned int l2t_start, unsigned int l2t_end) { unsigned int l2t_size; diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.h b/drivers/net/ethernet/chelsio/cxgb4/l2t.h index 340fecb28a13..8aad7e9dee6d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/l2t.h +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.h @@ -115,8 +115,6 @@ struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh, unsigned int priority); u64 cxgb4_select_ntuple(struct net_device *dev, const struct l2t_entry *l2t); -struct l2t_entry *cxgb4_l2t_alloc_switching(struct net_device *dev, u16 vlan, - u8 port, u8 *dmac); void t4_l2t_update(struct adapter *adap, struct neighbour *neigh); struct l2t_entry *t4_l2t_alloc_switching(struct adapter *adap, u16 vlan, u8 port, u8 *dmac); -- 2.50.1 From 73929750f2361a24e6956441e29f67ec58ecec8e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 13 Oct 2024 21:38:31 +0100 Subject: [PATCH 02/16] cxgb4: Remove unused t4_free_ofld_rxqs t4_free_ofld_rxqs() has been unused since commit 0fbc81b3ad51 ("chcr/cxgb4i/cxgbit/RDMA/cxgb4: Allocate resources dynamically for all cxgb4 ULD's") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20241013203831.88051-7-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 - drivers/net/ethernet/chelsio/cxgb4/sge.c | 16 ---------------- 2 files changed, 17 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 1c302dfd6503..75bd69ff61a8 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -1608,7 +1608,6 @@ void t4_os_portmod_changed(struct adapter *adap, int port_id); void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat); void t4_free_sge_resources(struct adapter *adap); -void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q); irq_handler_t t4_intr_handler(struct adapter *adap); netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev); int cxgb4_selftest_lb_pkt(struct net_device *netdev); diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index de52bcb884c4..a7d76a8ed050 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -4874,22 +4874,6 @@ void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq, } } -/** - * t4_free_ofld_rxqs - free a block of consecutive Rx queues - * @adap: the adapter - * @n: number of queues - * @q: pointer to first queue - * - * Release the resources of a consecutive block of offload Rx queues. - */ -void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q) -{ - for ( ; n; n--, q++) - if (q->rspq.desc) - free_rspq_fl(adap, &q->rspq, - q->fl.size ? &q->fl : NULL); -} - void t4_sge_free_ethofld_txq(struct adapter *adap, struct sge_eohw_txq *txq) { if (txq->q.desc) { -- 2.50.1 From 068f3b34c5c2be5fe7923a9966c1c16f992a2f9c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 13 Oct 2024 02:29:46 +0100 Subject: [PATCH 03/16] net: cxgb3: Remove stid deadcode cxgb3_alloc_stid() and cxgb3_free_stid() have been unused since commit 30e0f6cf5acb ("RDMA/iw_cxgb3: Remove the iw_cxgb3 module from kernel") Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241013012946.284721-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/cxgb3/cxgb3_offload.c | 39 ------------------- .../ethernet/chelsio/cxgb3/cxgb3_offload.h | 3 -- 2 files changed, 42 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c index 89256b866840..5a9f6925e1fa 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c @@ -515,23 +515,6 @@ void *cxgb3_free_atid(struct t3cdev *tdev, int atid) EXPORT_SYMBOL(cxgb3_free_atid); -/* - * Free a server TID and return it to the free pool. - */ -void cxgb3_free_stid(struct t3cdev *tdev, int stid) -{ - struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; - union listen_entry *p = stid2entry(t, stid); - - spin_lock_bh(&t->stid_lock); - p->next = t->sfree; - t->sfree = p; - t->stids_in_use--; - spin_unlock_bh(&t->stid_lock); -} - -EXPORT_SYMBOL(cxgb3_free_stid); - void cxgb3_insert_tid(struct t3cdev *tdev, struct cxgb3_client *client, void *ctx, unsigned int tid) { @@ -671,28 +654,6 @@ int cxgb3_alloc_atid(struct t3cdev *tdev, struct cxgb3_client *client, EXPORT_SYMBOL(cxgb3_alloc_atid); -int cxgb3_alloc_stid(struct t3cdev *tdev, struct cxgb3_client *client, - void *ctx) -{ - int stid = -1; - struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; - - spin_lock_bh(&t->stid_lock); - if (t->sfree) { - union listen_entry *p = t->sfree; - - stid = (p - t->stid_tab) + t->stid_base; - t->sfree = p->next; - p->t3c_tid.ctx = ctx; - p->t3c_tid.client = client; - t->stids_in_use++; - } - spin_unlock_bh(&t->stid_lock); - return stid; -} - -EXPORT_SYMBOL(cxgb3_alloc_stid); - /* Get the t3cdev associated with a net_device */ struct t3cdev *dev2t3cdev(struct net_device *dev) { diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h index 929c298115ca..7419824f9926 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h @@ -95,10 +95,7 @@ struct cxgb3_client { */ int cxgb3_alloc_atid(struct t3cdev *dev, struct cxgb3_client *client, void *ctx); -int cxgb3_alloc_stid(struct t3cdev *dev, struct cxgb3_client *client, - void *ctx); void *cxgb3_free_atid(struct t3cdev *dev, int atid); -void cxgb3_free_stid(struct t3cdev *dev, int stid); void cxgb3_insert_tid(struct t3cdev *dev, struct cxgb3_client *client, void *ctx, unsigned int tid); void cxgb3_queue_tid_release(struct t3cdev *dev, unsigned int tid); -- 2.50.1 From 95b3120a485f77a9bb8060bf3398311e3dcb6c65 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 16:52:16 -0700 Subject: [PATCH 04/16] neighbour: Remove NEIGH_DN_TABLE. Since commit 1202cdd66531 ("Remove DECnet support from kernel"), NEIGH_DN_TABLE is no longer used. MPLS has implicit dependency on it in nla_put_via(), but nla_get_via() does not support DECnet. Let's remove NEIGH_DN_TABLE. Now, neigh_tables[] has only 2 elements and no extra iteration for DECnet in many places. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241014235216.10785-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 - net/mpls/af_mpls.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index a44f262a7384..3887ed9e5026 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -239,7 +239,6 @@ struct neigh_table { enum { NEIGH_ARP_TABLE = 0, NEIGH_ND_TABLE = 1, - NEIGH_DN_TABLE = 2, NEIGH_NR_TABLES, NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index df62638b6498..a0573847bc55 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1664,7 +1664,7 @@ static int nla_put_via(struct sk_buff *skb, u8 table, const void *addr, int alen) { static const int table_to_family[NEIGH_NR_TABLES + 1] = { - AF_INET, AF_INET6, AF_DECnet, AF_PACKET, + AF_INET, AF_INET6, AF_PACKET, }; struct nlattr *nla; struct rtvia *via; -- 2.50.1 From 397006ba5d918f9b74e734867e8fddbc36dc2282 Mon Sep 17 00:00:00 2001 From: Elena Salomatkina Date: Sun, 13 Oct 2024 15:45:29 +0300 Subject: [PATCH 05/16] net/sched: cbs: Fix integer overflow in cbs_set_port_rate() The subsequent calculation of port_rate = speed * 1000 * BYTES_PER_KBIT, where the BYTES_PER_KBIT is of type LL, may cause an overflow. At least when speed = SPEED_20000, the expression to the left of port_rate will be greater than INT_MAX. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Elena Salomatkina Link: https://patch.msgid.link/20241013124529.1043-1-esalomatkina@ispras.ru Signed-off-by: Jakub Kicinski --- net/sched/sch_cbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 939425da1895..8c9a0400c862 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -310,7 +310,7 @@ static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; - int port_rate; + s64 port_rate; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); -- 2.50.1 From 46f2a11cb82b657fd15bab1c47821b635e03838b Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:00 +0100 Subject: [PATCH 06/16] af_packet: avoid erroring out after sock_init_data() in packet_create() After sock_init_data() the allocated sk object is attached to the provided sock object. On error, packet_create() frees the sk object leaving the dangling pointer in the sock object on return. Some other code may try to use this pointer and cause use-after-free. Suggested-by: Eric Dumazet Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-2-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2ff4b251842d..886c0dd47b66 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3422,17 +3422,17 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; + po = pkt_sk(sk); + err = packet_alloc_pending(po); + if (err) + goto out_sk_free; + sock_init_data(sock, sk); - po = pkt_sk(sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; - err = packet_alloc_pending(po); - if (err) - goto out2; - packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; @@ -3464,7 +3464,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock_prot_inuse_add(net, &packet_proto, 1); return 0; -out2: +out_sk_free: sk_free(sk); out: return err; -- 2.50.1 From 7c4f78cdb8e7501e9f92d291a7d956591bf73be9 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:01 +0100 Subject: [PATCH 07/16] Bluetooth: L2CAP: do not leave dangling sk pointer on error in l2cap_sock_create() bt_sock_alloc() allocates the sk object and attaches it to the provided sock object. On error l2cap_sock_alloc() frees the sk object, but the dangling pointer is still attached to the sock object, which may create use-after-free in other code. Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-3-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/bluetooth/l2cap_sock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ba437c6f6ee5..18e89e764f3b 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1886,6 +1886,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, chan = l2cap_chan_create(); if (!chan) { sk_free(sk); + sock->sk = NULL; return NULL; } -- 2.50.1 From 3945c799f12b8d1f49a3b48369ca494d981ac465 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:02 +0100 Subject: [PATCH 08/16] Bluetooth: RFCOMM: avoid leaving dangling sk pointer in rfcomm_sock_alloc() bt_sock_alloc() attaches allocated sk object to the provided sock object. If rfcomm_dlc_alloc() fails, we release the sk object, but leave the dangling pointer in the sock object, which may cause use-after-free. Fix this by swapping calls to bt_sock_alloc() and rfcomm_dlc_alloc(). Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-4-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/bluetooth/rfcomm/sock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index f48250e3f2e1..355e1a1698f5 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -274,13 +274,13 @@ static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, struct rfcomm_dlc *d; struct sock *sk; - sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern); - if (!sk) + d = rfcomm_dlc_alloc(prio); + if (!d) return NULL; - d = rfcomm_dlc_alloc(prio); - if (!d) { - sk_free(sk); + sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern); + if (!sk) { + rfcomm_dlc_free(d); return NULL; } -- 2.50.1 From 811a7ca7320c062e15d0f5b171fe6ad8592d1434 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:03 +0100 Subject: [PATCH 09/16] net: af_can: do not leave a dangling sk pointer in can_create() On error can_create() frees the allocated sk object, but sock_init_data() has already attached it to the provided sock object. This will leave a dangling sk pointer in the sock object and may cause use-after-free later. Signed-off-by: Ignat Korchagin Reviewed-by: Vincent Mailhol Reviewed-by: Kuniyuki Iwashima Reviewed-by: Marc Kleine-Budde Link: https://patch.msgid.link/20241014153808.51894-5-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/can/af_can.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/can/af_can.c b/net/can/af_can.c index 707576eeeb58..01f3fbb3b67d 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -171,6 +171,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol, /* release sk on errors */ sock_orphan(sk); sock_put(sk); + sock->sk = NULL; } errout: -- 2.50.1 From b4fcd63f6ef79c73cafae8cf4a114def5fc3d80d Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:04 +0100 Subject: [PATCH 10/16] net: ieee802154: do not leave a dangling sk pointer in ieee802154_create() sock_init_data() attaches the allocated sk object to the provided sock object. If ieee802154_create() fails later, the allocated sk object is freed, but the dangling pointer remains in the provided sock object, which may allow use-after-free. Clear the sk pointer in the sock object on error. Signed-off-by: Ignat Korchagin Reviewed-by: Miquel Raynal Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-6-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ieee802154/socket.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 990a83455dcf..18d267921bb5 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1043,19 +1043,21 @@ static int ieee802154_create(struct net *net, struct socket *sock, if (sk->sk_prot->hash) { rc = sk->sk_prot->hash(sk); - if (rc) { - sk_common_release(sk); - goto out; - } + if (rc) + goto out_sk_release; } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); if (rc) - sk_common_release(sk); + goto out_sk_release; } out: return rc; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } static const struct net_proto_family ieee802154_family_ops = { -- 2.50.1 From 9365fa510c6f82e3aa550a09d0c5c6b44dbc78ff Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:05 +0100 Subject: [PATCH 11/16] net: inet: do not leave a dangling sk pointer in inet_create() sock_init_data() attaches the allocated sk object to the provided sock object. If inet_create() fails later, the sk object is freed, but the sock object retains the dangling pointer, which may create use-after-free later. Clear the sk pointer in the sock object on error. Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-7-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b24d74616637..8095e82de808 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -376,32 +376,30 @@ lookup_protocol: inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } -- 2.50.1 From 9df99c395d0f55fb444ef39f4d6f194ca437d884 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:06 +0100 Subject: [PATCH 12/16] net: inet6: do not leave a dangling sk pointer in inet6_create() sock_init_data() attaches the allocated sk pointer to the provided sock object. If inet6_create() fails later, the sk object is released, but the sock object retains the dangling sk pointer, which may cause use-after-free later. Clear the sock sk pointer on error. Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-8-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv6/af_inet6.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ba69b86f1c7d..f60ec8b0f8ea 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -252,31 +252,29 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); err = sk->sk_prot->hash(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, -- 2.50.1 From 48156296a08c615a6baae514096c4b2e543d1157 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:07 +0100 Subject: [PATCH 13/16] net: warn, if pf->create does not clear sock->sk on error All pf->create implementations have been fixed now to clear sock->sk on error, when they deallocate the allocated sk object. Put a warning in place to make sure we don't break this promise in the future. Suggested-by: Kuniyuki Iwashima Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-9-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/socket.c b/net/socket.c index 24b404299015..9a8e4452b9b2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1576,9 +1576,9 @@ int __sock_create(struct net *net, int family, int type, int protocol, err = pf->create(net, sock, protocol, kern); if (err < 0) { /* ->create should release the allocated sock->sk object on error - * but it may leave the dangling pointer + * and make sure sock->sk is set to NULL to avoid use-after-free */ - sock->sk = NULL; + DEBUG_NET_WARN_ON_ONCE(sock->sk); goto out_module_put; } -- 2.50.1 From 18429e6e0c2ad26250862a786964d8c73400d9a0 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:08 +0100 Subject: [PATCH 14/16] Revert "net: do not leave a dangling sk pointer, when socket creation fails" This reverts commit 6cd4a78d962bebbaf8beb7d2ead3f34120e3f7b2. inet/inet6->create() implementations have been fixed to explicitly NULL the allocated sk object on error. A warning was put in place to make sure any future changes will not leave a dangling pointer in pf->create() implementations. So this code is now redundant. Suggested-by: Kuniyuki Iwashima Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-10-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index f8c0d4eda888..756f8e8e0ac7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3827,9 +3827,6 @@ void sk_common_release(struct sock *sk) sk->sk_prot->unhash(sk); - if (sk->sk_socket) - sk->sk_socket->sk = NULL; - /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and -- 2.50.1 From 93c68f1275f9e21ccfed9ee292aedb11c3f6241b Mon Sep 17 00:00:00 2001 From: Harshitha Ramamurthy Date: Mon, 14 Oct 2024 13:21:06 -0700 Subject: [PATCH 15/16] gve: move DQO rx buffer management related code to a new file In preparation for the upcoming page pool adoption for DQO raw addressing mode, move RX buffer management code to a new file. In the follow on patches, page pool code will be added to this file. No functional change, just movement of code. Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: Harshitha Ramamurthy Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241014202108.1051963-2-pkaligineedi@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/Makefile | 3 +- drivers/net/ethernet/google/gve/gve.h | 18 ++ .../ethernet/google/gve/gve_buffer_mgmt_dqo.c | 230 ++++++++++++++++++ drivers/net/ethernet/google/gve/gve_rx_dqo.c | 225 ----------------- 4 files changed, 250 insertions(+), 226 deletions(-) create mode 100644 drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile index 9ed07080b38a..4520f1c07a63 100644 --- a/drivers/net/ethernet/google/gve/Makefile +++ b/drivers/net/ethernet/google/gve/Makefile @@ -1,4 +1,5 @@ # Makefile for the Google virtual Ethernet (gve) driver obj-$(CONFIG_GVE) += gve.o -gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o +gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o \ + gve_buffer_mgmt_dqo.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 301fa1ea4f51..bd684c7d996a 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1162,6 +1162,24 @@ void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx); u16 gve_get_pkt_buf_size(const struct gve_priv *priv, bool enable_hplit); bool gve_header_split_supported(const struct gve_priv *priv); int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split); +/* rx buffer handling */ +int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs); +void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs, + bool free_page); +struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx); +bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list); +void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state); +struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx); +int gve_alloc_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); /* Reset */ void gve_schedule_reset(struct gve_priv *priv); int gve_reset(struct gve_priv *priv, bool attempt_teardown); diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c new file mode 100644 index 000000000000..8e50f0e4bb2e --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2024 Google, Inc. + */ + +#include "gve.h" +#include "gve_utils.h" + +int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) +{ + return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; +} + +void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs, + bool free_page) +{ + page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); + if (free_page) + gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, + DMA_FROM_DEVICE); + bs->page_info.page = NULL; +} + +struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = rx->dqo.free_buf_states; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from free list */ + rx->dqo.free_buf_states = buf_state->next; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + return buf_state->next == buffer_id; +} + +void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = rx->dqo.free_buf_states; + rx->dqo.free_buf_states = buffer_id; +} + +struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = list->head; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from list */ + list->head = buf_state->next; + if (buf_state->next == -1) + list->tail = -1; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = -1; + + if (list->head == -1) { + list->head = buffer_id; + list->tail = buffer_id; + } else { + int tail = list->tail; + + rx->dqo.buf_states[tail].next = buffer_id; + list->tail = buffer_id; + } +} + +struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + int i; + + /* Recycled buf states are immediately usable. */ + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); + if (likely(buf_state)) + return buf_state; + + if (unlikely(rx->dqo.used_buf_states.head == -1)) + return NULL; + + /* Used buf states are only usable when ref count reaches 0, which means + * no SKBs refer to them. + * + * Search a limited number before giving up. + */ + for (i = 0; i < 5; i++) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) { + rx->dqo.used_buf_states_cnt--; + return buf_state; + } + + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + } + + /* For QPL, we cannot allocate any new buffers and must + * wait for the existing ones to be available. + */ + if (rx->dqo.qpl) + return NULL; + + /* If there are no free buf states discard an entry from + * `used_buf_states` so it can be used. + */ + if (unlikely(rx->dqo.free_buf_states == -1)) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) + return buf_state; + + gve_free_page_dqo(rx->gve, buf_state, true); + gve_free_buf_state(rx, buf_state); + } + + return NULL; +} + +int gve_alloc_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + struct gve_priv *priv = rx->gve; + u32 idx; + + if (!rx->dqo.qpl) { + int err; + + err = gve_alloc_page(priv, &priv->pdev->dev, + &buf_state->page_info.page, + &buf_state->addr, + DMA_FROM_DEVICE, GFP_ATOMIC); + if (err) + return err; + } else { + idx = rx->dqo.next_qpl_page_idx; + if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { + net_err_ratelimited("%s: Out of QPL pages\n", + priv->dev->name); + return -ENOMEM; + } + buf_state->page_info.page = rx->dqo.qpl->pages[idx]; + buf_state->addr = rx->dqo.qpl->page_buses[idx]; + rx->dqo.next_qpl_page_idx++; + } + buf_state->page_info.page_offset = 0; + buf_state->page_info.page_address = + page_address(buf_state->page_info.page); + buf_state->last_single_ref_offset = 0; + + /* The page already has 1 ref. */ + page_ref_add(buf_state->page_info.page, INT_MAX - 1); + buf_state->page_info.pagecnt_bias = INT_MAX; + + return 0; +} + +void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + const u16 data_buffer_size = priv->data_buffer_size_dqo; + int pagecount; + + /* Can't reuse if we only fit one buffer per page */ + if (data_buffer_size * 2 > PAGE_SIZE) + goto mark_used; + + pagecount = gve_buf_ref_cnt(buf_state); + + /* Record the offset when we have a single remaining reference. + * + * When this happens, we know all of the other offsets of the page are + * usable. + */ + if (pagecount == 1) { + buf_state->last_single_ref_offset = + buf_state->page_info.page_offset; + } + + /* Use the next buffer sized chunk in the page. */ + buf_state->page_info.page_offset += data_buffer_size; + buf_state->page_info.page_offset &= (PAGE_SIZE - 1); + + /* If we wrap around to the same offset without ever dropping to 1 + * reference, then we don't know if this offset was ever freed. + */ + if (buf_state->page_info.page_offset == + buf_state->last_single_ref_offset) { + goto mark_used; + } + + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + return; + +mark_used: + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + rx->dqo.used_buf_states_cnt++; +} diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 1154c1d8f66f..b343be2fb118 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -16,189 +16,6 @@ #include #include -static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) -{ - return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; -} - -static void gve_free_page_dqo(struct gve_priv *priv, - struct gve_rx_buf_state_dqo *bs, - bool free_page) -{ - page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); - if (free_page) - gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, - DMA_FROM_DEVICE); - bs->page_info.page = NULL; -} - -static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) -{ - struct gve_rx_buf_state_dqo *buf_state; - s16 buffer_id; - - buffer_id = rx->dqo.free_buf_states; - if (unlikely(buffer_id == -1)) - return NULL; - - buf_state = &rx->dqo.buf_states[buffer_id]; - - /* Remove buf_state from free list */ - rx->dqo.free_buf_states = buf_state->next; - - /* Point buf_state to itself to mark it as allocated */ - buf_state->next = buffer_id; - - return buf_state; -} - -static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - return buf_state->next == buffer_id; -} - -static void gve_free_buf_state(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - buf_state->next = rx->dqo.free_buf_states; - rx->dqo.free_buf_states = buffer_id; -} - -static struct gve_rx_buf_state_dqo * -gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list) -{ - struct gve_rx_buf_state_dqo *buf_state; - s16 buffer_id; - - buffer_id = list->head; - if (unlikely(buffer_id == -1)) - return NULL; - - buf_state = &rx->dqo.buf_states[buffer_id]; - - /* Remove buf_state from list */ - list->head = buf_state->next; - if (buf_state->next == -1) - list->tail = -1; - - /* Point buf_state to itself to mark it as allocated */ - buf_state->next = buffer_id; - - return buf_state; -} - -static void gve_enqueue_buf_state(struct gve_rx_ring *rx, - struct gve_index_list *list, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - buf_state->next = -1; - - if (list->head == -1) { - list->head = buffer_id; - list->tail = buffer_id; - } else { - int tail = list->tail; - - rx->dqo.buf_states[tail].next = buffer_id; - list->tail = buffer_id; - } -} - -static struct gve_rx_buf_state_dqo * -gve_get_recycled_buf_state(struct gve_rx_ring *rx) -{ - struct gve_rx_buf_state_dqo *buf_state; - int i; - - /* Recycled buf states are immediately usable. */ - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); - if (likely(buf_state)) - return buf_state; - - if (unlikely(rx->dqo.used_buf_states.head == -1)) - return NULL; - - /* Used buf states are only usable when ref count reaches 0, which means - * no SKBs refer to them. - * - * Search a limited number before giving up. - */ - for (i = 0; i < 5; i++) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) { - rx->dqo.used_buf_states_cnt--; - return buf_state; - } - - gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); - } - - /* For QPL, we cannot allocate any new buffers and must - * wait for the existing ones to be available. - */ - if (rx->dqo.qpl) - return NULL; - - /* If there are no free buf states discard an entry from - * `used_buf_states` so it can be used. - */ - if (unlikely(rx->dqo.free_buf_states == -1)) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) - return buf_state; - - gve_free_page_dqo(rx->gve, buf_state, true); - gve_free_buf_state(rx, buf_state); - } - - return NULL; -} - -static int gve_alloc_page_dqo(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - struct gve_priv *priv = rx->gve; - u32 idx; - - if (!rx->dqo.qpl) { - int err; - - err = gve_alloc_page(priv, &priv->pdev->dev, - &buf_state->page_info.page, - &buf_state->addr, - DMA_FROM_DEVICE, GFP_ATOMIC); - if (err) - return err; - } else { - idx = rx->dqo.next_qpl_page_idx; - if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { - net_err_ratelimited("%s: Out of QPL pages\n", - priv->dev->name); - return -ENOMEM; - } - buf_state->page_info.page = rx->dqo.qpl->pages[idx]; - buf_state->addr = rx->dqo.qpl->page_buses[idx]; - rx->dqo.next_qpl_page_idx++; - } - buf_state->page_info.page_offset = 0; - buf_state->page_info.page_address = - page_address(buf_state->page_info.page); - buf_state->last_single_ref_offset = 0; - - /* The page already has 1 ref. */ - page_ref_add(buf_state->page_info.page, INT_MAX - 1); - buf_state->page_info.pagecnt_bias = INT_MAX; - - return 0; -} - static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) { struct device *hdev = &priv->pdev->dev; @@ -557,48 +374,6 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) rx->fill_cnt += num_posted; } -static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - const u16 data_buffer_size = priv->data_buffer_size_dqo; - int pagecount; - - /* Can't reuse if we only fit one buffer per page */ - if (data_buffer_size * 2 > PAGE_SIZE) - goto mark_used; - - pagecount = gve_buf_ref_cnt(buf_state); - - /* Record the offset when we have a single remaining reference. - * - * When this happens, we know all of the other offsets of the page are - * usable. - */ - if (pagecount == 1) { - buf_state->last_single_ref_offset = - buf_state->page_info.page_offset; - } - - /* Use the next buffer sized chunk in the page. */ - buf_state->page_info.page_offset += data_buffer_size; - buf_state->page_info.page_offset &= (PAGE_SIZE - 1); - - /* If we wrap around to the same offset without ever dropping to 1 - * reference, then we don't know if this offset was ever freed. - */ - if (buf_state->page_info.page_offset == - buf_state->last_single_ref_offset) { - goto mark_used; - } - - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); - return; - -mark_used: - gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); - rx->dqo.used_buf_states_cnt++; -} - static void gve_rx_skb_csum(struct sk_buff *skb, const struct gve_rx_compl_desc_dqo *desc, struct gve_ptype ptype) -- 2.50.1 From ebdfae0d377b487eabb739c55a13a2ab29f21f36 Mon Sep 17 00:00:00 2001 From: Harshitha Ramamurthy Date: Mon, 14 Oct 2024 13:21:07 -0700 Subject: [PATCH 16/16] gve: adopt page pool for DQ RDA mode For DQ queue format in raw DMA addressing(RDA) mode, implement page pool recycling of buffers by leveraging a few helper functions. DQ QPL mode will continue to use the exisiting recycling logic. This is because in QPL mode, the pages come from a constant set of pages that the driver pre-allocates and registers with the device. Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: Harshitha Ramamurthy Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241014202108.1051963-3-pkaligineedi@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/Kconfig | 1 + drivers/net/ethernet/google/gve/gve.h | 22 ++- .../ethernet/google/gve/gve_buffer_mgmt_dqo.c | 180 +++++++++++++----- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 89 ++++----- 4 files changed, 198 insertions(+), 94 deletions(-) diff --git a/drivers/net/ethernet/google/Kconfig b/drivers/net/ethernet/google/Kconfig index 8641a00f8e63..564862a57124 100644 --- a/drivers/net/ethernet/google/Kconfig +++ b/drivers/net/ethernet/google/Kconfig @@ -18,6 +18,7 @@ if NET_VENDOR_GOOGLE config GVE tristate "Google Virtual NIC (gVNIC) support" depends on (PCI_MSI && (X86 || CPU_LITTLE_ENDIAN)) + select PAGE_POOL help This driver supports Google Virtual NIC (gVNIC)" diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index bd684c7d996a..dd92949bb214 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "gve_desc.h" @@ -60,6 +61,8 @@ #define GVE_DEFAULT_RX_BUFFER_OFFSET 2048 +#define GVE_PAGE_POOL_SIZE_MULTIPLIER 4 + #define GVE_FLOW_RULES_CACHE_SIZE \ (GVE_ADMINQ_BUFFER_SIZE / sizeof(struct gve_adminq_queried_flow_rule)) #define GVE_FLOW_RULE_IDS_CACHE_SIZE \ @@ -102,6 +105,7 @@ struct gve_rx_slot_page_info { struct page *page; void *page_address; u32 page_offset; /* offset to write to in page */ + unsigned int buf_size; int pagecnt_bias; /* expected pagecnt if only the driver has a ref */ u16 pad; /* adjustment for rx padding */ u8 can_flip; /* tracks if the networking stack is using the page */ @@ -273,6 +277,8 @@ struct gve_rx_ring { /* Address info of the buffers for header-split */ struct gve_header_buf hdr_bufs; + + struct page_pool *page_pool; } dqo; }; @@ -1176,10 +1182,22 @@ struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, struct gve_rx_buf_state_dqo *buf_state); struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx); -int gve_alloc_page_dqo(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state); void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_buf_state_dqo *buf_state); +void gve_free_to_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + bool allow_direct); +int gve_alloc_qpl_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_qpl_page_dqo(struct gve_rx_buf_state_dqo *buf_state); +void gve_reuse_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +int gve_alloc_buffer(struct gve_rx_ring *rx, struct gve_rx_desc_dqo *desc); +struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv, + struct gve_rx_ring *rx); + /* Reset */ void gve_schedule_reset(struct gve_priv *priv); int gve_reset(struct gve_priv *priv, bool attempt_teardown); diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c index 8e50f0e4bb2e..05bf1f80a79c 100644 --- a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -12,16 +12,6 @@ int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; } -void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs, - bool free_page) -{ - page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); - if (free_page) - gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, - DMA_FROM_DEVICE); - bs->page_info.page = NULL; -} - struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) { struct gve_rx_buf_state_dqo *buf_state; @@ -128,56 +118,28 @@ struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx) gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); } - /* For QPL, we cannot allocate any new buffers and must - * wait for the existing ones to be available. - */ - if (rx->dqo.qpl) - return NULL; - - /* If there are no free buf states discard an entry from - * `used_buf_states` so it can be used. - */ - if (unlikely(rx->dqo.free_buf_states == -1)) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) - return buf_state; - - gve_free_page_dqo(rx->gve, buf_state, true); - gve_free_buf_state(rx, buf_state); - } - return NULL; } -int gve_alloc_page_dqo(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) +int gve_alloc_qpl_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) { struct gve_priv *priv = rx->gve; u32 idx; - if (!rx->dqo.qpl) { - int err; - - err = gve_alloc_page(priv, &priv->pdev->dev, - &buf_state->page_info.page, - &buf_state->addr, - DMA_FROM_DEVICE, GFP_ATOMIC); - if (err) - return err; - } else { - idx = rx->dqo.next_qpl_page_idx; - if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { - net_err_ratelimited("%s: Out of QPL pages\n", - priv->dev->name); - return -ENOMEM; - } - buf_state->page_info.page = rx->dqo.qpl->pages[idx]; - buf_state->addr = rx->dqo.qpl->page_buses[idx]; - rx->dqo.next_qpl_page_idx++; + idx = rx->dqo.next_qpl_page_idx; + if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { + net_err_ratelimited("%s: Out of QPL pages\n", + priv->dev->name); + return -ENOMEM; } + buf_state->page_info.page = rx->dqo.qpl->pages[idx]; + buf_state->addr = rx->dqo.qpl->page_buses[idx]; + rx->dqo.next_qpl_page_idx++; buf_state->page_info.page_offset = 0; buf_state->page_info.page_address = page_address(buf_state->page_info.page); + buf_state->page_info.buf_size = priv->data_buffer_size_dqo; buf_state->last_single_ref_offset = 0; /* The page already has 1 ref. */ @@ -187,6 +149,16 @@ int gve_alloc_page_dqo(struct gve_rx_ring *rx, return 0; } +void gve_free_qpl_page_dqo(struct gve_rx_buf_state_dqo *buf_state) +{ + if (!buf_state->page_info.page) + return; + + page_ref_sub(buf_state->page_info.page, + buf_state->page_info.pagecnt_bias - 1); + buf_state->page_info.page = NULL; +} + void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_buf_state_dqo *buf_state) { @@ -228,3 +200,113 @@ mark_used: gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); rx->dqo.used_buf_states_cnt++; } + +void gve_free_to_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + bool allow_direct) +{ + struct page *page = buf_state->page_info.page; + + if (!page) + return; + + page_pool_put_page(page->pp, page, buf_state->page_info.buf_size, + allow_direct); + buf_state->page_info.page = NULL; +} + +static int gve_alloc_from_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + struct gve_priv *priv = rx->gve; + struct page *page; + + buf_state->page_info.buf_size = priv->data_buffer_size_dqo; + page = page_pool_alloc(rx->dqo.page_pool, + &buf_state->page_info.page_offset, + &buf_state->page_info.buf_size, GFP_ATOMIC); + + if (!page) + return -ENOMEM; + + buf_state->page_info.page = page; + buf_state->page_info.page_address = page_address(page); + buf_state->addr = page_pool_get_dma_addr(page); + + return 0; +} + +struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv, + struct gve_rx_ring *rx) +{ + u32 ntfy_id = gve_rx_idx_to_ntfy(priv, rx->q_num); + struct page_pool_params pp = { + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, + .order = 0, + .pool_size = GVE_PAGE_POOL_SIZE_MULTIPLIER * priv->rx_desc_cnt, + .dev = &priv->pdev->dev, + .netdev = priv->dev, + .napi = &priv->ntfy_blocks[ntfy_id].napi, + .max_len = PAGE_SIZE, + .dma_dir = DMA_FROM_DEVICE, + }; + + return page_pool_create(&pp); +} + +void gve_free_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + if (rx->dqo.page_pool) { + gve_free_to_page_pool(rx, buf_state, true); + gve_free_buf_state(rx, buf_state); + } else { + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, + buf_state); + } +} + +void gve_reuse_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + if (rx->dqo.page_pool) { + buf_state->page_info.page = NULL; + gve_free_buf_state(rx, buf_state); + } else { + gve_dec_pagecnt_bias(&buf_state->page_info); + gve_try_recycle_buf(rx->gve, rx, buf_state); + } +} + +int gve_alloc_buffer(struct gve_rx_ring *rx, struct gve_rx_desc_dqo *desc) +{ + struct gve_rx_buf_state_dqo *buf_state; + + if (rx->dqo.page_pool) { + buf_state = gve_alloc_buf_state(rx); + if (WARN_ON_ONCE(!buf_state)) + return -ENOMEM; + + if (gve_alloc_from_page_pool(rx, buf_state)) + goto free_buf_state; + } else { + buf_state = gve_get_recycled_buf_state(rx); + if (unlikely(!buf_state)) { + buf_state = gve_alloc_buf_state(rx); + if (unlikely(!buf_state)) + return -ENOMEM; + + if (unlikely(gve_alloc_qpl_page_dqo(rx, buf_state))) + goto free_buf_state; + } + } + desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); + desc->buf_addr = cpu_to_le64(buf_state->addr + + buf_state->page_info.page_offset); + + return 0; + +free_buf_state: + gve_free_buf_state(rx, buf_state); + return -ENOMEM; +} diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index b343be2fb118..8ac0047f1ada 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -95,8 +95,10 @@ static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) for (i = 0; i < rx->dqo.num_buf_states; i++) { struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; - if (bs->page_info.page) - gve_free_page_dqo(priv, bs, !rx->dqo.qpl); + if (rx->dqo.page_pool) + gve_free_to_page_pool(rx, bs, false); + else + gve_free_qpl_page_dqo(bs); } } @@ -138,9 +140,11 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, for (i = 0; i < rx->dqo.num_buf_states; i++) { struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; - /* Only free page for RDA. QPL pages are freed in gve_main. */ - if (bs->page_info.page) - gve_free_page_dqo(priv, bs, !rx->dqo.qpl); + + if (rx->dqo.page_pool) + gve_free_to_page_pool(rx, bs, false); + else + gve_free_qpl_page_dqo(bs); } if (rx->dqo.qpl) { @@ -167,6 +171,11 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, kvfree(rx->dqo.buf_states); rx->dqo.buf_states = NULL; + if (rx->dqo.page_pool) { + page_pool_destroy(rx->dqo.page_pool); + rx->dqo.page_pool = NULL; + } + gve_rx_free_hdr_bufs(priv, rx); netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); @@ -199,6 +208,7 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx) { struct device *hdev = &priv->pdev->dev; + struct page_pool *pool; int qpl_page_cnt; size_t size; u32 qpl_id; @@ -212,8 +222,7 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, rx->gve = priv; rx->q_num = idx; - rx->dqo.num_buf_states = cfg->raw_addressing ? - min_t(s16, S16_MAX, buffer_queue_slots * 4) : + rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots : gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states, sizeof(rx->dqo.buf_states[0]), @@ -241,7 +250,13 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, if (!rx->dqo.bufq.desc_ring) goto err; - if (!cfg->raw_addressing) { + if (cfg->raw_addressing) { + pool = gve_rx_create_page_pool(priv, rx); + if (IS_ERR(pool)) + goto err; + + rx->dqo.page_pool = pool; + } else { qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); @@ -338,26 +353,14 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); while (num_posted < num_avail_slots) { struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; - struct gve_rx_buf_state_dqo *buf_state; - - buf_state = gve_get_recycled_buf_state(rx); - if (unlikely(!buf_state)) { - buf_state = gve_alloc_buf_state(rx); - if (unlikely(!buf_state)) - break; - - if (unlikely(gve_alloc_page_dqo(rx, buf_state))) { - u64_stats_update_begin(&rx->statss); - rx->rx_buf_alloc_fail++; - u64_stats_update_end(&rx->statss); - gve_free_buf_state(rx, buf_state); - break; - } + + if (unlikely(gve_alloc_buffer(rx, desc))) { + u64_stats_update_begin(&rx->statss); + rx->rx_buf_alloc_fail++; + u64_stats_update_end(&rx->statss); + break; } - desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); - desc->buf_addr = cpu_to_le64(buf_state->addr + - buf_state->page_info.page_offset); if (rx->dqo.hdr_bufs.data) desc->header_buf_addr = cpu_to_le64(rx->dqo.hdr_bufs.addr + @@ -488,6 +491,9 @@ static int gve_rx_append_frags(struct napi_struct *napi, if (!skb) return -1; + if (rx->dqo.page_pool) + skb_mark_for_recycle(skb); + if (rx->ctx.skb_tail == rx->ctx.skb_head) skb_shinfo(rx->ctx.skb_head)->frag_list = skb; else @@ -498,7 +504,7 @@ static int gve_rx_append_frags(struct napi_struct *napi, if (rx->ctx.skb_tail != rx->ctx.skb_head) { rx->ctx.skb_head->len += buf_len; rx->ctx.skb_head->data_len += buf_len; - rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo; + rx->ctx.skb_head->truesize += buf_state->page_info.buf_size; } /* Trigger ondemand page allocation if we are running low on buffers */ @@ -508,13 +514,8 @@ static int gve_rx_append_frags(struct napi_struct *napi, skb_add_rx_frag(rx->ctx.skb_tail, num_frags, buf_state->page_info.page, buf_state->page_info.page_offset, - buf_len, priv->data_buffer_size_dqo); - gve_dec_pagecnt_bias(&buf_state->page_info); - - /* Advances buffer page-offset if page is partially used. - * Marks buffer as used if page is full. - */ - gve_try_recycle_buf(priv, rx, buf_state); + buf_len, buf_state->page_info.buf_size); + gve_reuse_buffer(rx, buf_state); return 0; } @@ -548,8 +549,7 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, } if (unlikely(compl_desc->rx_error)) { - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, - buf_state); + gve_free_buffer(rx, buf_state); return -EINVAL; } @@ -573,6 +573,9 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, if (unlikely(!rx->ctx.skb_head)) goto error; rx->ctx.skb_tail = rx->ctx.skb_head; + + if (rx->dqo.page_pool) + skb_mark_for_recycle(rx->ctx.skb_head); } else { unsplit = 1; } @@ -609,8 +612,7 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, rx->rx_copybreak_pkt++; u64_stats_update_end(&rx->statss); - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, - buf_state); + gve_free_buffer(rx, buf_state); return 0; } @@ -625,16 +627,17 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, return 0; } + if (rx->dqo.page_pool) + skb_mark_for_recycle(rx->ctx.skb_head); + skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page, buf_state->page_info.page_offset, buf_len, - priv->data_buffer_size_dqo); - gve_dec_pagecnt_bias(&buf_state->page_info); - - gve_try_recycle_buf(priv, rx, buf_state); + buf_state->page_info.buf_size); + gve_reuse_buffer(rx, buf_state); return 0; error: - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + gve_free_buffer(rx, buf_state); return -ENOMEM; } -- 2.50.1