From c45211c2369734d1b03c75165988878d16867040 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:56 +0530 Subject: [PATCH 01/16] cn10k-ipsec: Add SA add/del support for outb ipsec crypto offload This patch adds support to add and delete Security Association (SA) xfrm ops. Hardware maintains SA context in memory allocated by software. Each SA context is 128 byte aligned and size of each context is multiple of 128-byte. Add support for transport and tunnel ipsec mode, ESP protocol, aead aes-gcm-icv16, key size 128/192/256-bits with 32bit salt. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../marvell/octeontx2/nic/cn10k_ipsec.c | 409 ++++++++++++++++++ .../marvell/octeontx2/nic/cn10k_ipsec.h | 114 +++++ 2 files changed, 523 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index e09ce42075c7..106a241625dc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -375,6 +375,385 @@ static int cn10k_outb_cpt_clean(struct otx2_nic *pf) return ret; } +static void cn10k_cpt_inst_flush(struct otx2_nic *pf, struct cpt_inst_s *inst, + u64 size) +{ + struct otx2_lmt_info *lmt_info; + u64 val = 0, tar_addr = 0; + + lmt_info = per_cpu_ptr(pf->hw.lmt_info, smp_processor_id()); + /* FIXME: val[0:10] LMT_ID. + * [12:15] no of LMTST - 1 in the burst. + * [19:63] data size of each LMTST in the burst except first. + */ + val = (lmt_info->lmt_id & 0x7FF); + /* Target address for LMTST flush tells HW how many 128bit + * words are present. + * tar_addr[6:4] size of first LMTST - 1 in units of 128b. + */ + tar_addr |= pf->ipsec.io_addr | (((size / 16) - 1) & 0x7) << 4; + dma_wmb(); + memcpy((u64 *)lmt_info->lmt_addr, inst, size); + cn10k_lmt_flush(val, tar_addr); +} + +static int cn10k_wait_for_cpt_respose(struct otx2_nic *pf, + struct cpt_res_s *res) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(100); + u64 *completion_ptr = (u64 *)res; + + do { + if (time_after(jiffies, timeout)) { + netdev_err(pf->netdev, "CPT response timeout\n"); + return -EBUSY; + } + } while ((READ_ONCE(*completion_ptr) & CN10K_CPT_COMP_E_MASK) == + CN10K_CPT_COMP_E_NOTDONE); + + if (!(res->compcode == CN10K_CPT_COMP_E_GOOD || + res->compcode == CN10K_CPT_COMP_E_WARN) || res->uc_compcode) { + netdev_err(pf->netdev, "compcode=%x doneint=%x\n", + res->compcode, res->doneint); + netdev_err(pf->netdev, "uc_compcode=%x uc_info=%llx esn=%llx\n", + res->uc_compcode, (u64)res->uc_info, res->esn); + } + return 0; +} + +static int cn10k_outb_write_sa(struct otx2_nic *pf, struct qmem *sa_info) +{ + dma_addr_t res_iova, dptr_iova, sa_iova; + struct cn10k_tx_sa_s *sa_dptr; + struct cpt_inst_s inst = {}; + struct cpt_res_s *res; + u32 sa_size, off; + u64 *sptr, *dptr; + u64 reg_val; + int ret; + + sa_iova = sa_info->iova; + if (!sa_iova) + return -EINVAL; + + res = dma_alloc_coherent(pf->dev, sizeof(struct cpt_res_s), + &res_iova, GFP_ATOMIC); + if (!res) + return -ENOMEM; + + sa_size = sizeof(struct cn10k_tx_sa_s); + sa_dptr = dma_alloc_coherent(pf->dev, sa_size, &dptr_iova, GFP_ATOMIC); + if (!sa_dptr) { + dma_free_coherent(pf->dev, sizeof(struct cpt_res_s), res, + res_iova); + return -ENOMEM; + } + + sptr = (__force u64 *)sa_info->base; + dptr = (__force u64 *)sa_dptr; + for (off = 0; off < (sa_size / 8); off++) + *(dptr + off) = (__force u64)cpu_to_be64(*(sptr + off)); + + res->compcode = CN10K_CPT_COMP_E_NOTDONE; + inst.res_addr = res_iova; + inst.dptr = (u64)dptr_iova; + inst.param2 = sa_size >> 3; + inst.dlen = sa_size; + inst.opcode_major = CN10K_IPSEC_MAJOR_OP_WRITE_SA; + inst.opcode_minor = CN10K_IPSEC_MINOR_OP_WRITE_SA; + inst.cptr = sa_iova; + inst.ctx_val = 1; + inst.egrp = CN10K_DEF_CPT_IPSEC_EGRP; + + /* Check if CPT-LF available */ + if (!cn10k_cpt_device_set_inuse(pf)) { + ret = -ENODEV; + goto free_mem; + } + + cn10k_cpt_inst_flush(pf, &inst, sizeof(struct cpt_inst_s)); + dma_wmb(); + ret = cn10k_wait_for_cpt_respose(pf, res); + if (ret) + goto set_available; + + /* Trigger CTX flush to write dirty data back to DRAM */ + reg_val = FIELD_PREP(CPT_LF_CTX_FLUSH, sa_iova >> 7); + otx2_write64(pf, CN10K_CPT_LF_CTX_FLUSH, reg_val); + +set_available: + cn10k_cpt_device_set_available(pf); +free_mem: + dma_free_coherent(pf->dev, sa_size, sa_dptr, dptr_iova); + dma_free_coherent(pf->dev, sizeof(struct cpt_res_s), res, res_iova); + return ret; +} + +static int cn10k_ipsec_get_hw_ctx_offset(void) +{ + /* Offset on Hardware-context offset in word */ + return (offsetof(struct cn10k_tx_sa_s, hw_ctx) / sizeof(u64)) & 0x7F; +} + +static int cn10k_ipsec_get_ctx_push_size(void) +{ + /* Context push size is round up and in multiple of 8 Byte */ + return (roundup(offsetof(struct cn10k_tx_sa_s, hw_ctx), 8) / 8) & 0x7F; +} + +static int cn10k_ipsec_get_aes_key_len(int key_len) +{ + /* key_len is aes key length in bytes */ + switch (key_len) { + case 16: + return CN10K_IPSEC_SA_AES_KEY_LEN_128; + case 24: + return CN10K_IPSEC_SA_AES_KEY_LEN_192; + default: + return CN10K_IPSEC_SA_AES_KEY_LEN_256; + } +} + +static void cn10k_outb_prepare_sa(struct xfrm_state *x, + struct cn10k_tx_sa_s *sa_entry) +{ + int key_len = (x->aead->alg_key_len + 7) / 8; + struct net_device *netdev = x->xso.dev; + u8 *key = x->aead->alg_key; + struct otx2_nic *pf; + u32 *tmp_salt; + u64 *tmp_key; + int idx; + + memset(sa_entry, 0, sizeof(struct cn10k_tx_sa_s)); + + /* context size, 128 Byte aligned up */ + pf = netdev_priv(netdev); + sa_entry->ctx_size = (pf->ipsec.sa_size / OTX2_ALIGN) & 0xF; + sa_entry->hw_ctx_off = cn10k_ipsec_get_hw_ctx_offset(); + sa_entry->ctx_push_size = cn10k_ipsec_get_ctx_push_size(); + + /* Ucode to skip two words of CPT_CTX_HW_S */ + sa_entry->ctx_hdr_size = 1; + + /* Allow Atomic operation (AOP) */ + sa_entry->aop_valid = 1; + + /* Outbound, ESP TRANSPORT/TUNNEL Mode, AES-GCM with */ + sa_entry->sa_dir = CN10K_IPSEC_SA_DIR_OUTB; + sa_entry->ipsec_protocol = CN10K_IPSEC_SA_IPSEC_PROTO_ESP; + sa_entry->enc_type = CN10K_IPSEC_SA_ENCAP_TYPE_AES_GCM; + sa_entry->iv_src = CN10K_IPSEC_SA_IV_SRC_PACKET; + if (x->props.mode == XFRM_MODE_TUNNEL) + sa_entry->ipsec_mode = CN10K_IPSEC_SA_IPSEC_MODE_TUNNEL; + else + sa_entry->ipsec_mode = CN10K_IPSEC_SA_IPSEC_MODE_TRANSPORT; + + /* Last 4 bytes are salt */ + key_len -= 4; + sa_entry->aes_key_len = cn10k_ipsec_get_aes_key_len(key_len); + memcpy(sa_entry->cipher_key, key, key_len); + tmp_key = (u64 *)sa_entry->cipher_key; + + for (idx = 0; idx < key_len / 8; idx++) + tmp_key[idx] = (__force u64)cpu_to_be64(tmp_key[idx]); + + memcpy(&sa_entry->iv_gcm_salt, key + key_len, 4); + tmp_salt = (u32 *)&sa_entry->iv_gcm_salt; + *tmp_salt = (__force u32)cpu_to_be32(*tmp_salt); + + /* Write SA context data to memory before enabling */ + wmb(); + + /* Enable SA */ + sa_entry->sa_valid = 1; +} + +static int cn10k_ipsec_validate_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + if (x->props.aalgo != SADB_AALG_NONE) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload authenticated xfrm states"); + return -EINVAL; + } + if (x->props.ealgo != SADB_X_EALG_AES_GCM_ICV16) { + NL_SET_ERR_MSG_MOD(extack, + "Only AES-GCM-ICV16 xfrm state may be offloaded"); + return -EINVAL; + } + if (x->props.calgo != SADB_X_CALG_NONE) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload compressed xfrm states"); + return -EINVAL; + } + if (x->props.flags & XFRM_STATE_ESN) { + NL_SET_ERR_MSG_MOD(extack, "Cannot offload ESN xfrm states"); + return -EINVAL; + } + if (x->props.family != AF_INET && x->props.family != AF_INET6) { + NL_SET_ERR_MSG_MOD(extack, + "Only IPv4/v6 xfrm states may be offloaded"); + return -EINVAL; + } + if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload other than crypto-mode"); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_TRANSPORT && + x->props.mode != XFRM_MODE_TUNNEL) { + NL_SET_ERR_MSG_MOD(extack, + "Only tunnel/transport xfrm states may be offloaded"); + return -EINVAL; + } + if (x->id.proto != IPPROTO_ESP) { + NL_SET_ERR_MSG_MOD(extack, + "Only ESP xfrm state may be offloaded"); + return -EINVAL; + } + if (x->encap) { + NL_SET_ERR_MSG_MOD(extack, + "Encapsulated xfrm state may not be offloaded"); + return -EINVAL; + } + if (!x->aead) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states without aead"); + return -EINVAL; + } + + if (x->aead->alg_icv_len != 128) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with AEAD ICV length other than 128bit"); + return -EINVAL; + } + if (x->aead->alg_key_len != 128 + 32 && + x->aead->alg_key_len != 192 + 32 && + x->aead->alg_key_len != 256 + 32) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with AEAD key length other than 128/192/256bit"); + return -EINVAL; + } + if (x->tfcpad) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with tfc padding"); + return -EINVAL; + } + if (!x->geniv) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states without geniv"); + return -EINVAL; + } + if (strcmp(x->geniv, "seqiv")) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload xfrm states with geniv other than seqiv"); + return -EINVAL; + } + return 0; +} + +static int cn10k_ipsec_inb_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG_MOD(extack, "xfrm inbound offload not supported"); + return -EOPNOTSUPP; +} + +static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + struct net_device *netdev = x->xso.dev; + struct cn10k_tx_sa_s *sa_entry; + struct qmem *sa_info; + struct otx2_nic *pf; + int err; + + err = cn10k_ipsec_validate_state(x, extack); + if (err) + return err; + + pf = netdev_priv(netdev); + + err = qmem_alloc(pf->dev, &sa_info, pf->ipsec.sa_size, OTX2_ALIGN); + if (err) + return err; + + sa_entry = (struct cn10k_tx_sa_s *)sa_info->base; + cn10k_outb_prepare_sa(x, sa_entry); + + err = cn10k_outb_write_sa(pf, sa_info); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error writing outbound SA"); + qmem_free(pf->dev, sa_info); + return err; + } + + x->xso.offload_handle = (unsigned long)sa_info; + pf->ipsec.outb_sa_count++; + return 0; +} + +static int cn10k_ipsec_add_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) +{ + if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) + return cn10k_ipsec_inb_add_state(x, extack); + else + return cn10k_ipsec_outb_add_state(x, extack); +} + +static void cn10k_ipsec_del_state(struct xfrm_state *x) +{ + struct net_device *netdev = x->xso.dev; + struct cn10k_tx_sa_s *sa_entry; + struct qmem *sa_info; + struct otx2_nic *pf; + int err; + + if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) + return; + + pf = netdev_priv(netdev); + + sa_info = (struct qmem *)x->xso.offload_handle; + sa_entry = (struct cn10k_tx_sa_s *)sa_info->base; + memset(sa_entry, 0, sizeof(struct cn10k_tx_sa_s)); + /* Disable SA in CPT h/w */ + sa_entry->ctx_push_size = cn10k_ipsec_get_ctx_push_size(); + sa_entry->ctx_size = (pf->ipsec.sa_size / OTX2_ALIGN) & 0xF; + sa_entry->aop_valid = 1; + + err = cn10k_outb_write_sa(pf, sa_info); + if (err) + netdev_err(netdev, "Error (%d) deleting SA\n", err); + + x->xso.offload_handle = 0; + qmem_free(pf->dev, sa_info); + + /* If no more SA's then update netdev feature for potential change + * in NETIF_F_HW_ESP. + */ + if (!--pf->ipsec.outb_sa_count) + queue_work(pf->ipsec.sa_workq, &pf->ipsec.sa_work); +} + +static const struct xfrmdev_ops cn10k_ipsec_xfrmdev_ops = { + .xdo_dev_state_add = cn10k_ipsec_add_state, + .xdo_dev_state_delete = cn10k_ipsec_del_state, +}; + +static void cn10k_ipsec_sa_wq_handler(struct work_struct *work) +{ + struct cn10k_ipsec *ipsec = container_of(work, struct cn10k_ipsec, + sa_work); + struct otx2_nic *pf = container_of(ipsec, struct otx2_nic, ipsec); + + rtnl_lock(); + netdev_update_features(pf->netdev); + rtnl_unlock(); +} + int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) { struct otx2_nic *pf = netdev_priv(netdev); @@ -387,16 +766,41 @@ int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) if (enable) return cn10k_outb_cpt_init(netdev); + /* Don't do CPT cleanup if SA installed */ + if (pf->ipsec.outb_sa_count) { + netdev_err(pf->netdev, "SA installed on this device\n"); + return -EBUSY; + } + return cn10k_outb_cpt_clean(pf); } int cn10k_ipsec_init(struct net_device *netdev) { struct otx2_nic *pf = netdev_priv(netdev); + u32 sa_size; if (!is_dev_support_ipsec_offload(pf->pdev)) return 0; + /* Each SA entry size is 128 Byte round up in size */ + sa_size = sizeof(struct cn10k_tx_sa_s) % OTX2_ALIGN ? + (sizeof(struct cn10k_tx_sa_s) / OTX2_ALIGN + 1) * + OTX2_ALIGN : sizeof(struct cn10k_tx_sa_s); + pf->ipsec.sa_size = sa_size; + + INIT_WORK(&pf->ipsec.sa_work, cn10k_ipsec_sa_wq_handler); + pf->ipsec.sa_workq = alloc_workqueue("cn10k_ipsec_sa_workq", 0, 0); + if (!pf->ipsec.sa_workq) { + netdev_err(pf->netdev, "SA alloc workqueue failed\n"); + return -ENOMEM; + } + + /* Set xfrm device ops + * NETIF_F_HW_ESP is not set as ipsec setup is not yet complete. + */ + netdev->xfrmdev_ops = &cn10k_ipsec_xfrmdev_ops; + cn10k_cpt_device_set_unavailable(pf); return 0; } @@ -410,6 +814,11 @@ void cn10k_ipsec_clean(struct otx2_nic *pf) if (!(pf->flags & OTX2_FLAG_IPSEC_OFFLOAD_ENABLED)) return; + if (pf->ipsec.sa_workq) { + destroy_workqueue(pf->ipsec.sa_workq); + pf->ipsec.sa_workq = NULL; + } + cn10k_outb_cpt_clean(pf); } EXPORT_SYMBOL(cn10k_ipsec_clean); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h index f3eb5aee4b9d..5ac4de4ae974 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h @@ -50,6 +50,20 @@ #define CN10K_CPT_LF_NQX(a) (CPT_LFBASE | 0x400 | (a) << 3) #define CN10K_CPT_LF_CTX_FLUSH (CPT_LFBASE | 0x510) +/* IPSEC Instruction opcodes */ +#define CN10K_IPSEC_MAJOR_OP_WRITE_SA 0x01UL +#define CN10K_IPSEC_MINOR_OP_WRITE_SA 0x09UL + +enum cn10k_cpt_comp_e { + CN10K_CPT_COMP_E_NOTDONE = 0x00, + CN10K_CPT_COMP_E_GOOD = 0x01, + CN10K_CPT_COMP_E_FAULT = 0x02, + CN10K_CPT_COMP_E_HWERR = 0x04, + CN10K_CPT_COMP_E_INSTERR = 0x05, + CN10K_CPT_COMP_E_WARN = 0x06, + CN10K_CPT_COMP_E_MASK = 0x3F +}; + struct cn10k_cpt_inst_queue { u8 *vaddr; u8 *real_vaddr; @@ -69,6 +83,103 @@ struct cn10k_ipsec { u64 io_addr; atomic_t cpt_state; struct cn10k_cpt_inst_queue iq; + + /* SA info */ + u32 sa_size; + u32 outb_sa_count; + struct work_struct sa_work; + struct workqueue_struct *sa_workq; +}; + +/* CN10K IPSEC Security Association (SA) */ +/* SA direction */ +#define CN10K_IPSEC_SA_DIR_INB 0 +#define CN10K_IPSEC_SA_DIR_OUTB 1 +/* SA protocol */ +#define CN10K_IPSEC_SA_IPSEC_PROTO_AH 0 +#define CN10K_IPSEC_SA_IPSEC_PROTO_ESP 1 +/* SA Encryption Type */ +#define CN10K_IPSEC_SA_ENCAP_TYPE_AES_GCM 5 +/* SA IPSEC mode Transport/Tunnel */ +#define CN10K_IPSEC_SA_IPSEC_MODE_TRANSPORT 0 +#define CN10K_IPSEC_SA_IPSEC_MODE_TUNNEL 1 +/* SA AES Key Length */ +#define CN10K_IPSEC_SA_AES_KEY_LEN_128 1 +#define CN10K_IPSEC_SA_AES_KEY_LEN_192 2 +#define CN10K_IPSEC_SA_AES_KEY_LEN_256 3 +/* IV Source */ +#define CN10K_IPSEC_SA_IV_SRC_COUNTER 0 +#define CN10K_IPSEC_SA_IV_SRC_PACKET 3 + +struct cn10k_tx_sa_s { + u64 esn_en : 1; /* W0 */ + u64 rsvd_w0_1_8 : 8; + u64 hw_ctx_off : 7; + u64 ctx_id : 16; + u64 rsvd_w0_32_47 : 16; + u64 ctx_push_size : 7; + u64 rsvd_w0_55 : 1; + u64 ctx_hdr_size : 2; + u64 aop_valid : 1; + u64 rsvd_w0_59 : 1; + u64 ctx_size : 4; + u64 w1; /* W1 */ + u64 sa_valid : 1; /* W2 */ + u64 sa_dir : 1; + u64 rsvd_w2_2_3 : 2; + u64 ipsec_mode : 1; + u64 ipsec_protocol : 1; + u64 aes_key_len : 2; + u64 enc_type : 3; + u64 rsvd_w2_11_19 : 9; + u64 iv_src : 2; + u64 rsvd_w2_22_31 : 10; + u64 rsvd_w2_32_63 : 32; + u64 w3; /* W3 */ + u8 cipher_key[32]; /* W4 - W7 */ + u32 rsvd_w8_0_31; /* W8 : IV */ + u32 iv_gcm_salt; + u64 rsvd_w9_w30[22]; /* W9 - W30 */ + u64 hw_ctx[6]; /* W31 - W36 */ +}; + +/* CPT Instruction Structure */ +struct cpt_inst_s { + u64 nixtxl : 3; /* W0 */ + u64 doneint : 1; + u64 rsvd_w0_4_15 : 12; + u64 dat_offset : 8; + u64 ext_param1 : 8; + u64 nixtx_offset : 20; + u64 rsvd_w0_52_63 : 12; + u64 res_addr; /* W1 */ + u64 tag : 32; /* W2 */ + u64 tt : 2; + u64 grp : 10; + u64 rsvd_w2_44_47 : 4; + u64 rvu_pf_func : 16; + u64 qord : 1; /* W3 */ + u64 rsvd_w3_1_2 : 2; + u64 wqe_ptr : 61; + u64 dlen : 16; /* W4 */ + u64 param2 : 16; + u64 param1 : 16; + u64 opcode_major : 8; + u64 opcode_minor : 8; + u64 dptr; /* W5 */ + u64 rptr; /* W6 */ + u64 cptr : 60; /* W7 */ + u64 ctx_val : 1; + u64 egrp : 3; +}; + +/* CPT Instruction Result Structure */ +struct cpt_res_s { + u64 compcode : 7; /* W0 */ + u64 doneint : 1; + u64 uc_compcode : 8; + u64 uc_info : 48; + u64 esn; /* W1 */ }; /* CPT LF_INPROG Register */ @@ -86,6 +197,9 @@ struct cn10k_ipsec { /* CPT LF_Q_SIZE Register */ #define CPT_LF_Q_SIZE_DIV40 GENMASK_ULL(14, 0) +/* CPT LF CTX Flush Register */ +#define CPT_LF_CTX_FLUSH GENMASK_ULL(45, 0) + #ifdef CONFIG_XFRM_OFFLOAD int cn10k_ipsec_init(struct net_device *netdev); void cn10k_ipsec_clean(struct otx2_nic *pf); -- 2.51.0 From 6a77a158848a8c68930df27b8840660db8531222 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:57 +0530 Subject: [PATCH 02/16] cn10k-ipsec: Process outbound ipsec crypto offload Prepare and submit crypto hardware (CPT) instruction for outbound ipsec crypto offload. The CPT instruction have authentication offset, IV offset and encapsulation offset in input packet. Also provide SA context pointer which have details about algo, keys, salt etc. Crypto hardware encrypt, authenticate and provide the ESP packet to networking hardware. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../marvell/octeontx2/nic/cn10k_ipsec.c | 219 ++++++++++++++++++ .../marvell/octeontx2/nic/cn10k_ipsec.h | 42 ++++ .../marvell/octeontx2/nic/otx2_common.c | 23 ++ .../marvell/octeontx2/nic/otx2_common.h | 3 + .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 + .../marvell/octeontx2/nic/otx2_txrx.c | 32 ++- .../marvell/octeontx2/nic/otx2_txrx.h | 3 + 7 files changed, 321 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index 106a241625dc..9a9b06f4c2cc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -7,10 +7,15 @@ #include #include #include +#include +#include #include "otx2_common.h" +#include "otx2_struct.h" #include "cn10k_ipsec.h" +DEFINE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); + static bool is_dev_support_ipsec_offload(struct pci_dev *pdev) { return is_dev_cn10ka_b0(pdev) || is_dev_cn10kb(pdev); @@ -690,6 +695,9 @@ static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, } x->xso.offload_handle = (unsigned long)sa_info; + /* Enable static branch when first SA setup */ + if (!pf->ipsec.outb_sa_count) + static_branch_enable(&cn10k_ipsec_sa_enabled); pf->ipsec.outb_sa_count++; return 0; } @@ -749,6 +757,8 @@ static void cn10k_ipsec_sa_wq_handler(struct work_struct *work) sa_work); struct otx2_nic *pf = container_of(ipsec, struct otx2_nic, ipsec); + /* Disable static branch when no more SA enabled */ + static_branch_disable(&cn10k_ipsec_sa_enabled); rtnl_lock(); netdev_update_features(pf->netdev); rtnl_unlock(); @@ -822,3 +832,212 @@ void cn10k_ipsec_clean(struct otx2_nic *pf) cn10k_outb_cpt_clean(pf); } EXPORT_SYMBOL(cn10k_ipsec_clean); + +static u16 cn10k_ipsec_get_ip_data_len(struct xfrm_state *x, + struct sk_buff *skb) +{ + struct ipv6hdr *ipv6h; + struct iphdr *iph; + u8 *src; + + src = (u8 *)skb->data + ETH_HLEN; + + if (x->props.family == AF_INET) { + iph = (struct iphdr *)src; + return ntohs(iph->tot_len); + } + + ipv6h = (struct ipv6hdr *)src; + return ntohs(ipv6h->payload_len) + sizeof(struct ipv6hdr); +} + +/* Prepare CPT and NIX SQE scatter/gather subdescriptor structure. + * SG of NIX and CPT are same in size. + * Layout of a NIX SQE and CPT SG entry: + * ----------------------------- + * | CPT Scatter Gather | + * | (SQE SIZE) | + * | | + * ----------------------------- + * | NIX SQE | + * | (SQE SIZE) | + * | | + * ----------------------------- + */ +bool otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset) +{ + struct cpt_sg_s *cpt_sg = NULL; + struct nix_sqe_sg_s *sg = NULL; + u64 dma_addr, *iova = NULL; + u64 *cpt_iova = NULL; + u16 *sg_lens = NULL; + int seg, len; + + sq->sg[sq->head].num_segs = 0; + cpt_sg = (struct cpt_sg_s *)(sq->sqe_base - sq->sqe_size); + + for (seg = 0; seg < num_segs; seg++) { + if ((seg % MAX_SEGS_PER_SG) == 0) { + sg = (struct nix_sqe_sg_s *)(sq->sqe_base + *offset); + sg->ld_type = NIX_SEND_LDTYPE_LDD; + sg->subdc = NIX_SUBDC_SG; + sg->segs = 0; + sg_lens = (void *)sg; + iova = (void *)sg + sizeof(*sg); + /* Next subdc always starts at a 16byte boundary. + * So if sg->segs is whether 2 or 3, offset += 16bytes. + */ + if ((num_segs - seg) >= (MAX_SEGS_PER_SG - 1)) + *offset += sizeof(*sg) + (3 * sizeof(u64)); + else + *offset += sizeof(*sg) + sizeof(u64); + + cpt_sg += (seg / MAX_SEGS_PER_SG) * 4; + cpt_iova = (void *)cpt_sg + sizeof(*cpt_sg); + } + dma_addr = otx2_dma_map_skb_frag(pfvf, skb, seg, &len); + if (dma_mapping_error(pfvf->dev, dma_addr)) + return false; + + sg_lens[seg % MAX_SEGS_PER_SG] = len; + sg->segs++; + *iova++ = dma_addr; + *cpt_iova++ = dma_addr; + + /* Save DMA mapping info for later unmapping */ + sq->sg[sq->head].dma_addr[seg] = dma_addr; + sq->sg[sq->head].size[seg] = len; + sq->sg[sq->head].num_segs++; + + *cpt_sg = *(struct cpt_sg_s *)sg; + cpt_sg->rsvd_63_50 = 0; + } + + sq->sg[sq->head].skb = (u64)skb; + return true; +} + +static u16 cn10k_ipsec_get_param1(u8 iv_offset) +{ + u16 param1_val; + + /* Set Crypto mode, disable L3/L4 checksum */ + param1_val = CN10K_IPSEC_INST_PARAM1_DIS_L4_CSUM | + CN10K_IPSEC_INST_PARAM1_DIS_L3_CSUM; + param1_val |= (u16)iv_offset << CN10K_IPSEC_INST_PARAM1_IV_OFFSET_SHIFT; + return param1_val; +} + +bool cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size) +{ + struct cpt_inst_s inst; + struct cpt_res_s *res; + struct xfrm_state *x; + struct qmem *sa_info; + dma_addr_t dptr_iova; + struct sec_path *sp; + u8 encap_offset; + u8 auth_offset; + u8 gthr_size; + u8 iv_offset; + u16 dlen; + + /* Check for IPSEC offload enabled */ + if (!(pf->flags & OTX2_FLAG_IPSEC_OFFLOAD_ENABLED)) + goto drop; + + sp = skb_sec_path(skb); + if (unlikely(!sp->len)) + goto drop; + + x = xfrm_input_state(skb); + if (unlikely(!x)) + goto drop; + + if (x->props.mode != XFRM_MODE_TRANSPORT && + x->props.mode != XFRM_MODE_TUNNEL) + goto drop; + + dlen = cn10k_ipsec_get_ip_data_len(x, skb); + if (dlen == 0 && netif_msg_tx_err(pf)) { + netdev_err(pf->netdev, "Invalid IP header, ip-length zero\n"); + goto drop; + } + + /* Check for valid SA context */ + sa_info = (struct qmem *)x->xso.offload_handle; + if (!sa_info) + goto drop; + + memset(&inst, 0, sizeof(struct cpt_inst_s)); + + /* Get authentication offset */ + if (x->props.family == AF_INET) + auth_offset = sizeof(struct iphdr); + else + auth_offset = sizeof(struct ipv6hdr); + + /* IV offset is after ESP header */ + iv_offset = auth_offset + sizeof(struct ip_esp_hdr); + /* Encap will start after IV */ + encap_offset = iv_offset + GCM_RFC4106_IV_SIZE; + + /* CPT Instruction word-1 */ + res = (struct cpt_res_s *)(sq->cpt_resp->base + (64 * sq->head)); + res->compcode = 0; + inst.res_addr = sq->cpt_resp->iova + (64 * sq->head); + + /* CPT Instruction word-2 */ + inst.rvu_pf_func = pf->pcifunc; + + /* CPT Instruction word-3: + * Set QORD to force CPT_RES_S write completion + */ + inst.qord = 1; + + /* CPT Instruction word-4 */ + /* inst.dlen should not include ICV length */ + inst.dlen = dlen + ETH_HLEN - (x->aead->alg_icv_len / 8); + inst.opcode_major = CN10K_IPSEC_MAJOR_OP_OUTB_IPSEC; + inst.param1 = cn10k_ipsec_get_param1(iv_offset); + + inst.param2 = encap_offset << + CN10K_IPSEC_INST_PARAM2_ENC_DATA_OFFSET_SHIFT; + inst.param2 |= (u16)auth_offset << + CN10K_IPSEC_INST_PARAM2_AUTH_DATA_OFFSET_SHIFT; + + /* CPT Instruction word-5 */ + gthr_size = num_segs / MAX_SEGS_PER_SG; + gthr_size = (num_segs % MAX_SEGS_PER_SG) ? gthr_size + 1 : gthr_size; + + gthr_size &= 0xF; + dptr_iova = (sq->sqe_ring->iova + (sq->head * (sq->sqe_size * 2))); + inst.dptr = dptr_iova | ((u64)gthr_size << 60); + + /* CPT Instruction word-6 */ + inst.rptr = inst.dptr; + + /* CPT Instruction word-7 */ + inst.cptr = sa_info->iova; + inst.ctx_val = 1; + inst.egrp = CN10K_DEF_CPT_IPSEC_EGRP; + + /* CPT Instruction word-0 */ + inst.nixtxl = (size / 16) - 1; + inst.dat_offset = ETH_HLEN; + inst.nixtx_offset = sq->sqe_size; + + netdev_tx_sent_queue(txq, skb->len); + + /* Finally Flush the CPT instruction */ + sq->head++; + sq->head &= (sq->sqe_cnt - 1); + cn10k_cpt_inst_flush(pf, &inst, sizeof(struct cpt_inst_s)); + return true; +drop: + dev_kfree_skb_any(skb); + return false; +} diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h index 5ac4de4ae974..9965df0faa3e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h @@ -9,6 +9,8 @@ #include +DECLARE_STATIC_KEY_FALSE(cn10k_ipsec_sa_enabled); + /* CPT instruction size in bytes */ #define CN10K_CPT_INST_SIZE 64 @@ -53,6 +55,7 @@ /* IPSEC Instruction opcodes */ #define CN10K_IPSEC_MAJOR_OP_WRITE_SA 0x01UL #define CN10K_IPSEC_MINOR_OP_WRITE_SA 0x09UL +#define CN10K_IPSEC_MAJOR_OP_OUTB_IPSEC 0x2AUL enum cn10k_cpt_comp_e { CN10K_CPT_COMP_E_NOTDONE = 0x00, @@ -143,6 +146,16 @@ struct cn10k_tx_sa_s { u64 hw_ctx[6]; /* W31 - W36 */ }; +/* CPT instruction parameter-1 */ +#define CN10K_IPSEC_INST_PARAM1_DIS_L4_CSUM 0x1 +#define CN10K_IPSEC_INST_PARAM1_DIS_L3_CSUM 0x2 +#define CN10K_IPSEC_INST_PARAM1_CRYPTO_MODE 0x20 +#define CN10K_IPSEC_INST_PARAM1_IV_OFFSET_SHIFT 8 + +/* CPT instruction parameter-2 */ +#define CN10K_IPSEC_INST_PARAM2_ENC_DATA_OFFSET_SHIFT 0 +#define CN10K_IPSEC_INST_PARAM2_AUTH_DATA_OFFSET_SHIFT 8 + /* CPT Instruction Structure */ struct cpt_inst_s { u64 nixtxl : 3; /* W0 */ @@ -182,6 +195,15 @@ struct cpt_res_s { u64 esn; /* W1 */ }; +/* CPT SG structure */ +struct cpt_sg_s { + u64 seg1_size : 16; + u64 seg2_size : 16; + u64 seg3_size : 16; + u64 segs : 2; + u64 rsvd_63_50 : 14; +}; + /* CPT LF_INPROG Register */ #define CPT_LF_INPROG_INFLIGHT GENMASK_ULL(8, 0) #define CPT_LF_INPROG_GRB_CNT GENMASK_ULL(39, 32) @@ -204,6 +226,11 @@ struct cpt_res_s { int cn10k_ipsec_init(struct net_device *netdev); void cn10k_ipsec_clean(struct otx2_nic *pf); int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable); +bool otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset); +bool cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size); #else static inline __maybe_unused int cn10k_ipsec_init(struct net_device *netdev) { @@ -219,5 +246,20 @@ int cn10k_ipsec_ethtool_init(struct net_device *netdev, bool enable) { return 0; } + +static inline bool __maybe_unused +otx2_sqe_add_sg_ipsec(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + struct sk_buff *skb, int num_segs, int *offset) +{ + return true; +} + +static inline bool __maybe_unused +cn10k_ipsec_transmit(struct otx2_nic *pf, struct netdev_queue *txq, + struct otx2_snd_queue *sq, struct sk_buff *skb, + int num_segs, int size) +{ + return true; +} #endif #endif // CN10K_IPSEC_H diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 4c8774899eaf..bf56888e7fe7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -970,6 +970,29 @@ int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) if (err) return err; + /* Allocate memory for NIX SQE (which includes NIX SG) and CPT SG. + * SG of NIX and CPT are same in size. Allocate memory for CPT SG + * same as NIX SQE for base address alignment. + * Layout of a NIX SQE and CPT SG entry: + * ----------------------------- + * | CPT Scatter Gather | + * | (SQE SIZE) | + * | | + * ----------------------------- + * | NIX SQE | + * | (SQE SIZE) | + * | | + * ----------------------------- + */ + err = qmem_alloc(pfvf->dev, &sq->sqe_ring, qset->sqe_cnt, + sq->sqe_size * 2); + if (err) + return err; + + err = qmem_alloc(pfvf->dev, &sq->cpt_resp, qset->sqe_cnt, 64); + if (err) + return err; + if (qidx < pfvf->hw.tx_queues) { err = qmem_alloc(pfvf->dev, &sq->tso_hdrs, qset->sqe_cnt, TSO_HEADER_SIZE); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 5e2da67d58bb..44d737a0dd09 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -57,6 +57,9 @@ #define NIX_PF_PFC_PRIO_MAX 8 #endif +/* Number of segments per SG structure */ +#define MAX_SEGS_PER_SG 3 + enum arua_mapped_qtypes { AURA_NIX_RQ, AURA_NIX_SQ, diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 2f652035d854..e1dde93e8af8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1485,6 +1485,8 @@ static void otx2_free_sq_res(struct otx2_nic *pf) if (!sq->sqe) continue; qmem_free(pf->dev, sq->sqe); + qmem_free(pf->dev, sq->sqe_ring); + qmem_free(pf->dev, sq->cpt_resp); qmem_free(pf->dev, sq->tso_hdrs); kfree(sq->sg); kfree(sq->sqb_ptrs); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index a49041e55c33..4e0133d1d892 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "otx2_reg.h" #include "otx2_common.h" @@ -32,6 +33,17 @@ static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, bool *need_xdp_flush); +static void otx2_sq_set_sqe_base(struct otx2_snd_queue *sq, + struct sk_buff *skb) +{ + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + sq->sqe_base = sq->sqe_ring->base + sq->sqe_size + + (sq->head * (sq->sqe_size * 2)); + else + sq->sqe_base = sq->sqe->base; +} + static int otx2_nix_cq_op_status(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) { @@ -593,7 +605,6 @@ void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq, sq->head &= (sq->sqe_cnt - 1); } -#define MAX_SEGS_PER_SG 3 /* Add SQE scatter/gather subdescriptor structure */ static bool otx2_sqe_add_sg(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, struct sk_buff *skb, int num_segs, int *offset) @@ -1129,6 +1140,7 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, int offset, num_segs, free_desc; struct nix_sqe_hdr_s *sqe_hdr; struct otx2_nic *pfvf = dev; + bool ret; /* Check if there is enough room between producer * and consumer index. @@ -1145,6 +1157,7 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, /* If SKB doesn't fit in a single SQE, linearize it. * TODO: Consider adding JUMP descriptor instead. */ + if (unlikely(num_segs > OTX2_MAX_FRAGS_IN_SQE)) { if (__skb_linearize(skb)) { dev_kfree_skb_any(skb); @@ -1164,6 +1177,9 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, return true; } + /* Set sqe base address */ + otx2_sq_set_sqe_base(sq, skb); + /* Set SQE's SEND_HDR. * Do not clear the first 64bit as it contains constant info. */ @@ -1176,7 +1192,13 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, otx2_sqe_add_ext(pfvf, sq, skb, &offset); /* Add SG subdesc with data frags */ - if (!otx2_sqe_add_sg(pfvf, sq, skb, num_segs, &offset)) { + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + ret = otx2_sqe_add_sg_ipsec(pfvf, sq, skb, num_segs, &offset); + else + ret = otx2_sqe_add_sg(pfvf, sq, skb, num_segs, &offset); + + if (!ret) { otx2_dma_unmap_skb_frags(pfvf, &sq->sg[sq->head]); return false; } @@ -1185,11 +1207,15 @@ bool otx2_sq_append_skb(void *dev, struct netdev_queue *txq, sqe_hdr->sizem1 = (offset / 16) - 1; + if (static_branch_unlikely(&cn10k_ipsec_sa_enabled) && + (xfrm_offload(skb))) + return cn10k_ipsec_transmit(pfvf, txq, sq, skb, num_segs, + offset); + netdev_tx_sent_queue(txq, skb->len); /* Flush SQE to HW */ pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx); - return true; } EXPORT_SYMBOL(otx2_sq_append_skb); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index e1db5f961877..d23810963fdb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -101,6 +101,9 @@ struct otx2_snd_queue { struct queue_stats stats; u16 sqb_count; u64 *sqb_ptrs; + /* SQE ring and CPT response queue for Inline IPSEC */ + struct qmem *sqe_ring; + struct qmem *cpt_resp; } ____cacheline_aligned_in_smp; enum cq_type { -- 2.51.0 From 32188be805d052a91b999a723fd93698d83a7fa5 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:58 +0530 Subject: [PATCH 03/16] cn10k-ipsec: Allow ipsec crypto offload for skb with SA Allow to use hardware offload for outbound ipsec crypto mode if security association (SA) is set for a given skb. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/cn10k_ipsec.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index 9a9b06f4c2cc..e9bf4632695e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -746,9 +746,24 @@ static void cn10k_ipsec_del_state(struct xfrm_state *x) queue_work(pf->ipsec.sa_workq, &pf->ipsec.sa_work); } +static bool cn10k_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + if (x->props.family == AF_INET) { + /* Offload with IPv4 options is not supported yet */ + if (ip_hdr(skb)->ihl > 5) + return false; + } else { + /* Offload with IPv6 extension headers is not support yet */ + if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) + return false; + } + return true; +} + static const struct xfrmdev_ops cn10k_ipsec_xfrmdev_ops = { .xdo_dev_state_add = cn10k_ipsec_add_state, .xdo_dev_state_delete = cn10k_ipsec_del_state, + .xdo_dev_offload_ok = cn10k_ipsec_offload_ok, }; static void cn10k_ipsec_sa_wq_handler(struct work_struct *work) -- 2.51.0 From b3ae3dc3a30f3de78c0c3675ea980639b9ba212c Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 4 Dec 2024 11:26:59 +0530 Subject: [PATCH 04/16] cn10k-ipsec: Enable outbound ipsec crypto offload Hardware is initialized and netdev transmit flow is hooked up for outbound ipsec crypto offload, so finally enable ipsec offload. Signed-off-by: Bharat Bhushan Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index e9bf4632695e..c333e04daad3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -821,10 +821,10 @@ int cn10k_ipsec_init(struct net_device *netdev) return -ENOMEM; } - /* Set xfrm device ops - * NETIF_F_HW_ESP is not set as ipsec setup is not yet complete. - */ + /* Set xfrm device ops */ netdev->xfrmdev_ops = &cn10k_ipsec_xfrmdev_ops; + netdev->hw_features |= NETIF_F_HW_ESP; + netdev->hw_enc_features |= NETIF_F_HW_ESP; cn10k_cpt_device_set_unavailable(pf); return 0; -- 2.51.0 From d1fd972914239996dbd15c5142d7f6e09d95a002 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:29 +0000 Subject: [PATCH 05/16] ktime: Add us_to_ktime() Add a us_to_ktime() helper to go with ms_to_ktime() and ns_to_ktime(). Signed-off-by: David Howells cc: Thomas Gleixner cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ktime.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 3a4e723eae0f..383ed9985802 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -222,6 +222,11 @@ static inline ktime_t ns_to_ktime(u64 ns) return ns; } +static inline ktime_t us_to_ktime(u64 us) +{ + return us * NSEC_PER_USEC; +} + static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; -- 2.51.0 From 0e56ebde245e4799ce74d38419426f2a80d39950 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:30 +0000 Subject: [PATCH 06/16] rxrpc: Fix handling of received connection abort Fix the handling of a connection abort that we've received. Though the abort is at the connection level, it needs propagating to the calls on that connection. Whilst the propagation bit is performed, the calls aren't then woken up to go and process their termination, and as no further input is forthcoming, they just hang. Also add some tracing for the logging of connection aborts. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 25 +++++++++++++++++++++++++ net/rxrpc/conn_event.c | 12 ++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d03e0bd8c028..27c23873c881 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -117,6 +117,7 @@ #define rxrpc_call_poke_traces \ EM(rxrpc_call_poke_abort, "Abort") \ EM(rxrpc_call_poke_complete, "Compl") \ + EM(rxrpc_call_poke_conn_abort, "Conn-abort") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ EM(rxrpc_call_poke_set_timeout, "Set-timo") \ @@ -282,6 +283,7 @@ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ EM(rxrpc_call_see_connected, "SEE connect ") \ + EM(rxrpc_call_see_conn_abort, "SEE conn-abt") \ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ @@ -981,6 +983,29 @@ TRACE_EVENT(rxrpc_rx_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_rx_conn_abort, + TP_PROTO(const struct rxrpc_connection *conn, const struct sk_buff *skb), + + TP_ARGS(conn, skb), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, abort_code) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = rxrpc_skb(skb)->hdr.serial; + __entry->abort_code = skb->priority; + ), + + TP_printk("C=%08x ABORT %08x ac=%d", + __entry->conn, + __entry->serial, + __entry->abort_code) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 598b4ee389fc..2a1396cd892f 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -63,11 +63,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, /* * Mark a connection as being remotely aborted. */ -static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, +static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, - RXRPC_CALL_REMOTELY_ABORTED); + trace_rxrpc_rx_conn_abort(conn, skb); + rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); } /* @@ -202,11 +203,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) for (i = 0; i < RXRPC_MAXCALLS; i++) { call = conn->channels[i].call; - if (call) + if (call) { + rxrpc_see_call(call, rxrpc_call_see_conn_abort); rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); + rxrpc_poke_call(call, rxrpc_call_poke_conn_abort); + } } _leave(""); -- 2.51.0 From 29e03ec757292e55fa0f7efa051c84ddc4f3e668 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:31 +0000 Subject: [PATCH 07/16] rxrpc: Use umin() and umax() rather than min_t()/max_t() where possible Use umin() and umax() rather than min_t()/max_t() where the type specified is an unsigned type. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-4-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/call_event.c | 5 ++--- net/rxrpc/call_object.c | 4 ++-- net/rxrpc/conn_client.c | 2 +- net/rxrpc/input.c | 13 +++++-------- net/rxrpc/insecure.c | 2 +- net/rxrpc/io_thread.c | 2 +- net/rxrpc/output.c | 2 +- net/rxrpc/rtt.c | 6 +++--- net/rxrpc/rxkad.c | 6 +++--- net/rxrpc/rxperf.c | 2 +- net/rxrpc/sendmsg.c | 2 +- 11 files changed, 21 insertions(+), 25 deletions(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7bbb68504766..c4754cc9b8d4 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -233,8 +233,7 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) { - unsigned int winsize = min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); + unsigned int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; rxrpc_seq_t tx_top = call->tx_top; int space; @@ -467,7 +466,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } else { unsigned long nowj = jiffies, delayj, nextj; - delayj = max(nsecs_to_jiffies(delay), 1); + delayj = umax(nsecs_to_jiffies(delay), 1); nextj = nowj + delayj; if (time_before(nextj, call->timer.expires) || !timer_pending(&call->timer)) { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f9e983a12c14..0df647d1d3a2 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -220,9 +220,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) - call->next_rx_timo = min(p->timeouts.normal, 1); + call->next_rx_timo = umin(p->timeouts.normal, 1); if (p->timeouts.idle) - call->next_req_timo = min(p->timeouts.idle, 1); + call->next_req_timo = umin(p->timeouts.idle, 1); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard; diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index bb11e8289d6d..86fb18bcd188 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); + limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 16d49a861dbb..49e35be7dc13 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -44,8 +44,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); + call->cong_ssthresh = umax(summary->flight_size / 2, 2); cwnd = 1; if (cwnd >= call->cong_ssthresh && call->cong_mode == RXRPC_CALL_SLOW_START) { @@ -113,8 +112,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, change = rxrpc_cong_begin_retransmission; call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); + call->cong_ssthresh = umax(summary->flight_size / 2, 2); cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; @@ -206,9 +204,8 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; call->cong_mode = RXRPC_CALL_SLOW_START; - call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh, - call->cong_cwnd * 3 / 4); - call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND); + call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4); + call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); } /* @@ -709,7 +706,7 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); + mtu = umin(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); peer = call->peer; if (mtu < peer->maxdata) { diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 6716c021a532..751eb621021d 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -19,7 +19,7 @@ static int none_init_connection_security(struct rxrpc_connection *conn, */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp); + return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 07c74c77d802..7af5adf53b25 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -558,7 +558,7 @@ int rxrpc_io_thread(void *data) } timeout = nsecs_to_jiffies(delay_ns); - timeout = max(timeout, 1UL); + timeout = umax(timeout, 1); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5ea9601efd05..85112ea31a39 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -118,7 +118,7 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, txb->kvec[1].iov_len = ack->nAcks; wrap = RXRPC_SACK_SIZE - sack; - to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE); + to = umin(ack->nAcks, RXRPC_SACK_SIZE); if (sack + ack->nAcks <= RXRPC_SACK_SIZE) { memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks); diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index cdab7b7d08a0..6dc51486b5a6 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -27,7 +27,7 @@ static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) static u32 rxrpc_bound_rto(u32 rto) { - return min(rto, RXRPC_RTO_MAX); + return umin(rto, RXRPC_RTO_MAX); } /* @@ -91,11 +91,11 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ - peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); + peer->rttvar_us = umax(peer->mdev_us, rxrpc_rto_min_us(peer)); peer->mdev_max_us = peer->rttvar_us; } - peer->srtt_us = max(1U, srtt); + peer->srtt_us = umax(srtt, 1); } /* diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 48a1475e6b06..e3194d73dd84 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -150,11 +150,11 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem struct rxrpc_txbuf *txb; size_t shdr, space; - remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header)); + remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); + space = umin(remain, RXRPC_JUMBO_DATALEN); return rxrpc_alloc_data_txbuf(call, space, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); @@ -164,7 +164,7 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem break; } - space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); + space = umin(round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); space = round_up(space, RXKAD_ALIGN); txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 085e7892d310..7ef93407be83 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -503,7 +503,7 @@ static int rxperf_process_call(struct rxperf_call *call) reply_len + sizeof(rxperf_magic_cookie)); while (reply_len > 0) { - len = min_t(size_t, reply_len, PAGE_SIZE); + len = umin(reply_len, PAGE_SIZE); bvec_set_page(&bv, ZERO_PAGE(0), len, 0); iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len); msg.msg_flags = MSG_MORE; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6abb8eec1b2b..b04afb5df241 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -360,7 +360,7 @@ reload: /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { - size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); + size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, -- 2.51.0 From efa95c32352b2ac7ff09d680144e22c0f25244cb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:32 +0000 Subject: [PATCH 08/16] rxrpc: Clean up Tx header flags generation handling Clean up the generation of the header flags when building packet headers for transmission: (1) Assemble the flags in a local variable rather than in the txb->flags. (2) Do the flags masking and JUMBO-PACKET setting in one bit of code for both the main header and the jumbo headers. (3) Generate the REQUEST-ACK flag afresh each time. There's a possibility we might want to do jumbo retransmission packets in future. (4) Pass the local flags variable to the rxrpc_tx_data tracepoint rather than the combination of the txb flags and the wire header flags (the latter belong only to the first subpacket). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 - net/rxrpc/ar-internal.h | 2 +- net/rxrpc/output.c | 18 ++++++++++++------ net/rxrpc/proc.c | 3 +-- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 27c23873c881..62064f63d6eb 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -452,7 +452,6 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ - EM(rxrpc_reqack_already_on, "ALREADY-ON") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d0fd37bdcfe9..fcdfbc1d5aaf 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -110,7 +110,7 @@ struct rxrpc_net { atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; - atomic_t stat_why_req_ack[8]; + atomic_t stat_why_req_ack[7]; atomic_t stat_io_loop; }; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 85112ea31a39..50d5f2a02458 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -330,6 +330,8 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; + bool last; + u8 flags; _enter("%x,{%d}", txb->seq, txb->len); @@ -339,6 +341,10 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t txb->seq == 1) whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + txb->flags &= ~RXRPC_REQUEST_ACK; + flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + last = txb->flags & RXRPC_LAST_PACKET; + /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -346,9 +352,7 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (txb->flags & RXRPC_REQUEST_ACK) - why = rxrpc_reqack_already_on; - else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb)) + if (last && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; @@ -367,15 +371,17 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); - if (why != rxrpc_reqack_no_srv_last) + if (why != rxrpc_reqack_no_srv_last) { txb->flags |= RXRPC_REQUEST_ACK; + flags |= RXRPC_REQUEST_ACK; + } dont_set_request_ack: - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + whdr->flags = flags; whdr->serial = htonl(txb->serial); whdr->cksum = txb->cksum; - trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); } /* diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 263a2251e3d2..3b7e34dd4385 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -519,9 +519,8 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); seq_printf(seq, -- 2.51.0 From cbe0d89095c31afcede96e4ce9cd58c4bed62d63 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:33 +0000 Subject: [PATCH 09/16] rxrpc: Don't set the MORE-PACKETS rxrpc wire header flag The MORE-PACKETS rxrpc header flag hasn't actually been looked at by anything since 1988 and not all implementations generate it. Change rxrpc so that it doesn't set MORE-PACKETS at all rather than setting it inconsistently. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/sendmsg.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index b04afb5df241..546abb463c3f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -385,9 +385,6 @@ reload: (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; - else if (call->tx_top - call->acks_hard_ack < - call->tx_winsize) - txb->flags |= RXRPC_MORE_PACKETS; ret = call->security->secure_packet(call, txb); if (ret < 0) -- 2.51.0 From ff992adbc470c86d2dcb66f5ed837fbb3c1a561e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:34 +0000 Subject: [PATCH 10/16] rxrpc: Show stats counter for received reason-0 ACKs In /proc/net/rxrpc/stats, show the stats counter for received ACKs that have the reason code set to 0 as some implementations do this. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-7-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 3b7e34dd4385..cdf32f0d8e0e 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -508,7 +508,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n", atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), @@ -517,7 +517,8 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), - atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), + atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, "Why-Req-A: acklost=%u mrtt=%u ortt=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), -- 2.51.0 From 8b5823ea437624b53ecf084b6dd582760f110394 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:35 +0000 Subject: [PATCH 11/16] rxrpc: Request an ACK on impending Tx stall Set the REQUEST-ACK flag on the DATA packet we're about to send if we're about to stall transmission because the app layer isn't keeping up supplying us with data to transmit. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-8-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + net/rxrpc/ar-internal.h | 2 +- net/rxrpc/output.c | 7 ++++++- net/rxrpc/proc.c | 5 +++-- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 62064f63d6eb..d86b5f07d292 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -452,6 +452,7 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ + EM(rxrpc_reqack_app_stall, "APP-STALL ") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index fcdfbc1d5aaf..d0fd37bdcfe9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -110,7 +110,7 @@ struct rxrpc_net { atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; - atomic_t stat_why_req_ack[7]; + atomic_t stat_why_req_ack[8]; atomic_t stat_io_loop; }; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 50d5f2a02458..b93a5d50be3e 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -330,7 +330,7 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - bool last; + bool last, more; u8 flags; _enter("%x,{%d}", txb->seq, txb->len); @@ -345,6 +345,9 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; last = txb->flags & RXRPC_LAST_PACKET; + more = (!list_is_last(&txb->call_link, &call->tx_buffer) || + !list_empty(&call->tx_sendmsg)); + /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -366,6 +369,8 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t why = rxrpc_reqack_more_rtt; else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; + else if (!last && !more) + why = rxrpc_reqack_app_stall; else goto dont_set_request_ack; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index cdf32f0d8e0e..ce4d48bdfbe9 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -520,10 +520,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, - "Why-Req-A: acklost=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall])); seq_printf(seq, "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), -- 2.51.0 From 420f8af502877a34dd371a7c8b6b943594487ebb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:36 +0000 Subject: [PATCH 12/16] rxrpc: Use a large kvec[] in rxrpc_local rather than every rxrpc_txbuf Use a single large kvec[] in the rxrpc_local struct rather than one in every rxrpc_txbuf struct to build large packets to save on memory. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-9-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 6 ++++++ net/rxrpc/output.c | 45 ++++++++++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d0fd37bdcfe9..ab8e565cb20b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -320,6 +320,12 @@ struct rxrpc_local { struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ + /* Provide a kvec table sufficiently large to manage either a DATA + * packet with a maximum set of jumbo subpackets or a PING ACK padded + * out to 64K with zeropages for PMTUD. + */ + struct kvec kvec[RXRPC_MAX_NR_JUMBO > 3 + 16 ? + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index b93a5d50be3e..f8bb5250e849 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -175,9 +175,11 @@ no_slot: /* * Transmit an ACK packet. */ -static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, + int nr_kv) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; struct rxrpc_connection *conn; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; @@ -206,8 +208,9 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, txb->len); rxrpc_local_dont_fragment(conn->local, false); + ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { @@ -233,6 +236,8 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { struct rxrpc_txbuf *txb; + struct kvec *kv = call->local->kvec; + int nr_kv; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return; @@ -248,12 +253,19 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, txb->ack_why = why; rxrpc_fill_out_ack(call, txb, ack_reason, serial); + + nr_kv = txb->nr_kvec; + kv[0] = txb->kvec[0]; + kv[1] = txb->kvec[1]; + kv[2] = txb->kvec[2]; + // TODO: Extend a path MTU probe ACK + call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb); + rxrpc_send_ack_packet(call, txb, nr_kv); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); } @@ -324,12 +336,15 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) /* * Prepare a (sub)packet for transmission. */ -static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - rxrpc_serial_t serial) +static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, + rxrpc_serial_t serial, + int subpkt) { struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; + struct kvec *kv = &call->local->kvec[subpkt]; + size_t len = txb->len; bool last, more; u8 flags; @@ -385,8 +400,13 @@ dont_set_request_ack: whdr->flags = flags; whdr->serial = htonl(txb->serial); whdr->cksum = txb->cksum; + whdr->serviceId = htons(conn->service_id); + kv->iov_base = whdr; + // TODO: Convert into a jumbo header for tail subpackets trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); + kv->iov_len = len; + return len; } /* @@ -395,13 +415,15 @@ dont_set_request_ack: static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { rxrpc_serial_t serial; + size_t len = 0; /* Each transmission of a Tx packet needs a new serial number */ serial = rxrpc_get_next_serial(call->conn); - rxrpc_prepare_data_subpacket(call, txb, serial); + len += rxrpc_prepare_data_subpacket(call, txb, serial, 0); + // TODO: Loop around adding tail subpackets - return txb->len; + return len; } /* @@ -442,7 +464,6 @@ static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbu */ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; struct msghdr msg; @@ -463,7 +484,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t } } - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -480,7 +501,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (txb->len >= call->peer->maxdata) { + if (len >= sizeof(struct rxrpc_wire_header) + call->peer->maxdata) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -503,7 +524,7 @@ retry: rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, whdr, frag); + trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); } rxrpc_tx_backoff(call, ret); -- 2.51.0 From eeaedc5449d9fccf2b56e844a018df9d3720d59e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:37 +0000 Subject: [PATCH 13/16] rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899) Implement path-MTU probing (along the lines of RFC8899) by padding some of the PING ACKs we send. PING ACKs get their own individual responses quite apart from the acking of data (though, as ACKs, they fulfil that role also). The probing concentrates on packet sizes that correspond how many subpackets can be stuffed inside a jumbo packet as jumbo DATA packets are just aggregations of individual DATA packets and can be split easily for retransmission purposes. If we want to perform probing, we advertise this by setting the maximum number of jumbo subpackets to 0 in the ack trailer when we send an ACK and see if the peer is also advertising the service. This is interpreted by non-supporting Rx stacks as an indication that jumbo packets aren't supported. The MTU sizes advertised in the ACK trailer AF_RXRPC transmits are pegged at a maximum of 1444 unless pmtud is supported by both sides. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 124 +++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 25 +++++-- net/rxrpc/call_event.c | 5 ++ net/rxrpc/conn_event.c | 17 +++-- net/rxrpc/conn_object.c | 6 ++ net/rxrpc/input.c | 26 +++++--- net/rxrpc/io_thread.c | 6 ++ net/rxrpc/misc.c | 4 +- net/rxrpc/output.c | 67 +++++++++++++++---- net/rxrpc/peer_event.c | 104 +++++++++++++++++++++++++++-- net/rxrpc/peer_object.c | 24 +++++-- net/rxrpc/proc.c | 9 +-- net/rxrpc/protocol.h | 13 ++-- net/rxrpc/sysctl.c | 6 +- net/rxrpc/txbuf.c | 3 +- 15 files changed, 382 insertions(+), 57 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d86b5f07d292..9dcadad88e76 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -364,6 +364,7 @@ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \ + EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \ EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \ EM(rxrpc_propose_ack_ping_for_params, "Params ") \ EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \ @@ -478,6 +479,11 @@ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") +#define rxrpc_pmtud_reduce_traces \ + EM(rxrpc_pmtud_reduce_ack, "Ack ") \ + EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ + E_(rxrpc_pmtud_reduce_route, "Route") + /* * Generate enums for tracing information. */ @@ -498,6 +504,7 @@ enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte); enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte); enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte); +enum rxrpc_pmtud_reduce_trace { rxrpc_pmtud_reduce_traces } __mode(byte); enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); @@ -534,6 +541,7 @@ rxrpc_congest_changes; rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; +rxrpc_pmtud_reduce_traces; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; @@ -2040,6 +2048,122 @@ TRACE_EVENT(rxrpc_sack, __entry->sack) ); +TRACE_EVENT(rxrpc_pmtud_tx, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(unsigned short, pmtud_trial) + __field(unsigned short, pmtud_good) + __field(unsigned short, pmtud_bad) + ), + + TP_fast_assign( + __entry->peer_debug_id = call->peer->debug_id; + __entry->call_debug_id = call->debug_id; + __entry->ping_serial = call->conn->pmtud_probe; + __entry->pmtud_trial = call->peer->pmtud_trial; + __entry->pmtud_good = call->peer->pmtud_good; + __entry->pmtud_bad = call->peer->pmtud_bad; + ), + + TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->pmtud_good, + __entry->pmtud_trial, + __entry->pmtud_bad) + ); + +TRACE_EVENT(rxrpc_pmtud_rx, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + __field(unsigned short, max_data) + __field(u8, jumbo_max) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + __entry->max_data = conn->peer->max_data; + __entry->jumbo_max = conn->peer->pmtud_jumbo; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial, + __entry->max_data, + __entry->jumbo_max) + ); + +TRACE_EVENT(rxrpc_pmtud_lost, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial) + ); + +TRACE_EVENT(rxrpc_pmtud_reduce, + TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial, + unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason), + + TP_ARGS(peer, serial, max_data, reason), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(rxrpc_serial_t, serial) + __field(unsigned int, max_data) + __field(enum rxrpc_pmtud_reduce_trace, reason) + ), + + TP_fast_assign( + __entry->peer_debug_id = peer->debug_id; + __entry->serial = serial; + __entry->max_data = max_data; + __entry->reason = reason; + ), + + TP_printk("P=%08x %s r=%08x m=%u", + __entry->peer_debug_id, + __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces), + __entry->serial, __entry->max_data) + ); + #undef EM #undef E_ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ab8e565cb20b..69e6f4b20bad 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -344,13 +344,25 @@ struct rxrpc_peer { time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ - unsigned int if_mtu; /* interface MTU for this peer */ - unsigned int mtu; /* network MTU for this peer */ - unsigned int maxdata; /* data size (MTU - hdrsize) */ - unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ + /* Path MTU discovery [RFC8899] */ + unsigned int pmtud_trial; /* Current MTU probe size */ + unsigned int pmtud_good; /* Largest working MTU probe we've tried */ + unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ + bool pmtud_lost; /* T if MTU probe was lost */ + bool pmtud_probing; /* T if we have an active probe outstanding */ + bool pmtud_pending; /* T if a call to this peer should send a probe */ + u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ + bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ + unsigned int ackr_max_data; /* Maximum data advertised by peer */ + seqcount_t mtu_lock; /* Lockless MTU access management */ + unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ + unsigned int max_data; /* Maximum packet data capacity for this peer */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + unsigned short tx_seg_max; /* Maximum number of transmissable segments */ + /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 spinlock_t rtt_input_lock; /* RTT lock for input routine */ @@ -531,6 +543,8 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ + rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ + unsigned int pmtud_call; /* ID of call used for probe */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ @@ -1155,6 +1169,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); @@ -1166,6 +1181,8 @@ void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail); /* * peer_object.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c4754cc9b8d4..1d889b6f0366 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -483,6 +483,11 @@ out: rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); + } else { + if (skb && + call->peer->ackr_adv_pmtud && + call->peer->pmtud_pending) + rxrpc_send_probe_for_pmtud(call); } if (call->acks_hard_ack != call->tx_bottom) rxrpc_shrink_call_tx_buffer(call); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 2a1396cd892f..f6c02cc44d98 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -92,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_acktrailer trailer; size_t len; int ret, ioc; - u32 serial, mtu, call_id, padding; + u32 serial, max_mtu, if_mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -150,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; case RXRPC_PACKET_TYPE_ACK: - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; + if_mtu = conn->peer->if_mtu - conn->peer->hdrsize; + if (conn->peer->ackr_adv_pmtud) { + max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(1444, if_mtu); + max_mtu = if_mtu; + } pkt.ack.bufferSpace = 0; pkt.ack.maxSkew = htons(skb ? skb->priority : 0); pkt.ack.firstPacket = htonl(chan->last_seq + 1); @@ -159,10 +164,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - trailer.maxMTU = htonl(rxrpc_rx_mtu); - trailer.ifMTU = htonl(mtu); + trailer.maxMTU = htonl(max_mtu); + trailer.ifMTU = htonl(if_mtu); trailer.rwind = htonl(rxrpc_rx_window_size); - trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max); + trailer.jumbo_max = 0; pkt.whdr.flags |= RXRPC_SLOW_START_OK; padding = 0; iov[0].iov_len += sizeof(pkt.ack); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 694c4df7a1a3..b0627398311b 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -321,6 +321,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work) list_del_init(&conn->proc_link); write_unlock(&rxnet->conn_lock); + if (conn->pmtud_probe) { + trace_rxrpc_pmtud_lost(conn, 0); + conn->peer->pmtud_probing = false; + conn->peer->pmtud_pending = true; + } + rxrpc_purge_queue(&conn->rx_queue); rxrpc_kill_client_conn(conn); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 49e35be7dc13..fd08d813ef29 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -692,8 +692,8 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_acktrailer *trailer) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - unsigned int mtu; + struct rxrpc_peer *peer = call->peer; + unsigned int max_data; bool wake = false; u32 rwind = ntohl(trailer->rwind); @@ -706,14 +706,22 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - mtu = umin(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); + if (trailer->jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + } + + max_data = ntohl(trailer->maxMTU); + peer->ackr_max_data = max_data; - peer = call->peer; - if (mtu < peer->maxdata) { - spin_lock(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock(&peer->lock); + if (max_data < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_data, + rxrpc_pmtud_reduce_ack); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); } if (wake) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 7af5adf53b25..bd6d4f5e97b4 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -364,6 +364,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); + /* Deal with path MTU discovery probing. */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && + conn->pmtud_probe && + after_eq(sp->ack.acked_serial, conn->pmtud_probe)) + rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); + /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 657cf35089a6..8fcc8139d771 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255; * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned int rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46); /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned int rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 46; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f8bb5250e849..a91be871ad96 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -82,10 +82,9 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); - unsigned int qsize, sack, wrap, to; + unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; rxrpc_seq_t window, wtop; int rsize; - u32 mtu, jmax; u8 *filler = txb->kvec[2].iov_base; u8 *sackp = txb->kvec[1].iov_base; @@ -132,16 +131,22 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, ack->reason = RXRPC_ACK_IDLE; } - mtu = call->peer->if_mtu; - mtu -= call->peer->hdrsize; - jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); txb->ack_rwind = rsize; - trailer->maxMTU = htonl(rxrpc_rx_mtu); - trailer->ifMTU = htonl(mtu); + + if_mtu = call->peer->if_mtu - call->peer->hdrsize; + if (call->peer->ackr_adv_pmtud) { + max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(if_mtu, 1444); + max_mtu = if_mtu; + } + + trailer->maxMTU = htonl(max_mtu); + trailer->ifMTU = htonl(if_mtu); trailer->rwind = htonl(rsize); - trailer->jumbo_max = htonl(jmax); + trailer->jumbo_max = 0; /* Advertise pmtu discovery */ } /* @@ -176,7 +181,7 @@ no_slot: * Transmit an ACK packet. */ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - int nr_kv) + int nr_kv, enum rxrpc_propose_ack_trace why) { struct kvec *kv = call->local->kvec; struct rxrpc_wire_header *whdr = kv[0].iov_base; @@ -209,13 +214,16 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, txb->len); - rxrpc_local_dont_fragment(conn->local, false); + rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, rxrpc_tx_point_call_ack); + if (why == rxrpc_propose_ack_ping_for_mtu_probe && + ret == -EMSGSIZE) + rxrpc_input_probe_for_pmtud(conn, txb->serial, true); } else { trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); @@ -225,6 +233,13 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t if (txb->flags & RXRPC_REQUEST_ACK) call->peer->rtt_last_req = now; rxrpc_set_keepalive(call, now); + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + call->peer->pmtud_pending = false; + call->peer->pmtud_probing = true; + call->conn->pmtud_probe = txb->serial; + call->conn->pmtud_call = call->debug_id; + trace_rxrpc_pmtud_tx(call); + } } rxrpc_tx_backoff(call, ret); } @@ -254,21 +269,45 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_fill_out_ack(call, txb, ack_reason, serial); + /* Extend a path MTU probe ACK. */ nr_kv = txb->nr_kvec; kv[0] = txb->kvec[0]; kv[1] = txb->kvec[1]; kv[2] = txb->kvec[2]; - // TODO: Extend a path MTU probe ACK + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); + + if (txb->len > probe_mtu) + goto skip; + while (txb->len < probe_mtu) { + size_t part = umin(probe_mtu - txb->len, PAGE_SIZE); + + kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); + kv[nr_kv].iov_len = part; + txb->len += part; + nr_kv++; + } + } call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb, nr_kv); + rxrpc_send_ack_packet(call, txb, nr_kv, why); +skip: rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); } +/* + * Send an ACK probe for path MTU discovery. + */ +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call) +{ + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_mtu_probe); +} + /* * Send an ABORT call packet. */ @@ -501,7 +540,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (len >= sizeof(struct rxrpc_wire_header) + call->peer->maxdata) { + if (len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -548,7 +587,7 @@ done: RX_USER_ABORT, ret); } - _leave(" = %d [%u]", ret, call->peer->maxdata); + _leave(" = %d [%u]", ret, call->peer->max_data); return ret; } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 552ba84a255c..8fc9464a960c 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { + unsigned int max_data; + /* wind down the local interface MTU */ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) peer->if_mtu = mtu; @@ -120,11 +122,17 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) } } - if (mtu < peer->mtu) { - spin_lock(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock(&peer->lock); + max_data = max_t(int, mtu - peer->hdrsize, 500); + if (max_data < peer->max_data) { + if (peer->pmtud_good > max_data) + peer->pmtud_good = max_data; + if (peer->pmtud_bad > max_data + 1) + peer->pmtud_bad = max_data + 1; + + trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); } } @@ -347,3 +355,89 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) _leave(""); } + +/* + * Do path MTU probing. + */ +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail) +{ + struct rxrpc_peer *peer = conn->peer; + unsigned int max_data = peer->max_data; + int good, trial, bad, jumbo; + + good = peer->pmtud_good; + trial = peer->pmtud_trial; + bad = peer->pmtud_bad; + if (good >= bad - 1) { + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + return; + } + + if (!peer->pmtud_probing) + goto send_probe; + + if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) { + /* Retry a lost probe. */ + if (!peer->pmtud_lost) { + trace_rxrpc_pmtud_lost(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = true; + goto send_probe; + } + + /* The probed size didn't seem to get through. */ + bad = trial; + peer->pmtud_bad = bad; + if (bad <= max_data) + max_data = bad - 1; + } else { + /* It did get through. */ + good = trial; + peer->pmtud_good = good; + if (good > max_data) + max_data = good; + } + + max_data = umin(max_data, peer->ackr_max_data); + if (max_data != peer->max_data) { + preempt_disable(); + write_seqcount_begin(&peer->mtu_lock); + peer->max_data = max_data; + write_seqcount_end(&peer->mtu_lock); + preempt_enable(); + } + + jumbo = max_data + sizeof(struct rxrpc_jumbo_header); + jumbo /= RXRPC_JUMBO_SUBPKTLEN; + peer->pmtud_jumbo = jumbo; + + trace_rxrpc_pmtud_rx(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + + if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2)) + trial = RXRPC_JUMBO(2); + else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4)) + trial = RXRPC_JUMBO(4); + else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3)) + trial = RXRPC_JUMBO(3); + else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6)) + trial = RXRPC_JUMBO(6); + else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5)) + trial = RXRPC_JUMBO(5); + else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8)) + trial = RXRPC_JUMBO(8); + else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7)) + trial = RXRPC_JUMBO(7); + else + trial = (good + bad) / 2; + peer->pmtud_trial = trial; + + if (good >= bad) + return; + +send_probe: + peer->pmtud_pending = true; +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 49dcda67a0d5..80ef6f06d512 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, #endif peer->if_mtu = 1500; + if (peer->max_data < peer->if_mtu - peer->hdrsize) { + trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize, + rxrpc_pmtud_reduce_route); + peer->max_data = peer->if_mtu - peer->hdrsize; + } memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { @@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, } peer->if_mtu = dst_mtu(dst); + peer->hdrsize += dst->header_len + dst->trailer_len; + peer->tx_seg_max = dst->dev->gso_max_segs; dst_release(dst); + peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize); + peer->pmtud_good = 500; + peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1; + peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1); + peer->pmtud_pending = true; + _leave(" [if_mtu %u]", peer->if_mtu); } @@ -223,6 +236,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); spin_lock_init(&peer->rtt_input_lock); + seqcount_init(&peer->mtu_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); rxrpc_peer_init_rtt(peer); @@ -242,9 +256,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(local, peer); - peer->mtu = peer->if_mtu; - peer->rtt_last_req = ktime_get_real(); + switch (peer->srx.transport.family) { case AF_INET: @@ -268,7 +280,11 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, } peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; + peer->max_data = peer->if_mtu - peer->hdrsize; + + rxrpc_assess_MTU_size(local, peer); + + peer->rtt_last_req = ktime_get_real(); } /* diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index ce4d48bdfbe9..44722c226064 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Proto Local " - " Remote " - " Use SST MTU LastUse RTT RTO\n" + "Proto Local Remote Use SST Maxd LastUse RTT RTO\n" ); return 0; } @@ -298,13 +296,12 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u" - " %3u %5u %6llus %8u %8u\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8u %8u\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, - peer->mtu, + peer->max_data, now - peer->last_tx_at, peer->srtt_us >> 3, peer->rto_us); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 4fe6b4d20ada..42f70e4636f8 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -92,11 +92,16 @@ struct rxrpc_jumbo_header { /* * The maximum number of subpackets that can possibly fit in a UDP packet is: * - * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 - * = ((65535 - 28 - 28) / 1416) + 1 - * = 46 non-terminal packets and 1 terminal packet. + * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412) + * = ((65535 - 28 + 4) / 1416) + * = 45 non-terminal packets and 1 terminal packet. */ -#define RXRPC_MAX_NR_JUMBO 47 +#define RXRPC_MAX_NR_JUMBO 46 + +/* Size of a jumbo packet with N subpackets, excluding UDP+IP */ +#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \ + RXRPC_JUMBO_DATALEN + \ + ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN) /*****************************************************************************/ /* diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 9bf9a1f6e4cb..46a20cf4c402 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,6 +11,8 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned int rxrpc_rx_mtu_min = 500; +static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO; static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; @@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)SYSCTL_ONE, + .extra1 = (void *)&rxrpc_rx_mtu_min, .extra2 = (void *)&n_65535, }, { @@ -125,7 +127,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ONE, - .extra2 = (void *)&four, + .extra2 = (void *)&rxrpc_jumbo_max, }, }; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index c3913d8a50d3..2a4291617d40 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -179,7 +179,8 @@ static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base) + if (txb->kvec[i].iov_base && + !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base)))) page_frag_free(txb->kvec[i].iov_base); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); -- 2.51.0 From 3d2bdf73cea57d7f6bf314aa1c948af11af94980 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:38 +0000 Subject: [PATCH 14/16] rxrpc: Separate the packet length from the data length in rxrpc_txbuf Separate the packet length from the data length (txb->len) stored in the rxrpc_txbuf to make security calculations easier. Also store the allocation size as that's an upper bound on the size of the security wrapper and change a number of fields to unsigned short as the amount of data can't exceed the capacity of a UDP packet. Also, whilst we're at it, use kzalloc() for txbufs. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-11-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 8 +++++--- net/rxrpc/insecure.c | 1 + net/rxrpc/output.c | 7 ++++--- net/rxrpc/rxkad.c | 44 ++++++++++++++++++++++------------------- net/rxrpc/sendmsg.c | 1 - net/rxrpc/txbuf.c | 7 ++----- 6 files changed, 36 insertions(+), 32 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 69e6f4b20bad..a5c0bc917641 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -821,9 +821,11 @@ struct rxrpc_txbuf { rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; - unsigned int len; /* Amount of data in buffer */ - unsigned int space; /* Remaining data space */ - unsigned int offset; /* Offset of fill point */ + unsigned short len; /* Amount of data in buffer */ + unsigned short space; /* Remaining data space */ + unsigned short offset; /* Offset of fill point */ + unsigned short pkt_len; /* Size of packet content */ + unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 751eb621021d..d665f486be5f 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -24,6 +24,7 @@ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t rema static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { + txb->pkt_len = txb->len; return 0; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index a91be871ad96..df9af4ad4260 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -383,11 +383,11 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; struct kvec *kv = &call->local->kvec[subpkt]; - size_t len = txb->len; + size_t len = txb->pkt_len; bool last, more; u8 flags; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%zd", txb->seq, len); txb->serial = serial; @@ -441,6 +441,7 @@ dont_set_request_ack: whdr->cksum = txb->cksum; whdr->serviceId = htons(conn->service_id); kv->iov_base = whdr; + len += sizeof(*whdr); // TODO: Convert into a jumbo header for tail subpackets trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); @@ -509,7 +510,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t size_t len; int ret; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,{%d}", txb->seq, txb->pkt_len); len = rxrpc_prepare_data_packet(call, txb); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index e3194d73dd84..755897fab626 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -148,14 +148,14 @@ error: static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { struct rxrpc_txbuf *txb; - size_t shdr, space; + size_t shdr, alloc, limit, part; remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - space = umin(remain, RXRPC_JUMBO_DATALEN); - return rxrpc_alloc_data_txbuf(call, space, 1, gfp); + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; @@ -164,15 +164,21 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem break; } - space = umin(round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); - space = round_up(space, RXKAD_ALIGN); + limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr; + if (remain < limit) { + part = remain; + alloc = round_up(shdr + part, RXKAD_ALIGN); + } else { + part = limit; + alloc = RXRPC_JUMBO_DATALEN; + } - txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); + txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp); if (!txb) return NULL; txb->offset += shdr; - txb->space -= shdr; + txb->space = part; return txb; } @@ -263,13 +269,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, check = txb->seq ^ call->call_id; hdr->data_size = htonl((u32)check << 16 | txb->len); - txb->len += sizeof(struct rxkad_level1_hdr); - pad = txb->len; + txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len; + pad = txb->pkt_len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; + txb->pkt_len += pad; } /* start the encryption afresh */ @@ -298,7 +304,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); struct rxrpc_crypt iv; struct scatterlist sg; - size_t pad; + size_t content, pad; u16 check; int ret; @@ -309,23 +315,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr->data_size = htonl(txb->len | (u32)check << 16); rxkhdr->checksum = 0; - txb->len += sizeof(struct rxkad_level2_hdr); - pad = txb->len; - pad = RXKAD_ALIGN - pad; - pad &= RXKAD_ALIGN - 1; - if (pad) { + content = sizeof(struct rxkad_level2_hdr) + txb->len; + txb->pkt_len = round_up(content, RXKAD_ALIGN); + pad = txb->pkt_len - content; + if (pad) memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; - } /* encrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg, rxkhdr, txb->len); + sg_init_one(&sg, rxkhdr, txb->pkt_len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); + skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x); ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return ret; @@ -384,6 +387,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: + txb->pkt_len = txb->len; ret = 0; break; case RXRPC_SECURITY_AUTH: diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 546abb463c3f..786c1fb1369a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -391,7 +391,6 @@ reload: goto out; txb->kvec[0].iov_len += txb->len; - txb->len = txb->kvec[0].iov_len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 2a4291617d40..8b7c854ed3d7 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -24,7 +24,7 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ size_t total, hoff; void *buf; - txb = kmalloc(sizeof(*txb), gfp); + txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; @@ -49,14 +49,11 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ txb->last_sent = KTIME_MIN; txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->alloc_size = data_size; txb->space = data_size; - txb->len = 0; txb->offset = sizeof(*whdr); txb->flags = call->conn->out_clientflag; - txb->ack_why = 0; txb->seq = call->tx_prepared + 1; - txb->serial = 0; - txb->cksum = 0; txb->nr_kvec = 1; txb->kvec[0].iov_base = whdr; txb->kvec[0].iov_len = sizeof(*whdr); -- 2.51.0 From b7313009c2e56d6e8bffd3d21c1a3a67a9149e2e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:39 +0000 Subject: [PATCH 15/16] rxrpc: Prepare to be able to send jumbo DATA packets Prepare to be able to send jumbo DATA packets if the we decide to, but don't enable that yet. This will allow larger chunks of data to be sent without reducing the retryability as the subpackets in a jumbo packet can also be retransmitted individually. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-12-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 18 +++++++++- net/rxrpc/call_event.c | 48 ++++++++++++++----------- net/rxrpc/input.c | 36 +++++++++++-------- net/rxrpc/insecure.c | 2 ++ net/rxrpc/output.c | 80 ++++++++++++++++++++++++++++------------- net/rxrpc/rxkad.c | 13 +++++++ 6 files changed, 137 insertions(+), 60 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a5c0bc917641..4386b2e6cca5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -832,6 +832,7 @@ struct rxrpc_txbuf { __be16 cksum; /* Checksum to go in header */ unsigned short ack_rwind; /* ACK receive window */ u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ + bool jumboable; /* Can be non-terminal jumbo subpacket */ u8 nr_kvec; /* Amount of kvec[] used */ struct kvec kvec[3]; }; @@ -862,6 +863,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn return serial; } +/* + * Allocate the next serial n numbers on a connection. 0 must be skipped. + */ +static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn, + unsigned int n) +{ + rxrpc_serial_t serial; + + serial = conn->tx_serial; + if (serial + n <= n) + serial = 1; + conn->tx_serial = serial + n; + return serial; +} + /* * af_rxrpc.c */ @@ -1176,7 +1192,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); +void rxrpc_transmit_data(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n); /* * peer_event.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1d889b6f0366..3379adfaaf65 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -124,7 +124,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) ktime_sub(resend_at, now)); txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); + rxrpc_transmit_data(call, txb, 1); did_send = true; now = ktime_get_real(); @@ -164,7 +164,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) unacked = true; txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); + rxrpc_transmit_data(call, txb, 1); did_send = true; rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); now = ktime_get_real(); @@ -231,15 +231,12 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) } } -static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) +static unsigned int rxrpc_tx_window_space(struct rxrpc_call *call) { - unsigned int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); - rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; - rxrpc_seq_t tx_top = call->tx_top; - int space; + int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); + int in_flight = call->tx_top - call->acks_hard_ack; - space = wtop - tx_top; - return space > 0; + return max(winsize - in_flight, 0); } /* @@ -247,7 +244,7 @@ static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) */ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) { - struct rxrpc_txbuf *txb; + int space = rxrpc_tx_window_space(call); if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { if (list_empty(&call->tx_sendmsg)) @@ -255,22 +252,33 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) rxrpc_expose_client_call(call); } - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { + while (space > 0) { + struct rxrpc_txbuf *head = NULL, *txb; + int count = 0, limit = min(space, 1); + + if (list_empty(&call->tx_sendmsg)) + break; + spin_lock(&call->tx_lock); - list_del(&txb->call_link); + do { + txb = list_first_entry(&call->tx_sendmsg, + struct rxrpc_txbuf, call_link); + if (!head) + head = txb; + list_move_tail(&txb->call_link, &call->tx_buffer); + count++; + if (!txb->jumboable) + break; + } while (count < limit && !list_empty(&call->tx_sendmsg)); + spin_unlock(&call->tx_lock); call->tx_top = txb->seq; - list_add_tail(&txb->call_link, &call->tx_buffer); - if (txb->flags & RXRPC_LAST_PACKET) rxrpc_close_tx_phase(call); - rxrpc_transmit_one(call, txb); - - if (!rxrpc_tx_window_has_space(call)) - break; + space -= count; + rxrpc_transmit_data(call, head, count); } } @@ -285,7 +293,7 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) case RXRPC_CALL_SERVER_SEND_REPLY: case RXRPC_CALL_CLIENT_SEND_REQUEST: - if (!rxrpc_tx_window_has_space(call)) + if (!rxrpc_tx_window_space(call)) return; if (list_empty(&call->tx_sendmsg)) { rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index fd08d813ef29..8398fa10ee8d 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -693,9 +693,12 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_peer *peer = call->peer; - unsigned int max_data; + unsigned int max_data, capacity; bool wake = false; - u32 rwind = ntohl(trailer->rwind); + u32 max_mtu = ntohl(trailer->maxMTU); + //u32 if_mtu = ntohl(trailer->ifMTU); + u32 rwind = ntohl(trailer->rwind); + u32 jumbo_max = ntohl(trailer->jumbo_max); if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; @@ -706,24 +709,29 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - if (trailer->jumbo_max == 0) { - /* The peer says it supports pmtu discovery */ - peer->ackr_adv_pmtud = true; - } else { - peer->ackr_adv_pmtud = false; - } - - max_data = ntohl(trailer->maxMTU); - peer->ackr_max_data = max_data; + max_mtu = clamp(max_mtu, 500, 65535); + peer->ackr_max_data = max_mtu; - if (max_data < peer->max_data) { - trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_data, + if (max_mtu < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, rxrpc_pmtud_reduce_ack); write_seqcount_begin(&peer->mtu_lock); - peer->max_data = max_data; + peer->max_data = max_mtu; write_seqcount_end(&peer->mtu_lock); } + max_data = umin(max_mtu, peer->max_data); + capacity = max_data; + capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */ + capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN; + + if (jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + } + if (wake) wake_up(&call->waitq); } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index d665f486be5f..e068f9b79d02 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -25,6 +25,8 @@ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t rema static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { txb->pkt_len = txb->len; + if (txb->len == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; return 0; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index df9af4ad4260..aededdd474d7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -377,9 +377,10 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) */ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_serial_t serial, - int subpkt) + int subpkt, int nr_subpkts) { struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; struct kvec *kv = &call->local->kvec[subpkt]; @@ -399,6 +400,11 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; last = txb->flags & RXRPC_LAST_PACKET; + if (subpkt < nr_subpkts - 1) { + len = RXRPC_JUMBO_DATALEN; + goto dont_set_request_ack; + } + more = (!list_is_last(&txb->call_link, &call->tx_buffer) || !list_empty(&call->tx_sendmsg)); @@ -436,13 +442,25 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc } dont_set_request_ack: - whdr->flags = flags; - whdr->serial = htonl(txb->serial); - whdr->cksum = txb->cksum; - whdr->serviceId = htons(conn->service_id); - kv->iov_base = whdr; - len += sizeof(*whdr); - // TODO: Convert into a jumbo header for tail subpackets + /* The jumbo header overlays the wire header in the txbuf. */ + if (subpkt < nr_subpkts - 1) + flags |= RXRPC_JUMBO_PACKET; + else + flags &= ~RXRPC_JUMBO_PACKET; + if (subpkt == 0) { + whdr->flags = flags; + whdr->serial = htonl(txb->serial); + whdr->cksum = txb->cksum; + whdr->serviceId = htons(conn->service_id); + kv->iov_base = whdr; + len += sizeof(*whdr); + } else { + jumbo->flags = flags; + jumbo->pad = 0; + jumbo->cksum = txb->cksum; + kv->iov_base = jumbo; + len += sizeof(*jumbo); + } trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, false); kv->iov_len = len; @@ -450,18 +468,22 @@ dont_set_request_ack: } /* - * Prepare a packet for transmission. + * Prepare a (jumbo) packet for transmission. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *head, int n) { + struct rxrpc_txbuf *txb = head; rxrpc_serial_t serial; size_t len = 0; /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serial(call->conn); + serial = rxrpc_get_next_serials(call->conn, n); - len += rxrpc_prepare_data_subpacket(call, txb, serial, 0); - // TODO: Loop around adding tail subpackets + for (int i = 0; i < n; i++) { + len += rxrpc_prepare_data_subpacket(call, txb, serial, i, n); + serial++; + txb = list_next_entry(txb, call_link); + } return len; } @@ -469,16 +491,24 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_tx /* * Set timeouts after transmitting a packet. */ -static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) { + rxrpc_serial_t serial; ktime_t now = ktime_get_real(); bool ack_requested = txb->flags & RXRPC_REQUEST_ACK; + int i; call->tx_last_sent = now; - txb->last_sent = now; + + for (i = 0; i < n; i++) { + txb->last_sent = now; + ack_requested |= txb->flags & RXRPC_REQUEST_ACK; + serial = txb->serial; + txb = list_next_entry(txb, call_link); + } if (ack_requested) { - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data); + rxrpc_begin_rtt_probe(call, serial, now, rxrpc_rtt_tx_data); call->peer->rtt_last_req = now; if (call->peer->rtt_count > 1) { @@ -502,7 +532,7 @@ static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbu /* * send a packet through the transport endpoint */ -static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) { struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; @@ -512,7 +542,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t _enter("%x,{%d}", txb->seq, txb->pkt_len); - len = rxrpc_prepare_data_packet(call, txb); + len = rxrpc_prepare_data_packet(call, txb, n); if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; @@ -524,7 +554,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t } } - iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -537,7 +567,7 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t * yet. */ if (txb->seq == call->tx_transmitted + 1) - call->tx_transmitted = txb->seq; + call->tx_transmitted = txb->seq + n - 1; /* send the packet with the don't fragment bit set if we currently * think it's small enough */ @@ -568,7 +598,7 @@ retry: } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) { + if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_nofrag) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; goto retry; @@ -576,7 +606,7 @@ retry: done: if (ret >= 0) { - rxrpc_tstamp_data_packets(call, txb); + rxrpc_tstamp_data_packets(call, txb, n); } else { /* Cancel the call if the initial transmission fails, * particularly if that's due to network routing issues that @@ -776,13 +806,13 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call, } /* - * Transmit one packet. + * Transmit a packet, possibly gluing several subpackets together. */ -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +void rxrpc_transmit_data(struct rxrpc_call *call, struct rxrpc_txbuf *txb, int n) { int ret; - ret = rxrpc_send_data_packet(call, txb); + ret = rxrpc_send_data_packet(call, txb, n); if (ret < 0) { switch (ret) { case -ENETUNREACH: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 755897fab626..62b09d23ec08 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -392,15 +392,28 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) break; case RXRPC_SECURITY_AUTH: ret = rxkad_secure_packet_auth(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; case RXRPC_SECURITY_ENCRYPT: ret = rxkad_secure_packet_encrypt(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; default: ret = -EPERM; break; } + /* Clear excess space in the packet */ + if (txb->pkt_len < txb->alloc_size) { + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + size_t gap = txb->alloc_size - txb->pkt_len; + void *p = whdr + 1; + + memset(p + txb->pkt_len, 0, gap); + } + skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; -- 2.51.0 From 149d002bee706f51772bd320cda90c922844bb0e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:40 +0000 Subject: [PATCH 16/16] rxrpc: Add a tracepoint to show variables pertinent to jumbo packet size Add a tracepoint to be called right before packets are transmitted for the first time that shows variable values that are pertinent to how many subpackets will be added to a jumbo DATA packet. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-13-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 41 ++++++++++++++++++++++++++++++++++++ net/rxrpc/call_event.c | 2 ++ 2 files changed, 43 insertions(+) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 9dcadad88e76..71f07e726a90 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -903,6 +903,47 @@ TRACE_EVENT(rxrpc_txqueue, __entry->tx_winsize) ); +TRACE_EVENT(rxrpc_transmit, + TP_PROTO(struct rxrpc_call *call, int space), + + TP_ARGS(call, space), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, seq) + __field(u16, space) + __field(u16, tx_winsize) + __field(u16, cong_cwnd) + __field(u16, cong_extra) + __field(u16, in_flight) + __field(u16, prepared) + __field(u16, pmtud_jumbo) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->seq = call->tx_bottom; + __entry->space = space; + __entry->tx_winsize = call->tx_winsize; + __entry->cong_cwnd = call->cong_cwnd; + __entry->cong_extra = call->cong_extra; + __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->in_flight = call->tx_top - call->acks_hard_ack; + __entry->pmtud_jumbo = call->peer->pmtud_jumbo; + ), + + TP_printk("c=%08x q=%08x sp=%u tw=%u cw=%u+%u pr=%u if=%u pj=%u", + __entry->call, + __entry->seq, + __entry->space, + __entry->tx_winsize, + __entry->cong_cwnd, + __entry->cong_extra, + __entry->prepared, + __entry->in_flight, + __entry->pmtud_jumbo) + ); + TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3379adfaaf65..1f716f09d441 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -259,6 +259,8 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) if (list_empty(&call->tx_sendmsg)) break; + trace_rxrpc_transmit(call, space); + spin_lock(&call->tx_lock); do { txb = list_first_entry(&call->tx_sendmsg, -- 2.51.0