]> www.infradead.org Git - users/hch/misc.git/commitdiff
net: af_packet: Use hrtimer to do the retire operation
authorXin Zhao <jackzxcui1989@163.com>
Mon, 8 Sep 2025 10:45:49 +0000 (18:45 +0800)
committerJakub Kicinski <kuba@kernel.org>
Fri, 12 Sep 2025 01:40:06 +0000 (18:40 -0700)
In a system with high real-time requirements, the timeout mechanism of
ordinary timers with jiffies granularity is insufficient to meet the
demands for real-time performance. Meanwhile, the optimization of CPU
usage with af_packet is quite significant. Use hrtimer instead of timer
to help compensate for the shortcomings in real-time performance.
In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
enough, with fluctuations reaching over 8ms (on a system with HZ=250).
This is unacceptable in some high real-time systems that require timely
processing of network packets. By replacing it with hrtimer, if a timeout
of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
3 ms.

Delete delete_blk_timer field, because hrtimer_cancel will check and wait
until the timer callback return and ensure never enter callback again.

Simplify the logic related to setting timeout, only update the hrtimer
expire time within the hrtimer callback, no longer update the expire time
in prb_open_block which is called by tpacket_rcv or timer callback.
Reasons why NOT update hrtimer in prb_open_block:
1) It will increase complexity to distinguish the two caller scenario.
2) hrtimer_cancel and hrtimer_start need to be called if you want to update
TMO of an already enqueued hrtimer, leading to complex shutdown logic.

One side effect of NOT update hrtimer when called by tpacket_rcv is that
a newly opened block triggered by tpacket_rcv may be retired earlier than
expected. On the other hand, if timeout is updated in prb_open_block, the
frequent reception of network packets that leads to prb_open_block being
called may cause hrtimer to be removed and enqueued repeatedly.

The retire hrtimer expiration is unconditional and periodic. If there are
numerous packet sockets on the system, please set an appropriate timeout
to avoid frequent enqueueing of hrtimers.

Reviewed-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/all/20250831100822.1238795-1-jackzxcui1989@163.com/
Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
Link: https://patch.msgid.link/20250908104549.204412-3-jackzxcui1989@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
net/packet/af_packet.c
net/packet/diag.c
net/packet/internal.h

index 230cb87646152b5daf3cb49ecfa2e1feee94aa5b..173e6edda08f8c431c0777bb9764b4fb37d9c65c 100644 (file)
@@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
 static int prb_queue_frozen(struct tpacket_kbdq_core *);
 static void prb_open_block(struct tpacket_kbdq_core *,
                struct tpacket_block_desc *);
-static void prb_retire_rx_blk_timer_expired(struct timer_list *);
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
                struct tpacket3_hdr *);
@@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
        return proto;
 }
 
-static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
-       timer_delete_sync(&pkc->retire_blk_timer);
-}
-
 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
                struct sk_buff_head *rb_queue)
 {
        struct tpacket_kbdq_core *pkc;
 
        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-
-       spin_lock_bh(&rb_queue->lock);
-       pkc->delete_blk_timer = 1;
-       spin_unlock_bh(&rb_queue->lock);
-
-       prb_del_retire_blk_timer(pkc);
-}
-
-static void prb_setup_retire_blk_timer(struct packet_sock *po)
-{
-       struct tpacket_kbdq_core *pkc;
-
-       pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-       timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
-                   0);
-       pkc->retire_blk_timer.expires = jiffies;
+       hrtimer_cancel(&pkc->retire_blk_timer);
 }
 
 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
@@ -671,53 +650,34 @@ static void init_prb_bdqc(struct packet_sock *po,
        p1->version = po->tp_version;
        po->stats.stats3.tp_freeze_q_cnt = 0;
        if (req_u->req3.tp_retire_blk_tov)
-               p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
+               p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
        else
-               p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
-                                               req_u->req3.tp_block_size);
-       p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
+               p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
+                                                req_u->req3.tp_block_size));
        p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
        rwlock_init(&p1->blk_fill_in_prog_lock);
 
        p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
        prb_init_ft_ops(p1, req_u);
-       prb_setup_retire_blk_timer(po);
+       hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
+                     CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+       hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
+                     HRTIMER_MODE_REL_SOFT);
        prb_open_block(p1, pbd);
 }
 
-/*  Do NOT update the last_blk_num first.
- *  Assumes sk_buff_head lock is held.
- */
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
-       mod_timer(&pkc->retire_blk_timer,
-                       jiffies + pkc->tov_in_jiffies);
-}
-
 /*
- * Timer logic:
- * 1) We refresh the timer only when we open a block.
- *    By doing this we don't waste cycles refreshing the timer
- *       on packet-by-packet basis.
- *
  * With a 1MB block-size, on a 1Gbps line, it will take
  * i) ~8 ms to fill a block + ii) memcpy etc.
  * In this cut we are not accounting for the memcpy time.
  *
- * So, if the user sets the 'tmo' to 10ms then the timer
- * will never fire while the block is still getting filled
- * (which is what we want). However, the user could choose
- * to close a block early and that's fine.
- *
- * But when the timer does fire, we check whether or not to refresh it.
  * Since the tmo granularity is in msecs, it is not too expensive
  * to refresh the timer, lets say every '8' msecs.
  * Either the user can set the 'tmo' or we can derive it based on
  * a) line-speed and b) block-size.
  * prb_calc_retire_blk_tmo() calculates the tmo.
- *
  */
-static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
 {
        struct packet_sock *po =
                timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
@@ -730,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
        frozen = prb_queue_frozen(pkc);
        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
 
-       if (unlikely(pkc->delete_blk_timer))
-               goto out;
-
        /* We only need to plug the race when the block is partially filled.
         * tpacket_rcv:
         *              lock(); increment BLOCK_NUM_PKTS; unlock()
@@ -749,26 +706,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
        }
 
        if (!frozen) {
-               if (!BLOCK_NUM_PKTS(pbd)) {
-                       /* An empty block. Just refresh the timer. */
-                       goto refresh_timer;
+               if (BLOCK_NUM_PKTS(pbd)) {
+                       /* Not an empty block. Need retire the block. */
+                       prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
+                       prb_dispatch_next_block(pkc, po);
                }
-               prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
-               if (!prb_dispatch_next_block(pkc, po))
-                       goto refresh_timer;
-               else
-                       goto out;
        } else {
                /* Case 1. Queue was frozen because user-space was
                 * lagging behind.
                 */
-               if (prb_curr_blk_in_use(pbd)) {
-                       /*
-                        * Ok, user-space is still behind.
-                        * So just refresh the timer.
-                        */
-                       goto refresh_timer;
-               } else {
+               if (!prb_curr_blk_in_use(pbd)) {
                        /* Case 2. queue was frozen,user-space caught up,
                         * now the link went idle && the timer fired.
                         * We don't have a block to close.So we open this
@@ -777,15 +724,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
                         * Thawing/timer-refresh is a side effect.
                         */
                        prb_open_block(pkc, pbd);
-                       goto out;
                }
        }
 
-refresh_timer:
-       _prb_refresh_rx_retire_blk_timer(pkc);
-
-out:
+       hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
        spin_unlock(&po->sk.sk_receive_queue.lock);
+       return HRTIMER_RESTART;
 }
 
 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
@@ -879,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
 }
 
 /*
- * Side effect of opening a block:
+ * prb_open_block is called by tpacket_rcv or timer callback.
  *
- * 1) prb_queue is thawed.
- * 2) retire_blk_timer is refreshed.
+ * Reasons why NOT update hrtimer in prb_open_block:
+ * 1) It will increase complexity to distinguish the two caller scenario.
+ * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update
+ * TMO of an already enqueued hrtimer, leading to complex shutdown logic.
  *
+ * One side effect of NOT update hrtimer when called by tpacket_rcv is that
+ * a newly opened block triggered by tpacket_rcv may be retired earlier than
+ * expected. On the other hand, if timeout is updated in prb_open_block, the
+ * frequent reception of network packets that leads to prb_open_block being
+ * called may cause hrtimer to be removed and enqueued repeatedly.
  */
 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
        struct tpacket_block_desc *pbd1)
@@ -917,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
        pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
 
        prb_thaw_queue(pkc1);
-       _prb_refresh_rx_retire_blk_timer(pkc1);
 
        smp_wmb();
 }
index 6ce1dcc284d92021ca7b53b9a0fd5626918ef8aa..c8f43e0c1925fab8ef6c39de3547dcd6f7389b81 100644 (file)
@@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
        pdr.pdr_frame_nr = ring->frame_max + 1;
 
        if (ver > TPACKET_V2) {
-               pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
+               pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
                pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
                pdr.pdr_features = ring->prb_bdqc.feature_req_word;
        } else {
index d367b9f93a73a09e346086071df7978c1fd73d2c..b76e645cd78d1a8d679facb1108549fb44d1a969 100644 (file)
@@ -20,10 +20,11 @@ struct tpacket_kbdq_core {
        unsigned int    feature_req_word;
        unsigned int    hdrlen;
        unsigned char   reset_pending_on_curr_blk;
-       unsigned char   delete_blk_timer;
        unsigned short  kactive_blk_num;
        unsigned short  blk_sizeof_priv;
 
+       unsigned short  version;
+
        char            *pkblk_start;
        char            *pkblk_end;
        int             kblk_size;
@@ -32,6 +33,7 @@ struct tpacket_kbdq_core {
        uint64_t        knxt_seq_num;
        char            *prev;
        char            *nxt_offset;
+
        struct sk_buff  *skb;
 
        rwlock_t        blk_fill_in_prog_lock;
@@ -39,12 +41,10 @@ struct tpacket_kbdq_core {
        /* Default is set to 8ms */
 #define DEFAULT_PRB_RETIRE_TOV (8)
 
-       unsigned short  retire_blk_tov;
-       unsigned short  version;
-       unsigned long   tov_in_jiffies;
+       ktime_t         interval_ktime;
 
        /* timer to retire an outstanding block */
-       struct timer_list retire_blk_timer;
+       struct hrtimer  retire_blk_timer;
 };
 
 struct pgv {