bpf: Defer work in bpf_timer_cancel_and_free

author Kumar Kartikeya Dwivedi <memxor@gmail.com>

Tue, 9 Jul 2024 18:54:39 +0000 (18:54 +0000)

committer Alexei Starovoitov <ast@kernel.org>

Wed, 10 Jul 2024 22:59:44 +0000 (15:59 -0700)
author Kumar Kartikeya Dwivedi <memxor@gmail.com>
Tue, 9 Jul 2024 18:54:39 +0000 (18:54 +0000)
committer Alexei Starovoitov <ast@kernel.org>
Wed, 10 Jul 2024 22:59:44 +0000 (15:59 -0700)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c

index 22e779ca50d5ad2b99179f9d8a477b2d142b322e..3243c83ef3e39f1cff0f4d42694eeaf374502280 100644 (file)
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
         struct bpf_prog *prog;
         void __rcu *callback_fn;
         void *value;
-       struct rcu_head rcu;
+       union {
+               struct rcu_head rcu;
+               struct work_struct delete_work;
+       };
         u64 flags;
  };
  
@@ -1220,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
         kfree_rcu(w, cb.rcu);
  }
  
+static void bpf_timer_delete_work(struct work_struct *work)
+{
+       struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+
+       /* Cancel the timer and wait for callback to complete if it was running.
+        * If hrtimer_cancel() can be safely called it's safe to call
+        * kfree_rcu(t) right after for both preallocated and non-preallocated
+        * maps.  The async->cb = NULL was already done and no code path can see
+        * address 't' anymore. Timer if armed for existing bpf_hrtimer before
+        * bpf_timer_cancel_and_free will have been cancelled.
+        */
+       hrtimer_cancel(&t->timer);
+       kfree_rcu(t, cb.rcu);
+}
+
  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
                             enum bpf_async_type type)
  {
@@ -1264,6 +1282,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
                 t = (struct bpf_hrtimer *)cb;
  
                 atomic_set(&t->cancelling, 0);
+               INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
                 hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
                 t->timer.function = bpf_timer_cb;
                 cb->value = (void *)async - map->record->timer_off;
@@ -1544,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)
  
         if (!t)
                 return;
-       /* Cancel the timer and wait for callback to complete if it was running.
-        * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
-        * right after for both preallocated and non-preallocated maps.
-        * The async->cb = NULL was already done and no code path can
-        * see address 't' anymore.
-        *
-        * Check that bpf_map_delete/update_elem() wasn't called from timer
-        * callback_fn. In such case don't call hrtimer_cancel() (since it will
-        * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
-        * return -1). Though callback_fn is still running on this cpu it's
+       /* We check that bpf_map_delete/update_elem() was called from timer
+        * callback_fn. In such case we don't call hrtimer_cancel() (since it
+        * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
+        * just return -1). Though callback_fn is still running on this cpu it's
          * safe to do kfree(t) because bpf_timer_cb() read everything it needed
          * from 't'. The bpf subprog callback_fn won't be able to access 't',
          * since async->cb = NULL was already done. The timer will be
          * effectively cancelled because bpf_timer_cb() will return
          * HRTIMER_NORESTART.
+        *
+        * However, it is possible the timer callback_fn calling us armed the
+        * timer _before_ calling us, such that failing to cancel it here will
+        * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
+        * Therefore, we _need_ to cancel any outstanding timers before we do
+        * kfree_rcu, even though no more timers can be armed.
+        *
+        * Moreover, we need to schedule work even if timer does not belong to
+        * the calling callback_fn, as on two different CPUs, we can end up in a
+        * situation where both sides run in parallel, try to cancel one
+        * another, and we end up waiting on both sides in hrtimer_cancel
+        * without making forward progress, since timer1 depends on time2
+        * callback to finish, and vice versa.
+        *
+        *  CPU 1 (timer1_cb)                   CPU 2 (timer2_cb)
+        *  bpf_timer_cancel_and_free(timer2)   bpf_timer_cancel_and_free(timer1)
+        *
+        * To avoid these issues, punt to workqueue context when we are in a
+        * timer callback.
          */
-       if (this_cpu_read(hrtimer_running) != t)
-               hrtimer_cancel(&t->timer);
-       kfree_rcu(t, cb.rcu);
+       if (this_cpu_read(hrtimer_running))
+               queue_work(system_unbound_wq, &t->cb.delete_work);
+       else
+               bpf_timer_delete_work(&t->cb.delete_work);
  }
  
  /* This function is called by map_delete/update_elem for individual element and
author	Kumar Kartikeya Dwivedi <memxor@gmail.com>
	Tue, 9 Jul 2024 18:54:39 +0000 (18:54 +0000)
committer	Alexei Starovoitov <ast@kernel.org>
	Wed, 10 Jul 2024 22:59:44 +0000 (15:59 -0700)