uprobes: revamp uprobe refcounting and lifetime management

author Andrii Nakryiko <andrii@kernel.org>

Tue, 3 Sep 2024 17:45:56 +0000 (10:45 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Thu, 5 Sep 2024 14:56:13 +0000 (16:56 +0200)
author Andrii Nakryiko <andrii@kernel.org>
Tue, 3 Sep 2024 17:45:56 +0000 (10:45 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Thu, 5 Sep 2024 14:56:13 +0000 (16:56 +0200)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c

index cac45ea4c2849ca0a74e1626565bb2289b9699e7..cd92e8dc3ed07b3db602a4e1a77d0ddc6c89ba35 100644 (file)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -109,6 +109,11 @@ struct xol_area {
         unsigned long                   vaddr;          /* Page(s) of instruction slots */
  };
  
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+       pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
+}
+
  /*
   * valid_vma: Verify if the specified vma is an executable vma
   * Relax restrictions while unregistering: vm_flags might have
@@ -587,25 +592,53 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
                         *(uprobe_opcode_t *)&auprobe->insn);
  }
  
+/* uprobe should have guaranteed positive refcount */
  static struct uprobe *get_uprobe(struct uprobe *uprobe)
  {
         refcount_inc(&uprobe->ref);
         return uprobe;
  }
  
+/*
+ * uprobe should have guaranteed lifetime, which can be either of:
+ *   - caller already has refcount taken (and wants an extra one);
+ *   - uprobe is RCU protected and won't be freed until after grace period;
+ *   - we are holding uprobes_treelock (for read or write, doesn't matter).
+ */
+static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
+{
+       if (refcount_inc_not_zero(&uprobe->ref))
+               return uprobe;
+       return NULL;
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+       return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+
  static void put_uprobe(struct uprobe *uprobe)
  {
-       if (refcount_dec_and_test(&uprobe->ref)) {
-               /*
-                * If application munmap(exec_vma) before uprobe_unregister()
-                * gets called, we don't get a chance to remove uprobe from
-                * delayed_uprobe_list from remove_breakpoint(). Do it here.
-                */
-               mutex_lock(&delayed_uprobe_lock);
-               delayed_uprobe_remove(uprobe, NULL);
-               mutex_unlock(&delayed_uprobe_lock);
-               kfree(uprobe);
-       }
+       if (!refcount_dec_and_test(&uprobe->ref))
+               return;
+
+       write_lock(&uprobes_treelock);
+
+       if (uprobe_is_active(uprobe))
+               rb_erase(&uprobe->rb_node, &uprobes_tree);
+
+       write_unlock(&uprobes_treelock);
+
+       /*
+        * If application munmap(exec_vma) before uprobe_unregister()
+        * gets called, we don't get a chance to remove uprobe from
+        * delayed_uprobe_list from remove_breakpoint(). Do it here.
+        */
+       mutex_lock(&delayed_uprobe_lock);
+       delayed_uprobe_remove(uprobe, NULL);
+       mutex_unlock(&delayed_uprobe_lock);
+
+       kfree(uprobe);
  }
  
  static __always_inline
@@ -656,7 +689,7 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
         struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
  
         if (node)
-               return get_uprobe(__node_2_uprobe(node));
+               return try_get_uprobe(__node_2_uprobe(node));
  
         return NULL;
  }
@@ -676,26 +709,44 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
         return uprobe;
  }
  
+/*
+ * Attempt to insert a new uprobe into uprobes_tree.
+ *
+ * If uprobe already exists (for given inode+offset), we just increment
+ * refcount of previously existing uprobe.
+ *
+ * If not, a provided new instance of uprobe is inserted into the tree (with
+ * assumed initial refcount == 1).
+ *
+ * In any case, we return a uprobe instance that ends up being in uprobes_tree.
+ * Caller has to clean up new uprobe instance, if it ended up not being
+ * inserted into the tree.
+ *
+ * We assume that uprobes_treelock is held for writing.
+ */
  static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
  {
         struct rb_node *node;
-
+again:
         node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
-       if (node)
-               return get_uprobe(__node_2_uprobe(node));
+       if (node) {
+               struct uprobe *u = __node_2_uprobe(node);
  
-       /* get access + creation ref */
-       refcount_set(&uprobe->ref, 2);
-       return NULL;
+               if (!try_get_uprobe(u)) {
+                       rb_erase(node, &uprobes_tree);
+                       RB_CLEAR_NODE(&u->rb_node);
+                       goto again;
+               }
+
+               return u;
+       }
+
+       return uprobe;
  }
  
  /*
- * Acquire uprobes_treelock.
- * Matching uprobe already exists in rbtree;
- *     increment (access refcount) and return the matching uprobe.
- *
- * No matching uprobe; insert the uprobe in rb_tree;
- *     get a double refcount (access + creation) and return NULL.
+ * Acquire uprobes_treelock and insert uprobe into uprobes_tree
+ * (or reuse existing one, see __insert_uprobe() comments above).
   */
  static struct uprobe *insert_uprobe(struct uprobe *uprobe)
  {
@@ -732,11 +783,13 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
         uprobe->ref_ctr_offset = ref_ctr_offset;
         init_rwsem(&uprobe->register_rwsem);
         init_rwsem(&uprobe->consumer_rwsem);
+       RB_CLEAR_NODE(&uprobe->rb_node);
+       refcount_set(&uprobe->ref, 1);
  
         /* add to uprobes_tree, sorted on inode:offset */
         cur_uprobe = insert_uprobe(uprobe);
         /* a uprobe exists for this inode:offset combination */
-       if (cur_uprobe) {
+       if (cur_uprobe != uprobe) {
                 if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
                         ref_ctr_mismatch_warn(cur_uprobe, uprobe);
                         put_uprobe(cur_uprobe);
@@ -921,26 +974,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
         return set_orig_insn(&uprobe->arch, mm, vaddr);
  }
  
-static inline bool uprobe_is_active(struct uprobe *uprobe)
-{
-       return !RB_EMPTY_NODE(&uprobe->rb_node);
-}
-/*
- * There could be threads that have already hit the breakpoint. They
- * will recheck the current insn and restart if find_uprobe() fails.
- * See find_active_uprobe().
- */
-static void delete_uprobe(struct uprobe *uprobe)
-{
-       if (WARN_ON(!uprobe_is_active(uprobe)))
-               return;
-
-       write_lock(&uprobes_treelock);
-       rb_erase(&uprobe->rb_node, &uprobes_tree);
-       write_unlock(&uprobes_treelock);
-       RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
-}
-
  struct map_info {
         struct map_info *next;
         struct mm_struct *mm;
@@ -1094,17 +1127,13 @@ void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
         int err;
  
         down_write(&uprobe->register_rwsem);
-       if (WARN_ON(!consumer_del(uprobe, uc)))
+       if (WARN_ON(!consumer_del(uprobe, uc))) {
                 err = -ENOENT;
-       else
+       } else {
                 err = register_for_each_vma(uprobe, NULL);
-
-       /* TODO : cant unregister? schedule a worker thread */
-       if (!err) {
-               if (!uprobe->consumers)
-                       delete_uprobe(uprobe);
-               else
-                       err = -EBUSY;
+               /* TODO : cant unregister? schedule a worker thread */
+               if (unlikely(err))
+                       uprobe_warn(current, "unregister, leaking uprobe");
         }
         up_write(&uprobe->register_rwsem);
  
@@ -1159,27 +1188,16 @@ struct uprobe *uprobe_register(struct inode *inode,
         if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
                 return ERR_PTR(-EINVAL);
  
- retry:
         uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
         if (IS_ERR(uprobe))
                 return uprobe;
  
-       /*
-        * We can race with uprobe_unregister()->delete_uprobe().
-        * Check uprobe_is_active() and retry if it is false.
-        */
         down_write(&uprobe->register_rwsem);
-       ret = -EAGAIN;
-       if (likely(uprobe_is_active(uprobe))) {
-               consumer_add(uprobe, uc);
-               ret = register_for_each_vma(uprobe, uc);
-       }
+       consumer_add(uprobe, uc);
+       ret = register_for_each_vma(uprobe, uc);
         up_write(&uprobe->register_rwsem);
-       put_uprobe(uprobe);
  
         if (ret) {
-               if (unlikely(ret == -EAGAIN))
-                       goto retry;
                 uprobe_unregister(uprobe, uc);
                 return ERR_PTR(ret);
         }
@@ -1286,15 +1304,17 @@ static void build_probe_list(struct inode *inode,
                         u = rb_entry(t, struct uprobe, rb_node);
                         if (u->inode != inode || u->offset < min)
                                 break;
-                       list_add(&u->pending_list, head);
-                       get_uprobe(u);
+                       /* if uprobe went away, it's safe to ignore it */
+                       if (try_get_uprobe(u))
+                               list_add(&u->pending_list, head);
                 }
                 for (t = n; (t = rb_next(t)); ) {
                         u = rb_entry(t, struct uprobe, rb_node);
                         if (u->inode != inode || u->offset > max)
                                 break;
-                       list_add(&u->pending_list, head);
-                       get_uprobe(u);
+                       /* if uprobe went away, it's safe to ignore it */
+                       if (try_get_uprobe(u))
+                               list_add(&u->pending_list, head);
                 }
         }
         read_unlock(&uprobes_treelock);
@@ -1751,6 +1771,12 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
                         return -ENOMEM;
  
                 *n = *o;
+               /*
+                * uprobe's refcnt has to be positive at this point, kept by
+                * utask->return_instances items; return_instances can't be
+                * removed right now, as task is blocked due to duping; so
+                * get_uprobe() is safe to use here.
+                */
                 get_uprobe(n->uprobe);
                 n->next = NULL;
  
@@ -1762,12 +1788,6 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
         return 0;
  }
  
-static void uprobe_warn(struct task_struct *t, const char *msg)
-{
-       pr_warn("uprobe: %s:%d failed to %s\n",
-                       current->comm, current->pid, msg);
-}
-
  static void dup_xol_work(struct callback_head *work)
  {
         if (current->flags & PF_EXITING)
@@ -1893,7 +1913,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
                 }
                 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
         }
-
+       /*
+        * uprobe's refcnt is positive, held by caller, so it's safe to
+        * unconditionally bump it one more time here
+        */
         ri->uprobe = get_uprobe(uprobe);
         ri->func = instruction_pointer(regs);
         ri->stack = user_stack_pointer(regs);
author	Andrii Nakryiko <andrii@kernel.org>
	Tue, 3 Sep 2024 17:45:56 +0000 (10:45 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Thu, 5 Sep 2024 14:56:13 +0000 (16:56 +0200)