#include <linux/hw_breakpoint.h>
 
+#include <linux/atomic.h>
 #include <linux/bug.h>
 #include <linux/cpu.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/percpu.h>
 #include <linux/rhashtable.h>
 #include <linux/sched.h>
        unsigned int    cpu_pinned;
        /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
 #ifdef hw_breakpoint_slots
-       unsigned int    tsk_pinned[hw_breakpoint_slots(0)];
+       atomic_t        tsk_pinned[hw_breakpoint_slots(0)];
 #else
-       unsigned int    *tsk_pinned;
+       atomic_t        *tsk_pinned;
 #endif
 };
 
 
 static bool constraints_initialized __ro_after_init;
 
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ *  1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ *     (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ *  2. Holding a write-lock is required for computations that require a
+ *     stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ *  3. In all other cases, non-atomic accesses require the appropriately held
+ *     lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
+
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+       struct task_struct *tsk = bp->hw.target;
+
+       return tsk ? &tsk->perf_event_mutex : NULL;
+}
+
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
+{
+       struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+       if (tsk_mtx) {
+               mutex_lock(tsk_mtx);
+               percpu_down_read(&bp_cpuinfo_sem);
+       } else {
+               percpu_down_write(&bp_cpuinfo_sem);
+       }
+
+       return tsk_mtx;
+}
+
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
+{
+       if (tsk_mtx) {
+               percpu_up_read(&bp_cpuinfo_sem);
+               mutex_unlock(tsk_mtx);
+       } else {
+               percpu_up_write(&bp_cpuinfo_sem);
+       }
+}
+
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+       struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+       return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+              (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+                         percpu_is_read_locked(&bp_cpuinfo_sem));
+}
+
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+       struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+       if (tsk_mtx)
+               lockdep_assert_held(tsk_mtx);
+       lockdep_assert_held(&bp_cpuinfo_sem);
+}
 
 #ifdef hw_breakpoint_slots
 /*
                for (i = 0; i < TYPE_MAX; i++) {
                        struct bp_cpuinfo *info = get_bp_info(cpu, i);
 
-                       info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL);
+                       info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(atomic_t), GFP_KERNEL);
                        if (!info->tsk_pinned)
                                goto err;
                }
  */
 static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
-       unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+       atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
        int i;
 
+       /*
+        * At this point we want to have acquired the bp_cpuinfo_sem as a
+        * writer to ensure that there are no concurrent writers in
+        * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+        */
+       lockdep_assert_held_write(&bp_cpuinfo_sem);
+
        for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
-               if (tsk_pinned[i] > 0)
+               ASSERT_EXCLUSIVE_WRITER(tsk_pinned[i]); /* Catch unexpected writers. */
+               if (atomic_read(&tsk_pinned[i]) > 0)
                        return i + 1;
        }
 
        struct perf_event *iter;
        int count = 0;
 
+       /*
+        * We need a stable snapshot of the per-task breakpoint list.
+        */
+       assert_bp_constraints_lock_held(bp);
+
        rcu_read_lock();
        head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
        if (!head)
 static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
                                enum bp_type_idx type, int weight)
 {
-       unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+       atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
        int old_idx, new_idx;
 
+       /*
+        * If bp->hw.target, tsk_pinned is only modified, but not used
+        * otherwise. We can permit concurrent updates as long as there are no
+        * other uses: having acquired bp_cpuinfo_sem as a reader allows
+        * concurrent updates here. Uses of tsk_pinned will require acquiring
+        * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+        */
+       lockdep_assert_held_read(&bp_cpuinfo_sem);
+
        old_idx = task_bp_pinned(cpu, bp, type) - 1;
        new_idx = old_idx + weight;
 
        if (old_idx >= 0)
-               tsk_pinned[old_idx]--;
+               atomic_dec(&tsk_pinned[old_idx]);
        if (new_idx >= 0)
-               tsk_pinned[new_idx]++;
+               atomic_inc(&tsk_pinned[new_idx]);
 }
 
 /*
 
        /* Pinned counter cpu profiling */
        if (!bp->hw.target) {
+               lockdep_assert_held_write(&bp_cpuinfo_sem);
                get_bp_info(bp->cpu, type)->cpu_pinned += weight;
                return 0;
        }
        for_each_cpu(cpu, cpumask)
                toggle_bp_task_slot(bp, cpu, type, weight);
 
+       /*
+        * Readers want a stable snapshot of the per-task breakpoint list.
+        */
+       assert_bp_constraints_lock_held(bp);
+
        if (enable)
                return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
        else
 
 int reserve_bp_slot(struct perf_event *bp)
 {
-       int ret;
-
-       mutex_lock(&nr_bp_mutex);
-
-       ret = __reserve_bp_slot(bp, bp->attr.bp_type);
-
-       mutex_unlock(&nr_bp_mutex);
+       struct mutex *mtx = bp_constraints_lock(bp);
+       int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
 
+       bp_constraints_unlock(mtx);
        return ret;
 }
 
 
 void release_bp_slot(struct perf_event *bp)
 {
-       mutex_lock(&nr_bp_mutex);
+       struct mutex *mtx = bp_constraints_lock(bp);
 
        arch_unregister_hw_breakpoint(bp);
        __release_bp_slot(bp, bp->attr.bp_type);
-
-       mutex_unlock(&nr_bp_mutex);
+       bp_constraints_unlock(mtx);
 }
 
 static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
 
 static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
 {
-       int ret;
+       struct mutex *mtx = bp_constraints_lock(bp);
+       int ret = __modify_bp_slot(bp, old_type, new_type);
 
-       mutex_lock(&nr_bp_mutex);
-       ret = __modify_bp_slot(bp, old_type, new_type);
-       mutex_unlock(&nr_bp_mutex);
+       bp_constraints_unlock(mtx);
        return ret;
 }
 
  */
 int dbg_reserve_bp_slot(struct perf_event *bp)
 {
-       if (mutex_is_locked(&nr_bp_mutex))
+       int ret;
+
+       if (bp_constraints_is_locked(bp))
                return -1;
 
-       return __reserve_bp_slot(bp, bp->attr.bp_type);
+       /* Locks aren't held; disable lockdep assert checking. */
+       lockdep_off();
+       ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+       lockdep_on();
+
+       return ret;
 }
 
 int dbg_release_bp_slot(struct perf_event *bp)
 {
-       if (mutex_is_locked(&nr_bp_mutex))
+       if (bp_constraints_is_locked(bp))
                return -1;
 
+       /* Locks aren't held; disable lockdep assert checking. */
+       lockdep_off();
        __release_bp_slot(bp, bp->attr.bp_type);
+       lockdep_on();
 
        return 0;
 }
                                return true;
 
                        for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
-                               if (info->tsk_pinned[slot])
+                               if (atomic_read(&info->tsk_pinned[slot]))
                                        return true;
                        }
                }