return -EPERM;
        }
 
-       if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
+       if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
+           (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
                return 0;
 
        /*
 
                if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
                        goto out;
-               if (cpu_id == raw_smp_processor_id())
-                       goto out;
                rcu_read_lock();
                p = rcu_dereference(cpu_rq(cpu_id)->curr);
                if (!p || p->mm != mm) {
                for_each_online_cpu(cpu) {
                        struct task_struct *p;
 
-                       /*
-                        * Skipping the current CPU is OK even through we can be
-                        * migrated at any point. The current CPU, at the point
-                        * where we read raw_smp_processor_id(), is ensured to
-                        * be in program order with respect to the caller
-                        * thread. Therefore, we can skip this CPU from the
-                        * iteration.
-                        */
-                       if (cpu == raw_smp_processor_id())
-                               continue;
                        p = rcu_dereference(cpu_rq(cpu)->curr);
                        if (p && p->mm == mm)
                                __cpumask_set_cpu(cpu, tmpmask);
                rcu_read_unlock();
        }
 
-       preempt_disable();
-       if (cpu_id >= 0)
+       if (cpu_id >= 0) {
+               /*
+                * smp_call_function_single() will call ipi_func() if cpu_id
+                * is the calling CPU.
+                */
                smp_call_function_single(cpu_id, ipi_func, NULL, 1);
-       else
-               smp_call_function_many(tmpmask, ipi_func, NULL, 1);
-       preempt_enable();
+       } else {
+               /*
+                * For regular membarrier, we can save a few cycles by
+                * skipping the current cpu -- we're about to do smp_mb()
+                * below, and if we migrate to a different cpu, this cpu
+                * and the new cpu will execute a full barrier in the
+                * scheduler.
+                *
+                * For SYNC_CORE, we do need a barrier on the current cpu --
+                * otherwise, if we are migrated and replaced by a different
+                * task in the same mm just before, during, or after
+                * membarrier, we will end up with some thread in the mm
+                * running without a core sync.
+                *
+                * For RSEQ, don't rseq_preempt() the caller.  User code
+                * is not supposed to issue syscalls at all from inside an
+                * rseq critical section.
+                */
+               if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
+                       preempt_disable();
+                       smp_call_function_many(tmpmask, ipi_func, NULL, true);
+                       preempt_enable();
+               } else {
+                       on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
+               }
+       }
 
 out:
        if (cpu_id < 0)