/* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
+       membarrier_execve(current);
        acct_update_integrals(current);
        task_numa_free(current);
        free_bprm(bprm);
 
        unsigned long flags; /* Must use atomic bitops to access the bits */
 
        struct core_state *core_state; /* coredumping support */
+#ifdef CONFIG_MEMBARRIER
+       atomic_t membarrier_state;
+#endif
 #ifdef CONFIG_AIO
        spinlock_t                      ioctx_lock;
        struct kioctx_table __rcu       *ioctx_table;
 
        current->flags = (current->flags & ~PF_MEMALLOC) | flags;
 }
 
+#ifdef CONFIG_MEMBARRIER
+enum {
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY        = (1U << 0),
+       MEMBARRIER_STATE_SWITCH_MM                      = (1U << 1),
+};
+
+static inline void membarrier_execve(struct task_struct *t)
+{
+       atomic_set(&t->mm->membarrier_state, 0);
+}
+#else
+static inline void membarrier_execve(struct task_struct *t)
+{
+}
+#endif
+
 #endif /* _LINUX_SCHED_MM_H */
 
  *                          (non-running threads are de facto in such a
  *                          state). This only covers threads from the
  *                          same processes as the caller thread. This
- *                          command returns 0. The "expedited" commands
- *                          complete faster than the non-expedited ones,
- *                          they never block, but have the downside of
- *                          causing extra overhead.
+ *                          command returns 0 on success. The
+ *                          "expedited" commands complete faster than
+ *                          the non-expedited ones, they never block,
+ *                          but have the downside of causing extra
+ *                          overhead. A process needs to register its
+ *                          intent to use the private expedited command
+ *                          prior to using it, otherwise this command
+ *                          returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
+ *                          returns 0.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-       MEMBARRIER_CMD_QUERY                    = 0,
-       MEMBARRIER_CMD_SHARED                   = (1 << 0),
+       MEMBARRIER_CMD_QUERY                            = 0,
+       MEMBARRIER_CMD_SHARED                           = (1 << 0),
        /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
        /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
-       MEMBARRIER_CMD_PRIVATE_EXPEDITED        = (1 << 3),
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED                = (1 << 3),
+       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED       = (1 << 4),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
 
 #include <linux/membarrier.h>
 #include <linux/tick.h>
 #include <linux/cpumask.h>
+#include <linux/atomic.h>
 
 #include "sched.h"     /* for cpu_rq(). */
 
  * except MEMBARRIER_CMD_QUERY.
  */
 #define MEMBARRIER_CMD_BITMASK \
-       (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+       (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED       \
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
 
 static void ipi_mb(void *info)
 {
        smp_mb();       /* IPIs should be serializing but paranoid. */
 }
 
-static void membarrier_private_expedited(void)
+static int membarrier_private_expedited(void)
 {
        int cpu;
        bool fallback = false;
        cpumask_var_t tmpmask;
 
+       if (!(atomic_read(¤t->mm->membarrier_state)
+                       & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+               return -EPERM;
+
        if (num_online_cpus() == 1)
-               return;
+               return 0;
 
        /*
         * Matches memory barriers around rq->curr modification in
         * rq->curr modification in scheduler.
         */
        smp_mb();       /* exit from system call is not a mb */
+       return 0;
+}
+
+static void membarrier_register_private_expedited(void)
+{
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+
+       /*
+        * We need to consider threads belonging to different thread
+        * groups, which use the same mm. (CLONE_VM but not
+        * CLONE_THREAD).
+        */
+       if (atomic_read(&mm->membarrier_state)
+                       & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
+               return;
+       atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+                       &mm->membarrier_state);
 }
 
 /**
                        synchronize_sched();
                return 0;
        case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-               membarrier_private_expedited();
+               return membarrier_private_expedited();
+       case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+               membarrier_register_private_expedited();
                return 0;
        default:
                return -EINVAL;