comm           :  1, /* include comm data     */
                                freq           :  1, /* use freq, not period  */
                                inherit_stat   :  1, /* per task counts       */
+                               enable_on_exec :  1, /* next exec enables     */
 
-                               __reserved_1   : 52;
+                               __reserved_1   : 51;
 
        __u32                   wakeup_events;  /* wakeup every n events */
        __u32                   __reserved_2;
 
                perf_counter_task_sched_in(curr, cpu);
 }
 
+/*
+ * Enable all of a task's counters that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_counter_enable_on_exec(struct task_struct *task)
+{
+       struct perf_counter_context *ctx;
+       struct perf_counter *counter;
+       unsigned long flags;
+       int enabled = 0;
+
+       local_irq_save(flags);
+       ctx = task->perf_counter_ctxp;
+       if (!ctx || !ctx->nr_counters)
+               goto out;
+
+       __perf_counter_task_sched_out(ctx);
+
+       spin_lock(&ctx->lock);
+
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               if (!counter->attr.enable_on_exec)
+                       continue;
+               counter->attr.enable_on_exec = 0;
+               if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+                       continue;
+               counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->tstamp_enabled =
+                       ctx->time - counter->total_time_enabled;
+               enabled = 1;
+       }
+
+       /*
+        * Unclone this context if we enabled any counter.
+        */
+       if (enabled && ctx->parent_ctx) {
+               put_ctx(ctx->parent_ctx);
+               ctx->parent_ctx = NULL;
+       }
+
+       spin_unlock(&ctx->lock);
+
+       perf_counter_task_sched_in(task, smp_processor_id());
+ out:
+       local_irq_restore(flags);
+}
+
 /*
  * Cross CPU call to read the hardware counter
  */
 {
        struct perf_comm_event comm_event;
 
+       if (task->perf_counter_ctxp)
+               perf_counter_enable_on_exec(task);
+
        if (!atomic_read(&nr_comm_counters))
                return;
 
 
                                        fd[cpu][counter], strerror(errno));
                }
        } else {
-               attr->inherit   = inherit;
-               attr->disabled  = 1;
+               attr->inherit        = inherit;
+               attr->disabled       = 1;
+               attr->enable_on_exec = 1;
 
                fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
                if (fd[0][counter] < 0 && verbose)
         * Enable counters and exec the command:
         */
        t0 = rdclock();
-       prctl(PR_TASK_PERF_COUNTERS_ENABLE);
 
        close(go_pipe[1]);
        wait(&status);