SEC("perf_event")
 int bpf_prog1(struct bpf_perf_event_data *ctx)
 {
+       char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+       char time_fmt2[] = "Get Time Failed, ErrCode: %d";
        char fmt[] = "CPU-%d period %lld ip %llx";
        u32 cpu = bpf_get_smp_processor_id();
+       struct bpf_perf_event_value value_buf;
        struct key_t key;
        u64 *val, one = 1;
+       int ret;
 
        if (ctx->sample_period < 10000)
                /* ignore warmup */
                return 0;
        }
 
+       ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+       if (!ret)
+         bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+       else
+         bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
        val = bpf_map_lookup_elem(&counts, &key);
        if (val)
                (*val)++;
 
        int *pmu_fd = malloc(nr_cpus * sizeof(int));
        int i, error = 0;
 
+       /* system wide perf event, no need to inherit */
+       attr->inherit = 0;
+
        /* open perf_event on all cpus */
        for (i = 0; i < nr_cpus; i++) {
                pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0);
 {
        int pmu_fd;
 
+       /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+        * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+        */
+       attr->inherit = 1;
+
        /* open task bound event */
        pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
        if (pmu_fd < 0) {
                .freq = 1,
                .type = PERF_TYPE_HARDWARE,
                .config = PERF_COUNT_HW_CPU_CYCLES,
-               .inherit = 1,
        };
        struct perf_event_attr attr_type_sw = {
                .sample_freq = SAMPLE_FREQ,
                .freq = 1,
                .type = PERF_TYPE_SOFTWARE,
                .config = PERF_COUNT_SW_CPU_CLOCK,
-               .inherit = 1,
        };
        struct perf_event_attr attr_hw_cache_l1d = {
                .sample_freq = SAMPLE_FREQ,
                        PERF_COUNT_HW_CACHE_L1D |
                        (PERF_COUNT_HW_CACHE_OP_READ << 8) |
                        (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
-               .inherit = 1,
        };
        struct perf_event_attr attr_hw_cache_branch_miss = {
                .sample_freq = SAMPLE_FREQ,
                        PERF_COUNT_HW_CACHE_BPU |
                        (PERF_COUNT_HW_CACHE_OP_READ << 8) |
                        (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
-               .inherit = 1,
        };
        struct perf_event_attr attr_type_raw = {
                .sample_freq = SAMPLE_FREQ,
                .type = PERF_TYPE_RAW,
                /* Intel Instruction Retired */
                .config = 0xc0,
-               .inherit = 1,
        };
 
        printf("Test HW_CPU_CYCLES\n");
 
 static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
                                        void *buf, unsigned int buf_size) =
        (void *) BPF_FUNC_perf_event_read_value;
+static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
+                                      unsigned int buf_size) =
+       (void *) BPF_FUNC_perf_prog_read_value;
 
 
 /* llvm builtin functions that eBPF C program may use to