update_stats(&runtime_itlb_cache_stats[0], count[0]);
 }
 
+static void zero_per_pkg(struct perf_evsel *counter)
+{
+       if (counter->per_pkg_mask)
+               memset(counter->per_pkg_mask, 0, MAX_NR_CPUS);
+}
+
+static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip)
+{
+       unsigned long *mask = counter->per_pkg_mask;
+       struct cpu_map *cpus = perf_evsel__cpus(counter);
+       int s;
+
+       *skip = false;
+
+       if (!counter->per_pkg)
+               return 0;
+
+       if (cpu_map__empty(cpus))
+               return 0;
+
+       if (!mask) {
+               mask = zalloc(MAX_NR_CPUS);
+               if (!mask)
+                       return -ENOMEM;
+
+               counter->per_pkg_mask = mask;
+       }
+
+       s = cpu_map__get_socket(cpus, cpu);
+       if (s < 0)
+               return -1;
+
+       *skip = test_and_set_bit(s, mask) == 1;
+       return 0;
+}
+
 static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused,
                   struct perf_counts_values *count)
 {
        struct perf_counts_values *aggr = &evsel->counts->aggr;
+       static struct perf_counts_values zero;
+       bool skip = false;
+
+       if (check_per_pkg(evsel, cpu, &skip)) {
+               pr_err("failed to read per-pkg counter\n");
+               return -1;
+       }
+
+       if (skip)
+               count = &zero;
 
        switch (aggr_mode) {
        case AGGR_CORE:
        if (counter->system_wide)
                nthreads = 1;
 
+       if (counter->per_pkg)
+               zero_per_pkg(counter);
+
        for (thread = 0; thread < nthreads; thread++) {
                for (cpu = 0; cpu < ncpus; cpu++) {
                        if (perf_evsel__read_cb(counter, cpu, thread, read_cb))