return ret;
 }
 
+struct xt_percpu_counter_alloc_state {
+       unsigned int off;
+       const char __percpu *mem;
+};
 
-bool xt_percpu_counter_alloc(struct xt_counters *counters);
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+                            struct xt_counters *counter);
 void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
 
 }
 
 static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+                struct xt_percpu_counter_alloc_state *alloc_state)
 {
        struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
 
-       if (!xt_percpu_counter_alloc(&e->counters))
+       if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
                return -ENOMEM;
 
        t = arpt_get_target(e);
 static int translate_table(struct xt_table_info *newinfo, void *entry0,
                           const struct arpt_replace *repl)
 {
+       struct xt_percpu_counter_alloc_state alloc_state = { 0 };
        struct arpt_entry *iter;
        unsigned int *offsets;
        unsigned int i;
        /* Finally, each sanity check must pass */
        i = 0;
        xt_entry_foreach(iter, entry0, newinfo->size) {
-               ret = find_check_entry(iter, repl->name, repl->size);
+               ret = find_check_entry(iter, repl->name, repl->size,
+                                      &alloc_state);
                if (ret != 0)
                        break;
                ++i;
 
 
 static int
 find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
-                unsigned int size)
+                unsigned int size,
+                struct xt_percpu_counter_alloc_state *alloc_state)
 {
        struct xt_entry_target *t;
        struct xt_target *target;
        struct xt_mtchk_param mtpar;
        struct xt_entry_match *ematch;
 
-       if (!xt_percpu_counter_alloc(&e->counters))
+       if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
                return -ENOMEM;
 
        j = 0;
 translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
                const struct ipt_replace *repl)
 {
+       struct xt_percpu_counter_alloc_state alloc_state = { 0 };
        struct ipt_entry *iter;
        unsigned int *offsets;
        unsigned int i;
        /* Finally, each sanity check must pass */
        i = 0;
        xt_entry_foreach(iter, entry0, newinfo->size) {
-               ret = find_check_entry(iter, net, repl->name, repl->size);
+               ret = find_check_entry(iter, net, repl->name, repl->size,
+                                      &alloc_state);
                if (ret != 0)
                        break;
                ++i;
 
 
 static int
 find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
-                unsigned int size)
+                unsigned int size,
+                struct xt_percpu_counter_alloc_state *alloc_state)
 {
        struct xt_entry_target *t;
        struct xt_target *target;
        struct xt_mtchk_param mtpar;
        struct xt_entry_match *ematch;
 
-       if (!xt_percpu_counter_alloc(&e->counters))
+       if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
                return -ENOMEM;
 
        j = 0;
 translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
                const struct ip6t_replace *repl)
 {
+       struct xt_percpu_counter_alloc_state alloc_state = { 0 };
        struct ip6t_entry *iter;
        unsigned int *offsets;
        unsigned int i;
        /* Finally, each sanity check must pass */
        i = 0;
        xt_entry_foreach(iter, entry0, newinfo->size) {
-               ret = find_check_entry(iter, net, repl->name, repl->size);
+               ret = find_check_entry(iter, net, repl->name, repl->size,
+                                      &alloc_state);
                if (ret != 0)
                        break;
                ++i;
 
 MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
 
 #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+#define XT_PCPU_BLOCK_SIZE 4096
 
 struct compat_delta {
        unsigned int offset; /* offset in kernel */
 /**
  * xt_percpu_counter_alloc - allocate x_tables rule counter
  *
+ * @state: pointer to xt_percpu allocation state
  * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
  *
  * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
  * Rule evaluation needs to use xt_get_this_cpu_counter() helper
  * to fetch the real percpu counter.
  *
+ * To speed up allocation and improve data locality, a 4kb block is
+ * allocated.
+ *
+ * xt_percpu_counter_alloc_state contains the base address of the
+ * allocated page and the current sub-offset.
+ *
  * returns false on error.
  */
-bool xt_percpu_counter_alloc(struct xt_counters *counter)
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+                            struct xt_counters *counter)
 {
-       void __percpu *res;
+       BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2));
 
        if (nr_cpu_ids <= 1)
                return true;
 
-       res = __alloc_percpu(sizeof(struct xt_counters),
-                            sizeof(struct xt_counters));
-       if (!res)
-               return false;
-
-       counter->pcnt = (__force unsigned long)res;
+       if (!state->mem) {
+               state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE,
+                                           XT_PCPU_BLOCK_SIZE);
+               if (!state->mem)
+                       return false;
+       }
+       counter->pcnt = (__force unsigned long)(state->mem + state->off);
+       state->off += sizeof(*counter);
+       if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) {
+               state->mem = NULL;
+               state->off = 0;
+       }
        return true;
 }
 EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc);
 {
        unsigned long pcnt = counters->pcnt;
 
-       if (nr_cpu_ids > 1)
+       if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0)
                free_percpu((void __percpu *)pcnt);
 }
 EXPORT_SYMBOL_GPL(xt_percpu_counter_free);