}
 }
 
+/* load/store pair that forms memory copy sould look like the following:
+ *
+ *   ld_width R, [addr_src + offset_src]
+ *   st_width [addr_dest + offset_dest], R
+ *
+ * The destination register of load and source register of store should
+ * be the same, load and store should also perform at the same width.
+ * If either of addr_src or addr_dest is stack pointer, we don't do the
+ * CPP optimization as stack is modelled by registers on NFP.
+ */
+static bool
+curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
+                   struct nfp_insn_meta *st_meta)
+{
+       struct bpf_insn *ld = &ld_meta->insn;
+       struct bpf_insn *st = &st_meta->insn;
+
+       if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
+               return false;
+
+       if (ld_meta->ptr.type != PTR_TO_PACKET)
+               return false;
+
+       if (st_meta->ptr.type != PTR_TO_PACKET)
+               return false;
+
+       if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
+               return false;
+
+       if (ld->dst_reg != st->src_reg)
+               return false;
+
+       /* There is jump to the store insn in this pair. */
+       if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
+               return false;
+
+       return true;
+}
+
+/* Currently, we only support chaining load/store pairs if:
+ *
+ *  - Their address base registers are the same.
+ *  - Their address offsets are in the same order.
+ *  - They operate at the same memory width.
+ *  - There is no jump into the middle of them.
+ */
+static bool
+curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
+                             struct nfp_insn_meta *st_meta,
+                             struct bpf_insn *prev_ld,
+                             struct bpf_insn *prev_st)
+{
+       u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
+       struct bpf_insn *ld = &ld_meta->insn;
+       struct bpf_insn *st = &st_meta->insn;
+       s16 prev_ld_off, prev_st_off;
+
+       /* This pair is the start pair. */
+       if (!prev_ld)
+               return true;
+
+       prev_size = BPF_LDST_BYTES(prev_ld);
+       curr_size = BPF_LDST_BYTES(ld);
+       prev_ld_base = prev_ld->src_reg;
+       prev_st_base = prev_st->dst_reg;
+       prev_ld_dst = prev_ld->dst_reg;
+       prev_ld_off = prev_ld->off;
+       prev_st_off = prev_st->off;
+
+       if (ld->dst_reg != prev_ld_dst)
+               return false;
+
+       if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
+               return false;
+
+       if (curr_size != prev_size)
+               return false;
+
+       /* There is jump to the head of this pair. */
+       if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
+               return false;
+
+       /* Both in ascending order. */
+       if (prev_ld_off + prev_size == ld->off &&
+           prev_st_off + prev_size == st->off)
+               return true;
+
+       /* Both in descending order. */
+       if (ld->off + curr_size == prev_ld_off &&
+           st->off + curr_size == prev_st_off)
+               return true;
+
+       return false;
+}
+
+/* Return TRUE if cross memory access happens. Cross memory access means
+ * store area is overlapping with load area that a later load might load
+ * the value from previous store, for this case we can't treat the sequence
+ * as an memory copy.
+ */
+static bool
+cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
+                struct nfp_insn_meta *head_st_meta)
+{
+       s16 head_ld_off, head_st_off, ld_off;
+
+       /* Different pointer types does not overlap. */
+       if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
+               return false;
+
+       /* load and store are both PTR_TO_PACKET, check ID info.  */
+       if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
+               return true;
+
+       /* Canonicalize the offsets. Turn all of them against the original
+        * base register.
+        */
+       head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
+       head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
+       ld_off = ld->off + head_ld_meta->ptr.off;
+
+       /* Ascending order cross. */
+       if (ld_off > head_ld_off &&
+           head_ld_off < head_st_off && ld_off >= head_st_off)
+               return true;
+
+       /* Descending order cross. */
+       if (ld_off < head_ld_off &&
+           head_ld_off > head_st_off && ld_off <= head_st_off)
+               return true;
+
+       return false;
+}
+
+/* This pass try to identify the following instructoin sequences.
+ *
+ *   load R, [regA + offA]
+ *   store [regB + offB], R
+ *   load R, [regA + offA + const_imm_A]
+ *   store [regB + offB + const_imm_A], R
+ *   load R, [regA + offA + 2 * const_imm_A]
+ *   store [regB + offB + 2 * const_imm_A], R
+ *   ...
+ *
+ * Above sequence is typically generated by compiler when lowering
+ * memcpy. NFP prefer using CPP instructions to accelerate it.
+ */
+static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
+{
+       struct nfp_insn_meta *head_ld_meta = NULL;
+       struct nfp_insn_meta *head_st_meta = NULL;
+       struct nfp_insn_meta *meta1, *meta2;
+       struct bpf_insn *prev_ld = NULL;
+       struct bpf_insn *prev_st = NULL;
+       u8 count = 0;
+
+       nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
+               struct bpf_insn *ld = &meta1->insn;
+               struct bpf_insn *st = &meta2->insn;
+
+               /* Reset record status if any of the following if true:
+                *   - The current insn pair is not load/store.
+                *   - The load/store pair doesn't chain with previous one.
+                *   - The chained load/store pair crossed with previous pair.
+                *   - The chained load/store pair has a total size of memory
+                *     copy beyond 128 bytes which is the maximum length a
+                *     single NFP CPP command can transfer.
+                */
+               if (!curr_pair_is_memcpy(meta1, meta2) ||
+                   !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
+                                                  prev_st) ||
+                   (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
+                                                      head_st_meta) ||
+                                     head_ld_meta->ldst_gather_len >= 128))) {
+                       if (!count)
+                               continue;
+
+                       if (count > 1) {
+                               s16 prev_ld_off = prev_ld->off;
+                               s16 prev_st_off = prev_st->off;
+                               s16 head_ld_off = head_ld_meta->insn.off;
+
+                               if (prev_ld_off < head_ld_off) {
+                                       head_ld_meta->insn.off = prev_ld_off;
+                                       head_st_meta->insn.off = prev_st_off;
+                                       head_ld_meta->ldst_gather_len =
+                                               -head_ld_meta->ldst_gather_len;
+                               }
+
+                               head_ld_meta->paired_st = &head_st_meta->insn;
+                               head_st_meta->skip = true;
+                       } else {
+                               head_ld_meta->ldst_gather_len = 0;
+                       }
+
+                       /* If the chain is ended by an load/store pair then this
+                        * could serve as the new head of the the next chain.
+                        */
+                       if (curr_pair_is_memcpy(meta1, meta2)) {
+                               head_ld_meta = meta1;
+                               head_st_meta = meta2;
+                               head_ld_meta->ldst_gather_len =
+                                       BPF_LDST_BYTES(ld);
+                               meta1 = nfp_meta_next(meta1);
+                               meta2 = nfp_meta_next(meta2);
+                               prev_ld = ld;
+                               prev_st = st;
+                               count = 1;
+                       } else {
+                               head_ld_meta = NULL;
+                               head_st_meta = NULL;
+                               prev_ld = NULL;
+                               prev_st = NULL;
+                               count = 0;
+                       }
+
+                       continue;
+               }
+
+               if (!head_ld_meta) {
+                       head_ld_meta = meta1;
+                       head_st_meta = meta2;
+               } else {
+                       meta1->skip = true;
+                       meta2->skip = true;
+               }
+
+               head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
+               meta1 = nfp_meta_next(meta1);
+               meta2 = nfp_meta_next(meta2);
+               prev_ld = ld;
+               prev_st = st;
+               count++;
+       }
+}
+
 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
 {
        nfp_bpf_opt_reg_init(nfp_prog);
 
        nfp_bpf_opt_ld_mask(nfp_prog);
        nfp_bpf_opt_ld_shift(nfp_prog);
+       nfp_bpf_opt_ldst_gather(nfp_prog);
 
        return 0;
 }