return -EFAULT;
 
        switch (flags & BPF_F_HDR_FIELD_MASK) {
+       case 0:
+               if (unlikely(from != 0))
+                       return -EINVAL;
+
+               inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+               break;
        case 2:
                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
                break;
        .arg5_type      = ARG_ANYTHING,
 };
 
+struct bpf_csum_scratchpad {
+       __be32 diff[128];
+};
+
+static DEFINE_PER_CPU(struct bpf_csum_scratchpad, bpf_csum_sp);
+
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+       struct bpf_csum_scratchpad *sp = this_cpu_ptr(&bpf_csum_sp);
+       u64 diff_size = from_size + to_size;
+       __be32 *from = (__be32 *) (long) r1;
+       __be32 *to   = (__be32 *) (long) r3;
+       int i, j = 0;
+
+       /* This is quite flexible, some examples:
+        *
+        * from_size == 0, to_size > 0,  seed := csum --> pushing data
+        * from_size > 0,  to_size == 0, seed := csum --> pulling data
+        * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
+        *
+        * Even for diffing, from_size and to_size don't need to be equal.
+        */
+       if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+                    diff_size > sizeof(sp->diff)))
+               return -EINVAL;
+
+       for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+               sp->diff[j] = ~from[i];
+       for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
+               sp->diff[j] = to[i];
+
+       return csum_partial(sp->diff, diff_size, seed);
+}
+
+const struct bpf_func_proto bpf_csum_diff_proto = {
+       .func           = bpf_csum_diff,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_STACK,
+       .arg2_type      = ARG_CONST_STACK_SIZE_OR_ZERO,
+       .arg3_type      = ARG_PTR_TO_STACK,
+       .arg4_type      = ARG_CONST_STACK_SIZE_OR_ZERO,
+       .arg5_type      = ARG_ANYTHING,
+};
+
 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 {
        struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
+       case BPF_FUNC_csum_diff:
+               return &bpf_csum_diff_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace: