bpf: Add bloom filter map implementation

author Joanne Koong <joannekoong@fb.com>

Wed, 27 Oct 2021 23:45:00 +0000 (16:45 -0700)

committer Alexei Starovoitov <ast@kernel.org>

Thu, 28 Oct 2021 20:22:49 +0000 (13:22 -0700)
author Joanne Koong <joannekoong@fb.com>
Wed, 27 Oct 2021 23:45:00 +0000 (16:45 -0700)
committer Alexei Starovoitov <ast@kernel.org>
Thu, 28 Oct 2021 20:22:49 +0000 (13:22 -0700)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index 31421c74ba0811dcec9766359ff89e941535cefa..50105e0b8fcc0df950e28d9170b8d8b2109f164c 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,6 +169,7 @@ struct bpf_map {
         u32 value_size;
         u32 max_entries;
         u32 map_flags;
+       u64 map_extra; /* any per-map-type extra fields */
         int spin_lock_off; /* >=0 valid offset, <0 error */
         int timer_off; /* >=0 valid offset, <0 error */
         u32 id;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h

index 9c81724e4b9850657ac484c9f5322b2fba905429..c4424ac2fa029d7651ec668f4e4f6c24dfd4a092 100644 (file)
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
  #endif
  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
  
  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index c108200378834c4b0278419a63d45defbf8cc822..8bead4aa3ad04dcfd9f554669ddbaaad49e830ec 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -906,6 +906,7 @@ enum bpf_map_type {
         BPF_MAP_TYPE_RINGBUF,
         BPF_MAP_TYPE_INODE_STORAGE,
         BPF_MAP_TYPE_TASK_STORAGE,
+       BPF_MAP_TYPE_BLOOM_FILTER,
  };
  
  /* Note that tracing related programs such as
@@ -1274,6 +1275,13 @@ union bpf_attr {
                                                    * struct stored as the
                                                    * map value
                                                    */
+               /* Any per-map-type extra fields
+                *
+                * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
+                * number of hash functions (if 0, the bloom filter will default
+                * to using 5 hash functions).
+                */
+               __u64   map_extra;
         };
  
         struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -5638,6 +5646,7 @@ struct bpf_map_info {
         __u32 btf_id;
         __u32 btf_key_type_id;
         __u32 btf_value_type_id;
+       __u64 map_extra;
  } __attribute__((aligned(8)));
  
  struct bpf_btf_info {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile

index 7f33098ca63fb54ebc8eb9090b6fe1e33b513e8c..cf6ca339f3cd8d713d6379c73ba20bf08610c427 100644 (file)
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,7 +7,7 @@ endif
  CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
  
  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
  obj-${CONFIG_BPF_LSM}    += bpf_inode_storage.o
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c

new file mode 100644 (file)

index 0000000..7c50232
--- /dev/null
+++ b/kernel/bpf/bloom_filter.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bitmap.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#define BLOOM_CREATE_FLAG_MASK \
+       (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
+
+struct bpf_bloom_filter {
+       struct bpf_map map;
+       u32 bitset_mask;
+       u32 hash_seed;
+       /* If the size of the values in the bloom filter is u32 aligned,
+        * then it is more performant to use jhash2 as the underlying hash
+        * function, else we use jhash. This tracks the number of u32s
+        * in an u32-aligned value size. If the value size is not u32 aligned,
+        * this will be 0.
+        */
+       u32 aligned_u32_count;
+       u32 nr_hash_funcs;
+       unsigned long bitset[];
+};
+
+static u32 hash(struct bpf_bloom_filter *bloom, void *value,
+               u32 value_size, u32 index)
+{
+       u32 h;
+
+       if (bloom->aligned_u32_count)
+               h = jhash2(value, bloom->aligned_u32_count,
+                          bloom->hash_seed + index);
+       else
+               h = jhash(value, value_size, bloom->hash_seed + index);
+
+       return h & bloom->bitset_mask;
+}
+
+static int peek_elem(struct bpf_map *map, void *value)
+{
+       struct bpf_bloom_filter *bloom =
+               container_of(map, struct bpf_bloom_filter, map);
+       u32 i, h;
+
+       for (i = 0; i < bloom->nr_hash_funcs; i++) {
+               h = hash(bloom, value, map->value_size, i);
+               if (!test_bit(h, bloom->bitset))
+                       return -ENOENT;
+       }
+
+       return 0;
+}
+
+static int push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+       struct bpf_bloom_filter *bloom =
+               container_of(map, struct bpf_bloom_filter, map);
+       u32 i, h;
+
+       if (flags != BPF_ANY)
+               return -EINVAL;
+
+       for (i = 0; i < bloom->nr_hash_funcs; i++) {
+               h = hash(bloom, value, map->value_size, i);
+               set_bit(h, bloom->bitset);
+       }
+
+       return 0;
+}
+
+static int pop_elem(struct bpf_map *map, void *value)
+{
+       return -EOPNOTSUPP;
+}
+
+static struct bpf_map *map_alloc(union bpf_attr *attr)
+{
+       u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
+       int numa_node = bpf_map_attr_numa_node(attr);
+       struct bpf_bloom_filter *bloom;
+
+       if (!bpf_capable())
+               return ERR_PTR(-EPERM);
+
+       if (attr->key_size != 0 || attr->value_size == 0 ||
+           attr->max_entries == 0 ||
+           attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
+           !bpf_map_flags_access_ok(attr->map_flags) ||
+           (attr->map_extra & ~0xF))
+               return ERR_PTR(-EINVAL);
+
+       /* The lower 4 bits of map_extra specify the number of hash functions */
+       nr_hash_funcs = attr->map_extra & 0xF;
+       if (nr_hash_funcs == 0)
+               /* Default to using 5 hash functions if unspecified */
+               nr_hash_funcs = 5;
+
+       /* For the bloom filter, the optimal bit array size that minimizes the
+        * false positive probability is n * k / ln(2) where n is the number of
+        * expected entries in the bloom filter and k is the number of hash
+        * functions. We use 7 / 5 to approximate 1 / ln(2).
+        *
+        * We round this up to the nearest power of two to enable more efficient
+        * hashing using bitmasks. The bitmask will be the bit array size - 1.
+        *
+        * If this overflows a u32, the bit array size will have 2^32 (4
+        * GB) bits.
+        */
+       if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
+           check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
+           nr_bits > (1UL << 31)) {
+               /* The bit array size is 2^32 bits but to avoid overflowing the
+                * u32, we use U32_MAX, which will round up to the equivalent
+                * number of bytes
+                */
+               bitset_bytes = BITS_TO_BYTES(U32_MAX);
+               bitset_mask = U32_MAX;
+       } else {
+               if (nr_bits <= BITS_PER_LONG)
+                       nr_bits = BITS_PER_LONG;
+               else
+                       nr_bits = roundup_pow_of_two(nr_bits);
+               bitset_bytes = BITS_TO_BYTES(nr_bits);
+               bitset_mask = nr_bits - 1;
+       }
+
+       bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+       bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
+
+       if (!bloom)
+               return ERR_PTR(-ENOMEM);
+
+       bpf_map_init_from_attr(&bloom->map, attr);
+
+       bloom->nr_hash_funcs = nr_hash_funcs;
+       bloom->bitset_mask = bitset_mask;
+
+       /* Check whether the value size is u32-aligned */
+       if ((attr->value_size & (sizeof(u32) - 1)) == 0)
+               bloom->aligned_u32_count =
+                       attr->value_size / sizeof(u32);
+
+       if (!(attr->map_flags & BPF_F_ZERO_SEED))
+               bloom->hash_seed = get_random_int();
+
+       return &bloom->map;
+}
+
+static void map_free(struct bpf_map *map)
+{
+       struct bpf_bloom_filter *bloom =
+               container_of(map, struct bpf_bloom_filter, map);
+
+       bpf_map_area_free(bloom);
+}
+
+static void *lookup_elem(struct bpf_map *map, void *key)
+{
+       /* The eBPF program should use map_peek_elem instead */
+       return ERR_PTR(-EINVAL);
+}
+
+static int update_elem(struct bpf_map *map, void *key,
+                      void *value, u64 flags)
+{
+       /* The eBPF program should use map_push_elem instead */
+       return -EINVAL;
+}
+
+static int check_btf(const struct bpf_map *map, const struct btf *btf,
+                    const struct btf_type *key_type,
+                    const struct btf_type *value_type)
+{
+       /* Bloom filter maps are keyless */
+       return btf_type_is_void(key_type) ? 0 : -EINVAL;
+}
+
+static int bpf_bloom_btf_id;
+const struct bpf_map_ops bloom_filter_map_ops = {
+       .map_meta_equal = bpf_map_meta_equal,
+       .map_alloc = map_alloc,
+       .map_free = map_free,
+       .map_push_elem = push_elem,
+       .map_peek_elem = peek_elem,
+       .map_pop_elem = pop_elem,
+       .map_lookup_elem = lookup_elem,
+       .map_update_elem = update_elem,
+       .map_check_btf = check_btf,
+       .map_btf_name = "bpf_bloom_filter",
+       .map_btf_id = &bpf_bloom_btf_id,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 3e1c024ce3ed14673e5021ae6e7226c02b0393fa..f7c2c6354add944917da1baabcb11a10dfb0bf9f 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -199,7 +199,8 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
                 err = bpf_fd_reuseport_array_update_elem(map, key, value,
                                                          flags);
         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-                  map->map_type == BPF_MAP_TYPE_STACK) {
+                  map->map_type == BPF_MAP_TYPE_STACK ||
+                  map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                 err = map->ops->map_push_elem(map, value, flags);
         } else {
                 rcu_read_lock();
@@ -238,7 +239,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
         } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
                 err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
         } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-                  map->map_type == BPF_MAP_TYPE_STACK) {
+                  map->map_type == BPF_MAP_TYPE_STACK ||
+                  map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                 err = map->ops->map_peek_elem(map, value);
         } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                 /* struct_ops map requires directly updating "value" */
@@ -348,6 +350,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
         map->max_entries = attr->max_entries;
         map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
         map->numa_node = bpf_map_attr_numa_node(attr);
+       map->map_extra = attr->map_extra;
  }
  
  static int bpf_map_alloc_id(struct bpf_map *map)
@@ -553,6 +556,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
                    "value_size:\t%u\n"
                    "max_entries:\t%u\n"
                    "map_flags:\t%#x\n"
+                  "map_extra:\t%#llx\n"
                    "memlock:\t%lu\n"
                    "map_id:\t%u\n"
                    "frozen:\t%u\n",
@@ -561,6 +565,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
                    map->value_size,
                    map->max_entries,
                    map->map_flags,
+                  (unsigned long long)map->map_extra,
                    bpf_map_memory_footprint(map),
                    map->id,
                    READ_ONCE(map->frozen));
@@ -810,7 +815,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
         return ret;
  }
  
-#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD map_extra
  /* called via syscall */
  static int map_create(union bpf_attr *attr)
  {
@@ -831,6 +836,10 @@ static int map_create(union bpf_attr *attr)
                 return -EINVAL;
         }
  
+       if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+           attr->map_extra != 0)
+               return -EINVAL;
+
         f_flags = bpf_get_file_flag(attr->map_flags);
         if (f_flags < 0)
                 return f_flags;
@@ -1080,6 +1089,14 @@ static int map_lookup_elem(union bpf_attr *attr)
         if (!value)
                 goto free_key;
  
+       if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
+               if (copy_from_user(value, uvalue, value_size))
+                       err = -EFAULT;
+               else
+                       err = bpf_map_copy_value(map, key, value, attr->flags);
+               goto free_value;
+       }
+
         err = bpf_map_copy_value(map, key, value, attr->flags);
         if (err)
                 goto free_value;
@@ -3881,6 +3898,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
         info.value_size = map->value_size;
         info.max_entries = map->max_entries;
         info.map_flags = map->map_flags;
+       info.map_extra = map->map_extra;
         memcpy(info.name, map->name, sizeof(map->name));
  
         if (map->btf) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index c6616e3258038a33b0ec30c46994ecbf5dd14fcf..3c8aa7df1773461a4b29ea81aa5d527685e55e54 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5002,7 +5002,10 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env,
                         return -EINVAL;
                 }
                 break;
-
+       case BPF_MAP_TYPE_BLOOM_FILTER:
+               if (meta->func_id == BPF_FUNC_map_peek_elem)
+                       *arg_type = ARG_PTR_TO_MAP_VALUE;
+               break;
         default:
                 break;
         }
@@ -5577,6 +5580,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                     func_id != BPF_FUNC_task_storage_delete)
                         goto error;
                 break;
+       case BPF_MAP_TYPE_BLOOM_FILTER:
+               if (func_id != BPF_FUNC_map_peek_elem &&
+                   func_id != BPF_FUNC_map_push_elem)
+                       goto error;
+               break;
         default:
                 break;
         }
@@ -5644,13 +5652,18 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                     map->map_type != BPF_MAP_TYPE_SOCKHASH)
                         goto error;
                 break;
-       case BPF_FUNC_map_peek_elem:
         case BPF_FUNC_map_pop_elem:
-       case BPF_FUNC_map_push_elem:
                 if (map->map_type != BPF_MAP_TYPE_QUEUE &&
                     map->map_type != BPF_MAP_TYPE_STACK)
                         goto error;
                 break;
+       case BPF_FUNC_map_peek_elem:
+       case BPF_FUNC_map_push_elem:
+               if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+                   map->map_type != BPF_MAP_TYPE_STACK &&
+                   map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
+                       goto error;
+               break;
         case BPF_FUNC_sk_storage_get:
         case BPF_FUNC_sk_storage_delete:
                 if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index c108200378834c4b0278419a63d45defbf8cc822..8bead4aa3ad04dcfd9f554669ddbaaad49e830ec 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -906,6 +906,7 @@ enum bpf_map_type {
         BPF_MAP_TYPE_RINGBUF,
         BPF_MAP_TYPE_INODE_STORAGE,
         BPF_MAP_TYPE_TASK_STORAGE,
+       BPF_MAP_TYPE_BLOOM_FILTER,
  };
  
  /* Note that tracing related programs such as
@@ -1274,6 +1275,13 @@ union bpf_attr {
                                                    * struct stored as the
                                                    * map value
                                                    */
+               /* Any per-map-type extra fields
+                *
+                * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
+                * number of hash functions (if 0, the bloom filter will default
+                * to using 5 hash functions).
+                */
+               __u64   map_extra;
         };
  
         struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -5638,6 +5646,7 @@ struct bpf_map_info {
         __u32 btf_id;
         __u32 btf_key_type_id;
         __u32 btf_value_type_id;
+       __u64 map_extra;
  } __attribute__((aligned(8)));
  
  struct bpf_btf_info {
author	Joanne Koong <joannekoong@fb.com>
	Wed, 27 Oct 2021 23:45:00 +0000 (16:45 -0700)
committer	Alexei Starovoitov <ast@kernel.org>
	Thu, 28 Oct 2021 20:22:49 +0000 (13:22 -0700)
include/linux/bpf.h		patch \| blob \| history
include/linux/bpf_types.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
kernel/bpf/Makefile		patch \| blob \| history
kernel/bpf/bloom_filter.c	[new file with mode: 0644]	patch \| blob
kernel/bpf/syscall.c		patch \| blob \| history
kernel/bpf/verifier.c		patch \| blob \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| history