]> www.infradead.org Git - users/hch/nvme-cli.git/commitdiff
bpf program support
authorChristoph Hellwig <hch@lst.de>
Wed, 12 Aug 2020 15:41:39 +0000 (17:41 +0200)
committerChristoph Hellwig <hch@lst.de>
Thu, 12 Nov 2020 10:13:16 +0000 (11:13 +0100)
Makefile
linux/nvme.h
plugins/bpf/bpf.c [new file with mode: 0644]
plugins/bpf/bpf.h [new file with mode: 0644]
plugins/bpf/nvme-bpf-defs.h [new file with mode: 0644]

index 65afdbe02a49eda3382c8b4cf1978ff7c35a3f28..e4561b6aa343fe46eddcc36e355ddae4fffef19d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,9 @@ ifeq ($(HAVE_SYSTEMD),0)
        override CFLAGS += -DHAVE_SYSTEMD
 endif
 
+# should be conditional and maybe limited to the bpf plugin
+override LDFLAGS += -lelf
+
 RPMBUILD = rpmbuild
 TAR = tar
 RM = rm -f
@@ -81,7 +84,8 @@ PLUGIN_OBJS :=                                        \
        plugins/dera/dera-nvme.o                \
        plugins/scaleflux/sfx-nvme.o            \
        plugins/transcend/transcend-nvme.o      \
-       plugins/zns/zns.o
+       plugins/zns/zns.o                       \
+       plugins/bpf/bpf.o
 
 nvme: nvme.c nvme.h $(OBJS) $(PLUGIN_OBJS) $(UTIL_OBJS) NVME-VERSION-FILE
        $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) $(INC) $< -o $(NVME) $(OBJS) $(PLUGIN_OBJS) $(UTIL_OBJS) $(LDFLAGS)
index de2502929ebb67b06f92d6a46e69d446723536a4..fd00fe59af5ced11cc359d4b4d5fdd16763fb2e1 100644 (file)
@@ -1017,6 +1017,11 @@ enum nvme_admin_opcode {
        nvme_admin_security_recv        = 0x82,
        nvme_admin_sanitize_nvm         = 0x84,
        nvme_admin_get_lba_status       = 0x86,
+
+       /* non-standard experimental opcodes for eBPF programs */
+       nvme_admin_create_map           = 0x60,
+       nvme_admin_create_program       = 0x61,
+       nvme_admin_lookup_elem          = 0x62,
 };
 
 enum {
diff --git a/plugins/bpf/bpf.c b/plugins/bpf/bpf.c
new file mode 100644 (file)
index 0000000..92f6994
--- /dev/null
@@ -0,0 +1,574 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Load eBPF code from an object file into a NVMe controller.
+ *
+ * Partially based on samples/bpf/bpf_load.c from the Linux kernel tree.
+ */
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <gelf.h>
+#include <libelf.h>
+#include <bcc/libbpf.h>
+#include <linux/nvme_ioctl.h>
+
+#include "nvme.h"
+#include "nvme-ioctl.h"
+#include "nvme-print.h"
+#include "nvme-bpf-defs.h"
+
+#define CREATE_CMD
+#include "bpf.h"
+
+static const char dash[100] = {[0 ... 99] = '-'};
+
+#define MAX_MAPS 32
+
+struct bpf_load_map_def {
+       unsigned int type;
+       unsigned int key_size;
+       unsigned int value_size;
+       unsigned int max_entries;
+       unsigned int map_flags;
+       unsigned int inner_map_idx;
+       unsigned int numa_node;
+};
+
+struct bpf_map_data {
+       int id;
+       char *name;
+       size_t elf_offset;
+       struct bpf_load_map_def def;
+};
+static int map_id[MAX_MAPS];
+static struct bpf_map_data map_data[MAX_MAPS];
+static int map_data_count;
+
+static bool processed_sec[128];
+
+static int get_sec(Elf * elf, int i, GElf_Ehdr * ehdr, char **shname,
+                  GElf_Shdr * shdr, Elf_Data ** data)
+{
+       Elf_Scn *scn;
+
+       scn = elf_getscn(elf, i);
+       if (!scn)
+               return 1;
+
+       if (gelf_getshdr(scn, shdr) != shdr)
+               return 2;
+
+       *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+       if (!*shname || !shdr->sh_size)
+               return 3;
+
+       *data = elf_getdata(scn, 0);
+       if (!*data || elf_getdata(scn, *data) != NULL)
+               return 4;
+
+       return 0;
+}
+
+static int parse_relo_and_apply(Elf_Data * data, Elf_Data * symbols,
+                               GElf_Shdr * shdr, struct bpf_insn *insn,
+                               struct bpf_map_data *maps, int nr_maps)
+{
+       int i, nrels;
+
+       nrels = shdr->sh_size / shdr->sh_entsize;
+
+       for (i = 0; i < nrels; i++) {
+               GElf_Sym sym;
+               GElf_Rel rel;
+               unsigned int insn_idx;
+               bool match = false;
+               int map_idx;
+
+               gelf_getrel(data, i, &rel);
+
+               insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+
+               gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
+
+               if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
+                       printf("invalid relo for insn[%d].code 0x%x\n",
+                              insn_idx, insn[insn_idx].code);
+                       return 1;
+               }
+               insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
+
+               /* Match FD relocation against recorded map_data[] offset */
+               for (map_idx = 0; map_idx < nr_maps; map_idx++) {
+                       if (maps[map_idx].elf_offset == sym.st_value) {
+                               match = true;
+                               break;
+                       }
+               }
+               if (match) {
+                       insn[insn_idx].imm = maps[map_idx].id;
+               } else {
+                       printf("invalid relo for insn[%d] no map_data match\n",
+                              insn_idx);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static int cmp_symbols(const void *l, const void *r)
+{
+       const GElf_Sym *lsym = (const GElf_Sym *)l;
+       const GElf_Sym *rsym = (const GElf_Sym *)r;
+
+       if (lsym->st_value < rsym->st_value)
+               return -1;
+       else if (lsym->st_value > rsym->st_value)
+               return 1;
+       else
+               return 0;
+}
+
+static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
+                                Elf * elf, Elf_Data * symbols, int strtabidx)
+{
+       int map_sz_elf, map_sz_copy;
+       bool validate_zero = false;
+       Elf_Data *data_maps;
+       int i, nr_maps;
+       GElf_Sym *sym;
+       Elf_Scn *scn;
+
+       if (maps_shndx < 0)
+               return -EINVAL;
+       if (!symbols)
+               return -EINVAL;
+
+       /* Get data for maps section via elf index */
+       scn = elf_getscn(elf, maps_shndx);
+       if (scn)
+               data_maps = elf_getdata(scn, NULL);
+       if (!scn || !data_maps) {
+               printf("Failed to get Elf_Data from maps section %d\n",
+                      maps_shndx);
+               return -EINVAL;
+       }
+
+       /* For each map get corrosponding symbol table entry */
+       sym = calloc(MAX_MAPS + 1, sizeof(GElf_Sym));
+       for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+               assert(nr_maps < MAX_MAPS + 1);
+               if (!gelf_getsym(symbols, i, &sym[nr_maps]))
+                       continue;
+               if (sym[nr_maps].st_shndx != maps_shndx)
+                       continue;
+               /* Only increment iif maps section */
+               nr_maps++;
+       }
+
+       /* Align to map_id[] order, via sort on offset in sym.st_value */
+       qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+
+       /* Keeping compatible with ELF maps section changes
+        * ------------------------------------------------
+        * The program size of struct bpf_load_map_def is known by loader
+        * code, but struct stored in ELF file can be different.
+        *
+        * Unfortunately sym[i].st_size is zero.  To calculate the
+        * struct size stored in the ELF file, assume all struct have
+        * the same size, and simply divide with number of map
+        * symbols.
+        */
+       map_sz_elf = data_maps->d_size / nr_maps;
+       map_sz_copy = sizeof(struct bpf_load_map_def);
+       if (map_sz_elf < map_sz_copy) {
+               /*
+                * Backward compat, loading older ELF file with
+                * smaller struct, keeping remaining bytes zero.
+                */
+               map_sz_copy = map_sz_elf;
+       } else if (map_sz_elf > map_sz_copy) {
+               /*
+                * Forward compat, loading newer ELF file with larger
+                * struct with unknown features. Assume zero means
+                * feature not used.  Thus, validate rest of struct
+                * data is zero.
+                */
+               validate_zero = true;
+       }
+
+       /* Memcpy relevant part of ELF maps data to loader maps */
+       for (i = 0; i < nr_maps; i++) {
+               struct bpf_load_map_def *def;
+               unsigned char *addr, *end;
+               const char *map_name;
+               size_t offset;
+
+               map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
+               maps[i].name = strdup(map_name);
+               if (!maps[i].name) {
+                       printf("strdup(%s): %s(%d)\n", map_name,
+                              strerror(errno), errno);
+                       free(sym);
+                       return -errno;
+               }
+
+               /* Symbol value is offset into ELF maps section data area */
+               offset = sym[i].st_value;
+               def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
+               maps[i].elf_offset = offset;
+               memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
+               memcpy(&maps[i].def, def, map_sz_copy);
+
+               /* Verify no newer features were requested */
+               if (validate_zero) {
+                       addr = (unsigned char *)def + map_sz_copy;
+                       end = (unsigned char *)def + map_sz_elf;
+                       for (; addr < end; addr++) {
+                               if (*addr != 0) {
+                                       free(sym);
+                                       return -EFBIG;
+                               }
+                       }
+               }
+       }
+
+       free(sym);
+       return nr_maps;
+}
+
+static int create_map(int nvme_fd, struct bpf_map_data *map)
+{
+       struct nvme_passthru_cmd cmd = {
+               .opcode = nvme_admin_create_map,
+               .cdw10 = map->def.type,
+               .cdw11 = map->def.key_size,
+               .cdw12 = map->def.value_size,
+               .cdw13 = map->def.max_entries,
+       };
+       int ret;
+
+       ret = nvme_submit_admin_passthru(nvme_fd, &cmd);
+       if (ret < 0) {
+               perror("map create");
+               return ret;
+       }
+       if (ret > 0) {
+               printf("map create: 0x%x\n", ret);
+               return -EIO;
+       }
+
+       return cmd.result;
+}
+
+static int load_maps(int nvme_fd, struct bpf_map_data *maps, int nr_maps)
+{
+       int i;
+
+       for (i = 0; i < nr_maps; i++) {
+               map_id[i] = create_map(nvme_fd, &maps[i]);
+               if (map_id[i] < 0) {
+                       printf("failed to create map %d (%s): %d %s\n",
+                              i, maps[i].name, errno, strerror(errno));
+                       return -EINVAL;
+               }
+               maps[i].id = map_id[i];
+       }
+
+       return 0;
+}
+
+static int load_prog(int nvme_fd, struct bpf_insn *prog, int size,
+               __u32 slot)
+{
+       struct nvme_passthru_cmd cmd = {
+               .opcode = nvme_admin_create_program,
+               .addr = (uintptr_t) prog,
+               .cdw10 = (size / sizeof(struct bpf_insn)) - 1,
+               .cdw11 = slot,
+               .data_len = size,
+       };
+       int ret;
+
+       ret = nvme_submit_admin_passthru(nvme_fd, &cmd);
+       if (ret < 0) {
+               perror("program");
+               return ret;
+       }
+       if (ret > 0) {
+               printf("program: 0x%x\n", ret);
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int nvme_bpf_load_program(int nvme_fd, int fd, __u32 slot)
+{
+       GElf_Ehdr ehdr;
+       GElf_Shdr shdr, shdr_prog;
+       Elf_Data *data_maps = NULL;
+       Elf_Data *symbols = NULL;
+       Elf_Data *data, *data_prog;
+       Elf *elf;
+       int maps_shndx = -1;
+       int strtabidx = -1;
+       int nr_maps = 0;
+       char *shname, *shname_prog;
+       int ret = 0;
+       int i;
+
+       if (elf_version(EV_CURRENT) == EV_NONE)
+               return -EINVAL;
+
+       elf = elf_begin(fd, ELF_C_READ, NULL);
+       if (!elf)
+               return -EINVAL;
+       if (gelf_getehdr(elf, &ehdr) != &ehdr)
+               return -EINVAL;
+
+       /* scan over all elf sections to get map info */
+       for (i = 1; i < ehdr.e_shnum; i++) {
+               if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+                       continue;
+
+               if (strcmp(shname, "maps") == 0) {
+                       int j;
+
+                       maps_shndx = i;
+                       data_maps = data;
+                       for (j = 0; j < MAX_MAPS; j++)
+                               map_data[j].id = -1;
+               } else if (shdr.sh_type == SHT_SYMTAB) {
+                       strtabidx = shdr.sh_link;
+                       symbols = data;
+               }
+       }
+
+       if (!symbols) {
+               printf("missing SHT_SYMTAB section\n");
+               return -EINVAL;
+       }
+
+       if (data_maps) {
+               nr_maps = load_elf_maps_section(map_data, maps_shndx,
+                                               elf, symbols, strtabidx);
+               if (nr_maps < 0) {
+                       printf("Error: Failed loading ELF maps (errno:%d):%s\n",
+                              nr_maps, strerror(-nr_maps));
+                       return -EINVAL;
+               }
+               if (load_maps(nvme_fd, map_data, nr_maps))
+                       return -EINVAL;
+               map_data_count = nr_maps;
+               processed_sec[maps_shndx] = true;
+       }
+
+       /* process all relo sections, and rewrite bpf insns for maps */
+       for (i = 1; i < ehdr.e_shnum; i++) {
+               if (processed_sec[i])
+                       continue;
+
+               if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+                       continue;
+
+               if (shdr.sh_type == SHT_REL) {
+                       struct bpf_insn *insns;
+
+                       /* locate prog sec that need map fixup (relocations) */
+                       if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
+                                   &shdr_prog, &data_prog))
+                               continue;
+
+                       if (shdr_prog.sh_type != SHT_PROGBITS ||
+                           !(shdr_prog.sh_flags & SHF_EXECINSTR))
+                               continue;
+
+                       insns = (struct bpf_insn *)data_prog->d_buf;
+                       processed_sec[i] = true;        /* relo section */
+
+                       if (parse_relo_and_apply(data, symbols, &shdr, insns,
+                                                map_data, nr_maps))
+                               continue;
+               }
+       }
+
+       for (i = 1; i < ehdr.e_shnum; i++) {
+               if (processed_sec[i])
+                       continue;
+
+               if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+                       continue;
+
+               if (memcmp(shname, "nvme", 4) == 0) {
+                       ret = load_prog(nvme_fd, data->d_buf, data->d_size,
+                                       slot);
+                       if (ret != 0)
+                               return ret;
+               }
+       }
+
+       return nr_maps;
+}
+
+static void nvme_bpf_show_maps(int nr_maps)
+{
+       int i;
+
+       printf("%-32s %-32s\n", "Name", "ID");
+       printf("%-.32s %-.32s\n", dash, dash);
+       
+       for (i = 0; i < nr_maps; i++)
+               printf("%-32s %-32u\n", map_data[i].name, map_data[i].id);
+}
+
+static void nvme_bpf_show_maps_json(int nr_maps)
+{
+       struct json_object *root = json_create_object();
+       struct json_array *maps;
+       int i;
+
+       maps = json_create_array();
+       for (i = 0; i < nr_maps; i++) {
+               struct bpf_map_data *map = &map_data[i];
+               struct json_object *map_attrs = json_create_object();
+
+               json_object_add_value_string(map_attrs, "Name", map->name);
+               json_object_add_value_int(map_attrs, "ID", map->id);
+               json_array_add_value_object(maps, map_attrs);
+       }
+       json_object_add_value_array(root, "Maps", maps);
+       json_print_object(root, NULL);
+       printf("\n");
+       json_free_object(root);
+}
+
+static const char *output_format_no_binary = "Output format: normal|json";
+
+static int load_program(int argc, char **argv, struct command *cmd,
+                       struct plugin *plugin)
+{
+       const char *desc = "Load an eBPF program into the NVMe controller";
+       const char *pname = "File name of the eBPF program";
+       const char *slot = "eBPF program slot";
+       enum nvme_print_flags flags;
+       int fd, pfd, ret = -EINVAL;
+       struct config {
+               char *output_format;
+               char *program;
+               __u32 slot;
+       };
+
+       struct config cfg = {
+               .output_format = "normal",
+       };
+
+       OPT_ARGS(opts) = {
+               OPT_FMT("output-format", 'o', &cfg.output_format,
+                       output_format_no_binary),
+               OPT_FILE("program", 'p', &cfg.program, pname),
+               OPT_UINT("slot", 's', &cfg.slot, slot),
+               OPT_END()
+       };
+
+       fd = parse_and_open(argc, argv, desc, opts);
+       if (fd < 0)
+               return errno;
+
+       if (!strlen(cfg.program)) {
+               fprintf(stderr, "program argument is required\n");
+               goto close_fd;
+       }
+
+       flags = validate_output_format(cfg.output_format);
+       if (flags < 0) {
+               ret = flags;
+               goto close_fd;
+       }
+       if (flags != JSON && flags != NORMAL) {
+               fprintf(stderr, "Invalid output format\n");
+               goto close_fd;
+       }
+
+       pfd = open(cfg.program, O_RDONLY);
+       if (pfd < 0) {
+               perror("failed to open program");
+               goto close_fd;
+       }
+
+       ret = nvme_bpf_load_program(fd, pfd, cfg.slot);
+       if (ret > 0) {
+               if (flags & JSON)
+                       nvme_bpf_show_maps_json(ret);
+               else
+                       nvme_bpf_show_maps(ret);
+       }
+       else if (ret < 0)
+               fprintf(stderr, "failed to load bpf program: %d\n", ret);
+
+       close(pfd);
+close_fd:
+       close(fd);
+       return ret > 0 ? 0 : ret; 
+}
+
+static int nvme_bpf_lookup_elem(int fd, uint32_t map_id, uint32_t key)
+{
+       struct nvme_passthru_cmd cmd = { };
+       unsigned long val; // XXX: should be variable size
+       int ret;
+
+       cmd.opcode = nvme_admin_lookup_elem;
+       cmd.addr = (uintptr_t)&val;
+       cmd.data_len = sizeof(unsigned long);
+       cmd.cdw10 = sizeof(unsigned long);
+       cmd.cdw11 = map_id;
+       cmd.cdw12 = key;
+
+       ret = nvme_submit_admin_passthru(fd, &cmd);
+       if (ret == 0)
+               printf("val: %lu\n", val);
+       return ret;
+}
+
+static int lookup_elem(int argc, char **argv, struct command *cmd,
+                       struct plugin *plugin)
+{
+       const char *desc = "Lookup an element in an eBPF map";
+       const char *map = "map identifier";
+       const char *key = "eBPF map key";
+       int fd, err = -1;
+       struct config {
+               uint32_t map_id;
+               uint32_t key;
+       };
+
+       struct config cfg = {
+       };
+
+       OPT_ARGS(opts) = {
+               OPT_UINT("map-id", 'm', &cfg.map_id, map),
+               OPT_UINT("key", 'k', &cfg.key, key),
+               OPT_END()
+       };
+
+       fd = parse_and_open(argc, argv, desc, opts);
+       if (fd < 0)
+               return errno;
+       err = nvme_bpf_lookup_elem(fd, cfg.map_id, cfg.key);
+       if (err > 0)
+               nvme_show_status(err);
+       else if (err)
+               fprintf(stderr, "failed to lookup bpf map element: %d\n", err);
+
+       close(fd);
+       return err;
+}
diff --git a/plugins/bpf/bpf.h b/plugins/bpf/bpf.h
new file mode 100644 (file)
index 0000000..5c44e3b
--- /dev/null
@@ -0,0 +1,20 @@
+#undef CMD_INC_FILE
+#define CMD_INC_FILE plugins/bpf/bpf
+
+#if !defined(BPF_NVME) || defined(CMD_HEADER_MULTI_READ)
+#define BPF_NVME
+
+#include "cmd.h"
+
+PLUGIN(NAME("bpf", "Zoned Namespace Command Set"),
+       COMMAND_LIST(
+               ENTRY("load-program", "Load an eBPF program", load_program)
+               ENTRY("lookup-elem", "Lookup element in an eBPF map",
+                       lookup_elem)
+       )
+);
+
+#endif
+
+#include "define_cmd.h"
+
diff --git a/plugins/bpf/nvme-bpf-defs.h b/plugins/bpf/nvme-bpf-defs.h
new file mode 100644 (file)
index 0000000..28ca11d
--- /dev/null
@@ -0,0 +1,22 @@
+/*
+ * Shared context for the host and the downloaded eBPF program.
+ */
+
+#define __section(NAME)  __attribute__((section(NAME), used))
+
+struct nvme_bpf_map_def {
+       unsigned int type;
+       unsigned int key_size;
+       unsigned int value_size;
+       unsigned int max_entries;
+       unsigned int map_flags;
+};
+
+#define BPF_FUNC_nvme_read_data 5      /* BPF_FUNC_ktime_get_ns */
+#define BPF_FUNC_nvme_hello_world 6    /* BPF_FUNC_trace_printk */
+
+/* public context seen by the eBPF programs */
+struct nvme_bpf_ctx {
+       unsigned int size;
+       /* add things like LBA size or metadata types here?? */
+};