From d9466420f8cc55eb73db226f2c714afec27a2607 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 12 Aug 2020 17:41:39 +0200 Subject: [PATCH] bpf program support --- Makefile | 6 +- linux/nvme.h | 5 + plugins/bpf/bpf.c | 574 ++++++++++++++++++++++++++++++++++++ plugins/bpf/bpf.h | 20 ++ plugins/bpf/nvme-bpf-defs.h | 22 ++ 5 files changed, 626 insertions(+), 1 deletion(-) create mode 100644 plugins/bpf/bpf.c create mode 100644 plugins/bpf/bpf.h create mode 100644 plugins/bpf/nvme-bpf-defs.h diff --git a/Makefile b/Makefile index 65afdbe..e4561b6 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,9 @@ ifeq ($(HAVE_SYSTEMD),0) override CFLAGS += -DHAVE_SYSTEMD endif +# should be conditional and maybe limited to the bpf plugin +override LDFLAGS += -lelf + RPMBUILD = rpmbuild TAR = tar RM = rm -f @@ -81,7 +84,8 @@ PLUGIN_OBJS := \ plugins/dera/dera-nvme.o \ plugins/scaleflux/sfx-nvme.o \ plugins/transcend/transcend-nvme.o \ - plugins/zns/zns.o + plugins/zns/zns.o \ + plugins/bpf/bpf.o nvme: nvme.c nvme.h $(OBJS) $(PLUGIN_OBJS) $(UTIL_OBJS) NVME-VERSION-FILE $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) $(INC) $< -o $(NVME) $(OBJS) $(PLUGIN_OBJS) $(UTIL_OBJS) $(LDFLAGS) diff --git a/linux/nvme.h b/linux/nvme.h index de25029..fd00fe5 100644 --- a/linux/nvme.h +++ b/linux/nvme.h @@ -1017,6 +1017,11 @@ enum nvme_admin_opcode { nvme_admin_security_recv = 0x82, nvme_admin_sanitize_nvm = 0x84, nvme_admin_get_lba_status = 0x86, + + /* non-standard experimental opcodes for eBPF programs */ + nvme_admin_create_map = 0x60, + nvme_admin_create_program = 0x61, + nvme_admin_lookup_elem = 0x62, }; enum { diff --git a/plugins/bpf/bpf.c b/plugins/bpf/bpf.c new file mode 100644 index 0000000..92f6994 --- /dev/null +++ b/plugins/bpf/bpf.c @@ -0,0 +1,574 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Load eBPF code from an object file into a NVMe controller. + * + * Partially based on samples/bpf/bpf_load.c from the Linux kernel tree. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme.h" +#include "nvme-ioctl.h" +#include "nvme-print.h" +#include "nvme-bpf-defs.h" + +#define CREATE_CMD +#include "bpf.h" + +static const char dash[100] = {[0 ... 99] = '-'}; + +#define MAX_MAPS 32 + +struct bpf_load_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; + unsigned int inner_map_idx; + unsigned int numa_node; +}; + +struct bpf_map_data { + int id; + char *name; + size_t elf_offset; + struct bpf_load_map_def def; +}; +static int map_id[MAX_MAPS]; +static struct bpf_map_data map_data[MAX_MAPS]; +static int map_data_count; + +static bool processed_sec[128]; + +static int get_sec(Elf * elf, int i, GElf_Ehdr * ehdr, char **shname, + GElf_Shdr * shdr, Elf_Data ** data) +{ + Elf_Scn *scn; + + scn = elf_getscn(elf, i); + if (!scn) + return 1; + + if (gelf_getshdr(scn, shdr) != shdr) + return 2; + + *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); + if (!*shname || !shdr->sh_size) + return 3; + + *data = elf_getdata(scn, 0); + if (!*data || elf_getdata(scn, *data) != NULL) + return 4; + + return 0; +} + +static int parse_relo_and_apply(Elf_Data * data, Elf_Data * symbols, + GElf_Shdr * shdr, struct bpf_insn *insn, + struct bpf_map_data *maps, int nr_maps) +{ + int i, nrels; + + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + GElf_Sym sym; + GElf_Rel rel; + unsigned int insn_idx; + bool match = false; + int map_idx; + + gelf_getrel(data, i, &rel); + + insn_idx = rel.r_offset / sizeof(struct bpf_insn); + + gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); + + if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { + printf("invalid relo for insn[%d].code 0x%x\n", + insn_idx, insn[insn_idx].code); + return 1; + } + insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + + /* Match FD relocation against recorded map_data[] offset */ + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + if (maps[map_idx].elf_offset == sym.st_value) { + match = true; + break; + } + } + if (match) { + insn[insn_idx].imm = maps[map_idx].id; + } else { + printf("invalid relo for insn[%d] no map_data match\n", + insn_idx); + return 1; + } + } + + return 0; +} + +static int cmp_symbols(const void *l, const void *r) +{ + const GElf_Sym *lsym = (const GElf_Sym *)l; + const GElf_Sym *rsym = (const GElf_Sym *)r; + + if (lsym->st_value < rsym->st_value) + return -1; + else if (lsym->st_value > rsym->st_value) + return 1; + else + return 0; +} + +static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, + Elf * elf, Elf_Data * symbols, int strtabidx) +{ + int map_sz_elf, map_sz_copy; + bool validate_zero = false; + Elf_Data *data_maps; + int i, nr_maps; + GElf_Sym *sym; + Elf_Scn *scn; + + if (maps_shndx < 0) + return -EINVAL; + if (!symbols) + return -EINVAL; + + /* Get data for maps section via elf index */ + scn = elf_getscn(elf, maps_shndx); + if (scn) + data_maps = elf_getdata(scn, NULL); + if (!scn || !data_maps) { + printf("Failed to get Elf_Data from maps section %d\n", + maps_shndx); + return -EINVAL; + } + + /* For each map get corrosponding symbol table entry */ + sym = calloc(MAX_MAPS + 1, sizeof(GElf_Sym)); + for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { + assert(nr_maps < MAX_MAPS + 1); + if (!gelf_getsym(symbols, i, &sym[nr_maps])) + continue; + if (sym[nr_maps].st_shndx != maps_shndx) + continue; + /* Only increment iif maps section */ + nr_maps++; + } + + /* Align to map_id[] order, via sort on offset in sym.st_value */ + qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); + + /* Keeping compatible with ELF maps section changes + * ------------------------------------------------ + * The program size of struct bpf_load_map_def is known by loader + * code, but struct stored in ELF file can be different. + * + * Unfortunately sym[i].st_size is zero. To calculate the + * struct size stored in the ELF file, assume all struct have + * the same size, and simply divide with number of map + * symbols. + */ + map_sz_elf = data_maps->d_size / nr_maps; + map_sz_copy = sizeof(struct bpf_load_map_def); + if (map_sz_elf < map_sz_copy) { + /* + * Backward compat, loading older ELF file with + * smaller struct, keeping remaining bytes zero. + */ + map_sz_copy = map_sz_elf; + } else if (map_sz_elf > map_sz_copy) { + /* + * Forward compat, loading newer ELF file with larger + * struct with unknown features. Assume zero means + * feature not used. Thus, validate rest of struct + * data is zero. + */ + validate_zero = true; + } + + /* Memcpy relevant part of ELF maps data to loader maps */ + for (i = 0; i < nr_maps; i++) { + struct bpf_load_map_def *def; + unsigned char *addr, *end; + const char *map_name; + size_t offset; + + map_name = elf_strptr(elf, strtabidx, sym[i].st_name); + maps[i].name = strdup(map_name); + if (!maps[i].name) { + printf("strdup(%s): %s(%d)\n", map_name, + strerror(errno), errno); + free(sym); + return -errno; + } + + /* Symbol value is offset into ELF maps section data area */ + offset = sym[i].st_value; + def = (struct bpf_load_map_def *)(data_maps->d_buf + offset); + maps[i].elf_offset = offset; + memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def)); + memcpy(&maps[i].def, def, map_sz_copy); + + /* Verify no newer features were requested */ + if (validate_zero) { + addr = (unsigned char *)def + map_sz_copy; + end = (unsigned char *)def + map_sz_elf; + for (; addr < end; addr++) { + if (*addr != 0) { + free(sym); + return -EFBIG; + } + } + } + } + + free(sym); + return nr_maps; +} + +static int create_map(int nvme_fd, struct bpf_map_data *map) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_create_map, + .cdw10 = map->def.type, + .cdw11 = map->def.key_size, + .cdw12 = map->def.value_size, + .cdw13 = map->def.max_entries, + }; + int ret; + + ret = nvme_submit_admin_passthru(nvme_fd, &cmd); + if (ret < 0) { + perror("map create"); + return ret; + } + if (ret > 0) { + printf("map create: 0x%x\n", ret); + return -EIO; + } + + return cmd.result; +} + +static int load_maps(int nvme_fd, struct bpf_map_data *maps, int nr_maps) +{ + int i; + + for (i = 0; i < nr_maps; i++) { + map_id[i] = create_map(nvme_fd, &maps[i]); + if (map_id[i] < 0) { + printf("failed to create map %d (%s): %d %s\n", + i, maps[i].name, errno, strerror(errno)); + return -EINVAL; + } + maps[i].id = map_id[i]; + } + + return 0; +} + +static int load_prog(int nvme_fd, struct bpf_insn *prog, int size, + __u32 slot) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_create_program, + .addr = (uintptr_t) prog, + .cdw10 = (size / sizeof(struct bpf_insn)) - 1, + .cdw11 = slot, + .data_len = size, + }; + int ret; + + ret = nvme_submit_admin_passthru(nvme_fd, &cmd); + if (ret < 0) { + perror("program"); + return ret; + } + if (ret > 0) { + printf("program: 0x%x\n", ret); + return -EIO; + } + + return 0; +} + +static int nvme_bpf_load_program(int nvme_fd, int fd, __u32 slot) +{ + GElf_Ehdr ehdr; + GElf_Shdr shdr, shdr_prog; + Elf_Data *data_maps = NULL; + Elf_Data *symbols = NULL; + Elf_Data *data, *data_prog; + Elf *elf; + int maps_shndx = -1; + int strtabidx = -1; + int nr_maps = 0; + char *shname, *shname_prog; + int ret = 0; + int i; + + if (elf_version(EV_CURRENT) == EV_NONE) + return -EINVAL; + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) + return -EINVAL; + if (gelf_getehdr(elf, &ehdr) != &ehdr) + return -EINVAL; + + /* scan over all elf sections to get map info */ + for (i = 1; i < ehdr.e_shnum; i++) { + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (strcmp(shname, "maps") == 0) { + int j; + + maps_shndx = i; + data_maps = data; + for (j = 0; j < MAX_MAPS; j++) + map_data[j].id = -1; + } else if (shdr.sh_type == SHT_SYMTAB) { + strtabidx = shdr.sh_link; + symbols = data; + } + } + + if (!symbols) { + printf("missing SHT_SYMTAB section\n"); + return -EINVAL; + } + + if (data_maps) { + nr_maps = load_elf_maps_section(map_data, maps_shndx, + elf, symbols, strtabidx); + if (nr_maps < 0) { + printf("Error: Failed loading ELF maps (errno:%d):%s\n", + nr_maps, strerror(-nr_maps)); + return -EINVAL; + } + if (load_maps(nvme_fd, map_data, nr_maps)) + return -EINVAL; + map_data_count = nr_maps; + processed_sec[maps_shndx] = true; + } + + /* process all relo sections, and rewrite bpf insns for maps */ + for (i = 1; i < ehdr.e_shnum; i++) { + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (shdr.sh_type == SHT_REL) { + struct bpf_insn *insns; + + /* locate prog sec that need map fixup (relocations) */ + if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, + &shdr_prog, &data_prog)) + continue; + + if (shdr_prog.sh_type != SHT_PROGBITS || + !(shdr_prog.sh_flags & SHF_EXECINSTR)) + continue; + + insns = (struct bpf_insn *)data_prog->d_buf; + processed_sec[i] = true; /* relo section */ + + if (parse_relo_and_apply(data, symbols, &shdr, insns, + map_data, nr_maps)) + continue; + } + } + + for (i = 1; i < ehdr.e_shnum; i++) { + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (memcmp(shname, "nvme", 4) == 0) { + ret = load_prog(nvme_fd, data->d_buf, data->d_size, + slot); + if (ret != 0) + return ret; + } + } + + return nr_maps; +} + +static void nvme_bpf_show_maps(int nr_maps) +{ + int i; + + printf("%-32s %-32s\n", "Name", "ID"); + printf("%-.32s %-.32s\n", dash, dash); + + for (i = 0; i < nr_maps; i++) + printf("%-32s %-32u\n", map_data[i].name, map_data[i].id); +} + +static void nvme_bpf_show_maps_json(int nr_maps) +{ + struct json_object *root = json_create_object(); + struct json_array *maps; + int i; + + maps = json_create_array(); + for (i = 0; i < nr_maps; i++) { + struct bpf_map_data *map = &map_data[i]; + struct json_object *map_attrs = json_create_object(); + + json_object_add_value_string(map_attrs, "Name", map->name); + json_object_add_value_int(map_attrs, "ID", map->id); + json_array_add_value_object(maps, map_attrs); + } + json_object_add_value_array(root, "Maps", maps); + json_print_object(root, NULL); + printf("\n"); + json_free_object(root); +} + +static const char *output_format_no_binary = "Output format: normal|json"; + +static int load_program(int argc, char **argv, struct command *cmd, + struct plugin *plugin) +{ + const char *desc = "Load an eBPF program into the NVMe controller"; + const char *pname = "File name of the eBPF program"; + const char *slot = "eBPF program slot"; + enum nvme_print_flags flags; + int fd, pfd, ret = -EINVAL; + struct config { + char *output_format; + char *program; + __u32 slot; + }; + + struct config cfg = { + .output_format = "normal", + }; + + OPT_ARGS(opts) = { + OPT_FMT("output-format", 'o', &cfg.output_format, + output_format_no_binary), + OPT_FILE("program", 'p', &cfg.program, pname), + OPT_UINT("slot", 's', &cfg.slot, slot), + OPT_END() + }; + + fd = parse_and_open(argc, argv, desc, opts); + if (fd < 0) + return errno; + + if (!strlen(cfg.program)) { + fprintf(stderr, "program argument is required\n"); + goto close_fd; + } + + flags = validate_output_format(cfg.output_format); + if (flags < 0) { + ret = flags; + goto close_fd; + } + if (flags != JSON && flags != NORMAL) { + fprintf(stderr, "Invalid output format\n"); + goto close_fd; + } + + pfd = open(cfg.program, O_RDONLY); + if (pfd < 0) { + perror("failed to open program"); + goto close_fd; + } + + ret = nvme_bpf_load_program(fd, pfd, cfg.slot); + if (ret > 0) { + if (flags & JSON) + nvme_bpf_show_maps_json(ret); + else + nvme_bpf_show_maps(ret); + } + else if (ret < 0) + fprintf(stderr, "failed to load bpf program: %d\n", ret); + + close(pfd); +close_fd: + close(fd); + return ret > 0 ? 0 : ret; +} + +static int nvme_bpf_lookup_elem(int fd, uint32_t map_id, uint32_t key) +{ + struct nvme_passthru_cmd cmd = { }; + unsigned long val; // XXX: should be variable size + int ret; + + cmd.opcode = nvme_admin_lookup_elem; + cmd.addr = (uintptr_t)&val; + cmd.data_len = sizeof(unsigned long); + cmd.cdw10 = sizeof(unsigned long); + cmd.cdw11 = map_id; + cmd.cdw12 = key; + + ret = nvme_submit_admin_passthru(fd, &cmd); + if (ret == 0) + printf("val: %lu\n", val); + return ret; +} + +static int lookup_elem(int argc, char **argv, struct command *cmd, + struct plugin *plugin) +{ + const char *desc = "Lookup an element in an eBPF map"; + const char *map = "map identifier"; + const char *key = "eBPF map key"; + int fd, err = -1; + struct config { + uint32_t map_id; + uint32_t key; + }; + + struct config cfg = { + }; + + OPT_ARGS(opts) = { + OPT_UINT("map-id", 'm', &cfg.map_id, map), + OPT_UINT("key", 'k', &cfg.key, key), + OPT_END() + }; + + fd = parse_and_open(argc, argv, desc, opts); + if (fd < 0) + return errno; + err = nvme_bpf_lookup_elem(fd, cfg.map_id, cfg.key); + if (err > 0) + nvme_show_status(err); + else if (err) + fprintf(stderr, "failed to lookup bpf map element: %d\n", err); + + close(fd); + return err; +} diff --git a/plugins/bpf/bpf.h b/plugins/bpf/bpf.h new file mode 100644 index 0000000..5c44e3b --- /dev/null +++ b/plugins/bpf/bpf.h @@ -0,0 +1,20 @@ +#undef CMD_INC_FILE +#define CMD_INC_FILE plugins/bpf/bpf + +#if !defined(BPF_NVME) || defined(CMD_HEADER_MULTI_READ) +#define BPF_NVME + +#include "cmd.h" + +PLUGIN(NAME("bpf", "Zoned Namespace Command Set"), + COMMAND_LIST( + ENTRY("load-program", "Load an eBPF program", load_program) + ENTRY("lookup-elem", "Lookup element in an eBPF map", + lookup_elem) + ) +); + +#endif + +#include "define_cmd.h" + diff --git a/plugins/bpf/nvme-bpf-defs.h b/plugins/bpf/nvme-bpf-defs.h new file mode 100644 index 0000000..28ca11d --- /dev/null +++ b/plugins/bpf/nvme-bpf-defs.h @@ -0,0 +1,22 @@ +/* + * Shared context for the host and the downloaded eBPF program. + */ + +#define __section(NAME) __attribute__((section(NAME), used)) + +struct nvme_bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; +}; + +#define BPF_FUNC_nvme_read_data 5 /* BPF_FUNC_ktime_get_ns */ +#define BPF_FUNC_nvme_hello_world 6 /* BPF_FUNC_trace_printk */ + +/* public context seen by the eBPF programs */ +struct nvme_bpf_ctx { + unsigned int size; + /* add things like LBA size or metadata types here?? */ +}; -- 2.50.1