--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Load eBPF code from an object file into a NVMe controller.
+ *
+ * Partially based on samples/bpf/bpf_load.c from the Linux kernel tree.
+ */
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <gelf.h>
+#include <libelf.h>
+#include <bcc/libbpf.h>
+#include <linux/nvme_ioctl.h>
+
+#include "nvme.h"
+#include "nvme-ioctl.h"
+#include "nvme-print.h"
+#include "nvme-bpf-defs.h"
+
+#define CREATE_CMD
+#include "bpf.h"
+
+static const char dash[100] = {[0 ... 99] = '-'};
+
+#define MAX_MAPS 32
+
+struct bpf_load_map_def {
+ unsigned int type;
+ unsigned int key_size;
+ unsigned int value_size;
+ unsigned int max_entries;
+ unsigned int map_flags;
+ unsigned int inner_map_idx;
+ unsigned int numa_node;
+};
+
+struct bpf_map_data {
+ int id;
+ char *name;
+ size_t elf_offset;
+ struct bpf_load_map_def def;
+};
+static int map_id[MAX_MAPS];
+static struct bpf_map_data map_data[MAX_MAPS];
+static int map_data_count;
+
+static bool processed_sec[128];
+
+static int get_sec(Elf * elf, int i, GElf_Ehdr * ehdr, char **shname,
+ GElf_Shdr * shdr, Elf_Data ** data)
+{
+ Elf_Scn *scn;
+
+ scn = elf_getscn(elf, i);
+ if (!scn)
+ return 1;
+
+ if (gelf_getshdr(scn, shdr) != shdr)
+ return 2;
+
+ *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
+ if (!*shname || !shdr->sh_size)
+ return 3;
+
+ *data = elf_getdata(scn, 0);
+ if (!*data || elf_getdata(scn, *data) != NULL)
+ return 4;
+
+ return 0;
+}
+
+static int parse_relo_and_apply(Elf_Data * data, Elf_Data * symbols,
+ GElf_Shdr * shdr, struct bpf_insn *insn,
+ struct bpf_map_data *maps, int nr_maps)
+{
+ int i, nrels;
+
+ nrels = shdr->sh_size / shdr->sh_entsize;
+
+ for (i = 0; i < nrels; i++) {
+ GElf_Sym sym;
+ GElf_Rel rel;
+ unsigned int insn_idx;
+ bool match = false;
+ int map_idx;
+
+ gelf_getrel(data, i, &rel);
+
+ insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+
+ gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
+
+ if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
+ printf("invalid relo for insn[%d].code 0x%x\n",
+ insn_idx, insn[insn_idx].code);
+ return 1;
+ }
+ insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
+
+ /* Match FD relocation against recorded map_data[] offset */
+ for (map_idx = 0; map_idx < nr_maps; map_idx++) {
+ if (maps[map_idx].elf_offset == sym.st_value) {
+ match = true;
+ break;
+ }
+ }
+ if (match) {
+ insn[insn_idx].imm = maps[map_idx].id;
+ } else {
+ printf("invalid relo for insn[%d] no map_data match\n",
+ insn_idx);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int cmp_symbols(const void *l, const void *r)
+{
+ const GElf_Sym *lsym = (const GElf_Sym *)l;
+ const GElf_Sym *rsym = (const GElf_Sym *)r;
+
+ if (lsym->st_value < rsym->st_value)
+ return -1;
+ else if (lsym->st_value > rsym->st_value)
+ return 1;
+ else
+ return 0;
+}
+
+static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
+ Elf * elf, Elf_Data * symbols, int strtabidx)
+{
+ int map_sz_elf, map_sz_copy;
+ bool validate_zero = false;
+ Elf_Data *data_maps;
+ int i, nr_maps;
+ GElf_Sym *sym;
+ Elf_Scn *scn;
+
+ if (maps_shndx < 0)
+ return -EINVAL;
+ if (!symbols)
+ return -EINVAL;
+
+ /* Get data for maps section via elf index */
+ scn = elf_getscn(elf, maps_shndx);
+ if (scn)
+ data_maps = elf_getdata(scn, NULL);
+ if (!scn || !data_maps) {
+ printf("Failed to get Elf_Data from maps section %d\n",
+ maps_shndx);
+ return -EINVAL;
+ }
+
+ /* For each map get corrosponding symbol table entry */
+ sym = calloc(MAX_MAPS + 1, sizeof(GElf_Sym));
+ for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+ assert(nr_maps < MAX_MAPS + 1);
+ if (!gelf_getsym(symbols, i, &sym[nr_maps]))
+ continue;
+ if (sym[nr_maps].st_shndx != maps_shndx)
+ continue;
+ /* Only increment iif maps section */
+ nr_maps++;
+ }
+
+ /* Align to map_id[] order, via sort on offset in sym.st_value */
+ qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+
+ /* Keeping compatible with ELF maps section changes
+ * ------------------------------------------------
+ * The program size of struct bpf_load_map_def is known by loader
+ * code, but struct stored in ELF file can be different.
+ *
+ * Unfortunately sym[i].st_size is zero. To calculate the
+ * struct size stored in the ELF file, assume all struct have
+ * the same size, and simply divide with number of map
+ * symbols.
+ */
+ map_sz_elf = data_maps->d_size / nr_maps;
+ map_sz_copy = sizeof(struct bpf_load_map_def);
+ if (map_sz_elf < map_sz_copy) {
+ /*
+ * Backward compat, loading older ELF file with
+ * smaller struct, keeping remaining bytes zero.
+ */
+ map_sz_copy = map_sz_elf;
+ } else if (map_sz_elf > map_sz_copy) {
+ /*
+ * Forward compat, loading newer ELF file with larger
+ * struct with unknown features. Assume zero means
+ * feature not used. Thus, validate rest of struct
+ * data is zero.
+ */
+ validate_zero = true;
+ }
+
+ /* Memcpy relevant part of ELF maps data to loader maps */
+ for (i = 0; i < nr_maps; i++) {
+ struct bpf_load_map_def *def;
+ unsigned char *addr, *end;
+ const char *map_name;
+ size_t offset;
+
+ map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
+ maps[i].name = strdup(map_name);
+ if (!maps[i].name) {
+ printf("strdup(%s): %s(%d)\n", map_name,
+ strerror(errno), errno);
+ free(sym);
+ return -errno;
+ }
+
+ /* Symbol value is offset into ELF maps section data area */
+ offset = sym[i].st_value;
+ def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
+ maps[i].elf_offset = offset;
+ memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
+ memcpy(&maps[i].def, def, map_sz_copy);
+
+ /* Verify no newer features were requested */
+ if (validate_zero) {
+ addr = (unsigned char *)def + map_sz_copy;
+ end = (unsigned char *)def + map_sz_elf;
+ for (; addr < end; addr++) {
+ if (*addr != 0) {
+ free(sym);
+ return -EFBIG;
+ }
+ }
+ }
+ }
+
+ free(sym);
+ return nr_maps;
+}
+
+static int create_map(int nvme_fd, struct bpf_map_data *map)
+{
+ struct nvme_passthru_cmd cmd = {
+ .opcode = nvme_admin_create_map,
+ .cdw10 = map->def.type,
+ .cdw11 = map->def.key_size,
+ .cdw12 = map->def.value_size,
+ .cdw13 = map->def.max_entries,
+ };
+ int ret;
+
+ ret = nvme_submit_admin_passthru(nvme_fd, &cmd);
+ if (ret < 0) {
+ perror("map create");
+ return ret;
+ }
+ if (ret > 0) {
+ printf("map create: 0x%x\n", ret);
+ return -EIO;
+ }
+
+ return cmd.result;
+}
+
+static int load_maps(int nvme_fd, struct bpf_map_data *maps, int nr_maps)
+{
+ int i;
+
+ for (i = 0; i < nr_maps; i++) {
+ map_id[i] = create_map(nvme_fd, &maps[i]);
+ if (map_id[i] < 0) {
+ printf("failed to create map %d (%s): %d %s\n",
+ i, maps[i].name, errno, strerror(errno));
+ return -EINVAL;
+ }
+ maps[i].id = map_id[i];
+ }
+
+ return 0;
+}
+
+static int load_prog(int nvme_fd, struct bpf_insn *prog, int size,
+ __u32 slot)
+{
+ struct nvme_passthru_cmd cmd = {
+ .opcode = nvme_admin_create_program,
+ .addr = (uintptr_t) prog,
+ .cdw10 = (size / sizeof(struct bpf_insn)) - 1,
+ .cdw11 = slot,
+ .data_len = size,
+ };
+ int ret;
+
+ ret = nvme_submit_admin_passthru(nvme_fd, &cmd);
+ if (ret < 0) {
+ perror("program");
+ return ret;
+ }
+ if (ret > 0) {
+ printf("program: 0x%x\n", ret);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int nvme_bpf_load_program(int nvme_fd, int fd, __u32 slot)
+{
+ GElf_Ehdr ehdr;
+ GElf_Shdr shdr, shdr_prog;
+ Elf_Data *data_maps = NULL;
+ Elf_Data *symbols = NULL;
+ Elf_Data *data, *data_prog;
+ Elf *elf;
+ int maps_shndx = -1;
+ int strtabidx = -1;
+ int nr_maps = 0;
+ char *shname, *shname_prog;
+ int ret = 0;
+ int i;
+
+ if (elf_version(EV_CURRENT) == EV_NONE)
+ return -EINVAL;
+
+ elf = elf_begin(fd, ELF_C_READ, NULL);
+ if (!elf)
+ return -EINVAL;
+ if (gelf_getehdr(elf, &ehdr) != &ehdr)
+ return -EINVAL;
+
+ /* scan over all elf sections to get map info */
+ for (i = 1; i < ehdr.e_shnum; i++) {
+ if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+ continue;
+
+ if (strcmp(shname, "maps") == 0) {
+ int j;
+
+ maps_shndx = i;
+ data_maps = data;
+ for (j = 0; j < MAX_MAPS; j++)
+ map_data[j].id = -1;
+ } else if (shdr.sh_type == SHT_SYMTAB) {
+ strtabidx = shdr.sh_link;
+ symbols = data;
+ }
+ }
+
+ if (!symbols) {
+ printf("missing SHT_SYMTAB section\n");
+ return -EINVAL;
+ }
+
+ if (data_maps) {
+ nr_maps = load_elf_maps_section(map_data, maps_shndx,
+ elf, symbols, strtabidx);
+ if (nr_maps < 0) {
+ printf("Error: Failed loading ELF maps (errno:%d):%s\n",
+ nr_maps, strerror(-nr_maps));
+ return -EINVAL;
+ }
+ if (load_maps(nvme_fd, map_data, nr_maps))
+ return -EINVAL;
+ map_data_count = nr_maps;
+ processed_sec[maps_shndx] = true;
+ }
+
+ /* process all relo sections, and rewrite bpf insns for maps */
+ for (i = 1; i < ehdr.e_shnum; i++) {
+ if (processed_sec[i])
+ continue;
+
+ if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+ continue;
+
+ if (shdr.sh_type == SHT_REL) {
+ struct bpf_insn *insns;
+
+ /* locate prog sec that need map fixup (relocations) */
+ if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
+ &shdr_prog, &data_prog))
+ continue;
+
+ if (shdr_prog.sh_type != SHT_PROGBITS ||
+ !(shdr_prog.sh_flags & SHF_EXECINSTR))
+ continue;
+
+ insns = (struct bpf_insn *)data_prog->d_buf;
+ processed_sec[i] = true; /* relo section */
+
+ if (parse_relo_and_apply(data, symbols, &shdr, insns,
+ map_data, nr_maps))
+ continue;
+ }
+ }
+
+ for (i = 1; i < ehdr.e_shnum; i++) {
+ if (processed_sec[i])
+ continue;
+
+ if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
+ continue;
+
+ if (memcmp(shname, "nvme", 4) == 0) {
+ ret = load_prog(nvme_fd, data->d_buf, data->d_size,
+ slot);
+ if (ret != 0)
+ return ret;
+ }
+ }
+
+ return nr_maps;
+}
+
+static void nvme_bpf_show_maps(int nr_maps)
+{
+ int i;
+
+ printf("%-32s %-32s\n", "Name", "ID");
+ printf("%-.32s %-.32s\n", dash, dash);
+
+ for (i = 0; i < nr_maps; i++)
+ printf("%-32s %-32u\n", map_data[i].name, map_data[i].id);
+}
+
+static void nvme_bpf_show_maps_json(int nr_maps)
+{
+ struct json_object *root = json_create_object();
+ struct json_array *maps;
+ int i;
+
+ maps = json_create_array();
+ for (i = 0; i < nr_maps; i++) {
+ struct bpf_map_data *map = &map_data[i];
+ struct json_object *map_attrs = json_create_object();
+
+ json_object_add_value_string(map_attrs, "Name", map->name);
+ json_object_add_value_int(map_attrs, "ID", map->id);
+ json_array_add_value_object(maps, map_attrs);
+ }
+ json_object_add_value_array(root, "Maps", maps);
+ json_print_object(root, NULL);
+ printf("\n");
+ json_free_object(root);
+}
+
+static const char *output_format_no_binary = "Output format: normal|json";
+
+static int load_program(int argc, char **argv, struct command *cmd,
+ struct plugin *plugin)
+{
+ const char *desc = "Load an eBPF program into the NVMe controller";
+ const char *pname = "File name of the eBPF program";
+ const char *slot = "eBPF program slot";
+ enum nvme_print_flags flags;
+ int fd, pfd, ret = -EINVAL;
+ struct config {
+ char *output_format;
+ char *program;
+ __u32 slot;
+ };
+
+ struct config cfg = {
+ .output_format = "normal",
+ };
+
+ OPT_ARGS(opts) = {
+ OPT_FMT("output-format", 'o', &cfg.output_format,
+ output_format_no_binary),
+ OPT_FILE("program", 'p', &cfg.program, pname),
+ OPT_UINT("slot", 's', &cfg.slot, slot),
+ OPT_END()
+ };
+
+ fd = parse_and_open(argc, argv, desc, opts);
+ if (fd < 0)
+ return errno;
+
+ if (!strlen(cfg.program)) {
+ fprintf(stderr, "program argument is required\n");
+ goto close_fd;
+ }
+
+ flags = validate_output_format(cfg.output_format);
+ if (flags < 0) {
+ ret = flags;
+ goto close_fd;
+ }
+ if (flags != JSON && flags != NORMAL) {
+ fprintf(stderr, "Invalid output format\n");
+ goto close_fd;
+ }
+
+ pfd = open(cfg.program, O_RDONLY);
+ if (pfd < 0) {
+ perror("failed to open program");
+ goto close_fd;
+ }
+
+ ret = nvme_bpf_load_program(fd, pfd, cfg.slot);
+ if (ret > 0) {
+ if (flags & JSON)
+ nvme_bpf_show_maps_json(ret);
+ else
+ nvme_bpf_show_maps(ret);
+ }
+ else if (ret < 0)
+ fprintf(stderr, "failed to load bpf program: %d\n", ret);
+
+ close(pfd);
+close_fd:
+ close(fd);
+ return ret > 0 ? 0 : ret;
+}
+
+static int nvme_bpf_lookup_elem(int fd, uint32_t map_id, uint32_t key)
+{
+ struct nvme_passthru_cmd cmd = { };
+ unsigned long val; // XXX: should be variable size
+ int ret;
+
+ cmd.opcode = nvme_admin_lookup_elem;
+ cmd.addr = (uintptr_t)&val;
+ cmd.data_len = sizeof(unsigned long);
+ cmd.cdw10 = sizeof(unsigned long);
+ cmd.cdw11 = map_id;
+ cmd.cdw12 = key;
+
+ ret = nvme_submit_admin_passthru(fd, &cmd);
+ if (ret == 0)
+ printf("val: %lu\n", val);
+ return ret;
+}
+
+static int lookup_elem(int argc, char **argv, struct command *cmd,
+ struct plugin *plugin)
+{
+ const char *desc = "Lookup an element in an eBPF map";
+ const char *map = "map identifier";
+ const char *key = "eBPF map key";
+ int fd, err = -1;
+ struct config {
+ uint32_t map_id;
+ uint32_t key;
+ };
+
+ struct config cfg = {
+ };
+
+ OPT_ARGS(opts) = {
+ OPT_UINT("map-id", 'm', &cfg.map_id, map),
+ OPT_UINT("key", 'k', &cfg.key, key),
+ OPT_END()
+ };
+
+ fd = parse_and_open(argc, argv, desc, opts);
+ if (fd < 0)
+ return errno;
+ err = nvme_bpf_lookup_elem(fd, cfg.map_id, cfg.key);
+ if (err > 0)
+ nvme_show_status(err);
+ else if (err)
+ fprintf(stderr, "failed to lookup bpf map element: %d\n", err);
+
+ close(fd);
+ return err;
+}