#include "progs/test_cls_redirect.h"
 #include "test_cls_redirect.skel.h"
+#include "test_cls_redirect_subprogs.skel.h"
 
 #define ENCAP_IP INADDR_LOOPBACK
 #define ENCAP_PORT (1234)
 
+static int duration = 0;
+
 struct addr_port {
        in_port_t port;
        union {
                        close(fds[i]);
 }
 
-void test_cls_redirect(void)
+static void test_cls_redirect_common(struct bpf_program *prog)
 {
-       struct test_cls_redirect *skel = NULL;
        struct bpf_prog_test_run_attr tattr = {};
        int families[] = { AF_INET, AF_INET6 };
        struct sockaddr_storage ss;
        struct sockaddr *addr;
        socklen_t slen;
        int i, j, err;
-
        int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
        int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
        struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
 
-       skel = test_cls_redirect__open();
-       if (CHECK_FAIL(!skel))
-               return;
-
-       skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
-       skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
-
-       if (CHECK_FAIL(test_cls_redirect__load(skel)))
-               goto cleanup;
-
        addr = (struct sockaddr *)&ss;
        for (i = 0; i < ARRAY_SIZE(families); i++) {
                slen = prepare_addr(&ss, families[i]);
                        goto cleanup;
        }
 
-       tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect);
+       tattr.prog_fd = bpf_program__fd(prog);
        for (i = 0; i < ARRAY_SIZE(tests); i++) {
                struct test_cfg *test = &tests[i];
 
        }
 
 cleanup:
-       test_cls_redirect__destroy(skel);
        close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0]));
        close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
 }
+
+static void test_cls_redirect_inlined(void)
+{
+       struct test_cls_redirect *skel;
+       int err;
+
+       skel = test_cls_redirect__open();
+       if (CHECK(!skel, "skel_open", "failed\n"))
+               return;
+
+       skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
+       skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
+
+       err = test_cls_redirect__load(skel);
+       if (CHECK(err, "skel_load", "failed: %d\n", err))
+               goto cleanup;
+
+       test_cls_redirect_common(skel->progs.cls_redirect);
+
+cleanup:
+       test_cls_redirect__destroy(skel);
+}
+
+static void test_cls_redirect_subprogs(void)
+{
+       struct test_cls_redirect_subprogs *skel;
+       int err;
+
+       skel = test_cls_redirect_subprogs__open();
+       if (CHECK(!skel, "skel_open", "failed\n"))
+               return;
+
+       skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
+       skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
+
+       err = test_cls_redirect_subprogs__load(skel);
+       if (CHECK(err, "skel_load", "failed: %d\n", err))
+               goto cleanup;
+
+       test_cls_redirect_common(skel->progs.cls_redirect);
+
+cleanup:
+       test_cls_redirect_subprogs__destroy(skel);
+}
+
+void test_cls_redirect(void)
+{
+       if (test__start_subtest("cls_redirect_inlined"))
+               test_cls_redirect_inlined();
+       if (test__start_subtest("cls_redirect_subprogs"))
+               test_cls_redirect_subprogs();
+}
 
 
 #include "test_cls_redirect.h"
 
+#ifdef SUBPROGS
+#define INLINING __noinline
+#else
+#define INLINING __always_inline
+#endif
+
 #define offsetofend(TYPE, MEMBER) \
        (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
 
        uint8_t *const tail;
 } buf_t;
 
-static size_t buf_off(const buf_t *buf)
+static __always_inline size_t buf_off(const buf_t *buf)
 {
        /* Clang seems to optimize constructs like
         *    a - b + c
        return off;
 }
 
-static bool buf_copy(buf_t *buf, void *dst, size_t len)
+static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
 {
        if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
                return false;
        return true;
 }
 
-static bool buf_skip(buf_t *buf, const size_t len)
+static __always_inline bool buf_skip(buf_t *buf, const size_t len)
 {
        /* Check whether off + len is valid in the non-linear part. */
        if (buf_off(buf) + len > buf->skb->len) {
  * If scratch is not NULL, the function will attempt to load non-linear
  * data via bpf_skb_load_bytes. On success, scratch is returned.
  */
-static void *buf_assign(buf_t *buf, const size_t len, void *scratch)
+static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
 {
        if (buf->head + len > buf->tail) {
                if (scratch == NULL) {
        return ptr;
 }
 
-static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
+static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
 {
        if (ipv4->ihl <= 5) {
                return true;
        return buf_skip(buf, (ipv4->ihl - 5) * 4);
 }
 
-static bool ipv4_is_fragment(const struct iphdr *ip)
+static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
 {
        uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
        return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
 }
 
-static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
+static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
 {
        struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
        if (ipv4 == NULL) {
 }
 
 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
-static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
+static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
 {
        if (!buf_copy(pkt, ports, sizeof(*ports))) {
                return false;
        return true;
 }
 
-static uint16_t pkt_checksum_fold(uint32_t csum)
+static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
 {
        /* The highest reasonable value for an IPv4 header
         * checksum requires two folds, so we just do that always.
        return (uint16_t)~csum;
 }
 
-static void pkt_ipv4_checksum(struct iphdr *iph)
+static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
 {
        iph->check = 0;
 
        iph->check = pkt_checksum_fold(acc);
 }
 
-static bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
-                                           const struct ipv6hdr *ipv6,
-                                           uint8_t *upper_proto,
-                                           bool *is_fragment)
+static INLINING
+bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
+                                    const struct ipv6hdr *ipv6,
+                                    uint8_t *upper_proto,
+                                    bool *is_fragment)
 {
        /* We understand five extension headers.
         * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
  * scratch is allocated on the stack. However, this usage should be safe since
  * it's the callers stack after all.
  */
-static inline __attribute__((__always_inline__)) struct ipv6hdr *
+static __always_inline struct ipv6hdr *
 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
               bool *is_fragment)
 {
 
 /* Global metrics, per CPU
  */
-struct bpf_map_def metrics_map SEC("maps") = {
-       .type = BPF_MAP_TYPE_PERCPU_ARRAY,
-       .key_size = sizeof(unsigned int),
-       .value_size = sizeof(metrics_t),
-       .max_entries = 1,
-};
-
-static metrics_t *get_global_metrics(void)
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, unsigned int);
+       __type(value, metrics_t);
+} metrics_map SEC(".maps");
+
+static INLINING metrics_t *get_global_metrics(void)
 {
        uint64_t key = 0;
        return bpf_map_lookup_elem(&metrics_map, &key);
 }
 
-static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
+static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
 {
        const int payload_off =
                sizeof(*encap) +
        return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
 }
 
-static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
-                             struct in_addr *next_hop, metrics_t *metrics)
+static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
+                                      struct in_addr *next_hop, metrics_t *metrics)
 {
        metrics->forwarded_packets_total_gre++;
 
        return bpf_redirect(skb->ifindex, 0);
 }
 
-static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
-                                struct in_addr *next_hop, metrics_t *metrics)
+static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
+                                         struct in_addr *next_hop, metrics_t *metrics)
 {
        /* swap L2 addresses */
        /* This assumes that packets are received from a router.
        return bpf_redirect(skb->ifindex, 0);
 }
 
-static ret_t skip_next_hops(buf_t *pkt, int n)
+static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
 {
        switch (n) {
        case 1:
  * pkt is positioned just after the variable length GLB header
  * iff the call is successful.
  */
-static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
-                         struct in_addr *next_hop)
+static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
+                                  struct in_addr *next_hop)
 {
        if (encap->unigue.next_hop > encap->unigue.hop_count) {
                return TC_ACT_SHOT;
  * return value, and calling code works while still being "generic" to
  * IPv4 and IPv6.
  */
-static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
-                          uint64_t iphlen, uint16_t sport, uint16_t dport)
+static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
+                                   uint64_t iphlen, uint16_t sport, uint16_t dport)
 {
        switch (iphlen) {
        case sizeof(struct iphdr): {
        }
 }
 
-static verdict_t classify_tcp(struct __sk_buff *skb,
-                             struct bpf_sock_tuple *tuple, uint64_t tuplen,
-                             void *iph, struct tcphdr *tcp)
+static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
+                                      struct bpf_sock_tuple *tuple, uint64_t tuplen,
+                                      void *iph, struct tcphdr *tcp)
 {
        struct bpf_sock *sk =
                bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
        return UNKNOWN;
 }
 
-static verdict_t classify_udp(struct __sk_buff *skb,
-                             struct bpf_sock_tuple *tuple, uint64_t tuplen)
+static INLINING verdict_t classify_udp(struct __sk_buff *skb,
+                                      struct bpf_sock_tuple *tuple, uint64_t tuplen)
 {
        struct bpf_sock *sk =
                bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
        return UNKNOWN;
 }
 
-static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
-                              struct bpf_sock_tuple *tuple, uint64_t tuplen,
-                              metrics_t *metrics)
+static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
+                                       struct bpf_sock_tuple *tuple, uint64_t tuplen,
+                                       metrics_t *metrics)
 {
        switch (proto) {
        case IPPROTO_TCP:
        }
 }
 
-static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
+static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
 {
        struct icmphdr icmp;
        if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
                             sizeof(tuple.ipv4), metrics);
 }
 
-static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
+static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
 {
        struct icmp6hdr icmp6;
        if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
                             metrics);
 }
 
-static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
-                            metrics_t *metrics)
+static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
+                                     metrics_t *metrics)
 {
        metrics->l4_protocol_packets_total_tcp++;
 
        return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
 }
 
-static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
-                            metrics_t *metrics)
+static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
+                                     metrics_t *metrics)
 {
        metrics->l4_protocol_packets_total_udp++;
 
        return classify_udp(pkt->skb, &tuple, tuplen);
 }
 
-static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
+static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
 {
        metrics->l3_protocol_packets_total_ipv4++;
 
        }
 }
 
-static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
+static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
 {
        metrics->l3_protocol_packets_total_ipv6++;