Program type `BPF_PROG_TYPE_STRUCT_OPS`

This program types allows for the implementation of certain kernel features in BPF.

Usage

The kernel uses the "struct ops" pattern in C to implement interfaces. The kernel defines a struct with function pointers as field types. An implementation can create an instance of this struct and provide pointers to its own functions that implement the signatures.

This program type allows for the creation of these struct implementations with BPF so locations in the kernel that allow it can BPF to implement the functionality. For example the TCP congestion control algorithm.

Context

The context provided to each function is an array of 64 bit integers, representing the argument list of the function in the struct it is implementing. The BPF_PROG macro from libbpf can be used to write your program in a recognizable way, the macro will take care of unpacking the array into named variables of the provided type.

Attachment

User perspective

From a eBPF user perspective, all one has to do is to create an instance of the struct to be used as implementation, any function pointer fields you wish to use should point to eBPF programs. This struct must be placed in the .struct_ops section like so:

/* Copyright (c) 2019 Facebook */

SEC(".struct_ops")
struct tcp_congestion_ops dctcp_nouse = {
    .init       = (void *)dctcp_init,
    .set_state  = (void *)dctcp_state,
    .flags      = TCP_CONG_NEEDS_ECN,
    .name       = "bpf_dctcp_nouse",
};

The full file is turned into a light skeleton. After loading the light skeleton the struct will appear as a map in the skel->maps structure. Then the map is passed to the bpf_map__attach_struct_ops function to attach it.

Loader/kernel perspective

The loader locates the struct(s) in the .struct_ops section. It uses BTF to get the name and structure of the struct. The values of non-function fields are present in the .struct_ops section itself and for the function pointer fields that were set there will be ELF relocation entries pointing to the respective eBPF programs.

The loader will figure out which eBPF programs were referenced and load them. The loader then will inspect the VMLinux BTF of the host to find the BTF type ID of the struct_ops struct. Once we have that, the loader creates a BPF_MAP_TYPE_STRUCT_OPS map. While loading the BTF type ID of the struct_ops struct is provided via the btf_vmlinux_value_type_id argument. The btf_value_type_id is set to the BTF ID of the programs struct. The key of the map must be an int/4 bytes and max_entries of 1.

The kernel has now allocated memory for one instance of the struct_ops struct and associated it with the call location. So the next step for the loader is to write to the only element in the map, element 0. The value of the map is the instance of the struct as C would lay it out in memory. Except the values of the function pointer fields are populated with the file descriptors of the eBPF programs, the kernel will transform these into the actual memory locations of the JITed eBPF programs. As soon as the update happens, the struct_ops programs are attached. The map holds the refcounts to the programs, so they don't have to be pinned. As soon as element 0 of the map is deleted or the map is cleaned up due to being unreferenced, the struct_ops programs are detached. So typically the map gets pinned.

Example

DC TCP

// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */

/* WARNING: This implemenation is not necessarily the same
* as the tcp_dctcp.c.  The purpose is mainly for testing
* the kernel BPF logic.
*/

#include <stddef.h>
#include <linux/bpf.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/tcp.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_tcp_helpers.h"

char _license[] SEC("license") = "GPL";

int stg_result = 0;

struct {
    __uint(type, BPF_MAP_TYPE_SK_STORAGE);
    __uint(map_flags, BPF_F_NO_PREALLOC);
    __type(key, int);
    __type(value, int);
} sk_stg_map SEC(".maps");

#define DCTCP_MAX_ALPHA 1024U

struct dctcp {
    __u32 old_delivered;
    __u32 old_delivered_ce;
    __u32 prior_rcv_nxt;
    __u32 dctcp_alpha;
    __u32 next_seq;
    __u32 ce_state;
    __u32 loss_cwnd;
};

static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */
static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA;

static __always_inline void dctcp_reset(const struct tcp_sock *tp,
                    struct dctcp *ca)
{
    ca->next_seq = tp->snd_nxt;

    ca->old_delivered = tp->delivered;
    ca->old_delivered_ce = tp->delivered_ce;
}

SEC("struct_ops/dctcp_init")
void BPF_PROG(dctcp_init, struct sock *sk)
{
    const struct tcp_sock *tp = tcp_sk(sk);
    struct dctcp *ca = inet_csk_ca(sk);
    int *stg;

    ca->prior_rcv_nxt = tp->rcv_nxt;
    ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
    ca->loss_cwnd = 0;
    ca->ce_state = 0;

    stg = bpf_sk_storage_get(&sk_stg_map, (void *)tp, NULL, 0);
    if (stg) {
        stg_result = *stg;
        bpf_sk_storage_delete(&sk_stg_map, (void *)tp);
    }
    dctcp_reset(tp, ca);
}

SEC("struct_ops/dctcp_ssthresh")
__u32 BPF_PROG(dctcp_ssthresh, struct sock *sk)
{
    struct dctcp *ca = inet_csk_ca(sk);
    struct tcp_sock *tp = tcp_sk(sk);

    ca->loss_cwnd = tp->snd_cwnd;
    return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
}

SEC("struct_ops/dctcp_update_alpha")
void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags)
{
    const struct tcp_sock *tp = tcp_sk(sk);
    struct dctcp *ca = inet_csk_ca(sk);

    /* Expired RTT */
    if (!before(tp->snd_una, ca->next_seq)) {
        __u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
        __u32 alpha = ca->dctcp_alpha;

        /* alpha = (1 - g) * alpha + g * F */

        alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
        if (delivered_ce) {
            __u32 delivered = tp->delivered - ca->old_delivered;

            /* If dctcp_shift_g == 1, a 32bit value would overflow
            * after 8 M packets.
            */
            delivered_ce <<= (10 - dctcp_shift_g);
            delivered_ce /= max(1U, delivered);

            alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
        }
        ca->dctcp_alpha = alpha;
        dctcp_reset(tp, ca);
    }
}

static __always_inline void dctcp_react_to_loss(struct sock *sk)
{
    struct dctcp *ca = inet_csk_ca(sk);
    struct tcp_sock *tp = tcp_sk(sk);

    ca->loss_cwnd = tp->snd_cwnd;
    tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
}

SEC("struct_ops/dctcp_state")
void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
{
    if (new_state == TCP_CA_Recovery &&
        new_state != BPF_CORE_READ_BITFIELD(inet_csk(sk), icsk_ca_state))
        dctcp_react_to_loss(sk);
    /* We handle RTO in dctcp_cwnd_event to ensure that we perform only
    * one loss-adjustment per RTT.
    */
}

static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
{
    struct tcp_sock *tp = tcp_sk(sk);

    if (ce_state == 1)
        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
    else
        tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

/* Minimal DCTP CE state machine:
*
* S:    0 <- last pkt was non-CE
*   1 <- last pkt was CE
*/
static __always_inline
void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
            __u32 *prior_rcv_nxt, __u32 *ce_state)
{
    __u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;

    if (*ce_state != new_ce_state) {
        /* CE state has changed, force an immediate ACK to
        * reflect the new CE state. If an ACK was delayed,
        * send that first to reflect the prior CE state.
        */
        if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
            dctcp_ece_ack_cwr(sk, *ce_state);
            bpf_tcp_send_ack(sk, *prior_rcv_nxt);
        }
        inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
    }
    *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
    *ce_state = new_ce_state;
    dctcp_ece_ack_cwr(sk, new_ce_state);
}

SEC("struct_ops/dctcp_cwnd_event")
void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev)
{
    struct dctcp *ca = inet_csk_ca(sk);

    switch (ev) {
    case CA_EVENT_ECN_IS_CE:
    case CA_EVENT_ECN_NO_CE:
        dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
        break;
    case CA_EVENT_LOSS:
        dctcp_react_to_loss(sk);
        break;
    default:
        /* Don't care for the rest. */
        break;
    }
}

SEC("struct_ops/dctcp_cwnd_undo")
__u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk)
{
    const struct dctcp *ca = inet_csk_ca(sk);

    return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}

extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;

SEC("struct_ops/dctcp_reno_cong_avoid")
void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
{
    tcp_reno_cong_avoid(sk, ack, acked);
}

SEC(".struct_ops")
struct tcp_congestion_ops dctcp_nouse = {
    .init       = (void *)dctcp_init,
    .set_state  = (void *)dctcp_state,
    .flags      = TCP_CONG_NEEDS_ECN,
    .name       = "bpf_dctcp_nouse",
};

SEC(".struct_ops")
struct tcp_congestion_ops dctcp = {
    .init       = (void *)dctcp_init,
    .in_ack_event   = (void *)dctcp_update_alpha,
    .cwnd_event = (void *)dctcp_cwnd_event,
    .ssthresh   = (void *)dctcp_ssthresh,
    .cong_avoid = (void *)dctcp_cong_avoid,
    .undo_cwnd  = (void *)dctcp_cwnd_undo,
    .set_state  = (void *)dctcp_state,
    .flags      = TCP_CONG_NEEDS_ECN,
    .name       = "bpf_dctcp",
};

Helper functions

Not all helper functions are available in all program types. These are the helper calls available for struct ops programs:

Supported helper functions

KFuncs

Supported kfuncs

Program type BPF_PROG_TYPE_STRUCT_OPS