Program type BPF_PROG_TYPE_RAW_TRACEPOINT
Raw tracepoint programs are similar to tracepoint programs, but the kernel does no pre-processing on the arguments and passes the raw arguments directly to the tracepoint program.
Usage
Raw tracepoint programs are typically put into an ELF section prefixed with raw_tp/
or in a raw_tracepoint
section. When loading as a BPF_PROG_TYPE_TRACING
program, the raw tracepoint is typically located in a section prefixed with tp_btf/
.
Raw tracepoints are attached to the same tracepoints as normal tracepoint programs. The reason why you might want to use raw tracepoints over normal tracepoints is due to the performance improvement. For normal tracepoints, the kernel will cast or transform arguments even if the arguments are never used. By taking the raw arguments, the BPF program can do the casting or transformation only if the arguments are used, thereby making a more efficient tracepoint program.
Context
The context for raw tracepoint programs is a pointer to a struct bpf_raw_tracepoint_args
:
struct bpf_raw_tracepoint_args {
__u64 args[0];
};
The args
array contains the raw arguments to the tracepoint. The number of arguments is determined by the tracepoint. The verifier will enforce that the number of arguments matches the number of arguments expected by the tracepoint. The BPF program can cast the u64 values to the expected types or use the bpf_probe_read
/bpf_probe_read_kernel
helper function to read the arguments.
Attachment
Raw tracepoints can be attached in two ways, first is with a dedicated syscall, the second method is with the more generic BPF link syscall.
Syscall
The dedicated syscall BPF_RAW_TRACEPOINT_OPEN
can be used to attach the raw tracepoint. This requires the name
field to be set to a string containing the name of the tracepoint to which the user whishes to attach to. The prog_fd
attribute field should be set to the file descriptor of the BPF program to attach.
Docs could be improved
This part of the docs is incomplete, contributions are very welcome
BPF link
A BPF link can also be used to attach a raw tracepoint program. To do so the raw tracepoint must be loaded with BPF_PROG_TYPE_TRACING
program type instead of the BPF_PROG_TYPE_RAW_TRACEPOINT
program type. The expected_attach_type
should be BPF_TRACE_RAW_TP
and the attach_btf_id
attribute set to the BTF ID of the tracepoint the program should be attached to.
After that a link should be created via the link create syscall command syscall. The attach type set to BPF_TRACE_RAW_TP
.
Example
raw tracepoint
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (c) 2021 Facebook
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(int));
__uint(map_flags, BPF_F_PRESERVE_ELEMS);
} events SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_perf_event_value));
__uint(max_entries, 1);
} prev_readings SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct bpf_perf_event_value));
__uint(max_entries, 1);
} diff_readings SEC(".maps");
SEC("raw_tp/sched_switch")
int BPF_PROG(on_switch)
{
struct bpf_perf_event_value val, *prev_val, *diff_val;
__u32 key = bpf_get_smp_processor_id();
__u32 zero = 0;
long err;
prev_val = bpf_map_lookup_elem(&prev_readings, &zero);
if (!prev_val)
return 0;
diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
if (!diff_val)
return 0;
err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
if (err)
return 0;
diff_val->counter = val.counter - prev_val->counter;
diff_val->enabled = val.enabled - prev_val->enabled;
diff_val->running = val.running - prev_val->running;
*prev_val = val;
return 0;
}
char LICENSE[] SEC("license") = "Dual BSD/GPL";
tracing program
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Facebook
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include "runqslower.h"
#define TASK_RUNNING 0
#define BPF_F_CURRENT_CPU 0xffffffffULL
const volatile __u64 min_us = 0;
const volatile pid_t targ_pid = 0;
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, u64);
} start SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
} events SEC(".maps");
/* record enqueue timestamp */
__always_inline
static int trace_enqueue(struct task_struct *t)
{
u32 pid = t->pid;
u64 *ptr;
if (!pid || (targ_pid && targ_pid != pid))
return 0;
ptr = bpf_task_storage_get(&start, t, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!ptr)
return 0;
*ptr = bpf_ktime_get_ns();
return 0;
}
SEC("tp_btf/sched_wakeup")
int handle__sched_wakeup(u64 *ctx)
{
/* TP_PROTO(struct task_struct *p) */
struct task_struct *p = (void *)ctx[0];
return trace_enqueue(p);
}
SEC("tp_btf/sched_wakeup_new")
int handle__sched_wakeup_new(u64 *ctx)
{
/* TP_PROTO(struct task_struct *p) */
struct task_struct *p = (void *)ctx[0];
return trace_enqueue(p);
}
SEC("tp_btf/sched_switch")
int handle__sched_switch(u64 *ctx)
{
/* TP_PROTO(bool preempt, struct task_struct *prev,
* struct task_struct *next)
*/
struct task_struct *prev = (struct task_struct *)ctx[1];
struct task_struct *next = (struct task_struct *)ctx[2];
struct runq_event event = {};
u64 *tsp, delta_us;
long state;
u32 pid;
/* ivcsw: treat like an enqueue event and store timestamp */
if (prev->__state == TASK_RUNNING)
trace_enqueue(prev);
pid = next->pid;
/* For pid mismatch, save a bpf_task_storage_get */
if (!pid || (targ_pid && targ_pid != pid))
return 0;
/* fetch timestamp and calculate delta */
tsp = bpf_task_storage_get(&start, next, 0, 0);
if (!tsp)
return 0; /* missed enqueue */
delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
if (min_us && delta_us <= min_us)
return 0;
event.pid = pid;
event.delta_us = delta_us;
bpf_get_current_comm(&event.task, sizeof(event.task));
/* output */
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
&event, sizeof(event));
bpf_task_storage_delete(&start, next);
return 0;
}
char LICENSE[] SEC("license") = "GPL";
Helper functions
Not all helper functions are available in all program types. These are the helper calls available for raw tracepoint programs:
Supported helper functions
- bpf_perf_event_output
- bpf_get_stackid
- bpf_get_stack
- bpf_map_lookup_elem
- bpf_map_update_elem
- bpf_map_delete_elem
- bpf_map_push_elem
- bpf_map_pop_elem
- bpf_map_peek_elem
- bpf_map_lookup_percpu_elem
- bpf_ktime_get_ns
- bpf_ktime_get_boot_ns
- bpf_tail_call
- bpf_get_current_pid_tgid
- bpf_get_current_task
- bpf_get_current_task_btf
- bpf_task_pt_regs
- bpf_get_current_uid_gid
- bpf_get_current_comm
- bpf_trace_printk
- bpf_get_smp_processor_id
- bpf_get_numa_node_id
- bpf_perf_event_read
- bpf_current_task_under_cgroup
- bpf_get_prandom_u32
- bpf_probe_write_user
- bpf_probe_read_user
- bpf_probe_read_kernel
- bpf_probe_read_user_str
- bpf_probe_read_kernel_str
- bpf_probe_read
- bpf_probe_read_str
- bpf_get_current_cgroup_id
- bpf_get_current_ancestor_cgroup_id
- bpf_send_signal
- bpf_send_signal_thread
- bpf_perf_event_read_value
- bpf_get_ns_current_pid_tgid
- bpf_ringbuf_output
- bpf_ringbuf_reserve
- bpf_ringbuf_submit
- bpf_ringbuf_discard
- bpf_ringbuf_query
- bpf_jiffies64
- bpf_get_task_stack
- bpf_copy_from_user
- bpf_copy_from_user_task
- bpf_snprintf_btf
- bpf_per_cpu_ptr
- bpf_this_cpu_ptr
- bpf_task_storage_get
- bpf_task_storage_delete
- bpf_for_each_map_elem
- bpf_snprintf
- bpf_get_func_ip
- bpf_get_branch_snapshot
- bpf_find_vma
- bpf_trace_vprintk
- bpf_map_lookup_elem
- bpf_map_update_elem
- bpf_map_delete_elem
- bpf_map_push_elem
- bpf_map_pop_elem
- bpf_map_peek_elem
- bpf_map_lookup_percpu_elem
- bpf_get_prandom_u32
- bpf_get_smp_processor_id
- bpf_get_numa_node_id
- bpf_tail_call
- bpf_ktime_get_ns
- bpf_ktime_get_boot_ns
- bpf_ringbuf_output
- bpf_ringbuf_reserve
- bpf_ringbuf_submit
- bpf_ringbuf_discard
- bpf_ringbuf_query
- bpf_for_each_map_elem
- bpf_loop
- bpf_strncmp
- bpf_spin_lock
- bpf_spin_unlock
- bpf_jiffies64
- bpf_per_cpu_ptr
- bpf_this_cpu_ptr
- bpf_timer_init
- bpf_timer_set_callback
- bpf_timer_start
- bpf_timer_cancel
- bpf_trace_printk
- bpf_get_current_task
- bpf_get_current_task_btf
- bpf_probe_read_user
- bpf_probe_read_kernel
- bpf_probe_read_user_str
- bpf_probe_read_kernel_str
- bpf_snprintf_btf
- bpf_snprintf
- bpf_task_pt_regs
- bpf_trace_vprintk
KFuncs
There are currently no kfuncs supported for this program type