sqlite-trace/src/sqlite_trace.bpf.c at main · Query-Doctor/sqlite-trace

550 lines (490 loc) · 16 KB
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include "sqlite_trace.h"
char LICENSE[] SEC("license") = "Dual BSD/GPL";
/* Minimal CO-RE views of SQLite's structs; libbpf relocates field offsets by
 * NAME against the version-matched BTF. preserve_access_index makes every access
 * a relocation. */
struct Pager {
	char *zFilename;
} __attribute__((preserve_access_index));
struct BtShared {
	struct Pager *pPager;
} __attribute__((preserve_access_index));
struct Btree {
	struct BtShared *pBt;
} __attribute__((preserve_access_index));
struct Db {
	char *zDbSName;
	struct Btree *pBt;
} __attribute__((preserve_access_index));
struct sqlite3 {
	struct Db *aDb;
} __attribute__((preserve_access_index));
/* MUST be tagged `sqlite3_value`, not `Mem`: SQLite has `typedef struct
 * sqlite3_value Mem`, so a view tagged `Mem` matches only the typedef in the
 * target BTF and field relocation fails. Omitted middle fields don't matter;
 * offsets relocate by name. */
struct sqlite3_value {
		double r;
		long long i;
	unsigned short flags;
} __attribute__((preserve_access_index));
struct Vdbe {
	struct sqlite3 *db;
	char *zSql;
	unsigned int btreeMask;
	short nVar;
	struct sqlite3_value *aVar;
} __attribute__((preserve_access_index));
#define MEM_Null 0x0001
#define MEM_Str  0x0002
#define MEM_Int  0x0004
#define MEM_Real 0x0008
#define MEM_Blob 0x0010
/* Cap on params summed per execution; bpf_loop verifies the body once so this can
 * be large. Below SQLITE_MAX_VARIABLE_NUMBER (32766). */
#define MAX_VARS_SCAN 32768
/* Real sizeof(struct sqlite3_value) from the loader's version-matched BTF: the
 * truncated view above has the wrong compile-time size, so we step aVar by this
 * many bytes per element instead of indexing. 0 = scan skipped. */
const volatile __u32 mem_stride = 0;
const volatile __u8 capture_values = 0;
#define SQLITE_ROW  100
#define SQLITE_DONE 101
/* PID/TGID as seen inside the task's own (container) pid namespace, read from
 * task_struct via CO-RE so it works at any namespace depth. numbers[level].nr at
 * the deepest level is the innermost namespace's view.
 * https://github.com/torvalds/linux/blob/master/include/linux/pid.h */
static __always_inline __u32 ns_pid_of(struct pid *pid)
		return 0;
	unsigned int level = BPF_CORE_READ(pid, level);
	return BPF_CORE_READ(pid, numbers[level].nr);
static __always_inline __u64 task_ns_pid_tgid(struct task_struct *task)
	if (!task)
		return 0;
	struct pid *tpid = BPF_CORE_READ(task, thread_pid);
	struct task_struct *leader = BPF_CORE_READ(task, group_leader);
	struct pid *gpid = BPF_CORE_READ(leader, thread_pid);
	__u64 ns_pid = ns_pid_of(tpid);
	__u64 ns_tgid = ns_pid_of(gpid);
	return (ns_tgid << 32) | ns_pid;
	__uint(type, BPF_MAP_TYPE_RINGBUF);
	__uint(max_entries, 256 * 1024);
} rb SEC(".maps");
struct var_scan_ctx {
	char *base;
	__u32 stride;
	__u32 nvar;
	__u32 bound_bytes;
	__u32 scanned;
	__u64 stmt_ptr;
	__u32 emitted;
/* Emit one captured value; truncates text/blob to MAX_VAL_BYTES, faulting read
 * yields len 0. */
static __always_inline void emit_bound_value(struct var_scan_ctx *c, __u32 idx,
					     struct sqlite3_value *m,
					     unsigned short flags, int n)
	struct bound_value *bv = bpf_ringbuf_reserve(&rb, sizeof(*bv), 0);
	bv->kind = REC_BOUND_VALUE;
	bv->stmt_ptr = c->stmt_ptr;
	bv->idx = idx;
	bv->full_len = 0;
	bv->len = 0;
	bv->truncated = 0;
	bv->scalar.i = 0;
	if (flags & MEM_Null) {
		bv->type = BOUND_NULL;
	} else if (flags & (MEM_Int | MEM_Real)) {
		/* The union is the first member at offset 0 and r/i both sit at offset 0,
		 * so the scalar is the first 8 bytes; the nested CO-RE path
		 * "sqlite3_value.u.r" fails to resolve and the offset is invariant anyway. */
		__u64 raw = 0;
		bpf_probe_read_user(&raw, sizeof(raw), m);
		bv->scalar.i = (__s64)raw;
		bv->type = (flags & MEM_Int) ? BOUND_INT : BOUND_REAL;
	} else if (flags & (MEM_Str | MEM_Blob)) {
		bv->type = (flags & MEM_Str) ? BOUND_TEXT : BOUND_BLOB;
		__u32 full = n > 0 ? (__u32)n : 0;
		bv->full_len = full;
		__u32 take = full > MAX_VAL_BYTES ? MAX_VAL_BYTES : full;
		bv->truncated = full > take;
		const char *z = BPF_CORE_READ_USER(m, z);
		if (z && take) {
			/* re-clamp so the verifier sees the copy size bounded by the buffer */
			if (take > MAX_VAL_BYTES)
				take = MAX_VAL_BYTES;
			if (bpf_probe_read_user(bv->data, take, z) == 0)
				bv->len = take;
		bv->type = BOUND_UNKNOWN;
	bpf_ringbuf_submit(bv, 0);
	c->emitted++;
/* bpf_loop verifies this body ONCE regardless of trip count. */
static __u64 var_scan_cb(__u32 i, void *vctx)
	struct var_scan_ctx *c = vctx;
	if (i >= c->nvar)
		return 1;
	struct sqlite3_value *m = (struct sqlite3_value *)(c->base + (__u64)i * c->stride);
	unsigned short flags = BPF_CORE_READ_USER(m, flags);
	int n = BPF_CORE_READ_USER(m, n);
	__u32 sz = 0;
	if (flags & (MEM_Str | MEM_Blob))
		sz = n > 0 ? (__u32)n : 0;
	else if (flags & (MEM_Int | MEM_Real))
	c->bound_bytes += sz;
	c->scanned = i + 1;
	if (capture_values && c->emitted < MAX_VALS_EMIT)
		emit_bound_value(c, i, m, flags, n);
/* User-stack storage: bpf_get_stackid() stores frames here keyed by id;
 * userspace reads them back by id. */
	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
	__uint(max_entries, 16384);
	__uint(key_size, sizeof(__u32));
	__uint(value_size, MAX_STACK_DEPTH * sizeof(__u64));
} stackmap SEC(".maps");
/* Per-statement accumulator keyed by stmt pointer. sqlite3_step loops once per
 * SQLITE_ROW then a final non-ROW return; we accumulate across steps and emit one
 * sql_event when the loop terminates, so a query is reported once with its row
 * count. */
struct step_acc {
	__u32 rows;
	__u32 btreeMask;
	__u64 engine_ns;	/* sum of (return - entry): in-engine time only */
	__u64 start_boot_ns;	/* boot clock so userspace can map to CLOCK_REALTIME */
	__u64 zsql_ptr;
	__s32 read_ret;
	__u32 sql_state;
	__s32 stack_id;
	__u32 sql_bytes;
	__u32 bound_bytes;
	__u32 nvar;
	__u32 nvar_scanned;	/* clamped by MAX_VARS_SCAN */
	char sql[MAX_SQL];
	char db_path[MAX_DB_PATH];
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(max_entries, 4096);
	__type(key, __u64);
	__type(value, struct step_acc);
} accs SEC(".maps");
/* step_acc exceeds the 512-byte BPF stack, so it can't be a local: this per-CPU
 * zeroed template seeds new hash entries without putting one on the stack. */
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 1);
	__type(key, __u32);
	__type(value, struct step_acc);
} acc_zero SEC(".maps");
/* Correlate entry->return within one sqlite3_step: the uretprobe can't see the
 * entry arg, so stash stmt + entry timestamp per thread (one in-flight step per
 * thread); the timestamp lets the return probe add only this step's duration. */
struct inflight_step {
	__u64 stmt;
	__u64 entry_ns;
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(max_entries, 4096);
	__type(key, __u64);
	__type(value, struct inflight_step);
} inflight SEC(".maps");
/* SQL captured at prepare time, keyed by stmt pointer; recovers SQL when
 * stmt->zSql is NULL at step time (prepared without SAVESQL, sub-statements). */
struct prepared_sql {
	char sql[MAX_SQL];
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(max_entries, 4096);
	__type(key, __u64);
	__type(value, struct prepared_sql);
} prepared SEC(".maps");
/* Zeroed per-CPU template, same oversized-value trick as acc_zero. */
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 1);
	__type(key, __u32);
	__type(value, struct prepared_sql);
} prepared_zero SEC(".maps");
/* Per-thread STACK for in-flight prepares: sqlite3_prepare RECURSES (compiling
 * CREATE TABLE prepares nested sqlite_master reads on the same thread before the
 * outer prepare returns), so a single slot would be clobbered. Entry pushes,
 * LIFO return pops, pairing each return with its own entry. Overflow drops that
 * statement's capture, never corrupts a sibling. */
#define PREP_STACK_DEPTH 8
struct inflight_prepare {
	__u64 zsql;
	__u64 ppstmt;
struct prepare_stack {
	__u32 depth;
	struct inflight_prepare frames[PREP_STACK_DEPTH];
	__uint(type, BPF_MAP_TYPE_HASH);
	__uint(max_entries, 4096);
	__type(key, __u64);
	__type(value, struct prepare_stack);
} inflight_prepare SEC(".maps");
/* Zeroed template so a new per-thread stack is seeded off the BPF stack. */
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 1);
	__type(key, __u32);
	__type(value, struct prepare_stack);
} prepare_stack_zero SEC(".maps");
static __always_inline void stash_prepare(__u64 zSql, __u64 ppStmt)
	__u64 id = bpf_get_current_pid_tgid();
	struct prepare_stack *st = bpf_map_lookup_elem(&inflight_prepare, &id);
	if (!st) {
		__u32 zero = 0;
		struct prepare_stack *tmpl =
			bpf_map_lookup_elem(&prepare_stack_zero, &zero);
		if (!tmpl)
		bpf_map_update_elem(&inflight_prepare, &id, tmpl, BPF_ANY);
		st = bpf_map_lookup_elem(&inflight_prepare, &id);
	__u32 d = st->depth;
	if (d >= PREP_STACK_DEPTH)
	st->frames[d].zsql = zSql;
	st->frames[d].ppstmt = ppStmt;
	st->depth = d + 1;
/* sqlite3_prepare_v2(db, zSql, nByte, ppStmt, pzTail): ppStmt is arg4. */
SEC("uprobe")
int BPF_KPROBE(handle_prepare, void *db, const char *zSql, int nByte,
	       void *ppStmt)
	stash_prepare((__u64)zSql, (__u64)ppStmt);
/* sqlite3_prepare_v3(db, zSql, nByte, prepFlags, ppStmt, pzTail): prepFlags pushes
 * ppStmt to arg5. Used by modern callers incl. the sqlite3 CLI.
 * https://www.sqlite.org/c3ref/prepare.html */
SEC("uprobe")
int BPF_KPROBE(handle_prepare_v3, void *db, const char *zSql, int nByte,
	       unsigned int prepFlags, void *ppStmt)
	stash_prepare((__u64)zSql, (__u64)ppStmt);
SEC("uretprobe")
int BPF_KRETPROBE(handle_prepare_ret, int rc)
	__u64 id = bpf_get_current_pid_tgid();
	struct prepare_stack *st = bpf_map_lookup_elem(&inflight_prepare, &id);
	if (!st || st->depth == 0)
		return 0;
	__u32 d = st->depth - 1;
	st->depth = d;
	if (d >= PREP_STACK_DEPTH)
		return 0;
	__u64 zsql = st->frames[d].zsql;
	__u64 ppstmt = st->frames[d].ppstmt;
	if (rc != 0 || !zsql || !ppstmt)
		return 0;
	/* *ppStmt holds the prepared stmt pointer in the traced process: a user read. */
	__u64 stmt = 0;
	if (bpf_probe_read_user(&stmt, sizeof(stmt), (void *)ppstmt) != 0 || !stmt)
		return 0;
	__u32 zero = 0;
	struct prepared_sql *tmpl = bpf_map_lookup_elem(&prepared_zero, &zero);
	if (!tmpl)
		return 0;
	bpf_map_update_elem(&prepared, &stmt, tmpl, BPF_ANY);
	struct prepared_sql *ps = bpf_map_lookup_elem(&prepared, &stmt);
		return 0;
	bpf_probe_read_user_str(&ps->sql, sizeof(ps->sql), (void *)zsql);
static __always_inline void fill_app_context(struct sql_event *e)
	__u64 id = bpf_get_current_pid_tgid();
	e->tgid = id >> 32;
	e->pid = (__u32)id;
	bpf_get_current_comm(&e->comm, sizeof(e->comm));
	__u64 uid_gid = bpf_get_current_uid_gid();
	e->uid = (__u32)uid_gid;
	e->gid = (__u32)(uid_gid >> 32);
	e->cgroup_id = bpf_get_current_cgroup_id();
	struct task_struct *task = (struct task_struct *)bpf_get_current_task();
	__u64 ns_id = task_ns_pid_tgid(task);
	e->ns_pid = (__u32)ns_id;
	e->ns_tgid = (__u32)(ns_id >> 32);
SEC("uprobe")
int BPF_KPROBE(handle_step, struct Vdbe *stmt)
	if (!stmt)
		return 0;
	__u64 id = bpf_get_current_pid_tgid();
	__u64 key = (__u64)stmt;
	struct inflight_step ifs = {
		.stmt = key,
		.entry_ns = bpf_ktime_get_ns(),
	bpf_map_update_elem(&inflight, &id, &ifs, BPF_ANY);
	/* First step of this execution: create the accumulator once. */
	if (!bpf_map_lookup_elem(&accs, &key)) {
		__u32 zero = 0;
		struct step_acc *tmpl = bpf_map_lookup_elem(&acc_zero, &zero);
		if (!tmpl)
			return 0;
		/* Write fields through the in-map pointer so the oversized step_acc
		 * never lands on the stack. */
		bpf_map_update_elem(&accs, &key, tmpl, BPF_ANY);
		struct step_acc *acc = bpf_map_lookup_elem(&accs, &key);
		if (!acc)
			return 0;
		acc->rows = 0;
		acc->engine_ns = 0;
		/* boot clock counts wall-time across suspend, so it maps to CLOCK_REALTIME.
		 * https://docs.ebpf.io/linux/helper-function/bpf_ktime_get_boot_ns/ */
		acc->start_boot_ns = bpf_ktime_get_boot_ns();
		acc->stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK);
		/* _USER variant required: stmt points into the traced process's user
		 * memory, so the field load needs bpf_probe_read_user; plain BPF_CORE_READ
		 * uses bpf_probe_read_kernel and reads garbage. Offset still relocates. */
		const char *zsql = BPF_CORE_READ_USER(stmt, zSql);
		acc->zsql_ptr = (__u64)zsql;
		if (!zsql) {
			/* zSql not retained: recover the text from the prepare hook. */
			struct prepared_sql *ps = bpf_map_lookup_elem(&prepared, &key);
			if (ps) {
				bpf_probe_read_kernel(&acc->sql, sizeof(acc->sql), &ps->sql);
				acc->sql_state = SQLSTATE_FROM_PREPARE;
				acc->read_ret = 0;
			} else {
				acc->sql_state = SQLSTATE_NULLPTR;
				acc->read_ret = 0;
			long r = bpf_probe_read_user_str(&acc->sql, sizeof(acc->sql), zsql);
			acc->read_ret = (__s32)r;
			acc->sql_state = r < 0 ? SQLSTATE_READERR : SQLSTATE_OK;
		/* read_user_str returns bytes copied INCLUDING NUL, so length is r-1; the
		 * prepare-recovered path used a kernel copy (r==0) so sql_bytes stays 0. */
		acc->sql_bytes = acc->read_ret > 0 ? (__u32)(acc->read_ret - 1) : 0;
		short nvar = BPF_CORE_READ_USER(stmt, nVar);
		struct sqlite3_value *aVar = BPF_CORE_READ_USER(stmt, aVar);
		acc->nvar = nvar > 0 ? (__u32)nvar : 0;
		acc->bound_bytes = 0;
		acc->nvar_scanned = 0;
		if (aVar && nvar > 0 && mem_stride) {
			__u32 trips = acc->nvar;
			if (trips > MAX_VARS_SCAN)
				trips = MAX_VARS_SCAN;
			struct var_scan_ctx vc = {
				.base = (char *)aVar,
				.stride = mem_stride,
				.nvar = trips,
				.bound_bytes = 0,
				.scanned = 0,
				.stmt_ptr = key,
				.emitted = 0,
			bpf_loop(trips, var_scan_cb, &vc, 0);
			acc->bound_bytes = vc.bound_bytes;
			acc->nvar_scanned = vc.scanned;
		/* btreeMask moved between versions (3.40: byte 200, 3.46: byte 204), so it
		 * doubles as a version discriminator: the right BTF reads the real bitmask,
		 * the wrong one reads a neighbouring field. */
		acc->btreeMask = BPF_CORE_READ_USER(stmt, btreeMask);
		/* Walk stmt->db->aDb[0].pBt->pBt->pPager->zFilename (aDb[0] = "main").
		 * Any broken link leaves db_path zero-seeded. */
		struct sqlite3 *db = BPF_CORE_READ_USER(stmt, db);
		if (db) {
			struct Db *aDb = BPF_CORE_READ_USER(db, aDb);
			if (aDb) {
				struct Btree *bt = BPF_CORE_READ_USER(aDb, pBt);
				const char *zf = NULL;
				if (bt)
					zf = BPF_CORE_READ_USER(bt, pBt, pPager, zFilename);
				if (zf)
					bpf_probe_read_user_str(&acc->db_path,
								sizeof(acc->db_path), zf);
SEC("uretprobe")
int BPF_KRETPROBE(handle_step_ret, int rc)
	__u64 id = bpf_get_current_pid_tgid();
	struct inflight_step *ifs = bpf_map_lookup_elem(&inflight, &id);
		return 0;
	__u64 key = ifs->stmt;
	__u64 entry_ns = ifs->entry_ns;
	bpf_map_delete_elem(&inflight, &id);
	struct step_acc *acc = bpf_map_lookup_elem(&accs, &key);
		return 0;
	acc->engine_ns += bpf_ktime_get_ns() - entry_ns;
	if (rc == SQLITE_ROW) {
		acc->rows++;
		return 0;
	/* Non-ROW return ends the loop: emit one event, then drop the accumulator. */
	struct sql_event *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
		e->kind = REC_EVENT;
		fill_app_context(e);
		e->rows = acc->rows;
		e->rc = rc;
		e->btreeMask = acc->btreeMask;
		e->ns = acc->engine_ns;
		e->start_boot_ns = acc->start_boot_ns;
		e->stmt_ptr = key;
		e->zsql_ptr = acc->zsql_ptr;
		e->read_ret = acc->read_ret;
		e->sql_state = acc->sql_state;
		e->stack_id = acc->stack_id;
		e->sql_bytes = acc->sql_bytes;
		e->bound_bytes = acc->bound_bytes;
		e->nvar = acc->nvar;
		e->nvar_scanned = acc->nvar_scanned;
		/* acc is in map memory; copy via kernel read (BPF backend lacks a
		 * builtin memcpy of this size). */
		bpf_probe_read_kernel(&e->sql, sizeof(e->sql), &acc->sql);
		bpf_probe_read_kernel(&e->db_path, sizeof(e->db_path), &acc->db_path);
		bpf_ringbuf_submit(e, 0);
	bpf_map_delete_elem(&accs, &key);
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

FilesExpand file tree

sqlite_trace.bpf.c

Latest commit

History

sqlite_trace.bpf.c

File metadata and controls