-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsqlite_trace.bpf.c
More file actions
550 lines (490 loc) · 16 KB
/
Copy pathsqlite_trace.bpf.c
File metadata and controls
550 lines (490 loc) · 16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include "sqlite_trace.h"
char LICENSE[] SEC("license") = "Dual BSD/GPL";
/* Minimal CO-RE views of SQLite's structs; libbpf relocates field offsets by
* NAME against the version-matched BTF. preserve_access_index makes every access
* a relocation. */
struct Pager {
char *zFilename;
} __attribute__((preserve_access_index));
struct BtShared {
struct Pager *pPager;
} __attribute__((preserve_access_index));
struct Btree {
struct BtShared *pBt;
} __attribute__((preserve_access_index));
struct Db {
char *zDbSName;
struct Btree *pBt;
} __attribute__((preserve_access_index));
struct sqlite3 {
struct Db *aDb;
} __attribute__((preserve_access_index));
/* MUST be tagged `sqlite3_value`, not `Mem`: SQLite has `typedef struct
* sqlite3_value Mem`, so a view tagged `Mem` matches only the typedef in the
* target BTF and field relocation fails. Omitted middle fields don't matter;
* offsets relocate by name. */
struct sqlite3_value {
union {
double r;
long long i;
} u;
char *z;
int n;
unsigned short flags;
} __attribute__((preserve_access_index));
struct Vdbe {
struct sqlite3 *db;
char *zSql;
unsigned int btreeMask;
short nVar;
struct sqlite3_value *aVar;
} __attribute__((preserve_access_index));
#define MEM_Null 0x0001
#define MEM_Str 0x0002
#define MEM_Int 0x0004
#define MEM_Real 0x0008
#define MEM_Blob 0x0010
/* Cap on params summed per execution; bpf_loop verifies the body once so this can
* be large. Below SQLITE_MAX_VARIABLE_NUMBER (32766). */
#define MAX_VARS_SCAN 32768
/* Real sizeof(struct sqlite3_value) from the loader's version-matched BTF: the
* truncated view above has the wrong compile-time size, so we step aVar by this
* many bytes per element instead of indexing. 0 = scan skipped. */
const volatile __u32 mem_stride = 0;
const volatile __u8 capture_values = 0;
#define SQLITE_ROW 100
#define SQLITE_DONE 101
/* PID/TGID as seen inside the task's own (container) pid namespace, read from
* task_struct via CO-RE so it works at any namespace depth. numbers[level].nr at
* the deepest level is the innermost namespace's view.
* https://github.com/torvalds/linux/blob/master/include/linux/pid.h */
static __always_inline __u32 ns_pid_of(struct pid *pid)
{
if (!pid)
return 0;
unsigned int level = BPF_CORE_READ(pid, level);
return BPF_CORE_READ(pid, numbers[level].nr);
}
static __always_inline __u64 task_ns_pid_tgid(struct task_struct *task)
{
if (!task)
return 0;
struct pid *tpid = BPF_CORE_READ(task, thread_pid);
struct task_struct *leader = BPF_CORE_READ(task, group_leader);
struct pid *gpid = BPF_CORE_READ(leader, thread_pid);
__u64 ns_pid = ns_pid_of(tpid);
__u64 ns_tgid = ns_pid_of(gpid);
return (ns_tgid << 32) | ns_pid;
}
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 256 * 1024);
} rb SEC(".maps");
struct var_scan_ctx {
char *base;
__u32 stride;
__u32 nvar;
__u32 bound_bytes;
__u32 scanned;
__u64 stmt_ptr;
__u32 emitted;
};
/* Emit one captured value; truncates text/blob to MAX_VAL_BYTES, faulting read
* yields len 0. */
static __always_inline void emit_bound_value(struct var_scan_ctx *c, __u32 idx,
struct sqlite3_value *m,
unsigned short flags, int n)
{
struct bound_value *bv = bpf_ringbuf_reserve(&rb, sizeof(*bv), 0);
if (!bv)
return;
bv->kind = REC_BOUND_VALUE;
bv->stmt_ptr = c->stmt_ptr;
bv->idx = idx;
bv->full_len = 0;
bv->len = 0;
bv->truncated = 0;
bv->scalar.i = 0;
if (flags & MEM_Null) {
bv->type = BOUND_NULL;
} else if (flags & (MEM_Int | MEM_Real)) {
/* The union is the first member at offset 0 and r/i both sit at offset 0,
* so the scalar is the first 8 bytes; the nested CO-RE path
* "sqlite3_value.u.r" fails to resolve and the offset is invariant anyway. */
__u64 raw = 0;
bpf_probe_read_user(&raw, sizeof(raw), m);
bv->scalar.i = (__s64)raw;
bv->type = (flags & MEM_Int) ? BOUND_INT : BOUND_REAL;
} else if (flags & (MEM_Str | MEM_Blob)) {
bv->type = (flags & MEM_Str) ? BOUND_TEXT : BOUND_BLOB;
__u32 full = n > 0 ? (__u32)n : 0;
bv->full_len = full;
__u32 take = full > MAX_VAL_BYTES ? MAX_VAL_BYTES : full;
bv->truncated = full > take;
const char *z = BPF_CORE_READ_USER(m, z);
if (z && take) {
/* re-clamp so the verifier sees the copy size bounded by the buffer */
if (take > MAX_VAL_BYTES)
take = MAX_VAL_BYTES;
if (bpf_probe_read_user(bv->data, take, z) == 0)
bv->len = take;
}
} else {
bv->type = BOUND_UNKNOWN;
}
bpf_ringbuf_submit(bv, 0);
c->emitted++;
}
/* bpf_loop verifies this body ONCE regardless of trip count. */
static __u64 var_scan_cb(__u32 i, void *vctx)
{
struct var_scan_ctx *c = vctx;
if (i >= c->nvar)
return 1;
struct sqlite3_value *m = (struct sqlite3_value *)(c->base + (__u64)i * c->stride);
unsigned short flags = BPF_CORE_READ_USER(m, flags);
int n = BPF_CORE_READ_USER(m, n);
__u32 sz = 0;
if (flags & (MEM_Str | MEM_Blob))
sz = n > 0 ? (__u32)n : 0;
else if (flags & (MEM_Int | MEM_Real))
sz = 8;
c->bound_bytes += sz;
c->scanned = i + 1;
if (capture_values && c->emitted < MAX_VALS_EMIT)
emit_bound_value(c, i, m, flags, n);
return 0;
}
/* User-stack storage: bpf_get_stackid() stores frames here keyed by id;
* userspace reads them back by id. */
struct {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__uint(max_entries, 16384);
__uint(key_size, sizeof(__u32));
__uint(value_size, MAX_STACK_DEPTH * sizeof(__u64));
} stackmap SEC(".maps");
/* Per-statement accumulator keyed by stmt pointer. sqlite3_step loops once per
* SQLITE_ROW then a final non-ROW return; we accumulate across steps and emit one
* sql_event when the loop terminates, so a query is reported once with its row
* count. */
struct step_acc {
__u32 rows;
__u32 btreeMask;
__u64 engine_ns; /* sum of (return - entry): in-engine time only */
__u64 start_boot_ns; /* boot clock so userspace can map to CLOCK_REALTIME */
__u64 zsql_ptr;
__s32 read_ret;
__u32 sql_state;
__s32 stack_id;
__u32 sql_bytes;
__u32 bound_bytes;
__u32 nvar;
__u32 nvar_scanned; /* clamped by MAX_VARS_SCAN */
char sql[MAX_SQL];
char db_path[MAX_DB_PATH];
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 4096);
__type(key, __u64);
__type(value, struct step_acc);
} accs SEC(".maps");
/* step_acc exceeds the 512-byte BPF stack, so it can't be a local: this per-CPU
* zeroed template seeds new hash entries without putting one on the stack. */
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, struct step_acc);
} acc_zero SEC(".maps");
/* Correlate entry->return within one sqlite3_step: the uretprobe can't see the
* entry arg, so stash stmt + entry timestamp per thread (one in-flight step per
* thread); the timestamp lets the return probe add only this step's duration. */
struct inflight_step {
__u64 stmt;
__u64 entry_ns;
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 4096);
__type(key, __u64);
__type(value, struct inflight_step);
} inflight SEC(".maps");
/* SQL captured at prepare time, keyed by stmt pointer; recovers SQL when
* stmt->zSql is NULL at step time (prepared without SAVESQL, sub-statements). */
struct prepared_sql {
char sql[MAX_SQL];
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 4096);
__type(key, __u64);
__type(value, struct prepared_sql);
} prepared SEC(".maps");
/* Zeroed per-CPU template, same oversized-value trick as acc_zero. */
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, struct prepared_sql);
} prepared_zero SEC(".maps");
/* Per-thread STACK for in-flight prepares: sqlite3_prepare RECURSES (compiling
* CREATE TABLE prepares nested sqlite_master reads on the same thread before the
* outer prepare returns), so a single slot would be clobbered. Entry pushes,
* LIFO return pops, pairing each return with its own entry. Overflow drops that
* statement's capture, never corrupts a sibling. */
#define PREP_STACK_DEPTH 8
struct inflight_prepare {
__u64 zsql;
__u64 ppstmt;
};
struct prepare_stack {
__u32 depth;
struct inflight_prepare frames[PREP_STACK_DEPTH];
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 4096);
__type(key, __u64);
__type(value, struct prepare_stack);
} inflight_prepare SEC(".maps");
/* Zeroed template so a new per-thread stack is seeded off the BPF stack. */
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, struct prepare_stack);
} prepare_stack_zero SEC(".maps");
static __always_inline void stash_prepare(__u64 zSql, __u64 ppStmt)
{
__u64 id = bpf_get_current_pid_tgid();
struct prepare_stack *st = bpf_map_lookup_elem(&inflight_prepare, &id);
if (!st) {
__u32 zero = 0;
struct prepare_stack *tmpl =
bpf_map_lookup_elem(&prepare_stack_zero, &zero);
if (!tmpl)
return;
bpf_map_update_elem(&inflight_prepare, &id, tmpl, BPF_ANY);
st = bpf_map_lookup_elem(&inflight_prepare, &id);
if (!st)
return;
}
__u32 d = st->depth;
if (d >= PREP_STACK_DEPTH)
return;
st->frames[d].zsql = zSql;
st->frames[d].ppstmt = ppStmt;
st->depth = d + 1;
}
/* sqlite3_prepare_v2(db, zSql, nByte, ppStmt, pzTail): ppStmt is arg4. */
SEC("uprobe")
int BPF_KPROBE(handle_prepare, void *db, const char *zSql, int nByte,
void *ppStmt)
{
stash_prepare((__u64)zSql, (__u64)ppStmt);
return 0;
}
/* sqlite3_prepare_v3(db, zSql, nByte, prepFlags, ppStmt, pzTail): prepFlags pushes
* ppStmt to arg5. Used by modern callers incl. the sqlite3 CLI.
* https://www.sqlite.org/c3ref/prepare.html */
SEC("uprobe")
int BPF_KPROBE(handle_prepare_v3, void *db, const char *zSql, int nByte,
unsigned int prepFlags, void *ppStmt)
{
stash_prepare((__u64)zSql, (__u64)ppStmt);
return 0;
}
SEC("uretprobe")
int BPF_KRETPROBE(handle_prepare_ret, int rc)
{
__u64 id = bpf_get_current_pid_tgid();
struct prepare_stack *st = bpf_map_lookup_elem(&inflight_prepare, &id);
if (!st || st->depth == 0)
return 0;
__u32 d = st->depth - 1;
st->depth = d;
if (d >= PREP_STACK_DEPTH)
return 0;
__u64 zsql = st->frames[d].zsql;
__u64 ppstmt = st->frames[d].ppstmt;
if (rc != 0 || !zsql || !ppstmt)
return 0;
/* *ppStmt holds the prepared stmt pointer in the traced process: a user read. */
__u64 stmt = 0;
if (bpf_probe_read_user(&stmt, sizeof(stmt), (void *)ppstmt) != 0 || !stmt)
return 0;
__u32 zero = 0;
struct prepared_sql *tmpl = bpf_map_lookup_elem(&prepared_zero, &zero);
if (!tmpl)
return 0;
bpf_map_update_elem(&prepared, &stmt, tmpl, BPF_ANY);
struct prepared_sql *ps = bpf_map_lookup_elem(&prepared, &stmt);
if (!ps)
return 0;
bpf_probe_read_user_str(&ps->sql, sizeof(ps->sql), (void *)zsql);
return 0;
}
static __always_inline void fill_app_context(struct sql_event *e)
{
__u64 id = bpf_get_current_pid_tgid();
e->tgid = id >> 32;
e->pid = (__u32)id;
bpf_get_current_comm(&e->comm, sizeof(e->comm));
__u64 uid_gid = bpf_get_current_uid_gid();
e->uid = (__u32)uid_gid;
e->gid = (__u32)(uid_gid >> 32);
e->cgroup_id = bpf_get_current_cgroup_id();
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
__u64 ns_id = task_ns_pid_tgid(task);
e->ns_pid = (__u32)ns_id;
e->ns_tgid = (__u32)(ns_id >> 32);
}
SEC("uprobe")
int BPF_KPROBE(handle_step, struct Vdbe *stmt)
{
if (!stmt)
return 0;
__u64 id = bpf_get_current_pid_tgid();
__u64 key = (__u64)stmt;
struct inflight_step ifs = {
.stmt = key,
.entry_ns = bpf_ktime_get_ns(),
};
bpf_map_update_elem(&inflight, &id, &ifs, BPF_ANY);
/* First step of this execution: create the accumulator once. */
if (!bpf_map_lookup_elem(&accs, &key)) {
__u32 zero = 0;
struct step_acc *tmpl = bpf_map_lookup_elem(&acc_zero, &zero);
if (!tmpl)
return 0;
/* Write fields through the in-map pointer so the oversized step_acc
* never lands on the stack. */
bpf_map_update_elem(&accs, &key, tmpl, BPF_ANY);
struct step_acc *acc = bpf_map_lookup_elem(&accs, &key);
if (!acc)
return 0;
acc->rows = 0;
acc->engine_ns = 0;
/* boot clock counts wall-time across suspend, so it maps to CLOCK_REALTIME.
* https://docs.ebpf.io/linux/helper-function/bpf_ktime_get_boot_ns/ */
acc->start_boot_ns = bpf_ktime_get_boot_ns();
acc->stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK);
/* _USER variant required: stmt points into the traced process's user
* memory, so the field load needs bpf_probe_read_user; plain BPF_CORE_READ
* uses bpf_probe_read_kernel and reads garbage. Offset still relocates. */
const char *zsql = BPF_CORE_READ_USER(stmt, zSql);
acc->zsql_ptr = (__u64)zsql;
if (!zsql) {
/* zSql not retained: recover the text from the prepare hook. */
struct prepared_sql *ps = bpf_map_lookup_elem(&prepared, &key);
if (ps) {
bpf_probe_read_kernel(&acc->sql, sizeof(acc->sql), &ps->sql);
acc->sql_state = SQLSTATE_FROM_PREPARE;
acc->read_ret = 0;
} else {
acc->sql_state = SQLSTATE_NULLPTR;
acc->read_ret = 0;
}
} else {
long r = bpf_probe_read_user_str(&acc->sql, sizeof(acc->sql), zsql);
acc->read_ret = (__s32)r;
acc->sql_state = r < 0 ? SQLSTATE_READERR : SQLSTATE_OK;
}
/* read_user_str returns bytes copied INCLUDING NUL, so length is r-1; the
* prepare-recovered path used a kernel copy (r==0) so sql_bytes stays 0. */
acc->sql_bytes = acc->read_ret > 0 ? (__u32)(acc->read_ret - 1) : 0;
short nvar = BPF_CORE_READ_USER(stmt, nVar);
struct sqlite3_value *aVar = BPF_CORE_READ_USER(stmt, aVar);
acc->nvar = nvar > 0 ? (__u32)nvar : 0;
acc->bound_bytes = 0;
acc->nvar_scanned = 0;
if (aVar && nvar > 0 && mem_stride) {
__u32 trips = acc->nvar;
if (trips > MAX_VARS_SCAN)
trips = MAX_VARS_SCAN;
struct var_scan_ctx vc = {
.base = (char *)aVar,
.stride = mem_stride,
.nvar = trips,
.bound_bytes = 0,
.scanned = 0,
.stmt_ptr = key,
.emitted = 0,
};
bpf_loop(trips, var_scan_cb, &vc, 0);
acc->bound_bytes = vc.bound_bytes;
acc->nvar_scanned = vc.scanned;
}
/* btreeMask moved between versions (3.40: byte 200, 3.46: byte 204), so it
* doubles as a version discriminator: the right BTF reads the real bitmask,
* the wrong one reads a neighbouring field. */
acc->btreeMask = BPF_CORE_READ_USER(stmt, btreeMask);
/* Walk stmt->db->aDb[0].pBt->pBt->pPager->zFilename (aDb[0] = "main").
* Any broken link leaves db_path zero-seeded. */
struct sqlite3 *db = BPF_CORE_READ_USER(stmt, db);
if (db) {
struct Db *aDb = BPF_CORE_READ_USER(db, aDb);
if (aDb) {
struct Btree *bt = BPF_CORE_READ_USER(aDb, pBt);
const char *zf = NULL;
if (bt)
zf = BPF_CORE_READ_USER(bt, pBt, pPager, zFilename);
if (zf)
bpf_probe_read_user_str(&acc->db_path,
sizeof(acc->db_path), zf);
}
}
}
return 0;
}
SEC("uretprobe")
int BPF_KRETPROBE(handle_step_ret, int rc)
{
__u64 id = bpf_get_current_pid_tgid();
struct inflight_step *ifs = bpf_map_lookup_elem(&inflight, &id);
if (!ifs)
return 0;
__u64 key = ifs->stmt;
__u64 entry_ns = ifs->entry_ns;
bpf_map_delete_elem(&inflight, &id);
struct step_acc *acc = bpf_map_lookup_elem(&accs, &key);
if (!acc)
return 0;
acc->engine_ns += bpf_ktime_get_ns() - entry_ns;
if (rc == SQLITE_ROW) {
acc->rows++;
return 0;
}
/* Non-ROW return ends the loop: emit one event, then drop the accumulator. */
struct sql_event *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
if (e) {
e->kind = REC_EVENT;
fill_app_context(e);
e->rows = acc->rows;
e->rc = rc;
e->btreeMask = acc->btreeMask;
e->ns = acc->engine_ns;
e->start_boot_ns = acc->start_boot_ns;
e->stmt_ptr = key;
e->zsql_ptr = acc->zsql_ptr;
e->read_ret = acc->read_ret;
e->sql_state = acc->sql_state;
e->stack_id = acc->stack_id;
e->sql_bytes = acc->sql_bytes;
e->bound_bytes = acc->bound_bytes;
e->nvar = acc->nvar;
e->nvar_scanned = acc->nvar_scanned;
/* acc is in map memory; copy via kernel read (BPF backend lacks a
* builtin memcpy of this size). */
bpf_probe_read_kernel(&e->sql, sizeof(e->sql), &acc->sql);
bpf_probe_read_kernel(&e->db_path, sizeof(e->db_path), &acc->db_path);
bpf_ringbuf_submit(e, 0);
}
bpf_map_delete_elem(&accs, &key);
return 0;
}