1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
|
// SPDX-License-Identifier: GPL-2.0
/*
* Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
*
* Test it with:
*
* perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null
*
* This exactly matches what is marshalled into the raw_syscall:sys_enter
* payload expected by the 'perf trace' beautifiers.
*
* For now it just uses the existing tracepoint augmentation code in 'perf
* trace', in the next csets we'll hook up these with the sys_enter/sys_exit
* code that will combine entry/exit in a strace like way.
*/
#include <unistd.h>
#include <linux/limits.h>
#include <pid_filter.h>
/* bpf-output associated map */
bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
struct syscall {
bool enabled;
};
bpf_map(syscalls, ARRAY, int, struct syscall, 512);
struct syscall_enter_args {
unsigned long long common_tp_fields;
long syscall_nr;
unsigned long args[6];
};
struct syscall_exit_args {
unsigned long long common_tp_fields;
long syscall_nr;
long ret;
};
struct augmented_filename {
unsigned int size;
int reserved;
char value[PATH_MAX];
};
/* syscalls where the first arg is a string */
#define SYS_OPEN 2
#define SYS_STAT 4
#define SYS_LSTAT 6
#define SYS_ACCESS 21
#define SYS_EXECVE 59
#define SYS_TRUNCATE 76
#define SYS_CHDIR 80
#define SYS_RENAME 82
#define SYS_MKDIR 83
#define SYS_RMDIR 84
#define SYS_CREAT 85
#define SYS_LINK 86
#define SYS_UNLINK 87
#define SYS_SYMLINK 88
#define SYS_READLINK 89
#define SYS_CHMOD 90
#define SYS_CHOWN 92
#define SYS_LCHOWN 94
#define SYS_MKNOD 133
#define SYS_STATFS 137
#define SYS_PIVOT_ROOT 155
#define SYS_CHROOT 161
#define SYS_ACCT 163
#define SYS_SWAPON 167
#define SYS_SWAPOFF 168
#define SYS_DELETE_MODULE 176
#define SYS_SETXATTR 188
#define SYS_LSETXATTR 189
#define SYS_GETXATTR 191
#define SYS_LGETXATTR 192
#define SYS_LISTXATTR 194
#define SYS_LLISTXATTR 195
#define SYS_REMOVEXATTR 197
#define SYS_LREMOVEXATTR 198
#define SYS_MQ_OPEN 240
#define SYS_MQ_UNLINK 241
#define SYS_ADD_KEY 248
#define SYS_REQUEST_KEY 249
#define SYS_SYMLINKAT 266
#define SYS_MEMFD_CREATE 319
/* syscalls where the second arg is a string */
#define SYS_PWRITE64 18
#define SYS_EXECVE 59
#define SYS_RENAME 82
#define SYS_QUOTACTL 179
#define SYS_FSETXATTR 190
#define SYS_FGETXATTR 193
#define SYS_FREMOVEXATTR 199
#define SYS_MQ_TIMEDSEND 242
#define SYS_REQUEST_KEY 249
#define SYS_INOTIFY_ADD_WATCH 254
#define SYS_OPENAT 257
#define SYS_MKDIRAT 258
#define SYS_MKNODAT 259
#define SYS_FCHOWNAT 260
#define SYS_FUTIMESAT 261
#define SYS_NEWFSTATAT 262
#define SYS_UNLINKAT 263
#define SYS_RENAMEAT 264
#define SYS_LINKAT 265
#define SYS_READLINKAT 267
#define SYS_FCHMODAT 268
#define SYS_FACCESSAT 269
#define SYS_UTIMENSAT 280
#define SYS_NAME_TO_HANDLE_AT 303
#define SYS_FINIT_MODULE 313
#define SYS_RENAMEAT2 316
#define SYS_EXECVEAT 322
#define SYS_STATX 332
#define SYS_MOVE_MOUNT 429
pid_filter(pids_filtered);
struct augmented_args_filename {
struct syscall_enter_args args;
struct augmented_filename filename;
};
bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct augmented_args_filename, 1);
SEC("raw_syscalls:sys_enter")
int sys_enter(struct syscall_enter_args *args)
{
struct augmented_args_filename *augmented_args;
unsigned int len = sizeof(*augmented_args);
const void *filename_arg = NULL;
struct syscall *syscall;
int key = 0;
augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key);
if (augmented_args == NULL)
return 1;
if (pid_filter__has(&pids_filtered, getpid()))
return 0;
probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
syscall = bpf_map_lookup_elem(&syscalls, &augmented_args->args.syscall_nr);
if (syscall == NULL || !syscall->enabled)
return 0;
/*
* Yonghong and Edward Cree sayz:
*
* https://www.spinics.net/lists/netdev/msg531645.html
*
* >> R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
* >> 10: (bf) r1 = r6
* >> 11: (07) r1 += 16
* >> 12: (05) goto pc+2
* >> 15: (79) r3 = *(u64 *)(r1 +0)
* >> dereference of modified ctx ptr R1 off=16 disallowed
* > Aha, we at least got a different error message this time.
* > And indeed llvm has done that optimisation, rather than the more obvious
* > 11: r3 = *(u64 *)(r1 +16)
* > because it wants to have lots of reads share a single insn. You may be able
* > to defeat that optimisation by adding compiler barriers, idk. Maybe someone
* > with llvm knowledge can figure out how to stop it (ideally, llvm would know
* > when it's generating for bpf backend and not do that). -O0? ¯\_(ツ)_/¯
*
* The optimization mostly likes below:
*
* br1:
* ...
* r1 += 16
* goto merge
* br2:
* ...
* r1 += 20
* goto merge
* merge:
* *(u64 *)(r1 + 0)
*
* The compiler tries to merge common loads. There is no easy way to
* stop this compiler optimization without turning off a lot of other
* optimizations. The easiest way is to add barriers:
*
* __asm__ __volatile__("": : :"memory")
*
* after the ctx memory access to prevent their down stream merging.
*/
/*
* This table of what args are strings will be provided by userspace,
* in the syscalls map, i.e. we will already have to do the lookup to
* see if this specific syscall is filtered, so we can as well get more
* info about what syscall args are strings or pointers, and how many
* bytes to copy, per arg, etc.
*
* For now hard code it, till we have all the basic mechanisms in place
* to automate everything and make the kernel part be completely driven
* by information obtained in userspace for each kernel version and
* processor architecture, making the kernel part the same no matter what
* kernel version or processor architecture it runs on.
*/
switch (augmented_args->args.syscall_nr) {
case SYS_ACCT:
case SYS_ADD_KEY:
case SYS_CHDIR:
case SYS_CHMOD:
case SYS_CHOWN:
case SYS_CHROOT:
case SYS_CREAT:
case SYS_DELETE_MODULE:
case SYS_EXECVE:
case SYS_GETXATTR:
case SYS_LCHOWN:
case SYS_LGETXATTR:
case SYS_LINK:
case SYS_LISTXATTR:
case SYS_LLISTXATTR:
case SYS_LREMOVEXATTR:
case SYS_LSETXATTR:
case SYS_LSTAT:
case SYS_MEMFD_CREATE:
case SYS_MKDIR:
case SYS_MKNOD:
case SYS_MQ_OPEN:
case SYS_MQ_UNLINK:
case SYS_PIVOT_ROOT:
case SYS_READLINK:
case SYS_REMOVEXATTR:
case SYS_RENAME:
case SYS_REQUEST_KEY:
case SYS_RMDIR:
case SYS_SETXATTR:
case SYS_STAT:
case SYS_STATFS:
case SYS_SWAPOFF:
case SYS_SWAPON:
case SYS_SYMLINK:
case SYS_SYMLINKAT:
case SYS_TRUNCATE:
case SYS_UNLINK:
case SYS_ACCESS:
case SYS_OPEN: filename_arg = (const void *)args->args[0];
__asm__ __volatile__("": : :"memory");
break;
case SYS_EXECVEAT:
case SYS_FACCESSAT:
case SYS_FCHMODAT:
case SYS_FCHOWNAT:
case SYS_FGETXATTR:
case SYS_FINIT_MODULE:
case SYS_FREMOVEXATTR:
case SYS_FSETXATTR:
case SYS_FUTIMESAT:
case SYS_INOTIFY_ADD_WATCH:
case SYS_LINKAT:
case SYS_MKDIRAT:
case SYS_MKNODAT:
// case SYS_MOVE_MOUNT:
// For now don't copy move_mount first string arg, as it has two and
// 'perf trace's syscall_arg__scnprintf_filename() will use the one
// copied here, the first, for both args, duplicating the first and
// ignoring the second.
//
// We need to copy both here and make syscall_arg__scnprintf_filename
// skip the first when reading the second, using the size of the first, etc.
// Shouldn't be difficult, but now its perf/urgent time, lets wait for
// the next devel window.
case SYS_MQ_TIMEDSEND:
case SYS_NAME_TO_HANDLE_AT:
case SYS_NEWFSTATAT:
case SYS_PWRITE64:
case SYS_QUOTACTL:
case SYS_READLINKAT:
case SYS_RENAMEAT:
case SYS_RENAMEAT2:
case SYS_STATX:
case SYS_UNLINKAT:
case SYS_UTIMENSAT:
case SYS_OPENAT: filename_arg = (const void *)args->args[1];
break;
}
if (filename_arg != NULL) {
augmented_args->filename.reserved = 0;
augmented_args->filename.size = probe_read_str(&augmented_args->filename.value,
sizeof(augmented_args->filename.value),
filename_arg);
if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) {
len -= sizeof(augmented_args->filename.value) - augmented_args->filename.size;
len &= sizeof(augmented_args->filename.value) - 1;
}
} else {
len = sizeof(augmented_args->args);
}
/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len);
}
SEC("raw_syscalls:sys_exit")
int sys_exit(struct syscall_exit_args *args)
{
struct syscall_exit_args exit_args;
struct syscall *syscall;
if (pid_filter__has(&pids_filtered, getpid()))
return 0;
probe_read(&exit_args, sizeof(exit_args), args);
syscall = bpf_map_lookup_elem(&syscalls, &exit_args.syscall_nr);
if (syscall == NULL || !syscall->enabled)
return 0;
return 1;
}
license(GPL);
|