tools/perf/examples/bpf/augmented_raw_syscalls.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

// SPDX-License-Identifier: GPL-2.0
/*
 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
 *
 * Test it with:
 *
 * perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null
 *
 * This exactly matches what is marshalled into the raw_syscall:sys_enter
 * payload expected by the 'perf trace' beautifiers.
 *
 * For now it just uses the existing tracepoint augmentation code in 'perf
 * trace', in the next csets we'll hook up these with the sys_enter/sys_exit
 * code that will combine entry/exit in a strace like way.
 */

#include <unistd.h>
#include <linux/limits.h>
#include <pid_filter.h>

/* bpf-output associated map */
bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);

struct syscall {
	bool	enabled;
};

bpf_map(syscalls, ARRAY, int, struct syscall, 512);

struct syscall_enter_args {
	unsigned long long common_tp_fields;
	long		   syscall_nr;
	unsigned long	   args[6];
};

struct syscall_exit_args {
	unsigned long long common_tp_fields;
	long		   syscall_nr;
	long		   ret;
};

struct augmented_filename {
	unsigned int	size;
	int		reserved;
	char		value[PATH_MAX];
};

/* syscalls where the first arg is a string */
#define SYS_OPEN                 2
#define SYS_STAT                 4
#define SYS_LSTAT                6
#define SYS_ACCESS              21
#define SYS_EXECVE              59
#define SYS_TRUNCATE            76
#define SYS_CHDIR               80
#define SYS_RENAME              82
#define SYS_MKDIR               83
#define SYS_RMDIR               84
#define SYS_CREAT               85
#define SYS_LINK                86
#define SYS_UNLINK              87
#define SYS_SYMLINK             88
#define SYS_READLINK            89
#define SYS_CHMOD               90
#define SYS_CHOWN               92
#define SYS_LCHOWN              94
#define SYS_MKNOD              133
#define SYS_STATFS             137
#define SYS_PIVOT_ROOT         155
#define SYS_CHROOT             161
#define SYS_ACCT               163
#define SYS_SWAPON             167
#define SYS_SWAPOFF            168
#define SYS_DELETE_MODULE      176
#define SYS_SETXATTR           188
#define SYS_LSETXATTR          189
#define SYS_GETXATTR           191
#define SYS_LGETXATTR          192
#define SYS_LISTXATTR          194
#define SYS_LLISTXATTR         195
#define SYS_REMOVEXATTR        197
#define SYS_LREMOVEXATTR       198
#define SYS_MQ_OPEN            240
#define SYS_MQ_UNLINK          241
#define SYS_ADD_KEY            248
#define SYS_REQUEST_KEY        249
#define SYS_SYMLINKAT          266
#define SYS_MEMFD_CREATE       319

/* syscalls where the second arg is a string */

#define SYS_PWRITE64            18
#define SYS_EXECVE              59
#define SYS_RENAME              82
#define SYS_QUOTACTL           179
#define SYS_FSETXATTR          190
#define SYS_FGETXATTR          193
#define SYS_FREMOVEXATTR       199
#define SYS_MQ_TIMEDSEND       242
#define SYS_REQUEST_KEY        249
#define SYS_INOTIFY_ADD_WATCH  254
#define SYS_OPENAT             257
#define SYS_MKDIRAT            258
#define SYS_MKNODAT            259
#define SYS_FCHOWNAT           260
#define SYS_FUTIMESAT          261
#define SYS_NEWFSTATAT         262
#define SYS_UNLINKAT           263
#define SYS_RENAMEAT           264
#define SYS_LINKAT             265
#define SYS_READLINKAT         267
#define SYS_FCHMODAT           268
#define SYS_FACCESSAT          269
#define SYS_UTIMENSAT          280
#define SYS_NAME_TO_HANDLE_AT  303
#define SYS_FINIT_MODULE       313
#define SYS_RENAMEAT2          316
#define SYS_EXECVEAT           322
#define SYS_STATX              332
#define SYS_MOVE_MOUNT         429

pid_filter(pids_filtered);

struct augmented_args_filename {
       struct syscall_enter_args args;
       struct augmented_filename filename;
};

bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct augmented_args_filename, 1);

SEC("raw_syscalls:sys_enter")
int sys_enter(struct syscall_enter_args *args)
{
	struct augmented_args_filename *augmented_args;
	unsigned int len = sizeof(*augmented_args);
	const void *filename_arg = NULL;
	struct syscall *syscall;
	int key = 0;

        augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key);
        if (augmented_args == NULL)
                return 1;

	if (pid_filter__has(&pids_filtered, getpid()))
		return 0;

	probe_read(&augmented_args->args, sizeof(augmented_args->args), args);

	syscall = bpf_map_lookup_elem(&syscalls, &augmented_args->args.syscall_nr);
	if (syscall == NULL || !syscall->enabled)
		return 0;
	/*
	 * Yonghong and Edward Cree sayz:
	 *
	 * https://www.spinics.net/lists/netdev/msg531645.html
	 *
	 * >>   R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
	 * >> 10: (bf) r1 = r6
	 * >> 11: (07) r1 += 16
	 * >> 12: (05) goto pc+2
	 * >> 15: (79) r3 = *(u64 *)(r1 +0)
	 * >> dereference of modified ctx ptr R1 off=16 disallowed
	 * > Aha, we at least got a different error message this time.
	 * > And indeed llvm has done that optimisation, rather than the more obvious
	 * > 11: r3 = *(u64 *)(r1 +16)
	 * > because it wants to have lots of reads share a single insn.  You may be able
	 * > to defeat that optimisation by adding compiler barriers, idk.  Maybe someone
	 * > with llvm knowledge can figure out how to stop it (ideally, llvm would know
	 * > when it's generating for bpf backend and not do that).  -O0?  ¯\_(ツ)_/¯
	 *
	 * The optimization mostly likes below:
	 *
	 *	br1:
	 * 	...
	 *	r1 += 16
	 *	goto merge
	 *	br2:
	 *	...
	 *	r1 += 20
	 *	goto merge
	 *	merge:
	 *	*(u64 *)(r1 + 0)
	 *
	 * The compiler tries to merge common loads. There is no easy way to
	 * stop this compiler optimization without turning off a lot of other
	 * optimizations. The easiest way is to add barriers:
	 *
	 * 	 __asm__ __volatile__("": : :"memory")
	 *
	 * 	 after the ctx memory access to prevent their down stream merging.
	 */
	/*
	 * This table of what args are strings will be provided by userspace,
	 * in the syscalls map, i.e. we will already have to do the lookup to
	 * see if this specific syscall is filtered, so we can as well get more
	 * info about what syscall args are strings or pointers, and how many
	 * bytes to copy, per arg, etc.
	 *
	 * For now hard code it, till we have all the basic mechanisms in place
	 * to automate everything and make the kernel part be completely driven
	 * by information obtained in userspace for each kernel version and
	 * processor architecture, making the kernel part the same no matter what
	 * kernel version or processor architecture it runs on.
	 */
	switch (augmented_args->args.syscall_nr) {
	case SYS_ACCT:
	case SYS_ADD_KEY:
	case SYS_CHDIR:
	case SYS_CHMOD:
	case SYS_CHOWN:
	case SYS_CHROOT:
	case SYS_CREAT:
	case SYS_DELETE_MODULE:
	case SYS_EXECVE:
	case SYS_GETXATTR:
	case SYS_LCHOWN:
	case SYS_LGETXATTR:
	case SYS_LINK:
	case SYS_LISTXATTR:
	case SYS_LLISTXATTR:
	case SYS_LREMOVEXATTR:
	case SYS_LSETXATTR:
	case SYS_LSTAT:
	case SYS_MEMFD_CREATE:
	case SYS_MKDIR:
	case SYS_MKNOD:
	case SYS_MQ_OPEN:
	case SYS_MQ_UNLINK:
	case SYS_PIVOT_ROOT:
	case SYS_READLINK:
	case SYS_REMOVEXATTR:
	case SYS_RENAME:
	case SYS_REQUEST_KEY:
	case SYS_RMDIR:
	case SYS_SETXATTR:
	case SYS_STAT:
	case SYS_STATFS:
	case SYS_SWAPOFF:
	case SYS_SWAPON:
	case SYS_SYMLINK:
	case SYS_SYMLINKAT:
	case SYS_TRUNCATE:
	case SYS_UNLINK:
	case SYS_ACCESS:
	case SYS_OPEN:	 filename_arg = (const void *)args->args[0];
			__asm__ __volatile__("": : :"memory");
			 break;
	case SYS_EXECVEAT:
	case SYS_FACCESSAT:
	case SYS_FCHMODAT:
	case SYS_FCHOWNAT:
	case SYS_FGETXATTR:
	case SYS_FINIT_MODULE:
	case SYS_FREMOVEXATTR:
	case SYS_FSETXATTR:
	case SYS_FUTIMESAT:
	case SYS_INOTIFY_ADD_WATCH:
	case SYS_LINKAT:
	case SYS_MKDIRAT:
	case SYS_MKNODAT:
	// case SYS_MOVE_MOUNT:
	// For now don't copy move_mount first string arg, as it has two and
	// 'perf trace's syscall_arg__scnprintf_filename() will use the one
	// copied here, the first, for both args, duplicating the first and
	// ignoring the second.
	//
	// We need to copy both here and make syscall_arg__scnprintf_filename
	// skip the first when reading the second, using the size of the first, etc.
	// Shouldn't be difficult, but now its perf/urgent time, lets wait for
	// the next devel window.
	case SYS_MQ_TIMEDSEND:
	case SYS_NAME_TO_HANDLE_AT:
	case SYS_NEWFSTATAT:
	case SYS_PWRITE64:
	case SYS_QUOTACTL:
	case SYS_READLINKAT:
	case SYS_RENAMEAT:
	case SYS_RENAMEAT2:
	case SYS_STATX:
	case SYS_UNLINKAT:
	case SYS_UTIMENSAT:
	case SYS_OPENAT: filename_arg = (const void *)args->args[1];
			 break;
	}

	if (filename_arg != NULL) {
		augmented_args->filename.reserved = 0;
		augmented_args->filename.size = probe_read_str(&augmented_args->filename.value,
							      sizeof(augmented_args->filename.value),
							      filename_arg);
		if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) {
			len -= sizeof(augmented_args->filename.value) - augmented_args->filename.size;
			len &= sizeof(augmented_args->filename.value) - 1;
		}
	} else {
		len = sizeof(augmented_args->args);
	}

	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
	return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len);
}

SEC("raw_syscalls:sys_exit")
int sys_exit(struct syscall_exit_args *args)
{
	struct syscall_exit_args exit_args;
	struct syscall *syscall;

	if (pid_filter__has(&pids_filtered, getpid()))
		return 0;

	probe_read(&exit_args, sizeof(exit_args), args);

	syscall = bpf_map_lookup_elem(&syscalls, &exit_args.syscall_nr);
	if (syscall == NULL || !syscall->enabled)
		return 0;

	return 1;
}

license(GPL);