11 files changed, 322 insertions, 107 deletions
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 0acbd74e8c76..b2f17cca014b 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -7,9 +7,14 @@
  */
 
 #include "vmlinux.h"
+#include "../trace_augment.h"
+
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
 
+#define PERF_ALIGN(x, a)        __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __PERF_ALIGN_MASK(x, mask)      (((x)+(mask))&~(mask))
+
 /**
  * is_power_of_2() - check if a value is a power of two
  * @n: the value to check
@@ -66,19 +71,6 @@ struct syscall_exit_args {
 	long		   ret;
 };
 
-struct augmented_arg {
-	unsigned int	size;
-	int		err;
-	char		value[PATH_MAX];
-};
-
-struct pids_filtered {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__type(key, pid_t);
-	__type(value, bool);
-	__uint(max_entries, 64);
-} pids_filtered SEC(".maps");
-
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -105,17 +97,27 @@ struct sockaddr_storage {
 	};
 };
 
-struct augmented_args_payload {
-       struct syscall_enter_args args;
-       union {
-		struct {
-			struct augmented_arg arg, arg2;
-		};
+struct augmented_arg {
+	unsigned int	size;
+	int		err;
+	union {
+		char   value[PATH_MAX];
 		struct sockaddr_storage saddr;
-		char   __data[sizeof(struct augmented_arg)];
 	};
 };
 
+struct pids_filtered {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, bool);
+	__uint(max_entries, 64);
+} pids_filtered SEC(".maps");
+
+struct augmented_args_payload {
+	struct syscall_enter_args args;
+	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
+};
+
 // We need more tmp space than the BPF stack can give us
 struct augmented_args_tmp {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
@@ -124,6 +126,25 @@ struct augmented_args_tmp {
 	__uint(max_entries, 1);
 } augmented_args_tmp SEC(".maps");
 
+struct beauty_map_enter {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, int);
+	__type(value, __u32[6]);
+	__uint(max_entries, 512);
+} beauty_map_enter SEC(".maps");
+
+struct beauty_payload_enter {
+	struct syscall_enter_args args;
+	struct augmented_arg aug_args[6];
+};
+
+struct beauty_payload_enter_map {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, int);
+	__type(value, struct beauty_payload_enter);
+	__uint(max_entries, 1);
+} beauty_payload_enter_map SEC(".maps");
+
 static inline struct augmented_args_payload *augmented_args_payload(void)
 {
 	int key = 0;
@@ -136,6 +157,11 @@ static inline int augmented__output(void *ctx, struct augmented_args_payload *ar
 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
 }
 
+static inline int augmented__beauty_output(void *ctx, void *data, int len)
+{
+	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+}
+
 static inline
 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
 {
@@ -182,15 +208,17 @@ int sys_enter_connect(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *sockaddr_arg = (const void *)args->args[1];
 	unsigned int socklen = args->args[2];
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
-	_Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two");
-	socklen &= sizeof(augmented_args->saddr) - 1;
+	_Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two");
+	socklen &= sizeof(augmented_args->arg.saddr) - 1;
 
-	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
+	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
+	augmented_args->arg.size = socklen;
+	augmented_args->arg.err = 0;
 
 	return augmented__output(args, augmented_args, len + socklen);
 }
@@ -201,14 +229,14 @@ int sys_enter_sendto(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *sockaddr_arg = (const void *)args->args[4];
 	unsigned int socklen = args->args[5];
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
-	socklen &= sizeof(augmented_args->saddr) - 1;
+	socklen &= sizeof(augmented_args->arg.saddr) - 1;
 
-	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
+	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
 
 	return augmented__output(args, augmented_args, len + socklen);
 }
@@ -249,30 +277,50 @@ int sys_enter_rename(struct syscall_enter_args *args)
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[0],
 		   *newpath_arg = (const void *)args->args[1];
-	unsigned int len = sizeof(augmented_args->args), oldpath_len;
+	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
+	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
+
 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
-	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
+	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+	len += augmented_args->arg.size;
+
+	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+
+	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
+	arg2->size = newpath_len;
+
+	len += newpath_len;
 
 	return augmented__output(args, augmented_args, len);
 }
 
-SEC("tp/syscalls/sys_enter_renameat")
-int sys_enter_renameat(struct syscall_enter_args *args)
+SEC("tp/syscalls/sys_enter_renameat2")
+int sys_enter_renameat2(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[1],
 		   *newpath_arg = (const void *)args->args[3];
-	unsigned int len = sizeof(augmented_args->args), oldpath_len;
+	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
 
         if (augmented_args == NULL)
                 return 1; /* Failure: don't filter */
 
+	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
+
 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
-	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
+	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+	len += augmented_args->arg.size;
+
+	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+
+	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
+	arg2->size = newpath_len;
+
+	len += newpath_len;
 
 	return augmented__output(args, augmented_args, len);
 }
@@ -293,26 +341,26 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 
         if (augmented_args == NULL)
 		goto failure;
 
-	if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0)
+	if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0)
 		goto failure;
 
-	attr_read = (const struct perf_event_attr_size *)augmented_args->__data;
+	attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value;
 
 	__u32 size = attr_read->size;
 
 	if (!size)
 		size = PERF_ATTR_SIZE_VER0;
 
-	if (size > sizeof(augmented_args->__data))
+	if (size > sizeof(augmented_args->arg.value))
                 goto failure;
 
 	// Now that we read attr->size and tested it against the size limits, read it completely
-	if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0)
+	if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0)
 		goto failure;
 
 	return augmented__output(args, augmented_args, len + size);
@@ -325,16 +373,16 @@ int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *rqtp_arg = (const void *)args->args[2];
-	unsigned int len = sizeof(augmented_args->args);
+	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
 	__u32 size = sizeof(struct timespec64);
 
         if (augmented_args == NULL)
 		goto failure;
 
-	if (size > sizeof(augmented_args->__data))
+	if (size > sizeof(augmented_args->arg.value))
                 goto failure;
 
-	bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg);
+	bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg);
 
 	return augmented__output(args, augmented_args, len + size);
 failure:
@@ -352,10 +400,10 @@ int sys_enter_nanosleep(struct syscall_enter_args *args)
         if (augmented_args == NULL)
 		goto failure;
 
-	if (size > sizeof(augmented_args->__data))
+	if (size > sizeof(augmented_args->arg.value))
                 goto failure;
 
-	bpf_probe_read_user(&augmented_args->__data, size, req_arg);
+	bpf_probe_read_user(&augmented_args->arg.value, size, req_arg);
 
 	return augmented__output(args, augmented_args, len + size);
 failure:
@@ -372,6 +420,91 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
+static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
+{
+	bool augmented, do_output = false;
+	int zero = 0, size, aug_size, index, output = 0,
+	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+	unsigned int nr, *beauty_map;
+	struct beauty_payload_enter *payload;
+	void *arg, *payload_offset;
+
+	/* fall back to do predefined tail call */
+	if (args == NULL)
+		return 1;
+
+	/* use syscall number to get beauty_map entry */
+	nr             = (__u32)args->syscall_nr;
+	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
+
+	/* set up payload for output */
+	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
+	payload_offset = (void *)&payload->aug_args;
+
+	if (beauty_map == NULL || payload == NULL)
+		return 1;
+
+	/* copy the sys_enter header, which has the syscall_nr */
+	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
+
+	/*
+	 * Determine what type of argument and how many bytes to read from user space, using the
+	 * value in the beauty_map. This is the relation of parameter type and its corresponding
+	 * value in the beauty map, and how many bytes we read eventually:
+	 *
+	 * string: 1			      -> size of string
+	 * struct: size of struct	      -> size of struct
+	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
+	 */
+	for (int i = 0; i < 6; i++) {
+		arg = (void *)args->args[i];
+		augmented = false;
+		size = beauty_map[i];
+		aug_size = size; /* size of the augmented data read from user space */
+
+		if (size == 0 || arg == NULL)
+			continue;
+
+		if (size == 1) { /* string */
+			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
+			/* minimum of 0 to pass the verifier */
+			if (aug_size < 0)
+				aug_size = 0;
+
+			augmented = true;
+		} else if (size > 0 && size <= value_size) { /* struct */
+			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
+				augmented = true;
+		} else if (size < 0 && size >= -6) { /* buffer */
+			index = -(size + 1);
+			aug_size = args->args[index];
+
+			if (aug_size > TRACE_AUG_MAX_BUF)
+				aug_size = TRACE_AUG_MAX_BUF;
+
+			if (aug_size > 0) {
+				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
+					augmented = true;
+			}
+		}
+
+		/* write data to payload */
+		if (augmented) {
+			int written = offsetof(struct augmented_arg, value) + aug_size;
+
+			((struct augmented_arg *)payload_offset)->size = aug_size;
+			output += written;
+			payload_offset += written;
+			do_output = true;
+		}
+	}
+
+	if (!do_output)
+		return 1;
+
+	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
+}
+
 SEC("tp/raw_syscalls/sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
@@ -400,7 +533,8 @@ int sys_enter(struct syscall_enter_args *args)
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
 	 * unaugmented tracepoint payload.
 	 */
-	bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
+	if (augment_sys_enter(args, &augmented_args->args))
+		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
 
 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
 	return 0;
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index 6a438e0102c5..57cab7647a9a 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -57,9 +57,9 @@ struct cgroup___old {
 
 const volatile __u32 num_events = 1;
 const volatile __u32 num_cpus = 1;
+const volatile int use_cgroup_v2 = 0;
 
 int enabled = 0;
-int use_cgroup_v2 = 0;
 int perf_subsys_id = -1;
 
 static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c
index 9d01e3af7479..f613dc9cb123 100644
--- a/tools/perf/util/bpf_skel/func_latency.bpf.c
+++ b/tools/perf/util/bpf_skel/func_latency.bpf.c
@@ -37,9 +37,10 @@ struct {
 
 
 int enabled = 0;
-int has_cpu = 0;
-int has_task = 0;
-int use_nsec = 0;
+
+const volatile int has_cpu = 0;
+const volatile int has_task = 0;
+const volatile int use_nsec = 0;
 
 SEC("kprobe/func")
 int BPF_PROG(func_begin)
diff --git a/tools/perf/util/bpf_skel/kwork_top.bpf.c b/tools/perf/util/bpf_skel/kwork_top.bpf.c
index 84c15ccbab44..594da91965a2 100644
--- a/tools/perf/util/bpf_skel/kwork_top.bpf.c
+++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c
@@ -84,7 +84,7 @@ struct {
 
 int enabled = 0;
 
-int has_cpu_filter = 0;
+const volatile int has_cpu_filter = 0;
 
 __u64 from_timestamp = 0;
 __u64 to_timestamp = 0;
diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
index 063c124e0999..cbd79bc4b330 100644
--- a/tools/perf/util/bpf_skel/kwork_trace.bpf.c
+++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
@@ -68,8 +68,9 @@ struct {
 } perf_kwork_name_filter SEC(".maps");
 
 int enabled = 0;
-int has_cpu_filter = 0;
-int has_name_filter = 0;
+
+const volatile int has_cpu_filter = 0;
+const volatile int has_name_filter = 0;
 
 static __always_inline int local_strncmp(const char *s1,
 					 unsigned int sz, const char *s2)
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index d931a898c434..1069bda5d733 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -117,21 +117,22 @@ struct mm_struct___new {
 } __attribute__((preserve_access_index));
 
 /* control flags */
-int enabled;
-int has_cpu;
-int has_task;
-int has_type;
-int has_addr;
-int has_cgroup;
-int needs_callstack;
-int stack_skip;
-int lock_owner;
-
-int use_cgroup_v2;
-int perf_subsys_id = -1;
+const volatile int has_cpu;
+const volatile int has_task;
+const volatile int has_type;
+const volatile int has_addr;
+const volatile int has_cgroup;
+const volatile int needs_callstack;
+const volatile int stack_skip;
+const volatile int lock_owner;
+const volatile int use_cgroup_v2;
 
 /* determine the key of lock stat */
-int aggr_mode;
+const volatile int aggr_mode;
+
+int enabled;
+
+int perf_subsys_id = -1;
 
 __u64 end_ts;
 
@@ -323,8 +324,7 @@ static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
 	struct tstamp_data *pelem;
 
 	/* Use per-cpu array map for spinlock and rwlock */
-	if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
-	    flags == (LCB_F_SPIN | LCB_F_WRITE)) {
+	if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
 		__u32 idx = 0;
 
 		pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
@@ -439,11 +439,8 @@ int contention_end(u64 *ctx)
 
 	duration = bpf_ktime_get_ns() - pelem->timestamp;
 	if ((__s64)duration < 0) {
-		pelem->lock = 0;
-		if (need_delete)
-			bpf_map_delete_elem(&tstamp, &pid);
 		__sync_fetch_and_add(&time_fail, 1);
-		return 0;
+		goto out;
 	}
 
 	switch (aggr_mode) {
@@ -477,11 +474,8 @@ int contention_end(u64 *ctx)
 	data = bpf_map_lookup_elem(&lock_stat, &key);
 	if (!data) {
 		if (data_map_full) {
-			pelem->lock = 0;
-			if (need_delete)
-				bpf_map_delete_elem(&tstamp, &pid);
 			__sync_fetch_and_add(&data_fail, 1);
-			return 0;
+			goto out;
 		}
 
 		struct contention_data first = {
@@ -498,16 +492,20 @@ int contention_end(u64 *ctx)
 
 		err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
 		if (err < 0) {
+			if (err == -EEXIST) {
+				/* it lost the race, try to get it again */
+				data = bpf_map_lookup_elem(&lock_stat, &key);
+				if (data != NULL)
+					goto found;
+			}
 			if (err == -E2BIG)
 				data_map_full = 1;
 			__sync_fetch_and_add(&data_fail, 1);
 		}
-		pelem->lock = 0;
-		if (need_delete)
-			bpf_map_delete_elem(&tstamp, &pid);
-		return 0;
+		goto out;
 	}
 
+found:
 	__sync_fetch_and_add(&data->total_time, duration);
 	__sync_fetch_and_add(&data->count, 1);
 
@@ -517,6 +515,7 @@ int contention_end(u64 *ctx)
 	if (data->min_time > duration)
 		data->min_time = duration;
 
+out:
 	pelem->lock = 0;
 	if (need_delete)
 		bpf_map_delete_elem(&tstamp, &pid);
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index 36af11faad03..de12892f992f 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -7,11 +7,11 @@ struct tstamp_data {
 	u64 timestamp;
 	u64 lock;
 	u32 flags;
-	u32 stack_id;
+	s32 stack_id;
 };
 
 struct contention_key {
-	u32 stack_id;
+	s32 stack_id;
 	u32 pid;
 	u64 lock_addr_or_cgroup;
 };
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index d877a0a9731f..c152116df72f 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -85,10 +85,11 @@ struct task_struct___old {
 } __attribute__((preserve_access_index));
 
 int enabled = 0;
-int has_cpu = 0;
-int has_task = 0;
-int has_cgroup = 0;
-int uses_tgid = 0;
+
+const volatile int has_cpu = 0;
+const volatile int has_task = 0;
+const volatile int has_cgroup = 0;
+const volatile int uses_tgid = 0;
 
 const volatile bool has_prev_state = false;
 const volatile bool needs_cgroup = false;
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index 350efa121026..683fec85e71e 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -1,7 +1,9 @@
 #ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
 #define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
 
-#define MAX_FILTERS  64
+#define MAX_FILTERS   64
+#define MAX_IDX_HASH  (16 * 1024)
+#define MAX_EVT_HASH  (1024 * 1024)
 
 /* supported filter operations */
 enum perf_bpf_filter_op {
@@ -14,6 +16,7 @@ enum perf_bpf_filter_op {
 	PBF_OP_AND,
 	PBF_OP_GROUP_BEGIN,
 	PBF_OP_GROUP_END,
+	PBF_OP_DONE,
 };
 
 enum perf_bpf_filter_term {
@@ -42,7 +45,7 @@ enum perf_bpf_filter_term {
 	__PBF_UNUSED_TERM18	= PBF_TERM_SAMPLE_START + 18, /* SAMPLE_REGS_INTR = 1U << 18 */
 	PBF_TERM_PHYS_ADDR	= PBF_TERM_SAMPLE_START + 19, /* SAMPLE_PHYS_ADDR = 1U << 19 */
 	__PBF_UNUSED_TERM20	= PBF_TERM_SAMPLE_START + 20, /* SAMPLE_AUX = 1U << 20 */
-	__PBF_UNUSED_TERM21	= PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */
+	PBF_TERM_CGROUP		= PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */
 	PBF_TERM_DATA_PAGE_SIZE	= PBF_TERM_SAMPLE_START + 22, /* SAMPLE_DATA_PAGE_SIZE = 1U << 22 */
 	PBF_TERM_CODE_PAGE_SIZE	= PBF_TERM_SAMPLE_START + 23, /* SAMPLE_CODE_PAGE_SIZE = 1U << 23 */
 	PBF_TERM_WEIGHT_STRUCT	= PBF_TERM_SAMPLE_START + 24, /* SAMPLE_WEIGHT_STRUCT = 1U << 24 */
@@ -60,4 +63,10 @@ struct perf_bpf_filter_entry {
 	__u64 value;
 };
 
+struct idx_hash_key {
+	__u64 evt_id;
+	__u32 tgid;
+	__u32 reserved;
+};
+
 #endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index f59985101973..b195e6efeb8b 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -9,13 +9,41 @@
 
 /* BPF map that will be filled by user space */
 struct filters {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, int);
-	__type(value, struct perf_bpf_filter_entry);
-	__uint(max_entries, MAX_FILTERS);
+	__type(value, struct perf_bpf_filter_entry[MAX_FILTERS]);
+	__uint(max_entries, 1);
 } filters SEC(".maps");
 
-int dropped;
+/*
+ * An evsel has multiple instances for each CPU or task but we need a single
+ * id to be used as a key for the idx_hash.  This hashmap would translate the
+ * instance's ID to a representative ID.
+ */
+struct event_hash {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, __u64);
+	__type(value, __u64);
+	__uint(max_entries, 1);
+} event_hash SEC(".maps");
+
+/* tgid/evtid to filter index */
+struct idx_hash {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, struct idx_hash_key);
+	__type(value, int);
+	__uint(max_entries, 1);
+} idx_hash SEC(".maps");
+
+/* tgid to filter index */
+struct lost_count {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 1);
+} dropped SEC(".maps");
+
+volatile const int use_idx_hash;
 
 void *bpf_cast_to_kern_ctx(void *) __ksym;
 
@@ -65,6 +93,7 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
 	BUILD_CHECK_SAMPLE(DATA_SRC);
 	BUILD_CHECK_SAMPLE(TRANSACTION);
 	BUILD_CHECK_SAMPLE(PHYS_ADDR);
+	BUILD_CHECK_SAMPLE(CGROUP);
 	BUILD_CHECK_SAMPLE(DATA_PAGE_SIZE);
 	BUILD_CHECK_SAMPLE(CODE_PAGE_SIZE);
 	BUILD_CHECK_SAMPLE(WEIGHT_STRUCT);
@@ -107,6 +136,8 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
 		return kctx->data->weight.full;
 	case PBF_TERM_PHYS_ADDR:
 		return kctx->data->phys_addr;
+	case PBF_TERM_CGROUP:
+		return kctx->data->cgroup;
 	case PBF_TERM_CODE_PAGE_SIZE:
 		return kctx->data->code_page_size;
 	case PBF_TERM_DATA_PAGE_SIZE:
@@ -155,7 +186,6 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
 	case __PBF_UNUSED_TERM16:
 	case __PBF_UNUSED_TERM18:
 	case __PBF_UNUSED_TERM20:
-	case __PBF_UNUSED_TERM21:
 	default:
 		break;
 	}
@@ -179,39 +209,66 @@ int perf_sample_filter(void *ctx)
 	__u64 sample_data;
 	int in_group = 0;
 	int group_result = 0;
-	int i;
+	int i, k;
+	int *losts;
 
 	kctx = bpf_cast_to_kern_ctx(ctx);
 
-	for (i = 0; i < MAX_FILTERS; i++) {
-		int key = i; /* needed for verifier :( */
+	k = 0;
 
-		entry = bpf_map_lookup_elem(&filters, &key);
-		if (entry == NULL)
-			break;
-		sample_data = perf_get_sample(kctx, entry);
+	if (use_idx_hash) {
+		struct idx_hash_key key = {
+			.tgid = bpf_get_current_pid_tgid() >> 32,
+		};
+		__u64 eid = kctx->event->id;
+		__u64 *key_id;
+		int *idx;
+
+		/* get primary_event_id */
+		if (kctx->event->parent)
+			eid = kctx->event->parent->id;
 
-		switch (entry->op) {
+		key_id = bpf_map_lookup_elem(&event_hash, &eid);
+		if (key_id == NULL)
+			goto drop;
+
+		key.evt_id = *key_id;
+
+		idx = bpf_map_lookup_elem(&idx_hash, &key);
+		if (idx)
+			k = *idx;
+		else
+			goto drop;
+	}
+
+	entry = bpf_map_lookup_elem(&filters, &k);
+	if (entry == NULL)
+		goto drop;
+
+	for (i = 0; i < MAX_FILTERS; i++) {
+		sample_data = perf_get_sample(kctx, &entry[i]);
+
+		switch (entry[i].op) {
 		case PBF_OP_EQ:
-			CHECK_RESULT(sample_data, ==, entry->value)
+			CHECK_RESULT(sample_data, ==, entry[i].value)
 			break;
 		case PBF_OP_NEQ:
-			CHECK_RESULT(sample_data, !=, entry->value)
+			CHECK_RESULT(sample_data, !=, entry[i].value)
 			break;
 		case PBF_OP_GT:
-			CHECK_RESULT(sample_data, >, entry->value)
+			CHECK_RESULT(sample_data, >, entry[i].value)
 			break;
 		case PBF_OP_GE:
-			CHECK_RESULT(sample_data, >=, entry->value)
+			CHECK_RESULT(sample_data, >=, entry[i].value)
 			break;
 		case PBF_OP_LT:
-			CHECK_RESULT(sample_data, <, entry->value)
+			CHECK_RESULT(sample_data, <, entry[i].value)
 			break;
 		case PBF_OP_LE:
-			CHECK_RESULT(sample_data, <=, entry->value)
+			CHECK_RESULT(sample_data, <=, entry[i].value)
 			break;
 		case PBF_OP_AND:
-			CHECK_RESULT(sample_data, &, entry->value)
+			CHECK_RESULT(sample_data, &, entry[i].value)
 			break;
 		case PBF_OP_GROUP_BEGIN:
 			in_group = 1;
@@ -222,13 +279,19 @@ int perf_sample_filter(void *ctx)
 				goto drop;
 			in_group = 0;
 			break;
+		case PBF_OP_DONE:
+			/* no failures so far, accept it */
+			return 1;
 		}
 	}
 	/* generate sample data */
 	return 1;
 
 drop:
-	__sync_fetch_and_add(&dropped, 1);
+	losts = bpf_map_lookup_elem(&dropped, &k);
+	if (losts != NULL)
+		__sync_fetch_and_add(losts, 1);
+
 	return 0;
 }
 
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index e9028235d771..4dcad7b682bd 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -15,6 +15,7 @@
 
 typedef __u8 u8;
 typedef __u32 u32;
+typedef __s32 s32;
 typedef __u64 u64;
 typedef __s64 s64;
 
@@ -170,10 +171,16 @@ struct perf_sample_data {
 		u32		 cpu;
 	} cpu_entry;
 	u64			 phys_addr;
+	u64			 cgroup;
 	u64			 data_page_size;
 	u64			 code_page_size;
 } __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
 
+struct perf_event {
+	struct perf_event	*parent;
+	u64			id;
+} __attribute__((preserve_access_index));
+
 struct bpf_perf_event_data_kern {
 	struct perf_sample_data *data;
 	struct perf_event	*event;