1 files changed, 72 insertions, 0 deletions
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
index cde91c34b101..90a19336310b 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -37,15 +37,87 @@ struct syscall_exit_args {
 	long		   ret;
 };
 
+struct augmented_filename {
+	unsigned int	size;
+	int		reserved;
+	char		value[256];
+};
+
+#define SYS_OPEN 2
+#define SYS_OPENAT 257
+
 SEC("raw_syscalls:sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
 	struct {
 		struct syscall_enter_args args;
+		struct augmented_filename filename;
 	} augmented_args;
 	unsigned int len = sizeof(augmented_args);
+	const void *filename_arg = NULL;
 
 	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
+	/*
+	 * Yonghong and Edward Cree sayz:
+	 *
+	 * https://www.spinics.net/lists/netdev/msg531645.html
+	 *
+	 * >>   R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
+	 * >> 10: (bf) r1 = r6
+	 * >> 11: (07) r1 += 16
+	 * >> 12: (05) goto pc+2
+	 * >> 15: (79) r3 = *(u64 *)(r1 +0)
+	 * >> dereference of modified ctx ptr R1 off=16 disallowed
+	 * > Aha, we at least got a different error message this time.
+	 * > And indeed llvm has done that optimisation, rather than the more obvious
+	 * > 11: r3 = *(u64 *)(r1 +16)
+	 * > because it wants to have lots of reads share a single insn.  You may be able
+	 * > to defeat that optimisation by adding compiler barriers, idk.  Maybe someone
+	 * > with llvm knowledge can figure out how to stop it (ideally, llvm would know
+	 * > when it's generating for bpf backend and not do that).  -O0?  ¯\_(ツ)_/¯
+	 *
+	 * The optimization mostly likes below:
+	 *
+	 *	br1:
+	 * 	...
+	 *	r1 += 16
+	 *	goto merge
+	 *	br2:
+	 *	...
+	 *	r1 += 20
+	 *	goto merge
+	 *	merge:
+	 *	*(u64 *)(r1 + 0)
+	 *
+	 * The compiler tries to merge common loads. There is no easy way to
+	 * stop this compiler optimization without turning off a lot of other
+	 * optimizations. The easiest way is to add barriers:
+	 *
+	 * 	 __asm__ __volatile__("": : :"memory")
+	 *
+	 * 	 after the ctx memory access to prevent their down stream merging.
+	 */
+	switch (augmented_args.args.syscall_nr) {
+	case SYS_OPEN:	 filename_arg = (const void *)args->args[0];
+			__asm__ __volatile__("": : :"memory");
+			 break;
+	case SYS_OPENAT: filename_arg = (const void *)args->args[1];
+			 break;
+	}
+
+	if (filename_arg != NULL) {
+		augmented_args.filename.reserved = 0;
+		augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
+							      sizeof(augmented_args.filename.value),
+							      filename_arg);
+		if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
+			len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
+			len &= sizeof(augmented_args.filename.value) - 1;
+		}
+	} else {
+		len = sizeof(augmented_args.args);
+	}
+
 	perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);
 	return 0;
 }