From 49b23575943c04b6711107cfd08ad2b3ae4e81f5 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 3 Jun 2020 21:03:47 +0200 Subject: bpf: Fix unused-var without NETDEVICES A recent commit added new variables only used if CONFIG_NETDEVICES is set. A simple fix would be to only declare these variables if the same condition is valid but Alexei suggested an even simpler solution: since CONFIG_NETDEVICES doesn't change anything in .h I think the best is to remove #ifdef CONFIG_NETDEVICES from net/core/filter.c and rely on sock_bindtoindex() returning ENOPROTOOPT in the extreme case of oddly configured kernels. Fixes: 70c58997c1e8 ("bpf: Allow SO_BINDTODEVICE opt in bpf_setsockopt") Suggested-by: Alexei Starovoitov Signed-off-by: Matthieu Baerts Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200603190347.2310320-1-matthieu.baerts@tessares.net --- net/core/filter.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index d01a244b5087..90d2eb77002f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4340,8 +4340,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } break; case SO_BINDTODEVICE: - ret = -ENOPROTOOPT; -#ifdef CONFIG_NETDEVICES optlen = min_t(long, optlen, IFNAMSIZ - 1); strncpy(devname, optval, optlen); devname[optlen] = 0; @@ -4360,7 +4358,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, dev_put(dev); } ret = sock_bindtoindex(sk, ifindex, false); -#endif break; default: ret = -EINVAL; -- cgit v1.2.3 From e7ed83d6fa1a00d0f2ad0327e73d3ea9e7ea8de1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Jun 2020 11:54:36 +0300 Subject: bpf: Fix an error code in check_btf_func() This code returns success if the "info_aux" allocation fails but it should return -ENOMEM. Fixes: 8c1b6e69dcc1 ("bpf: Compare BTF types of functions arguments with actual types") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200604085436.GA943001@mwanda --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c7bbaac81ef..34cde841ab68 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7552,7 +7552,7 @@ static int check_btf_func(struct bpf_verifier_env *env, const struct btf *btf; void __user *urecord; u32 prev_offset = 0; - int ret = 0; + int ret = -ENOMEM; nfuncs = attr->func_info_cnt; if (!nfuncs) -- cgit v1.2.3 From 1f2436229bf64ac040f2f5018df059c21fc5526a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 7 Jun 2020 17:36:15 -0700 Subject: selftests/bpf: Fix ringbuf selftest sample counting undeterminism Fix test race, in which background poll can get either 5 or 6 samples, depending on timing of notification. Prevent this by open-coding sample triggering and forcing notification for the very last sample only. Also switch to using atomic increments and exchanges for more obviously reliable counting and checking. Additionally, check expected processed sample counters for single-threaded use cases as well. Fixes: 9a5f25ad30e5 ("selftests/bpf: Fix sample_cnt shared between two threads") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200608003615.3549991-1-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 42 ++++++++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 2bba908dfa63..c1650548433c 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -25,13 +25,23 @@ struct sample { char comm[16]; }; -static volatile int sample_cnt; +static int sample_cnt; + +static void atomic_inc(int *cnt) +{ + __atomic_add_fetch(cnt, 1, __ATOMIC_SEQ_CST); +} + +static int atomic_xchg(int *cnt, int val) +{ + return __atomic_exchange_n(cnt, val, __ATOMIC_SEQ_CST); +} static int process_sample(void *ctx, void *data, size_t len) { struct sample *s = data; - sample_cnt++; + atomic_inc(&sample_cnt); switch (s->seq) { case 0: @@ -76,7 +86,7 @@ void test_ringbuf(void) const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); pthread_t thread; long bg_ret = -1; - int err; + int err, cnt; skel = test_ringbuf__open_and_load(); if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) @@ -116,11 +126,15 @@ void test_ringbuf(void) /* -EDONE is used as an indicator that we are done */ if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err)) goto cleanup; + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 2, "cnt", "exp %d samples, got %d\n", 2, cnt); /* we expect extra polling to return nothing */ err = ring_buffer__poll(ringbuf, 0); if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err)) goto cleanup; + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt); CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", 0L, skel->bss->dropped); @@ -136,6 +150,8 @@ void test_ringbuf(void) 3L * rec_sz, skel->bss->cons_pos); err = ring_buffer__poll(ringbuf, -1); CHECK(err <= 0, "poll_err", "err %d\n", err); + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 2, "cnt", "exp %d samples, got %d\n", 2, cnt); /* start poll in background w/ long timeout */ err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000); @@ -164,6 +180,8 @@ void test_ringbuf(void) 2L, skel->bss->total); CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", 1L, skel->bss->discarded); + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt); /* clear flags to return to "adaptive" notification mode */ skel->bss->flags = 0; @@ -178,10 +196,20 @@ void test_ringbuf(void) if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) goto cleanup; + /* still no samples, because consumer is behind */ + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt); + + skel->bss->dropped = 0; + skel->bss->total = 0; + skel->bss->discarded = 0; + + skel->bss->value = 333; + syscall(__NR_getpgid); /* now force notifications */ skel->bss->flags = BPF_RB_FORCE_WAKEUP; - sample_cnt = 0; - trigger_samples(); + skel->bss->value = 777; + syscall(__NR_getpgid); /* now we should get a pending notification */ usleep(50000); @@ -193,8 +221,8 @@ void test_ringbuf(void) goto cleanup; /* 3 rounds, 2 samples each */ - CHECK(sample_cnt != 6, "wrong_sample_cnt", - "expected to see %d samples, got %d\n", 6, sample_cnt); + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 6, "cnt", "exp %d samples, got %d\n", 6, cnt); /* BPF side did everything right */ CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", -- cgit v1.2.3 From 487082fb7bd2a32b66927d2b22e3a81b072b44f0 Mon Sep 17 00:00:00 2001 From: dihu Date: Fri, 5 Jun 2020 16:46:25 +0800 Subject: bpf/sockmap: Fix kernel panic at __tcp_bpf_recvmsg When user application calls read() with MSG_PEEK flag to read data of bpf sockmap socket, kernel panic happens at __tcp_bpf_recvmsg+0x12c/0x350. sk_msg is not removed from ingress_msg queue after read out under MSG_PEEK flag is set. Because it's not judged whether sk_msg is the last msg of ingress_msg queue, the next sk_msg may be the head of ingress_msg queue, whose memory address of sg page is invalid. So it's necessary to add check codes to prevent this problem. [20759.125457] BUG: kernel NULL pointer dereference, address: 0000000000000008 [20759.132118] CPU: 53 PID: 51378 Comm: envoy Tainted: G E 5.4.32 #1 [20759.140890] Hardware name: Inspur SA5212M4/YZMB-00370-109, BIOS 4.1.12 06/18/2017 [20759.149734] RIP: 0010:copy_page_to_iter+0xad/0x300 [20759.270877] __tcp_bpf_recvmsg+0x12c/0x350 [20759.276099] tcp_bpf_recvmsg+0x113/0x370 [20759.281137] inet_recvmsg+0x55/0xc0 [20759.285734] __sys_recvfrom+0xc8/0x130 [20759.290566] ? __audit_syscall_entry+0x103/0x130 [20759.296227] ? syscall_trace_enter+0x1d2/0x2d0 [20759.301700] ? __audit_syscall_exit+0x1e4/0x290 [20759.307235] __x64_sys_recvfrom+0x24/0x30 [20759.312226] do_syscall_64+0x55/0x1b0 [20759.316852] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: dihu Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200605084625.9783-1-anny.hu@linux.alibaba.com --- net/ipv4/tcp_bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 629aaa9a1eb9..2b915aafda42 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -64,6 +64,9 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end); if (unlikely(peek)) { + if (msg_rx == list_last_entry(&psock->ingress_msg, + struct sk_msg, list)) + break; msg_rx = list_next_entry(msg_rx, list); continue; } -- cgit v1.2.3 From 33a7c831565c43a7ee2f38c7df4c4a40e1dfdfed Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 7 Jun 2020 22:52:28 +0200 Subject: bpf, sockhash: Fix memory leak when unlinking sockets in sock_hash_free When sockhash gets destroyed while sockets are still linked to it, we will walk the bucket lists and delete the links. However, we are not freeing the list elements after processing them, leaking the memory. The leak can be triggered by close()'ing a sockhash map when it still contains sockets, and observed with kmemleak: unreferenced object 0xffff888116e86f00 (size 64): comm "race_sock_unlin", pid 223, jiffies 4294731063 (age 217.404s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 81 de e8 41 00 00 00 00 c0 69 2f 15 81 88 ff ff ...A.....i/..... backtrace: [<00000000dd089ebb>] sock_hash_update_common+0x4ca/0x760 [<00000000b8219bd5>] sock_hash_update_elem+0x1d2/0x200 [<000000005e2c23de>] __do_sys_bpf+0x2046/0x2990 [<00000000d0084618>] do_syscall_64+0xad/0x9a0 [<000000000d96f263>] entry_SYSCALL_64_after_hwframe+0x49/0xb3 Fix it by freeing the list element when we're done with it. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200607205229.2389672-2-jakub@cloudflare.com --- net/core/sock_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 00a26cf2cfe9..ea46f07a22d8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1031,6 +1031,7 @@ static void sock_hash_free(struct bpf_map *map) sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); + sock_hash_free_elem(htab, elem); } } -- cgit v1.2.3 From 75e68e5bf2c7fa9d3e874099139df03d5952a3e1 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 7 Jun 2020 22:52:29 +0200 Subject: bpf, sockhash: Synchronize delete from bucket list on map free We can end up modifying the sockhash bucket list from two CPUs when a sockhash is being destroyed (sock_hash_free) on one CPU, while a socket that is in the sockhash is unlinking itself from it on another CPU it (sock_hash_delete_from_link). This results in accessing a list element that is in an undefined state as reported by KASAN: | ================================================================== | BUG: KASAN: wild-memory-access in sock_hash_free+0x13c/0x280 | Write of size 8 at addr dead000000000122 by task kworker/2:1/95 | | CPU: 2 PID: 95 Comm: kworker/2:1 Not tainted 5.7.0-rc7-02961-ge22c35ab0038-dirty #691 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x97/0xe0 | ? sock_hash_free+0x13c/0x280 | __kasan_report.cold+0x5/0x40 | ? mark_lock+0xbc1/0xc00 | ? sock_hash_free+0x13c/0x280 | kasan_report+0x38/0x50 | ? sock_hash_free+0x152/0x280 | sock_hash_free+0x13c/0x280 | bpf_map_free_deferred+0xb2/0xd0 | ? bpf_map_charge_finish+0x50/0x50 | ? rcu_read_lock_sched_held+0x81/0xb0 | ? rcu_read_lock_bh_held+0x90/0x90 | process_one_work+0x59a/0xac0 | ? lock_release+0x3b0/0x3b0 | ? pwq_dec_nr_in_flight+0x110/0x110 | ? rwlock_bug.part.0+0x60/0x60 | worker_thread+0x7a/0x680 | ? _raw_spin_unlock_irqrestore+0x4c/0x60 | kthread+0x1cc/0x220 | ? process_one_work+0xac0/0xac0 | ? kthread_create_on_node+0xa0/0xa0 | ret_from_fork+0x24/0x30 | ================================================================== Fix it by reintroducing spin-lock protected critical section around the code that removes the elements from the bucket on sockhash free. To do that we also need to defer processing of removed elements, until out of atomic context so that we can unlink the socket from the map when holding the sock lock. Fixes: 90db6d772f74 ("bpf, sockmap: Remove bucket->lock from sock_{hash|map}_free") Reported-by: Eric Dumazet Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200607205229.2389672-3-jakub@cloudflare.com --- net/core/sock_map.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ea46f07a22d8..17a40a947546 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1013,6 +1013,7 @@ static void sock_hash_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct bpf_htab_bucket *bucket; + struct hlist_head unlink_list; struct bpf_htab_elem *elem; struct hlist_node *node; int i; @@ -1024,13 +1025,31 @@ static void sock_hash_free(struct bpf_map *map) synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - hlist_for_each_entry_safe(elem, node, &bucket->head, node) { - hlist_del_rcu(&elem->node); + + /* We are racing with sock_hash_delete_from_link to + * enter the spin-lock critical section. Every socket on + * the list is still linked to sockhash. Since link + * exists, psock exists and holds a ref to socket. That + * lets us to grab a socket ref too. + */ + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry(elem, &bucket->head, node) + sock_hold(elem->sk); + hlist_move_list(&bucket->head, &unlink_list); + raw_spin_unlock_bh(&bucket->lock); + + /* Process removed entries out of atomic context to + * block for socket lock before deleting the psock's + * link to sockhash. + */ + hlist_for_each_entry_safe(elem, node, &unlink_list, node) { + hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); + sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } -- cgit v1.2.3 From 21a85bd601ee50f2796d52c542c46d04e21cedac Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 8 Jun 2020 10:42:57 +0100 Subject: scripts: Require pahole v1.16 when generating BTF bpf_iter requires the kernel BTF to be generated with pahole >= 1.16, since otherwise the function definitions that the iterator attaches to are not included. This failure mode is indistiguishable from trying to attach to an iterator that really doesn't exist. Since it's really easy to miss this requirement, bump the pahole version check used at build time to at least 1.16. Fixes: 15d83c4d7cef ("bpf: Allow loading of a bpf_iter program") Suggested-by: Ivan Babrou Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200608094257.47366-1-lmb@cloudflare.com --- scripts/link-vmlinux.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 3adef49250af..a37875904ca6 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -143,8 +143,8 @@ gen_btf() fi pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') - if [ "${pahole_ver}" -lt "113" ]; then - echo >&2 "BTF: ${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" + if [ "${pahole_ver}" -lt "116" ]; then + echo >&2 "BTF: ${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.16" return 1 fi -- cgit v1.2.3 From 22d5bd6867364b41576a712755271a7d6161abd6 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Jun 2020 14:45:32 +0200 Subject: tracing/probe: Fix bpf_task_fd_query() for kprobes and uprobes Commit 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") removed the trace_[ku]probe structure from the trace_event_call->data pointer. As bpf_get_[ku]probe_info() were forgotten in that change, fix them now. These functions are currently only used by the bpf_task_fd_query() syscall handler to collect information about a perf event. Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20200608124531.819838-1-jean-philippe@linaro.org --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_uprobe.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..8eeb95e04bf5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1629,7 +1629,7 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tk = find_trace_kprobe(pevent, group); else - tk = event->tp_event->data; + tk = trace_kprobe_primary_from_call(event->tp_event); if (!tk) return -EINVAL; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a8e8e9c1c75..fdd47f99b18f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1412,7 +1412,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tu = find_probe_event(pevent, group); else - tu = event->tp_event->data; + tu = trace_uprobe_primary_from_call(event->tp_event); if (!tu) return -EINVAL; -- cgit v1.2.3 From 26afa0a4eb3fd87757f9de56ec5db5a03b14e120 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 8 Jun 2020 09:17:23 -0600 Subject: bpf: Reset data_meta before running programs attached to devmap entry This is a new context that does not handle metadata at the moment, so mark data_meta invalid. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608151723.9539-1-dsahern@kernel.org --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 854b09beb16b..bfdff2faf5cb 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -479,6 +479,7 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp_set_data_meta_invalid(xdp); xdp->txq = &txq; act = bpf_prog_run_xdp(xdp_prog, xdp); -- cgit v1.2.3 From 248e00ac47d64e153b9c50f45aad73cd61894a73 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 8 Jun 2020 17:22:01 +0100 Subject: bpf: cgroup: Allow multi-attach program to replace itself When using BPF_PROG_ATTACH to attach a program to a cgroup in BPF_F_ALLOW_MULTI mode, it is not possible to replace a program with itself. This is because the check for duplicate programs doesn't take the replacement program into account. Replacing a program with itself might seem weird, but it has some uses: first, it allows resetting the associated cgroup storage. Second, it makes the API consistent with the non-ALLOW_MULTI usage, where it is possible to replace a program with itself. Third, it aligns BPF_PROG_ATTACH with bpf_link, where replacing itself is also supported. Sice this code has been refactored a few times this change will only apply to v5.7 and later. Adjustments could be made to commit 1020c1f24a94 ("bpf: Simplify __cgroup_bpf_attach") and commit d7bf2c10af05 ("bpf: allocate cgroup storage entries on attaching bpf programs") as well as commit 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608162202.94002-1-lmb@cloudflare.com --- kernel/bpf/cgroup.c | 2 +- tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fdf7836750a3..4d76f16524cc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -378,7 +378,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, } list_for_each_entry(pl, progs, node) { - if (prog && pl->prog == prog) + if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); if (link && pl->link == link) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index 139f8e82c7c6..b549fcfacc0b 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -230,6 +230,13 @@ void test_cgroup_attach_multi(void) "prog_replace", "errno=%d\n", errno)) goto err; + /* replace program with itself */ + attach_opts.replace_prog_fd = allow_prog[6]; + if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1, + BPF_CGROUP_INET_EGRESS, &attach_opts), + "prog_replace", "errno=%d\n", errno)) + goto err; + value = 0; CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); CHECK_FAIL(system(PING_CMD)); -- cgit v1.2.3 From 281920b7e0b31e0a7706433ff58e7d52ac97c327 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Jun 2020 15:31:46 +0200 Subject: bpf: Devmap adjust uapi for attach bpf program V2: - Defer changing BPF-syscall to start at file-descriptor 1 - Use {} to zero initialise struct. The recent commit fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry"), introduced ability to attach (and run) a separate XDP bpf_prog for each devmap entry. A bpf_prog is added via a file-descriptor. As zero were a valid FD, not using the feature requires using value minus-1. The UAPI is extended via tail-extending struct bpf_devmap_val and using map->value_size to determine the feature set. This will break older userspace applications not using the bpf_prog feature. Consider an old userspace app that is compiled against newer kernel uapi/bpf.h, it will not know that it need to initialise the member bpf_prog.fd to minus-1. Thus, users will be forced to update source code to get program running on newer kernels. This patch remove the minus-1 checks, and have zero mean feature isn't used. Followup patches either for kernel or libbpf should handle and avoid returning file-descriptor zero in the first place. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159170950687.2102545.7235914718298050113.stgit@firesoul --- include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/devmap.c | 17 ++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c65b374a5090..19684813faae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3761,6 +3761,19 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bfdff2faf5cb..0cbb72cdaf63 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,15 +60,6 @@ struct xdp_dev_bulk_queue { unsigned int count; }; -/* DEVMAP values */ -struct bpf_devmap_val { - u32 ifindex; /* device index */ - union { - int fd; /* prog fd on map write */ - u32 id; /* prog id on map read */ - } bpf_prog; -}; - struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; @@ -619,7 +610,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; - if (val->bpf_prog.fd >= 0) { + if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) @@ -653,8 +644,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -670,7 +661,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ - if (val.bpf_prog.fd != -1) + if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); @@ -700,8 +691,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; -- cgit v1.2.3 From 042b1545fe47788e734b0f074a8ae65856015cdf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Jun 2020 15:31:52 +0200 Subject: bpf: Selftests and tools use struct bpf_devmap_val from uapi Sync tools uapi bpf.h header file and update selftests that use struct bpf_devmap_val. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159170951195.2102545.1833108712124273987.stgit@firesoul --- tools/include/uapi/linux/bpf.h | 13 +++++++++++++ tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c | 8 -------- tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c | 2 +- .../selftests/bpf/progs/test_xdp_with_devmap_helpers.c | 3 +-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c65b374a5090..19684813faae 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3761,6 +3761,19 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index d19dbd668f6a..88ef3ec8ac4c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -8,14 +8,6 @@ #define IFINDEX_LO 1 -struct bpf_devmap_val { - u32 ifindex; /* device index */ - union { - int fd; /* prog fd on map write */ - u32 id; /* prog id on map read */ - } bpf_prog; -}; - void test_xdp_with_devmap_helpers(void) { struct test_xdp_with_devmap_helpers *skel; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c index e5c0f131c8a7..b360ba2bd441 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c @@ -2,7 +2,7 @@ /* fails to load without expected_attach_type = BPF_XDP_DEVMAP * because of access to egress_ifindex */ -#include "vmlinux.h" +#include #include SEC("xdp_dm_log") diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c index deef0e050863..330811260123 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 - -#include "vmlinux.h" +#include #include struct { -- cgit v1.2.3 From 8ca8d4a841730c02e77bf3c87bf658cc44f364b9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Jun 2020 18:16:53 -0300 Subject: libbpf: Define __WORDSIZE if not available Some systems, such as Android, don't have a define for __WORDSIZE, do it in terms of __SIZEOF_LONG__, as done in perf since 2012: http://git.kernel.org/torvalds/c/3f34f6c0233ae055b5 For reference: https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html I build tested it here and Andrii did some Travis CI build tests too. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200608161150.GA3073@kernel.org --- tools/lib/bpf/hashmap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index e823b35e7371..df59fd4fc95b 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -10,10 +10,9 @@ #include #include -#ifdef __GLIBC__ -#include -#else -#include +#include +#ifndef __WORDSIZE +#define __WORDSIZE (__SIZEOF_LONG__ * 8) #endif static inline size_t hash_bits(size_t h, int bits) -- cgit v1.2.3 From 32022fd97ed34f6812802bf1288db27c313576f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 9 Jun 2020 22:23:35 -0700 Subject: libbpf: Handle GCC noreturn-turned-volatile quirk Handle a GCC quirk of emitting extra volatile modifier in DWARF (and subsequently preserved in BTF by pahole) for function pointers marked as __attribute__((noreturn)). This was the way to mark such functions before GCC 2.5 added noreturn attribute. Drop such func_proto modifiers, similarly to how it's done for array (also to handle GCC quirk/bug). Such volatile attribute is emitted by GCC only, so existing selftests can't express such test. Simple repro is like this (compiled with GCC + BTF generated by pahole): struct my_struct { void __attribute__((noreturn)) (*fn)(int); }; struct my_struct a; Without this fix, output will be: struct my_struct { voidvolatile (*fn)(int); }; With the fix: struct my_struct { void (*fn)(int); }; Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion") Reported-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Tested-by: Jean-Philippe Brucker Link: https://lore.kernel.org/bpf/20200610052335.2862559-1-andriin@fb.com --- tools/lib/bpf/btf_dump.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index de07e559a11d..bbb430317260 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1137,6 +1137,20 @@ static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack) } } +static void btf_dump_drop_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + if (!btf_is_mod(t)) + return; + decl_stack->cnt--; + } +} + static void btf_dump_emit_name(const struct btf_dump *d, const char *name, bool last_was_ptr) { @@ -1235,14 +1249,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, * a const/volatile modifier for array, so we are * going to silently skip them here. */ - while (decls->cnt) { - next_id = decls->ids[decls->cnt - 1]; - next_t = btf__type_by_id(d->btf, next_id); - if (btf_is_mod(next_t)) - decls->cnt--; - else - break; - } + btf_dump_drop_mods(d, decls); if (decls->cnt == 0) { btf_dump_emit_name(d, fname, last_was_ptr); @@ -1270,7 +1277,15 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, __u16 vlen = btf_vlen(t); int i; - btf_dump_emit_mods(d, decls); + /* + * GCC emits extra volatile qualifier for + * __attribute__((noreturn)) function pointers. Clang + * doesn't do it. It's a GCC quirk for backwards + * compatibility with code written for GCC <2.5. So, + * similarly to extra qualifiers for array, just drop + * them, instead of handling them. + */ + btf_dump_drop_mods(d, decls); if (decls->cnt) { btf_dump_printf(d, " ("); btf_dump_emit_type_chain(d, decls, fname, lvl); -- cgit v1.2.3 From 47f6bc4ce1ff70d7ba0924c2f1c218c96cd585fb Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Tue, 9 Jun 2020 17:35:06 -0400 Subject: tools, bpf: Do not force gcc as CC This allows transparent cross-compilation with CROSS_COMPILE by relying on 7ed1c1901fe5 ("tools: fix cross-compile var clobbering"). Same change was applied to tools/bpf/bpftool/Makefile in 9e88b9312acb ("tools: bpftool: do not force gcc as CC"). Signed-off-by: Brett Mastbergen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200609213506.3299-1-brett.mastbergen@gmail.com --- tools/bpf/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 77472e28c8fd..6df1850f8353 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -3,7 +3,6 @@ include ../scripts/Makefile.include prefix ?= /usr/local -CC = gcc LEX = flex YACC = bison MAKE = make -- cgit v1.2.3 From 0f5d82f187e1beda3fe7295dfc500af266a5bd80 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 10 Jun 2020 13:41:39 -0500 Subject: net/filter: Permit reading NET in load_bytes_relative when MAC not set Added a check in the switch case on start_header that checks for the existence of the header, and in the case that MAC is not set and the caller requests for MAC, -EFAULT. If the caller requests for NET then MAC's existence is completely ignored. There is no function to check NET header's existence and as far as cgroup_skb/egress is concerned it should always be set. Removed for ptr >= the start of header, considering offset is bounded unsigned and should always be true. len <= end - mac is redundant to ptr + len <= end. Fixes: 3eee1f75f2b9 ("bpf: fix bpf_skb_load_bytes_relative pkt length check") Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/76bb820ddb6a95f59a772ecbd8c8a336f646b362.1591812755.git.zhuyifei@google.com --- net/core/filter.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 90d2eb77002f..1b7d1180931c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1755,25 +1755,27 @@ BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); - u8 *net = skb_network_header(skb); - u8 *mac = skb_mac_header(skb); - u8 *ptr; + u8 *start, *ptr; - if (unlikely(offset > 0xffff || len > (end - mac))) + if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: - ptr = mac + offset; + if (unlikely(!skb_mac_header_was_set(skb))) + goto err_clear; + start = skb_mac_header(skb); break; case BPF_HDR_START_NET: - ptr = net + offset; + start = skb_network_header(skb); break; default: goto err_clear; } - if (likely(ptr >= mac && ptr + len <= end)) { + ptr = start + offset; + + if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } -- cgit v1.2.3 From bd6fecb9a99cceb949271c1821cfbad2b2db97c6 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 10 Jun 2020 13:41:40 -0500 Subject: selftests/bpf: Add cgroup_skb/egress test for load_bytes_relative When cgroup_skb/egress triggers the MAC header is not set. Added a test that asserts reading MAC header is a -EFAULT but NET header succeeds. The test result from within the eBPF program is stored in an 1-element array map that the userspace then reads and asserts on. Another assertion is added that reading from a large offset, past the end of packet, returns -EFAULT. Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/9028ccbea4385a620e69c0a104f469ffd655c01e.1591812755.git.zhuyifei@google.com --- .../selftests/bpf/prog_tests/load_bytes_relative.c | 71 ++++++++++++++++++++++ .../selftests/bpf/progs/load_bytes_relative.c | 48 +++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c create mode 100644 tools/testing/selftests/bpf/progs/load_bytes_relative.c diff --git a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c new file mode 100644 index 000000000000..c1168e4a9036 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2020 Google LLC. + */ + +#include +#include + +void test_load_bytes_relative(void) +{ + int server_fd, cgroup_fd, prog_fd, map_fd, client_fd; + int err; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_map *test_result; + __u32 duration = 0; + + __u32 map_key = 0; + __u32 map_value = 0; + + cgroup_fd = test__join_cgroup("/load_bytes_relative"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server(AF_INET, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + + err = bpf_prog_load("./load_bytes_relative.o", BPF_PROG_TYPE_CGROUP_SKB, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + goto close_server_fd; + + test_result = bpf_object__find_map_by_name(obj, "test_result"); + if (CHECK_FAIL(!test_result)) + goto close_bpf_object; + + map_fd = bpf_map__fd(test_result); + if (map_fd < 0) + goto close_bpf_object; + + prog = bpf_object__find_program_by_name(obj, "load_bytes_relative"); + if (CHECK_FAIL(!prog)) + goto close_bpf_object; + + err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI); + if (CHECK_FAIL(err)) + goto close_bpf_object; + + client_fd = connect_to_fd(AF_INET, SOCK_STREAM, server_fd); + if (CHECK_FAIL(client_fd < 0)) + goto close_bpf_object; + close(client_fd); + + err = bpf_map_lookup_elem(map_fd, &map_key, &map_value); + if (CHECK_FAIL(err)) + goto close_bpf_object; + + CHECK(map_value != 1, "bpf", "bpf program returned failure"); + +close_bpf_object: + bpf_object__close(obj); + +close_server_fd: + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/load_bytes_relative.c b/tools/testing/selftests/bpf/progs/load_bytes_relative.c new file mode 100644 index 000000000000..dc1d04a7a3d6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/load_bytes_relative.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} test_result SEC(".maps"); + +SEC("cgroup_skb/egress") +int load_bytes_relative(struct __sk_buff *skb) +{ + struct ethhdr eth; + struct iphdr iph; + + __u32 map_key = 0; + __u32 test_passed = 0; + + /* MAC header is not set by the time cgroup_skb/egress triggers */ + if (bpf_skb_load_bytes_relative(skb, 0, ð, sizeof(eth), + BPF_HDR_START_MAC) != -EFAULT) + goto fail; + + if (bpf_skb_load_bytes_relative(skb, 0, &iph, sizeof(iph), + BPF_HDR_START_NET)) + goto fail; + + if (bpf_skb_load_bytes_relative(skb, 0xffff, &iph, sizeof(iph), + BPF_HDR_START_NET) != -EFAULT) + goto fail; + + test_passed = 1; + +fail: + bpf_map_update_elem(&test_result, &map_key, &test_passed, BPF_ANY); + + return 1; +} -- cgit v1.2.3 From d4060ac969563113101c79433f2ae005feca1c29 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 10 Jun 2020 15:08:04 +0200 Subject: tools, bpftool: Fix memory leak in codegen error cases Free the memory allocated for the template on error paths in function codegen. Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200610130804.21423-1-tklauser@distanz.ch --- tools/bpf/bpftool/gen.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index a3c4bb86c05a..ecbae47e66b8 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -224,6 +224,7 @@ static int codegen(const char *template, ...) } else { p_err("unrecognized character at pos %td in template '%s'", src - template - 1, template); + free(s); return -EINVAL; } } @@ -234,6 +235,7 @@ static int codegen(const char *template, ...) if (*src != '\t') { p_err("not enough tabs at pos %td in template '%s'", src - template - 1, template); + free(s); return -EINVAL; } } -- cgit v1.2.3 From aa2cad0600ed2ca6a0ab39948d4db1666b6c962b Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 11 Jun 2020 13:11:06 +0800 Subject: xdp: Fix xsk_generic_xmit errno MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Propagate sock_alloc_send_skb error code, not set it to EAGAIN unconditionally, when fail to allocate skb, which might cause that user space unnecessary loops. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Li RongQing Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1591852266-24017-1-git-send-email-lirongqing@baidu.com --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b6c0f08bd80d..3700266229f6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -352,10 +352,8 @@ static int xsk_generic_xmit(struct sock *sk) len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); - if (unlikely(!skb)) { - err = -EAGAIN; + if (unlikely(!skb)) goto out; - } skb_put(skb, len); addr = desc.addr; -- cgit v1.2.3 From 2c4779eff837f1035f6f9650d246905daadd9528 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 11 Jun 2020 12:33:41 +0200 Subject: tools, bpftool: Exit on error in function codegen Currently, the codegen function might fail and return an error. But its callers continue without checking its return value. Since codegen can fail only in the unlikely case of the system running out of memory or the static template being malformed, just exit(-1) directly from codegen and make it void-returning. Suggested-by: Andrii Nakryiko Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200611103341.21532-1-tklauser@distanz.ch --- tools/bpf/bpftool/gen.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index ecbae47e66b8..7443879e87af 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -200,7 +200,7 @@ out: return err; } -static int codegen(const char *template, ...) +static void codegen(const char *template, ...) { const char *src, *end; int skip_tabs = 0, n; @@ -211,7 +211,7 @@ static int codegen(const char *template, ...) n = strlen(template); s = malloc(n + 1); if (!s) - return -ENOMEM; + exit(-1); src = template; dst = s; @@ -225,7 +225,7 @@ static int codegen(const char *template, ...) p_err("unrecognized character at pos %td in template '%s'", src - template - 1, template); free(s); - return -EINVAL; + exit(-1); } } @@ -236,7 +236,7 @@ static int codegen(const char *template, ...) p_err("not enough tabs at pos %td in template '%s'", src - template - 1, template); free(s); - return -EINVAL; + exit(-1); } } /* trim trailing whitespace */ @@ -257,7 +257,8 @@ static int codegen(const char *template, ...) va_end(args); free(s); - return n; + if (n) + exit(-1); } static int do_skeleton(int argc, char **argv) -- cgit v1.2.3 From 2c7269b231194aae23fb90ab65842573a91acbc9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 10 Jun 2020 12:19:43 +0200 Subject: bpf: tcp: Recv() should return 0 when the peer socket is closed If the peer is closed, we will never get more data, so tcp_bpf_wait_data will get stuck forever. In case we passed MSG_DONTWAIT to recv(), we get EAGAIN but we should actually get 0. >From man 2 recv: RETURN VALUE When a stream socket peer has performed an orderly shutdown, the return value will be 0 (the traditional "end-of-file" return). This patch makes tcp_bpf_wait_data always return 1 when the peer socket has been shutdown. Either we have data available, and it would have returned 1 anyway, or there isn't, in which case we'll call tcp_recvmsg which does the right thing in this situation. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Sabrina Dubroca Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/26038a28c21fea5d04d4bd4744c5686d3f2e5504.1591784177.git.sd@queasysnail.net --- net/ipv4/tcp_bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 2b915aafda42..7aa68f4aae6c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -245,6 +245,9 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + if (!timeo) return ret; -- cgit v1.2.3 From f6fede8569689dd31e7b0ed15024b25e5ce2e2e5 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 11 Jun 2020 18:25:20 +0100 Subject: bpf: sockmap: Don't attach programs to UDP sockets The stream parser infrastructure isn't set up to deal with UDP sockets, so we mustn't try to attach programs to them. I remember making this change at some point, but I must have lost it while rebasing or something similar. Fixes: 7b98cd42b049 ("bpf: sockmap: Add UDP support") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200611172520.327602-1-lmb@cloudflare.com --- net/core/sock_map.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 17a40a947546..a2dc64de5213 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -424,10 +424,7 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) return 0; } -static bool sock_map_redirect_allowed(const struct sock *sk) -{ - return sk->sk_state != TCP_LISTEN; -} +static bool sock_map_redirect_allowed(const struct sock *sk); static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) @@ -508,6 +505,11 @@ static bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static bool sock_map_redirect_allowed(const struct sock *sk) +{ + return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN; +} + static bool sock_map_sk_is_suitable(const struct sock *sk) { return sk_is_tcp(sk) || sk_is_udp(sk); -- cgit v1.2.3 From 60e5ca8a64bad8f3e2e20a1e57846e497361c700 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 11 Jun 2020 17:08:57 -0700 Subject: bpf: Fix memlock accounting for sock_hash Add missed bpf_map_charge_init() in sock_hash_alloc() and correspondingly bpf_map_charge_finish() on ENOMEM. It was found accidentally while working on unrelated selftest that checks "map->memory.pages > 0" is true for all map types. Before: # bpftool m l ... 3692: sockhash name m_sockhash flags 0x0 key 4B value 4B max_entries 8 memlock 0B After: # bpftool m l ... 84: sockmap name m_sockmap flags 0x0 key 4B value 4B max_entries 8 memlock 4096B Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200612000857.2881453-1-rdna@fb.com --- net/core/sock_map.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a2dc64de5213..4059f94e9bb5 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -991,11 +991,15 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) err = -EINVAL; goto free_htab; } + err = bpf_map_charge_init(&htab->map.memory, cost); + if (err) + goto free_htab; htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_htab_bucket), htab->map.numa_node); if (!htab->buckets) { + bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } -- cgit v1.2.3 From 22eb78792e07a4dfb63c85f34950d4e58eb90326 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 13:16:03 -0700 Subject: tools/bpftool: Fix skeleton codegen Remove unnecessary check at the end of codegen() routine which makes codegen() to always fail and exit bpftool with error code. Positive value of variable n is not an indicator of a failure. Fixes: 2c4779eff837 ("tools, bpftool: Exit on error in function codegen") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Tobias Klauser Link: https://lore.kernel.org/bpf/20200612201603.680852-1-andriin@fb.com --- tools/bpf/bpftool/gen.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 7443879e87af..10de76b296ba 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -257,8 +257,6 @@ static void codegen(const char *template, ...) va_end(args); free(s); - if (n) - exit(-1); } static int do_skeleton(int argc, char **argv) -- cgit v1.2.3 From caf62492f479585296e9d636c798d5ac256b7b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 12:45:04 -0700 Subject: libbpf: Support pre-initializing .bss global variables Remove invalid assumption in libbpf that .bss map doesn't have to be updated in kernel. With addition of skeleton and memory-mapped initialization image, .bss doesn't have to be all zeroes when BPF map is created, because user-code might have initialized those variables from user-space. Fixes: eba9c5f498a1 ("libbpf: Refactor global data map initialization") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200612194504.557844-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 4 -- tools/testing/selftests/bpf/prog_tests/skeleton.c | 45 ++++++++++++++++++++--- tools/testing/selftests/bpf/progs/test_skeleton.c | 19 ++++++++-- 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7f01be2b88b8..477c679ed945 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3564,10 +3564,6 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) char *cp, errmsg[STRERR_BUFSIZE]; int err, zero = 0; - /* kernel already zero-initializes .bss map. */ - if (map_type == LIBBPF_MAP_BSS) - return 0; - err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); if (err) { err = -errno; diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c index 9264a2736018..fa153cf67b1b 100644 --- a/tools/testing/selftests/bpf/prog_tests/skeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -15,6 +15,8 @@ void test_skeleton(void) int duration = 0, err; struct test_skeleton* skel; struct test_skeleton__bss *bss; + struct test_skeleton__data *data; + struct test_skeleton__rodata *rodata; struct test_skeleton__kconfig *kcfg; skel = test_skeleton__open(); @@ -24,13 +26,45 @@ void test_skeleton(void) if (CHECK(skel->kconfig, "skel_kconfig", "kconfig is mmaped()!\n")) goto cleanup; + bss = skel->bss; + data = skel->data; + rodata = skel->rodata; + + /* validate values are pre-initialized correctly */ + CHECK(data->in1 != -1, "in1", "got %d != exp %d\n", data->in1, -1); + CHECK(data->out1 != -1, "out1", "got %d != exp %d\n", data->out1, -1); + CHECK(data->in2 != -1, "in2", "got %lld != exp %lld\n", data->in2, -1LL); + CHECK(data->out2 != -1, "out2", "got %lld != exp %lld\n", data->out2, -1LL); + + CHECK(bss->in3 != 0, "in3", "got %d != exp %d\n", bss->in3, 0); + CHECK(bss->out3 != 0, "out3", "got %d != exp %d\n", bss->out3, 0); + CHECK(bss->in4 != 0, "in4", "got %lld != exp %lld\n", bss->in4, 0LL); + CHECK(bss->out4 != 0, "out4", "got %lld != exp %lld\n", bss->out4, 0LL); + + CHECK(rodata->in6 != 0, "in6", "got %d != exp %d\n", rodata->in6, 0); + CHECK(bss->out6 != 0, "out6", "got %d != exp %d\n", bss->out6, 0); + + /* validate we can pre-setup global variables, even in .bss */ + data->in1 = 10; + data->in2 = 11; + bss->in3 = 12; + bss->in4 = 13; + rodata->in6 = 14; + err = test_skeleton__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err)) goto cleanup; - bss = skel->bss; - bss->in1 = 1; - bss->in2 = 2; + /* validate pre-setup values are still there */ + CHECK(data->in1 != 10, "in1", "got %d != exp %d\n", data->in1, 10); + CHECK(data->in2 != 11, "in2", "got %lld != exp %lld\n", data->in2, 11LL); + CHECK(bss->in3 != 12, "in3", "got %d != exp %d\n", bss->in3, 12); + CHECK(bss->in4 != 13, "in4", "got %lld != exp %lld\n", bss->in4, 13LL); + CHECK(rodata->in6 != 14, "in6", "got %d != exp %d\n", rodata->in6, 14); + + /* now set new values and attach to get them into outX variables */ + data->in1 = 1; + data->in2 = 2; bss->in3 = 3; bss->in4 = 4; bss->in5.a = 5; @@ -44,14 +78,15 @@ void test_skeleton(void) /* trigger tracepoint */ usleep(1); - CHECK(bss->out1 != 1, "res1", "got %d != exp %d\n", bss->out1, 1); - CHECK(bss->out2 != 2, "res2", "got %lld != exp %d\n", bss->out2, 2); + CHECK(data->out1 != 1, "res1", "got %d != exp %d\n", data->out1, 1); + CHECK(data->out2 != 2, "res2", "got %lld != exp %d\n", data->out2, 2); CHECK(bss->out3 != 3, "res3", "got %d != exp %d\n", (int)bss->out3, 3); CHECK(bss->out4 != 4, "res4", "got %lld != exp %d\n", bss->out4, 4); CHECK(bss->handler_out5.a != 5, "res5", "got %d != exp %d\n", bss->handler_out5.a, 5); CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n", bss->handler_out5.b, 6); + CHECK(bss->out6 != 14, "res7", "got %d != exp %d\n", bss->out6, 14); CHECK(bss->bpf_syscall != kcfg->CONFIG_BPF_SYSCALL, "ext1", "got %d != exp %d\n", bss->bpf_syscall, kcfg->CONFIG_BPF_SYSCALL); diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c index de03a90f78ca..77ae86f44db5 100644 --- a/tools/testing/selftests/bpf/progs/test_skeleton.c +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -10,16 +10,26 @@ struct s { long long b; } __attribute__((packed)); -int in1 = 0; -long long in2 = 0; +/* .data section */ +int in1 = -1; +long long in2 = -1; + +/* .bss section */ char in3 = '\0'; long long in4 __attribute__((aligned(64))) = 0; struct s in5 = {}; -long long out2 = 0; +/* .rodata section */ +const volatile int in6 = 0; + +/* .data section */ +int out1 = -1; +long long out2 = -1; + +/* .bss section */ char out3 = 0; long long out4 = 0; -int out1 = 0; +int out6 = 0; extern bool CONFIG_BPF_SYSCALL __kconfig; extern int LINUX_KERNEL_VERSION __kconfig; @@ -36,6 +46,7 @@ int handler(const void *ctx) out3 = in3; out4 = in4; out5 = in5; + out6 = in6; bpf_syscall = CONFIG_BPF_SYSCALL; kern_ver = LINUX_KERNEL_VERSION; -- cgit v1.2.3 From 29fcb05bbf1a7008900bb9bee347bdbfc7171036 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 17:21:15 -0700 Subject: bpf: Undo internal BPF_PROBE_MEM in BPF insns dump BPF_PROBE_MEM is kernel-internal implmementation details. When dumping BPF instructions to user-space, it needs to be replaced back with BPF_MEM mode. Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200613002115.1632142-1-andriin@fb.com --- kernel/bpf/syscall.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d530b1d5683..e9a3ebc00e08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3158,6 +3158,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) struct bpf_insn *insns; u32 off, type; u64 imm; + u8 code; int i; insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), @@ -3166,21 +3167,27 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) return insns; for (i = 0; i < prog->len; i++) { - if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + code = insns[i].code; + + if (code == (BPF_JMP | BPF_TAIL_CALL)) { insns[i].code = BPF_JMP | BPF_CALL; insns[i].imm = BPF_FUNC_tail_call; /* fall-through */ } - if (insns[i].code == (BPF_JMP | BPF_CALL) || - insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { - if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + if (code == (BPF_JMP | BPF_CALL) || + code == (BPF_JMP | BPF_CALL_ARGS)) { + if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok()) insns[i].imm = 0; continue; } + if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { + insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; + continue; + } - if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; -- cgit v1.2.3