summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_bpf.c
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-05-25 19:56:10 -0700
committerJakub Kicinski <kuba@kernel.org>2023-05-25 19:57:39 -0700
commitd4031ec844bc52fe7f2f844e9c476727fd6b8240 (patch)
treeb350f90348c94ccd6e6d5dcd31ad371e3486e177 /net/ipv4/tcp_bpf.c
parent657d42cf5df64d9f32caab73ba2d2284879a37b0 (diff)
parentad42a35bdfc6d3c0fc4cb4027d7b2757ce665665 (diff)
downloadlinux-d4031ec844bc52fe7f2f844e9c476727fd6b8240.tar.gz
linux-d4031ec844bc52fe7f2f844e9c476727fd6b8240.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR. Conflicts: net/ipv4/raw.c 3632679d9e4f ("ipv{4,6}/raw: fix output xfrm lookup wrt protocol") c85be08fc4fa ("raw: Stop using RTO_ONLINK.") https://lore.kernel.org/all/20230525110037.2b532b83@canb.auug.org.au/ Adjacent changes: drivers/net/ethernet/freescale/fec_main.c 9025944fddfe ("net: fec: add dma_wmb to ensure correct descriptor values") 144470c88c5d ("net: fec: using the standard return codes when xdp xmit errors") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/ipv4/tcp_bpf.c')
-rw-r--r--net/ipv4/tcp_bpf.c79
1 files changed, 78 insertions, 1 deletions
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 0291d15acd19..e75023ea052f 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -11,6 +11,24 @@
#include <net/inet_common.h>
#include <net/tls.h>
+void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tcp;
+ int copied;
+
+ if (!skb || !skb->len || !sk_is_tcp(sk))
+ return;
+
+ if (skb_bpf_strparser(skb))
+ return;
+
+ tcp = tcp_sk(sk);
+ copied = tcp->copied_seq + skb->len;
+ WRITE_ONCE(tcp->copied_seq, copied);
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, skb->len);
+}
+
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
@@ -178,14 +196,34 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
return ret;
}
+static bool is_next_msg_fin(struct sk_psock *psock)
+{
+ struct scatterlist *sge;
+ struct sk_msg *msg_rx;
+ int i;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ i = msg_rx->sg.start;
+ sge = sk_msg_elem(msg_rx, i);
+ if (!sge->length) {
+ struct sk_buff *skb = msg_rx->skb;
+
+ if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ return true;
+ }
+ return false;
+}
+
static int tcp_bpf_recvmsg_parser(struct sock *sk,
struct msghdr *msg,
size_t len,
int flags,
int *addr_len)
{
+ struct tcp_sock *tcp = tcp_sk(sk);
+ u32 seq = tcp->copied_seq;
struct sk_psock *psock;
- int copied;
+ int copied = 0;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
@@ -198,8 +236,43 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
return tcp_recvmsg(sk, msg, len, flags, addr_len);
lock_sock(sk);
+
+ /* We may have received data on the sk_receive_queue pre-accept and
+ * then we can not use read_skb in this context because we haven't
+ * assigned a sk_socket yet so have no link to the ops. The work-around
+ * is to check the sk_receive_queue and in these cases read skbs off
+ * queue again. The read_skb hook is not running at this point because
+ * of lock_sock so we avoid having multiple runners in read_skb.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ tcp_data_ready(sk);
+ /* This handles the ENOMEM errors if we both receive data
+ * pre accept and are already under memory pressure. At least
+ * let user know to retry.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ copied = -EAGAIN;
+ goto out;
+ }
+ }
+
msg_bytes_ready:
copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ /* The typical case for EFAULT is the socket was gracefully
+ * shutdown with a FIN pkt. So check here the other case is
+ * some error on copy_page_to_iter which would be unexpected.
+ * On fin return correct return code to zero.
+ */
+ if (copied == -EFAULT) {
+ bool is_fin = is_next_msg_fin(psock);
+
+ if (is_fin) {
+ copied = 0;
+ seq++;
+ goto out;
+ }
+ }
+ seq += copied;
if (!copied) {
long timeo;
int data;
@@ -237,6 +310,10 @@ msg_bytes_ready:
copied = -EAGAIN;
}
out:
+ WRITE_ONCE(tcp->copied_seq, seq);
+ tcp_rcv_space_adjust(sk);
+ if (copied > 0)
+ __tcp_cleanup_rbuf(sk, copied);
release_sock(sk);
sk_psock_put(sk, psock);
return copied;