From 5540fbf43845868defcb599ec91c678275a20671 Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 24 Apr 2018 23:46:59 -0700 Subject: bpf: clear the ip_tunnel_info. The percpu metadata_dst might carry the stale ip_tunnel_info and cause incorrect behavior. When mixing tests using ipv4/ipv6 bpf vxlan and geneve tunnel, the ipv6 tunnel info incorrectly uses ipv4's src ip addr as its ipv6 src address, because the previous tunnel info does not clean up. The patch zeros the fields in ip_tunnel_info. Signed-off-by: William Tu Reported-by: Yifeng Sun Signed-off-by: Daniel Borkmann --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d31aff93270d0..e77c30ca491df 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3240,6 +3240,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; + memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; -- cgit v1.2.3 From 070204a34884110ac5e19c1e2e036fcfd033f8e3 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Apr 2018 12:48:58 +0200 Subject: net/smc: keep clcsock reference in smc_tcp_listen_work() The internal CLC socket should exist till the SMC-socket is released. Function tcp_listen_worker() releases the internal CLC socket of a listen socket, if an smc_close_active() is called. This function is called for the final release(), but it is called for shutdown SHUT_RDWR as well. This opens a door for protection faults, if socket calls using the internal CLC socket are called for a shutdown listen socket. With the changes of commit 3d502067599f ("net/smc: simplify wait when closing listen socket") there is no need anymore to release the internal CLC socket in function tcp_listen_worker((). It is sufficient to release it in smc_release(). Fixes: 127f49705823 ("net/smc: release clcsock from tcp_listen_worker") Signed-off-by: Ursula Braun Reported-by: syzbot+9045fc589fcd196ef522@syzkaller.appspotmail.com Reported-by: syzbot+28a2c86cf19c81d871fa@syzkaller.appspotmail.com Reported-by: syzbot+9605e6cace1b5efd4a0a@syzkaller.appspotmail.com Reported-by: syzbot+cf9012c597c8379d535c@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f5d4b69dbabc8..4470501374bf9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -978,10 +978,6 @@ static void smc_tcp_listen_work(struct work_struct *work) } out: - if (lsmc->clcsock) { - sock_release(lsmc->clcsock); - lsmc->clcsock = NULL; - } release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_listen */ } -- cgit v1.2.3 From 91a825290ca4eae88603bc811bf74a45f94a3f46 Mon Sep 17 00:00:00 2001 From: Dag Moxnes Date: Wed, 25 Apr 2018 13:22:01 +0200 Subject: rds: ib: Fix missing call to rds_ib_dev_put in rds_ib_setup_qp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function rds_ib_setup_qp is calling rds_ib_get_client_data and should correspondingly call rds_ib_dev_put. This call was lost in the non-error path with the introduction of error handling done in commit 3b12f73a5c29 ("rds: ib: add error handle") Signed-off-by: Dag Moxnes Reviewed-by: HÃ¥kon Bugge Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index eea1d8611b205..13b38ad0fa4a4 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -547,7 +547,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, ic->i_send_cq, ic->i_recv_cq); - return ret; + goto out; sends_out: vfree(ic->i_sends); @@ -572,6 +572,7 @@ send_cq_out: ic->i_send_cq = NULL; rds_ibdev_out: rds_ib_remove_conn(rds_ibdev, conn); +out: rds_ib_dev_put(rds_ibdev); return ret; -- cgit v1.2.3 From 7dbc73e6124ce4d0cfbdd6166de388e9367c47ad Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 25 Apr 2018 18:29:25 +0200 Subject: tipc: fix bug in function tipc_nl_node_dump_monitor Commit 36a50a989ee8 ("tipc: fix infinite loop when dumping link monitor summary") intended to fix a problem with user tool looping when max number of bearers are enabled. Unfortunately, the wrong version of the commit was posted, so the problem was not solved at all. This commit adds the missing part. Fixes: 36a50a989ee8 ("tipc: fix infinite loop when dumping link monitor summary") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 6f98b56dd48ec..baaf93f12cbd0 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2244,7 +2244,7 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { - err = __tipc_nl_add_monitor(net, &msg, prev_bearer); + err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } -- cgit v1.2.3 From 16ae6aa1705299789f71fdea59bfb119c1fbd9c0 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 25 Apr 2018 11:33:08 -0700 Subject: tcp: ignore Fast Open on repair mode The TCP repair sequence of operation is to first set the socket in repair mode, then inject the TCP stats into the socket with repair socket options, then call connect() to re-activate the socket. The connect syscall simply returns and set state to ESTABLISHED mode. As a result Fast Open is meaningless for TCP repair. However allowing sendto() system call with MSG_FASTOPEN flag half-way during the repair operation could unexpectedly cause data to be sent, before the operation finishes changing the internal TCP stats (e.g. MSS). This in turn triggers TCP warnings on inconsistent packet accounting. The fix is to simply disallow Fast Open operation once the socket is in the repair mode. Reported-by: syzbot Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9ce1c726185eb..4b18ad41d4df3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1204,7 +1204,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) uarg->zerocopy = 0; } - if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) { + if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && + !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) goto out; -- cgit v1.2.3 From d625329b06e46bd20baf9ee40847d11982569204 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 26 Apr 2018 14:13:57 +0800 Subject: sctp: handle two v4 addrs comparison in sctp_inet6_cmp_addr Since sctp ipv6 socket also supports v4 addrs, it's possible to compare two v4 addrs in pf v6 .cmp_addr, sctp_inet6_cmp_addr. However after Commit 1071ec9d453a ("sctp: do not check port in sctp_inet6_cmp_addr"), it no longer calls af1->cmp_addr, which in this case is sctp_v4_cmp_addr, but calls __sctp_v6_cmp_addr where it handles them as two v6 addrs. It would cause a out of bounds crash. syzbot found this crash when trying to bind two v4 addrs to a v6 socket. This patch fixes it by adding the process for two v4 addrs in sctp_inet6_cmp_addr. Fixes: 1071ec9d453a ("sctp: do not check port in sctp_inet6_cmp_addr") Reported-by: syzbot+cd494c1dd681d4d93ebb@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 2e3f7b75a8ece..42247110d842e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -895,6 +895,9 @@ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1, if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2)) return 1; + if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET) + return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr; + return __sctp_v6_cmp_addr(addr1, addr2); } -- cgit v1.2.3 From 6a9a27d5397fc6c52f90c09ddab91e65053584aa Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 26 Apr 2018 15:21:44 +0800 Subject: sctp: clear the new asoc's stream outcnt in sctp_stream_update When processing a duplicate cookie-echo chunk, sctp moves the new temp asoc's stream out/in into the old asoc, and later frees this new temp asoc. But now after this move, the new temp asoc's stream->outcnt is not cleared while stream->out is set to NULL, which would cause a same crash as the one fixed in Commit 79d0895140e9 ("sctp: fix error path in sctp_stream_init") when freeing this asoc later. This fix is to clear this outcnt in sctp_stream_update. Fixes: f952be79cebd ("sctp: introduce struct sctp_stream_out_ext") Reported-by: Jianwen Ji Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index f799043abec9a..f1f1d1b232ba3 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -240,6 +240,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) new->out = NULL; new->in = NULL; + new->outcnt = 0; + new->incnt = 0; } static int sctp_send_reconf(struct sctp_association *asoc, -- cgit v1.2.3 From 988bf7243e03ef69238381594e0334a79cef74a6 Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Wed, 25 Apr 2018 10:21:54 -0400 Subject: net: support compat 64-bit time in {s,g}etsockopt For the x32 ABI, struct timeval has two 64-bit fields. However the kernel currently interprets the user-space values used for the SO_RCVTIMEO and SO_SNDTIMEO socket options as having a pair of 32-bit fields. When the seconds portion of the requested timeout is less than 2**32, the seconds portion of the effective timeout is correct but the microseconds portion is zero. When the seconds portion of the requested timeout is zero and the microseconds portion is non-zero, the kernel interprets the timeout as zero (never timeout). Fix by using 64-bit time for SO_RCVTIMEO/SO_SNDTIMEO as required for the ABI. The code included below demonstrates the problem. Results before patch: $ gcc -m64 -Wall -O2 -o socktmo socktmo.c && ./socktmo recv time: 2.008181 seconds send time: 2.015985 seconds $ gcc -m32 -Wall -O2 -o socktmo socktmo.c && ./socktmo recv time: 2.016763 seconds send time: 2.016062 seconds $ gcc -mx32 -Wall -O2 -o socktmo socktmo.c && ./socktmo recv time: 1.007239 seconds send time: 1.023890 seconds Results after patch: $ gcc -m64 -O2 -Wall -o socktmo socktmo.c && ./socktmo recv time: 2.010062 seconds send time: 2.015836 seconds $ gcc -m32 -O2 -Wall -o socktmo socktmo.c && ./socktmo recv time: 2.013974 seconds send time: 2.015981 seconds $ gcc -mx32 -O2 -Wall -o socktmo socktmo.c && ./socktmo recv time: 2.030257 seconds send time: 2.013383 seconds #include #include #include #include #include void checkrc(char *str, int rc) { if (rc >= 0) return; perror(str); exit(1); } static char buf[1024]; int main(int argc, char **argv) { int rc; int socks[2]; struct timeval tv; struct timeval start, end, delta; rc = socketpair(AF_UNIX, SOCK_STREAM, 0, socks); checkrc("socketpair", rc); /* set timeout to 1.999999 seconds */ tv.tv_sec = 1; tv.tv_usec = 999999; rc = setsockopt(socks[0], SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv); rc = setsockopt(socks[0], SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof tv); checkrc("setsockopt", rc); /* measure actual receive timeout */ gettimeofday(&start, NULL); rc = recv(socks[0], buf, sizeof buf, 0); gettimeofday(&end, NULL); timersub(&end, &start, &delta); printf("recv time: %ld.%06ld seconds\n", (long)delta.tv_sec, (long)delta.tv_usec); /* fill send buffer */ do { rc = send(socks[0], buf, sizeof buf, 0); } while (rc > 0); /* measure actual send timeout */ gettimeofday(&start, NULL); rc = send(socks[0], buf, sizeof buf, 0); gettimeofday(&end, NULL); timersub(&end, &start, &delta); printf("send time: %ld.%06ld seconds\n", (long)delta.tv_sec, (long)delta.tv_usec); exit(0); } Fixes: 515c7af85ed9 ("x32: Use compat shims for {g,s}etsockopt") Reported-by: Gopal RajagopalSai Signed-off-by: Lance Richardson Signed-off-by: David S. Miller --- net/compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 5ae7437d38538..7242cce5631bd 100644 --- a/net/compat.c +++ b/net/compat.c @@ -377,7 +377,8 @@ static int compat_sock_setsockopt(struct socket *sock, int level, int optname, optname == SO_ATTACH_REUSEPORT_CBPF) return do_set_attach_filter(sock, level, optname, optval, optlen); - if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) + if (!COMPAT_USE_64BIT_TIME && + (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)) return do_set_sock_timeout(sock, level, optname, optval, optlen); return sock_setsockopt(sock, level, optname, optval, optlen); @@ -448,7 +449,8 @@ static int do_get_sock_timeout(struct socket *sock, int level, int optname, static int compat_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { - if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) + if (!COMPAT_USE_64BIT_TIME && + (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)) return do_get_sock_timeout(sock, level, optname, optval, optlen); return sock_getsockopt(sock, level, optname, optval, optlen); } -- cgit v1.2.3 From e8238fc2bd7b4c3c7554fa2df067e796610212fc Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 27 Apr 2018 20:59:24 +0800 Subject: bridge: check iface upper dev when setting master via ioctl When we set a bond slave's master to bridge via ioctl, we only check the IFF_BRIDGE_PORT flag. Although we will find the slave's real master at netdev_master_upper_dev_link() later, it already does some settings and allocates some resources. It would be better to return as early as possible. v1 -> v2: use netdev_master_upper_dev_get() instead of netdev_has_any_upper_dev() to check if we have a master, because not all upper devs are masters, e.g. vlan device. Reported-by: syzbot+de73361ee4971b6e6f75@syzkaller.appspotmail.com Signed-off-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_if.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 82c1a6f430b37..5bb6681fa91e6 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -518,8 +518,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, return -ELOOP; } - /* Device is already being bridged */ - if (br_port_exists(dev)) + /* Device has master upper dev */ + if (netdev_master_upper_dev_get(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ -- cgit v1.2.3 From cea67a2dd6b2419dcc13a39309b9a79a1f773193 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Apr 2018 09:54:59 -0700 Subject: ipv6: fix uninit-value in ip6_multipath_l3_keys() syzbot/KMSAN reported an uninit-value in ip6_multipath_l3_keys(), root caused to a bad assumption of ICMP header being already pulled in skb->head ip_multipath_l3_keys() does the correct thing, so it is an IPv6 only bug. BUG: KMSAN: uninit-value in ip6_multipath_l3_keys net/ipv6/route.c:1830 [inline] BUG: KMSAN: uninit-value in rt6_multipath_hash+0x5c4/0x640 net/ipv6/route.c:1858 CPU: 0 PID: 4507 Comm: syz-executor661 Not tainted 4.16.0+ #87 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683 ip6_multipath_l3_keys net/ipv6/route.c:1830 [inline] rt6_multipath_hash+0x5c4/0x640 net/ipv6/route.c:1858 ip6_route_input+0x65a/0x920 net/ipv6/route.c:1884 ip6_rcv_finish+0x413/0x6e0 net/ipv6/ip6_input.c:69 NF_HOOK include/linux/netfilter.h:288 [inline] ipv6_rcv+0x1e16/0x2340 net/ipv6/ip6_input.c:208 __netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4562 __netif_receive_skb net/core/dev.c:4627 [inline] netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701 netif_receive_skb+0x230/0x240 net/core/dev.c:4725 tun_rx_batched drivers/net/tun.c:1555 [inline] tun_get_user+0x740f/0x7c60 drivers/net/tun.c:1962 tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990 call_write_iter include/linux/fs.h:1782 [inline] new_sync_write fs/read_write.c:469 [inline] __vfs_write+0x7fb/0x9f0 fs/read_write.c:482 vfs_write+0x463/0x8d0 fs/read_write.c:544 SYSC_write+0x172/0x360 fs/read_write.c:589 SyS_write+0x55/0x80 fs/read_write.c:581 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 23aebdacb05d ("ipv6: Compute multipath hash for ICMP errors from offending packet") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jakub Sitnicki Acked-by: Jakub Sitnicki Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cde7d8251377c..f4d61736c41ab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1835,11 +1835,16 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; + struct icmp6hdr _icmph; if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) goto out; - icmph = icmp6_hdr(skb); + icmph = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_icmph), &_icmph); + if (!icmph) + goto out; + if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && icmph->icmp6_type != ICMPV6_PKT_TOOBIG && icmph->icmp6_type != ICMPV6_TIME_EXCEED && -- cgit v1.2.3 From bf2acc943a45d2b2e8a9f1a5ddff6b6e43cc69d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Apr 2018 18:55:20 -0700 Subject: tcp: fix TCP_REPAIR_QUEUE bound checking syzbot is able to produce a nasty WARN_ON() in tcp_verify_left_out() with following C-repro : socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3 setsockopt(3, SOL_TCP, TCP_REPAIR, [1], 4) = 0 setsockopt(3, SOL_TCP, TCP_REPAIR_QUEUE, [-1], 4) = 0 bind(3, {sa_family=AF_INET, sin_port=htons(20002), sin_addr=inet_addr("0.0.0.0")}, 16) = 0 sendto(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1242, MSG_FASTOPEN, {sa_family=AF_INET, sin_port=htons(20002), sin_addr=inet_addr("127.0.0.1")}, 16) = 1242 setsockopt(3, SOL_TCP, TCP_REPAIR_WINDOW, "\4\0\0@+\205\0\0\377\377\0\0\377\377\377\177\0\0\0\0", 20) = 0 writev(3, [{"\270", 1}], 1) = 1 setsockopt(3, SOL_TCP, TCP_REPAIR_OPTIONS, "\10\0\0\0\0\0\0\0\0\0\0\0|\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 386) = 0 writev(3, [{"\210v\r[\226\320t\231qwQ\204\264l\254\t\1\20\245\214p\350H\223\254;\\\37\345\307p$"..., 3144}], 1) = 3144 The 3rd system call looks odd : setsockopt(3, SOL_TCP, TCP_REPAIR_QUEUE, [-1], 4) = 0 This patch makes sure bound checking is using an unsigned compare. Fixes: ee9952831cfd ("tcp: Initial repair mode") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4b18ad41d4df3..44be7f43455e4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2674,7 +2674,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; - else if (val < TCP_QUEUES_NR) + else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; -- cgit v1.2.3 From d656fe49e33df48ee6bc19e871f5862f49895c9e Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Mon, 30 Apr 2018 12:31:13 -0500 Subject: ethtool: fix a potential missing-check bug In ethtool_get_rxnfc(), the object "info" is firstly copied from user-space. If the FLOW_RSS flag is set in the member field flow_type of "info" (and cmd is ETHTOOL_GRXFH), info needs to be copied again from user-space because FLOW_RSS is newer and has new definition, as mentioned in the comment. However, given that the user data resides in user-space, a malicious user can race to change the data after the first copy. By doing so, the user can inject inconsistent data. For example, in the second copy, the FLOW_RSS flag could be cleared in the field flow_type of "info". In the following execution, "info" will be used in the function ops->get_rxnfc(). Such inconsistent data can potentially lead to unexpected information leakage since ops->get_rxnfc() will prepare various types of data according to flow_type, and the prepared data will be eventually copied to user-space. This inconsistent data may also cause undefined behaviors based on how ops->get_rxnfc() is implemented. This patch simply re-verifies the flow_type field of "info" after the second copy. If the value is not as expected, an error code will be returned. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- net/core/ethtool.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 03416e6dd5d7b..ba02f0dfe85cd 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1032,6 +1032,11 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, info_size = sizeof(info); if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info.flow_type & FLOW_RSS)) + return -EINVAL; } if (info.cmd == ETHTOOL_GRXCLSRLALL) { -- cgit v1.2.3 From edd7ceb78296fb1574958991b6655c3c2cedf124 Mon Sep 17 00:00:00 2001 From: Thomas Winter Date: Tue, 1 May 2018 09:15:29 +1200 Subject: ipv6: Allow non-gateway ECMP for IPv6 It is valid to have static routes where the nexthop is an interface not an address such as tunnels. For IPv4 it was possible to use ECMP on these routes but not for IPv6. Signed-off-by: Thomas Winter Cc: David Ahern Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +-- net/ipv6/ip6_fib.c | 3 --- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 08b132381984f..abceb5864d995 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) { - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; + return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0; } void ip6_route_input(struct sk_buff *skb); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index deab2db6692eb..3c97c29d44013 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -934,9 +934,6 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. - * - * To avoid long list, we only had siblings if the - * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) -- cgit v1.2.3 From c212d2c7fc4736d49be102fb7a1a545cdc2f1fea Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Tue, 1 May 2018 13:05:39 -0700 Subject: net/tls: Don't recursively call push_record during tls_write_space callbacks It is reported that in some cases, write_space may be called in do_tcp_sendpages, such that we recursively invoke do_tcp_sendpages again: [ 660.468802] ? do_tcp_sendpages+0x8d/0x580 [ 660.468826] ? tls_push_sg+0x74/0x130 [tls] [ 660.468852] ? tls_push_record+0x24a/0x390 [tls] [ 660.468880] ? tls_write_space+0x6a/0x80 [tls] ... tls_push_sg already does a loop over all sending sg's, so ignore any tls_write_space notifications until we are done sending. We then have to call the previous write_space to wake up poll() waiters after we are done with the send loop. Reported-by: Andre Tomt Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 1 + net/tls/tls_main.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 3da8e13a6d966..b400d0bb74483 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -148,6 +148,7 @@ struct tls_context { struct scatterlist *partially_sent_record; u16 partially_sent_offset; unsigned long flags; + bool in_tcp_sendpages; u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 0d379970960e6..cc03e00785c7f 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -114,6 +114,7 @@ int tls_push_sg(struct sock *sk, size = sg->length - offset; offset += sg->offset; + ctx->in_tcp_sendpages = true; while (1) { if (sg_is_last(sg)) sendpage_flags = flags; @@ -148,6 +149,8 @@ retry: } clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); + ctx->in_tcp_sendpages = false; + ctx->sk_write_space(sk); return 0; } @@ -217,6 +220,10 @@ static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); + /* We are already sending pages, ignore notification */ + if (ctx->in_tcp_sendpages) + return; + if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; int rc; -- cgit v1.2.3 From e6e6a278b1eaffa19d42186bfacd1ffc15a50b3f Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 1 May 2018 21:45:41 -0400 Subject: tcp_bbr: fix to zero idle_restart only upon S/ACKed data Previously the bbr->idle_restart tracking was zeroing out the bbr->idle_restart bit upon ACKs that did not SACK or ACK anything, e.g. receiving incoming data or receiver window updates. In such situations BBR would forget that this was a restart-from-idle situation, and if the min_rtt had expired it would unnecessarily enter PROBE_RTT (even though we were actually restarting from idle but had merely forgotten that fact). The fix is simple: we need to remember we are restarting from idle until we receive a S/ACK for some data (a S/ACK for the first flight of data we send as we are restarting). This commit is a stable candidate for kernels back as far as 4.9. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Priyaranjan Jha Signed-off-by: Yousuk Seung Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 158d105e76da1..58e2f479ffb4d 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -806,7 +806,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) } } } - bbr->idle_restart = 0; + /* Restart after idle ends only once we process a new S/ACK for data */ + if (rs->delivered > 0) + bbr->idle_restart = 0; } static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) -- cgit v1.2.3 From 4842a08fb80bc09b7b089af42c58353dfaa8f88f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 2 May 2018 13:37:44 +0800 Subject: sctp: init active key for the new asoc in dupcook_a and dupcook_b When processing a duplicate cookie-echo chunk, for case 'A' and 'B', after sctp_process_init for the new asoc, if auth is enabled for the cookie-ack chunk, the active key should also be initialized. Otherwise, the cookie-ack chunk made later can not be set with auth shkey properly, and a crash can even be caused by this, as after Commit 1b1e0bc99474 ("sctp: add refcnt support for sh_key"), sctp needs to hold the shkey when making control chunks. Fixes: 1b1e0bc99474 ("sctp: add refcnt support for sh_key") Reported-by: Jianwen Ji Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index dd0594a109610..98acfed45e3b6 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1794,6 +1794,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( GFP_ATOMIC)) goto nomem; + if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC)) + goto nomem; + /* Make sure no new addresses are being added during the * restart. Though this is a pretty complicated attack * since you'd have to get inside the cookie. @@ -1906,6 +1909,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_b( GFP_ATOMIC)) goto nomem; + if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC)) + goto nomem; + /* Update the content of current association. */ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, -- cgit v1.2.3 From 46e16d4b956867013e0bbd7f2bad206f4aa55752 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 2 May 2018 13:39:46 +0800 Subject: sctp: use the old asoc when making the cookie-ack chunk in dupcook_d When processing a duplicate cookie-echo chunk, for case 'D', sctp will not process the param from this chunk. It means old asoc has nothing to be updated, and the new temp asoc doesn't have the complete info. So there's no reason to use the new asoc when creating the cookie-ack chunk. Otherwise, like when auth is enabled for cookie-ack, the chunk can not be set with auth, and it will definitely be dropped by peer. This issue is there since very beginning, and we fix it by using the old asoc instead. Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 98acfed45e3b6..28c070e187c2e 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2056,7 +2056,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( } } - repl = sctp_make_cookie_ack(new_asoc, chunk); + repl = sctp_make_cookie_ack(asoc, chunk); if (!repl) goto nomem; -- cgit v1.2.3 From ce402f044e4e432c296f90eaabb8dbe8f3624391 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 2 May 2018 13:45:12 +0800 Subject: sctp: fix the issue that the cookie-ack with auth can't get processed When auth is enabled for cookie-ack chunk, in sctp_inq_pop, sctp processes auth chunk first, then continues to the next chunk in this packet if chunk_end + chunk_hdr size < skb_tail_pointer(). Otherwise, it will go to the next packet or discard this chunk. However, it missed the fact that cookie-ack chunk's size is equal to chunk_hdr size, which couldn't match that check, and thus this chunk would not get processed. This patch fixes it by changing the check to chunk_end + chunk_hdr size <= skb_tail_pointer(). Fixes: 26b87c788100 ("net: sctp: fix remote memory pressure from excessive queueing") Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/inqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 23ebc5318edc4..eb93ffe2408bd 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -217,7 +217,7 @@ new_skb: skb_pull(chunk->skb, sizeof(*ch)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ - if (chunk->chunk_end + sizeof(*ch) < skb_tail_pointer(chunk->skb)) { + if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) { /* This is not a singleton */ chunk->singleton = 0; } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { -- cgit v1.2.3 From 784813aed6ba24a1f24e7e11d9d0f208cee37a7d Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 2 May 2018 16:53:56 +0200 Subject: net/smc: restrict non-blocking connect finish The smc_poll code tries to finish connect() if the socket is in state SMC_INIT and polling of the internal CLC-socket returns with EPOLLOUT. This makes sense for a select/poll call following a connect call, but not without preceding connect(). With this patch smc_poll starts connect logic only, if the CLC-socket is no longer in its initial state TCP_CLOSE. In addition, a poll error on the internal CLC-socket is always propagated to the SMC socket. With this patch the code path mentioned by syzbot https://syzkaller.appspot.com/bug?extid=03faa2dc16b8b64be396 is no longer possible. Signed-off-by: Ursula Braun Reported-by: syzbot+03faa2dc16b8b64be396@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- net/smc/af_smc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4470501374bf9..8b4c059bd13b4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1166,13 +1166,15 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, /* delegate to CLC child sock */ release_sock(sk); mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); - /* if non-blocking connect finished ... */ lock_sock(sk); - if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) { - sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) { - mask |= EPOLLERR; - } else { + sk->sk_err = smc->clcsock->sk->sk_err; + if (sk->sk_err) { + mask |= EPOLLERR; + } else { + /* if non-blocking connect finished ... */ + if (sk->sk_state == SMC_INIT && + mask & EPOLLOUT && + smc->clcsock->sk->sk_state != TCP_CLOSE) { rc = smc_connect_rdma(smc); if (rc < 0) mask |= EPOLLERR; -- cgit v1.2.3 From 30ca22e4a5d0063dd9a9cdf35cd139c5807cbeb3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 2 May 2018 22:41:56 +0300 Subject: ipv6: Revert "ipv6: Allow non-gateway ECMP for IPv6" This reverts commit edd7ceb78296 ("ipv6: Allow non-gateway ECMP for IPv6"). Eric reported a division by zero in rt6_multipath_rebalance() which is caused by above commit that considers identical local routes to be siblings. The division by zero happens because a nexthop weight is not set for local routes. Revert the commit as it does not fix a bug and has side effects. To reproduce: # ip -6 address add 2001:db8::1/64 dev dummy0 # ip -6 address add 2001:db8::1/64 dev dummy1 Fixes: edd7ceb78296 ("ipv6: Allow non-gateway ECMP for IPv6") Signed-off-by: Ido Schimmel Reported-by: Eric Dumazet Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 ++- net/ipv6/ip6_fib.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index abceb5864d995..08b132381984f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,7 +68,8 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) { - return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0; + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; } void ip6_route_input(struct sk_buff *skb); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3c97c29d44013..deab2db6692eb 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -934,6 +934,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. + * + * To avoid long list, we only had siblings if the + * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) -- cgit v1.2.3 From 7df40c2673a1307c3260aab6f9d4b9bf97ca8fd7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2018 10:03:30 -0700 Subject: net_sched: fq: take care of throttled flows before reuse Normally, a socket can not be freed/reused unless all its TX packets left qdisc and were TX-completed. However connect(AF_UNSPEC) allows this to happen. With commit fc59d5bdf1e3 ("pkt_sched: fq: clear time_next_packet for reused flows") we cleared f->time_next_packet but took no special action if the flow was still in the throttled rb-tree. Since f->time_next_packet is the key used in the rb-tree searches, blindly clearing it might break rb-tree integrity. We need to make sure the flow is no longer in the rb-tree to avoid this problem. Fixes: fc59d5bdf1e3 ("pkt_sched: fq: clear time_next_packet for reused flows") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a366e4c9413ab..4808713c73b98 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -128,6 +128,28 @@ static bool fq_flow_is_detached(const struct fq_flow *f) return f->next == &detached; } +static bool fq_flow_is_throttled(const struct fq_flow *f) +{ + return f->next == &throttled; +} + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ + if (head->first) + head->last->next = flow; + else + head->first = flow; + head->last = flow; + flow->next = NULL; +} + +static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ + rb_erase(&f->rate_node, &q->delayed); + q->throttled_flows--; + fq_flow_add_tail(&q->old_flows, f); +} + static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) { struct rb_node **p = &q->delayed.rb_node, *parent = NULL; @@ -155,15 +177,6 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) static struct kmem_cache *fq_flow_cachep __read_mostly; -static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) -{ - if (head->first) - head->last->next = flow; - else - head->first = flow; - head->last = flow; - flow->next = NULL; -} /* limit number of collected flows per round */ #define FQ_GC_MAX 8 @@ -267,6 +280,8 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; + if (fq_flow_is_throttled(f)) + fq_flow_unset_throttled(q, f); f->time_next_packet = 0ULL; } return f; @@ -438,9 +453,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) q->time_next_delayed_flow = f->time_next_packet; break; } - rb_erase(p, &q->delayed); - q->throttled_flows--; - fq_flow_add_tail(&q->old_flows, f); + fq_flow_unset_throttled(q, f); } } -- cgit v1.2.3 From 94720e3aee6884d8c8beb678001629da60ec6366 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 2 May 2018 09:41:19 +0300 Subject: ipv4: fix fnhe usage by non-cached routes Allow some non-cached routes to use non-expired fnhe: 1. ip_del_fnhe: moved above and now called by find_exception. The 4.5+ commit deed49df7390 expires fnhe only when caching routes. Change that to: 1.1. use fnhe for non-cached local output routes, with the help from (2) 1.2. allow __mkroute_input to detect expired fnhe (outdated fnhe_gw, for example) when do_cache is false, eg. when itag!=0 for unicast destinations. 2. __mkroute_output: keep fi to allow local routes with orig_oif != 0 to use fnhe info even when the new route will not be cached into fnhe. After commit 839da4d98960 ("net: ipv4: set orig_oif based on fib result for local traffic") it means all local routes will be affected because they are not cached. This change is used to solve a PMTU problem with IPVS (and probably Netfilter DNAT) setups that redirect local clients from target local IP (local route to Virtual IP) to new remote IP target, eg. IPVS TUN real server. Loopback has 64K MTU and we need to create fnhe on the local route that will keep the reduced PMTU for the Virtual IP. Without this change fnhe_pmtu is updated from ICMP but never exposed to non-cached local routes. This includes routes with flowi4_oif!=0 for 4.6+ and with flowi4_oif=any for 4.14+). 3. update_or_create_fnhe: make sure fnhe_expires is not 0 for new entries Fixes: 839da4d98960 ("net: ipv4: set orig_oif based on fib result for local traffic") Fixes: d6d5e999e5df ("route: do not cache fib route info on local routes with oif") Fixes: deed49df7390 ("route: check and remove route cache when we get route") Cc: David Ahern Cc: Xin Long Signed-off-by: Julian Anastasov Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 118 +++++++++++++++++++++++++------------------------------ 1 file changed, 53 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ccb25d80f6795..1412a7baf0b9c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -709,7 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; - fnhe->fnhe_expires = expires; + fnhe->fnhe_expires = max(1UL, expires); /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception @@ -1297,6 +1297,36 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe, __rcu **fnhe_p; + u32 hval = fnhe_hashfun(daddr); + + spin_lock_bh(&fnhe_lock); + + hash = rcu_dereference_protected(nh->nh_exceptions, + lockdep_is_held(&fnhe_lock)); + hash += hval; + + fnhe_p = &hash->chain; + fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); + while (fnhe) { + if (fnhe->fnhe_daddr == daddr) { + rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( + fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); + fnhe_flush_routes(fnhe); + kfree_rcu(fnhe, rcu); + break; + } + fnhe_p = &fnhe->fnhe_next; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, + lockdep_is_held(&fnhe_lock)); + } + + spin_unlock_bh(&fnhe_lock); +} + static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) { struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); @@ -1310,8 +1340,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { - if (fnhe->fnhe_daddr == daddr) + if (fnhe->fnhe_daddr == daddr) { + if (fnhe->fnhe_expires && + time_after(jiffies, fnhe->fnhe_expires)) { + ip_del_fnhe(nh, daddr); + break; + } return fnhe; + } } return NULL; } @@ -1636,36 +1672,6 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } -static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) -{ - struct fnhe_hash_bucket *hash; - struct fib_nh_exception *fnhe, __rcu **fnhe_p; - u32 hval = fnhe_hashfun(daddr); - - spin_lock_bh(&fnhe_lock); - - hash = rcu_dereference_protected(nh->nh_exceptions, - lockdep_is_held(&fnhe_lock)); - hash += hval; - - fnhe_p = &hash->chain; - fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); - while (fnhe) { - if (fnhe->fnhe_daddr == daddr) { - rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( - fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); - fnhe_flush_routes(fnhe); - kfree_rcu(fnhe, rcu); - break; - } - fnhe_p = &fnhe->fnhe_next; - fnhe = rcu_dereference_protected(fnhe->fnhe_next, - lockdep_is_held(&fnhe_lock)); - } - - spin_unlock_bh(&fnhe_lock); -} - /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1719,20 +1725,10 @@ static int __mkroute_input(struct sk_buff *skb, fnhe = find_exception(&FIB_RES_NH(*res), daddr); if (do_cache) { - if (fnhe) { + if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); - if (rth && rth->dst.expires && - time_after(jiffies, rth->dst.expires)) { - ip_del_fnhe(&FIB_RES_NH(*res), daddr); - fnhe = NULL; - } else { - goto rt_cache; - } - } - - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); - -rt_cache: + else + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2216,39 +2212,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res, * the loopback interface and the IP_PKTINFO ipi_ifindex will * be set to the loopback interface as well. */ - fi = NULL; + do_cache = false; } fnhe = NULL; do_cache &= fi != NULL; - if (do_cache) { + if (fi) { struct rtable __rcu **prth; struct fib_nh *nh = &FIB_RES_NH(*res); fnhe = find_exception(nh, fl4->daddr); + if (!do_cache) + goto add; if (fnhe) { prth = &fnhe->fnhe_rth_output; - rth = rcu_dereference(*prth); - if (rth && rth->dst.expires && - time_after(jiffies, rth->dst.expires)) { - ip_del_fnhe(nh, fl4->daddr); - fnhe = NULL; - } else { - goto rt_cache; + } else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; } + prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); } - - if (unlikely(fl4->flowi4_flags & - FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { - do_cache = false; - goto add; - } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); rth = rcu_dereference(*prth); - -rt_cache: if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; } -- cgit v1.2.3 From eb80ca476ec11f67a62691a93604b405ffc7d80c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2018 14:53:39 -0700 Subject: rds: do not leak kernel memory to user land syzbot/KMSAN reported an uninit-value in put_cmsg(), originating from rds_cmsg_recv(). Simply clear the structure, since we have holes there, or since rx_traces might be smaller than RDS_MSG_RX_DGRAM_TRACE_MAX. BUG: KMSAN: uninit-value in copy_to_user include/linux/uaccess.h:184 [inline] BUG: KMSAN: uninit-value in put_cmsg+0x600/0x870 net/core/scm.c:242 CPU: 0 PID: 4459 Comm: syz-executor582 Not tainted 4.16.0+ #87 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 kmsan_internal_check_memory+0x135/0x1e0 mm/kmsan/kmsan.c:1157 kmsan_copy_to_user+0x69/0x160 mm/kmsan/kmsan.c:1199 copy_to_user include/linux/uaccess.h:184 [inline] put_cmsg+0x600/0x870 net/core/scm.c:242 rds_cmsg_recv net/rds/recv.c:570 [inline] rds_recvmsg+0x2db5/0x3170 net/rds/recv.c:657 sock_recvmsg_nosec net/socket.c:803 [inline] sock_recvmsg+0x1d0/0x230 net/socket.c:810 ___sys_recvmsg+0x3fb/0x810 net/socket.c:2205 __sys_recvmsg net/socket.c:2250 [inline] SYSC_recvmsg+0x298/0x3c0 net/socket.c:2262 SyS_recvmsg+0x54/0x80 net/socket.c:2257 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 3289025aedc0 ("RDS: add receive message trace used by application") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Santosh Shilimkar Cc: linux-rdma Signed-off-by: David S. Miller --- net/rds/recv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/recv.c b/net/rds/recv.c index de50e2126e404..dc67458b52f00 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -558,6 +558,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, struct rds_cmsg_rx_trace t; int i, j; + memset(&t, 0, sizeof(t)); inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); t.rx_traces = rs->rs_rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { -- cgit v1.2.3 From 114f39feab360e6c7b0c4238697f223444d662a1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2018 20:25:13 -0700 Subject: tcp: restore autocorking When adding rb-tree for TCP retransmit queue, we inadvertently broke TCP autocorking. tcp_should_autocork() should really check if the rtx queue is not empty. Tested: Before the fix : $ nstat -n;./netperf -H 10.246.7.152 -Cc -- -m 500;nstat | grep AutoCork MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 540000 262144 500 10.00 2682.85 2.47 1.59 3.618 2.329 TcpExtTCPAutoCorking 33 0.0 // Same test, but forcing TCP_NODELAY $ nstat -n;./netperf -H 10.246.7.152 -Cc -- -D -m 500;nstat | grep AutoCork MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET : nodelay Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 540000 262144 500 10.00 1408.75 2.44 2.96 6.802 8.259 TcpExtTCPAutoCorking 1 0.0 After the fix : $ nstat -n;./netperf -H 10.246.7.152 -Cc -- -m 500;nstat | grep AutoCork MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 540000 262144 500 10.00 5472.46 2.45 1.43 1.761 1.027 TcpExtTCPAutoCorking 361293 0.0 // With TCP_NODELAY option $ nstat -n;./netperf -H 10.246.7.152 -Cc -- -D -m 500;nstat | grep AutoCork MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET : nodelay Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 540000 262144 500 10.00 5454.96 2.46 1.63 1.775 1.174 TcpExtTCPAutoCorking 315448 0.0 Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Signed-off-by: Eric Dumazet Reported-by: Michael Wenig Tested-by: Michael Wenig Signed-off-by: Eric Dumazet Reported-by: Michael Wenig Tested-by: Michael Wenig Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 44be7f43455e4..c9d00ef54deca 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, { return skb->len < size_goal && sock_net(sk)->ipv4.sysctl_tcp_autocorking && - skb != tcp_write_queue_head(sk) && + !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } -- cgit v1.2.3 From e63a5f8c19d7807823d68830ebe8cfbd4419ab13 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 3 May 2018 17:57:37 +0200 Subject: net/smc: call consolidation Consolidate the call to smc_wr_reg_send() in a new function. No functional changes. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8b4c059bd13b4..fdb2976117bd7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -292,6 +292,15 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } +/* register a new rmb */ +static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +{ + /* register memory region for new rmb */ + if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) + return -EFAULT; + return 0; +} + static int smc_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link_group *lgr = smc->conn.lgr; @@ -321,9 +330,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - rc = smc_wr_reg_send(link, - smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); - if (rc) + if (smc_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK response over RoCE fabric */ @@ -473,13 +480,8 @@ static int smc_connect_rdma(struct smc_sock *smc) goto decline_rdma_unlock; } } else { - struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; - - if (!buf_desc->reused) { - /* register memory region for new rmb */ - rc = smc_wr_reg_send(link, - buf_desc->mr_rx[SMC_SINGLE_LINK]); - if (rc) { + if (!smc->conn.rmb_desc->reused) { + if (smc_reg_rmb(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_INTERR; goto decline_rdma_unlock; } @@ -719,9 +721,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) link = &lgr->lnk[SMC_SINGLE_LINK]; - rc = smc_wr_reg_send(link, - smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); - if (rc) + if (smc_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -854,13 +854,8 @@ static void smc_listen_work(struct work_struct *work) smc_rx_init(new_smc); if (local_contact != SMC_FIRST_CONTACT) { - struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; - - if (!buf_desc->reused) { - /* register memory region for new rmb */ - rc = smc_wr_reg_send(link, - buf_desc->mr_rx[SMC_SINGLE_LINK]); - if (rc) { + if (!new_smc->conn.rmb_desc->reused) { + if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_INTERR; goto decline_rdma_unlock; } -- cgit v1.2.3 From a6920d1d130c3de039be982eba42542d329dc64c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 3 May 2018 17:57:38 +0200 Subject: net/smc: handle unregistered buffers When smc_wr_reg_send() fails then tag (regerr) the affected buffer and free it in smc_buf_unuse(). Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- net/smc/smc_core.c | 22 +++++++++++++++++++--- net/smc/smc_core.h | 3 ++- 3 files changed, 24 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fdb2976117bd7..d03b8d29ffc05 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -296,8 +296,10 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) { /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) + if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { + rmb_desc->regerr = 1; return -EFAULT; + } return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f44f6803f7ff2..d4bd01bb44e17 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -32,6 +32,9 @@ static u32 smc_lgr_num; /* unique link group number */ +static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, + bool is_rmb); + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group @@ -234,9 +237,22 @@ static void smc_buf_unuse(struct smc_connection *conn) conn->sndbuf_size = 0; } if (conn->rmb_desc) { - conn->rmb_desc->reused = true; - conn->rmb_desc->used = 0; - conn->rmbe_size = 0; + if (!conn->rmb_desc->regerr) { + conn->rmb_desc->reused = 1; + conn->rmb_desc->used = 0; + conn->rmbe_size = 0; + } else { + /* buf registration failed, reuse not possible */ + struct smc_link_group *lgr = conn->lgr; + struct smc_link *lnk; + + write_lock_bh(&lgr->rmbs_lock); + list_del(&conn->rmb_desc->list); + write_unlock_bh(&lgr->rmbs_lock); + + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + smc_buf_free(conn->rmb_desc, lnk, true); + } } } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 07e2a393e6d98..5dfcb15d529f8 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -123,7 +123,8 @@ struct smc_buf_desc { */ u32 order; /* allocation order */ u32 used; /* currently used / unused */ - bool reused; /* new created / reused */ + u8 reused : 1; /* new created / reused */ + u8 regerr : 1; /* err during registration */ }; struct smc_rtoken { /* address/key of remote RMB */ -- cgit v1.2.3 From bda27ff5c4526f80a7620a94ecfe8dca153e3696 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 3 May 2018 17:57:39 +0200 Subject: smc: fix sendpage() call The sendpage() call grabs the sock lock before calling the default implementation - which tries to grab it once again. Signed-off-by: Stefan Raspl Signed-off-by: Ursula Braun < Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d03b8d29ffc05..544bab42f925a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1315,8 +1315,11 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, smc = smc_sk(sk); lock_sock(sk); - if (sk->sk_state != SMC_ACTIVE) + if (sk->sk_state != SMC_ACTIVE) { + release_sock(sk); goto out; + } + release_sock(sk); if (smc->use_fallback) rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); @@ -1324,7 +1327,6 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, rc = sock_no_sendpage(sock, page, offset, size, flags); out: - release_sock(sk); return rc; } -- cgit v1.2.3 From a8d7aa17bbc970971ccdf71988ea19230ab368b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 May 2018 09:39:20 -0700 Subject: dccp: fix tasklet usage syzbot reported a crash in tasklet_action_common() caused by dccp. dccp needs to make sure socket wont disappear before tasklet handler has completed. This patch takes a reference on the socket when arming the tasklet, and moves the sock_put() from dccp_write_xmit_timer() to dccp_write_xmitlet() kernel BUG at kernel/softirq.c:514! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 17 Comm: ksoftirqd/1 Not tainted 4.17.0-rc3+ #30 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:tasklet_action_common.isra.19+0x6db/0x700 kernel/softirq.c:515 RSP: 0018:ffff8801d9b3faf8 EFLAGS: 00010246 dccp_close: ABORT with 65423 bytes unread RAX: 1ffff1003b367f6b RBX: ffff8801daf1f3f0 RCX: 0000000000000000 RDX: ffff8801cf895498 RSI: 0000000000000004 RDI: 0000000000000000 RBP: ffff8801d9b3fc40 R08: ffffed0039f12a95 R09: ffffed0039f12a94 dccp_close: ABORT with 65423 bytes unread R10: ffffed0039f12a94 R11: ffff8801cf8954a3 R12: 0000000000000000 R13: ffff8801d9b3fc18 R14: dffffc0000000000 R15: ffff8801cf895490 FS: 0000000000000000(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2bc28000 CR3: 00000001a08a9000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tasklet_action+0x1d/0x20 kernel/softirq.c:533 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285 dccp_close: ABORT with 65423 bytes unread run_ksoftirqd+0x86/0x100 kernel/softirq.c:646 smpboot_thread_fn+0x417/0x870 kernel/smpboot.c:164 kthread+0x345/0x410 kernel/kthread.c:238 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412 Code: 48 8b 85 e8 fe ff ff 48 8b 95 f0 fe ff ff e9 94 fb ff ff 48 89 95 f0 fe ff ff e8 81 53 6e 00 48 8b 95 f0 fe ff ff e9 62 fb ff ff <0f> 0b 48 89 cf 48 89 8d e8 fe ff ff e8 64 53 6e 00 48 8b 8d e8 RIP: tasklet_action_common.isra.19+0x6db/0x700 kernel/softirq.c:515 RSP: ffff8801d9b3faf8 Fixes: dc841e30eaea ("dccp: Extend CCID packet dequeueing interface") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Gerrit Renker Cc: dccp@vger.kernel.org Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 14 ++++++++++++-- net/dccp/timer.c | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 92d016e87816e..385f153fe0318 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -126,6 +126,16 @@ static void ccid2_change_l_seq_window(struct sock *sk, u64 val) DCCPF_SEQ_WMAX)); } +static void dccp_tasklet_schedule(struct sock *sk) +{ + struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet; + + if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { + sock_hold(sk); + __tasklet_schedule(t); + } +} + static void ccid2_hc_tx_rto_expire(struct timer_list *t) { struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); @@ -166,7 +176,7 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t) /* if we were blocked before, we may now send cwnd=1 packet */ if (sender_was_blocked) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + dccp_tasklet_schedule(sk); /* restart backed-off timer */ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); out: @@ -706,7 +716,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) done: /* check if incoming Acks allow pending packets to be sent */ if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + dccp_tasklet_schedule(sk); dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); } diff --git a/net/dccp/timer.c b/net/dccp/timer.c index b50a8732ff434..1501a20a94ca0 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -232,6 +232,7 @@ static void dccp_write_xmitlet(unsigned long data) else dccp_write_xmit(sk); bh_unlock_sock(sk); + sock_put(sk); } static void dccp_write_xmit_timer(struct timer_list *t) @@ -240,7 +241,6 @@ static void dccp_write_xmit_timer(struct timer_list *t) struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; dccp_write_xmitlet((unsigned long)sk); - sock_put(sk); } void dccp_init_xmit_timers(struct sock *sk) -- cgit v1.2.3