summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-10-01 12:58:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-10-01 12:58:48 -0700
commitc31eeaced22ce8bd61268a3c595d542bb38c0a4f (patch)
tree630f25b34aa1182b0a5ac1c5f60b9fa1acd29614 /net
parent0b936842c86392dad2c880539e824881e5d8ba77 (diff)
parent0eab5eb7a3a9a6ccfcdecbffff00d60a86a004bb (diff)
downloadlinux-0-day-c31eeaced22ce8bd61268a3c595d542bb38c0a4f.tar.gz
linux-0-day-c31eeaced22ce8bd61268a3c595d542bb38c0a4f.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Pull networking changes from David Miller: 1) Multiply in netfilter IPVS can overflow when calculating destination weight. From Simon Kirby. 2) Use after free fixes in IPVS from Julian Anastasov. 3) SFC driver bug fixes from Daniel Pieczko. 4) Memory leak in pcan_usb_core failure paths, from Alexey Khoroshilov. 5) Locking and encapsulation fixes to serial line CAN driver, from Andrew Naujoks. 6) Duplex and VF handling fixes to bnx2x driver from Yaniv Rosner, Eilon Greenstein, and Ariel Elior. 7) In lapb, if no other packets are outstanding, T1 timeouts actually stall things and no packet gets sent. Fix from Josselin Costanzi. 8) ICMP redirects should not make it to the socket error queues, from Duan Jiong. 9) Fix bugs in skge DMA mapping error handling, from Nikulas Patocka. 10) Fix setting of VLAN priority field on via-rhine driver, from Roget Luethi. 11) Fix TX stalls and VLAN promisc programming in be2net driver from Ajit Khaparde. 12) Packet padding doesn't get handled correctly in new usbnet SG support code, from Ming Lei. 13) Fix races in netdevice teardown wrt. network namespace closing. From Eric W. Biederman. 14) Fix potential missed initialization of net_secret if not TCP connections are openned. From Eric Dumazet. 15) Cinterion PLXX product ID in qmi_wwan driver is wrong, from Aleksander Morgado. 16) skb_cow_head() can change skb->data and thus packet header pointers, don't use stale ip_hdr reference in ip_tunnel code. 17) Backend state transition handling fixes in xen-netback, from Paul Durrant. 18) Packet offset for AH protocol is handled wrong in flow dissector, from Eric Dumazet. 19) Taking down an fq packet scheduler instance can leave stale packets in the queues, fix from Eric Dumazet. 20) Fix performance regressions introduced by TCP Small Queues. From Eric Dumazet. 21) IPV6 GRE tunneling code calculates max_headroom incorrectly, from Hannes Frederic Sowa. 22) Multicast timer handlers in ipv4 and ipv6 can be the last and final reference to the ipv4/ipv6 specific network device state, so use the reference put that will check and release the object if the reference hits zero. From Salam Noureddine. 23) Fix memory corruption in ip_tunnel driver, and use skb_push() instead of __skb_push() so that similar bugs are less hard to find. From Steffen Klassert. 24) Add forgotten hookup of rtnl_ops in SIT and ip6tnl drivers, from Nicolas Dichtel. 25) fq scheduler doesn't accurately rate limit in certain circumstances, from Eric Dumazet. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (103 commits) pkt_sched: fq: rate limiting improvements ip6tnl: allow to use rtnl ops on fb tunnel sit: allow to use rtnl ops on fb tunnel ip_tunnel: Remove double unregister of the fallback device ip_tunnel_core: Change __skb_push back to skb_push ip_tunnel: Add fallback tunnels to the hash lists ip_tunnel: Fix a memory corruption in ip_tunnel_xmit qlcnic: Fix SR-IOV configuration ll_temac: Reset dma descriptors indexes on ndo_open skbuff: size of hole is wrong in a comment ipv6 mcast: use in6_dev_put in timer handlers instead of __in6_dev_put ipv4 igmp: use in_dev_put in timer handlers instead of __in_dev_put ethernet: moxa: fix incorrect placement of __initdata tag ipv6: gre: correct calculation of max_headroom powerpc/83xx: gianfar_ptp: select 1588 clock source through dts file Revert "powerpc/83xx: gianfar_ptp: select 1588 clock source through dts file" bonding: Fix broken promiscuity reference counting issue tcp: TSQ can use a dynamic limit dm9601: fix IFF_ALLMULTI handling pkt_sched: fq: qdisc dismantle fixes ...
Diffstat (limited to 'net')
-rw-r--r--net/802/mrp.c27
-rw-r--r--net/bluetooth/hci_core.c26
-rw-r--r--net/bluetooth/hci_event.c6
-rw-r--r--net/bluetooth/l2cap_core.c7
-rw-r--r--net/bluetooth/rfcomm/tty.c35
-rw-r--r--net/core/dev.c49
-rw-r--r--net/core/flow_dissector.c4
-rw-r--r--net/core/secure_seq.c27
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/igmp.c4
-rw-r--r--net/ipv4/ip_tunnel.c22
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c10
-rw-r--r--net/ipv4/raw.c4
-rw-r--r--net/ipv4/tcp_output.c17
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv6/addrconf.c79
-rw-r--r--net/ipv6/ip6_gre.c4
-rw-r--r--net/ipv6/ip6_output.c53
-rw-r--r--net/ipv6/ip6_tunnel.c3
-rw-r--r--net/ipv6/mcast.c6
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c10
-rw-r--r--net/ipv6/raw.c4
-rw-r--r--net/ipv6/sit.c86
-rw-r--r--net/ipv6/udp.c4
-rw-r--r--net/lapb/lapb_timer.c1
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c12
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c86
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c72
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c62
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c6
-rw-r--r--net/netfilter/nf_synproxy_core.c12
-rw-r--r--net/sched/sch_fq.c102
36 files changed, 514 insertions, 354 deletions
diff --git a/net/802/mrp.c b/net/802/mrp.c
index 1eb05d80b07be..3ed616215870c 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -24,6 +24,11 @@
static unsigned int mrp_join_time __read_mostly = 200;
module_param(mrp_join_time, uint, 0644);
MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)");
+
+static unsigned int mrp_periodic_time __read_mostly = 1000;
+module_param(mrp_periodic_time, uint, 0644);
+MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)");
+
MODULE_LICENSE("GPL");
static const u8
@@ -595,6 +600,24 @@ static void mrp_join_timer(unsigned long data)
mrp_join_timer_arm(app);
}
+static void mrp_periodic_timer_arm(struct mrp_applicant *app)
+{
+ mod_timer(&app->periodic_timer,
+ jiffies + msecs_to_jiffies(mrp_periodic_time));
+}
+
+static void mrp_periodic_timer(unsigned long data)
+{
+ struct mrp_applicant *app = (struct mrp_applicant *)data;
+
+ spin_lock(&app->lock);
+ mrp_mad_event(app, MRP_EVENT_PERIODIC);
+ mrp_pdu_queue(app);
+ spin_unlock(&app->lock);
+
+ mrp_periodic_timer_arm(app);
+}
+
static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset)
{
__be16 endmark;
@@ -845,6 +868,9 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
setup_timer(&app->join_timer, mrp_join_timer, (unsigned long)app);
mrp_join_timer_arm(app);
+ setup_timer(&app->periodic_timer, mrp_periodic_timer,
+ (unsigned long)app);
+ mrp_periodic_timer_arm(app);
return 0;
err3:
@@ -870,6 +896,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
* all pending messages before the applicant is gone.
*/
del_timer_sync(&app->join_timer);
+ del_timer_sync(&app->periodic_timer);
spin_lock_bh(&app->lock);
mrp_mad_event(app, MRP_EVENT_TX);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 634debab4d545..fb7356fcfe51e 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1146,7 +1146,11 @@ int hci_dev_open(__u16 dev)
goto done;
}
- if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) {
+ /* Check for rfkill but allow the HCI setup stage to proceed
+ * (which in itself doesn't cause any RF activity).
+ */
+ if (test_bit(HCI_RFKILLED, &hdev->dev_flags) &&
+ !test_bit(HCI_SETUP, &hdev->dev_flags)) {
ret = -ERFKILL;
goto done;
}
@@ -1566,10 +1570,13 @@ static int hci_rfkill_set_block(void *data, bool blocked)
BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked);
- if (!blocked)
- return 0;
-
- hci_dev_do_close(hdev);
+ if (blocked) {
+ set_bit(HCI_RFKILLED, &hdev->dev_flags);
+ if (!test_bit(HCI_SETUP, &hdev->dev_flags))
+ hci_dev_do_close(hdev);
+ } else {
+ clear_bit(HCI_RFKILLED, &hdev->dev_flags);
+ }
return 0;
}
@@ -1591,9 +1598,13 @@ static void hci_power_on(struct work_struct *work)
return;
}
- if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags))
+ if (test_bit(HCI_RFKILLED, &hdev->dev_flags)) {
+ clear_bit(HCI_AUTO_OFF, &hdev->dev_flags);
+ hci_dev_do_close(hdev);
+ } else if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) {
queue_delayed_work(hdev->req_workqueue, &hdev->power_off,
HCI_AUTO_OFF_TIMEOUT);
+ }
if (test_and_clear_bit(HCI_SETUP, &hdev->dev_flags))
mgmt_index_added(hdev);
@@ -2209,6 +2220,9 @@ int hci_register_dev(struct hci_dev *hdev)
}
}
+ if (hdev->rfkill && rfkill_blocked(hdev->rfkill))
+ set_bit(HCI_RFKILLED, &hdev->dev_flags);
+
set_bit(HCI_SETUP, &hdev->dev_flags);
if (hdev->dev_type != HCI_AMP)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 94aab73f89d4c..8db3e89fae354 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3557,7 +3557,11 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
cp.handle = cpu_to_le16(conn->handle);
if (ltk->authenticated)
- conn->sec_level = BT_SECURITY_HIGH;
+ conn->pending_sec_level = BT_SECURITY_HIGH;
+ else
+ conn->pending_sec_level = BT_SECURITY_MEDIUM;
+
+ conn->enc_key_size = ltk->enc_size;
hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index b3bb7bca8e606..63fa11109a1c3 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -3755,6 +3755,13 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
sk = chan->sk;
+ /* For certain devices (ex: HID mouse), support for authentication,
+ * pairing and bonding is optional. For such devices, inorder to avoid
+ * the ACL alive for too long after L2CAP disconnection, reset the ACL
+ * disc_timeout back to HCI_DISCONN_TIMEOUT during L2CAP connect.
+ */
+ conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
+
bacpy(&bt_sk(sk)->src, conn->src);
bacpy(&bt_sk(sk)->dst, conn->dst);
chan->psm = psm;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d126faf145fe..84fcf9fff3ea5 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -569,7 +569,6 @@ static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb)
static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
{
struct rfcomm_dev *dev = dlc->owner;
- struct tty_struct *tty;
if (!dev)
return;
@@ -581,38 +580,8 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
DPM_ORDER_DEV_AFTER_PARENT);
wake_up_interruptible(&dev->port.open_wait);
- } else if (dlc->state == BT_CLOSED) {
- tty = tty_port_tty_get(&dev->port);
- if (!tty) {
- if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
- /* Drop DLC lock here to avoid deadlock
- * 1. rfcomm_dev_get will take rfcomm_dev_lock
- * but in rfcomm_dev_add there's lock order:
- * rfcomm_dev_lock -> dlc lock
- * 2. tty_port_put will deadlock if it's
- * the last reference
- *
- * FIXME: when we release the lock anything
- * could happen to dev, even its destruction
- */
- rfcomm_dlc_unlock(dlc);
- if (rfcomm_dev_get(dev->id) == NULL) {
- rfcomm_dlc_lock(dlc);
- return;
- }
-
- if (!test_and_set_bit(RFCOMM_TTY_RELEASED,
- &dev->flags))
- tty_port_put(&dev->port);
-
- tty_port_put(&dev->port);
- rfcomm_dlc_lock(dlc);
- }
- } else {
- tty_hangup(tty);
- tty_kref_put(tty);
- }
- }
+ } else if (dlc->state == BT_CLOSED)
+ tty_port_tty_hangup(&dev->port, false);
}
static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
diff --git a/net/core/dev.c b/net/core/dev.c
index 5c713f2239cc6..65f829cfd928b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5247,10 +5247,12 @@ static int dev_new_index(struct net *net)
/* Delayed registration/unregisteration */
static LIST_HEAD(net_todo_list);
+static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
+ dev_net(dev)->dev_unreg_count++;
}
static void rollback_registered_many(struct list_head *head)
@@ -5918,6 +5920,12 @@ void netdev_run_todo(void)
if (dev->destructor)
dev->destructor(dev);
+ /* Report a network device has been unregistered */
+ rtnl_lock();
+ dev_net(dev)->dev_unreg_count--;
+ __rtnl_unlock();
+ wake_up(&netdev_unregistering_wq);
+
/* Free network device */
kobject_put(&dev->dev.kobj);
}
@@ -6603,6 +6611,34 @@ static void __net_exit default_device_exit(struct net *net)
rtnl_unlock();
}
+static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
+{
+ /* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace in net_list.
+ */
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
/* At exit all network devices most be removed from a network
@@ -6614,7 +6650,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
- rtnl_lock();
+ /* To prevent network device cleanup code from dereferencing
+ * loopback devices or network devices that have been freed
+ * wait here for all pending unregistrations to complete,
+ * before unregistring the loopback device and allowing the
+ * network namespace be freed.
+ *
+ * The netdev todo list containing all network devices
+ * unregistrations that happen in default_device_exit_batch
+ * will run in the rtnl_unlock() at the end of
+ * default_device_exit_batch.
+ */
+ rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1929af87b2609..8d7d0dd72db21 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -154,8 +154,8 @@ ipv6:
if (poff >= 0) {
__be32 *ports, _ports;
- nhoff += poff;
- ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports);
+ ports = skb_header_pointer(skb, nhoff + poff,
+ sizeof(_ports), &_ports);
if (ports)
flow->ports = *ports;
}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 6a2f13cee86a0..3f1ec1586ae17 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -10,11 +10,24 @@
#include <net/secure_seq.h>
-static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
+#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
-void net_secret_init(void)
+static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+
+static void net_secret_init(void)
{
- get_random_bytes(net_secret, sizeof(net_secret));
+ u32 tmp;
+ int i;
+
+ if (likely(net_secret[0]))
+ return;
+
+ for (i = NET_SECRET_SIZE; i > 0;) {
+ do {
+ get_random_bytes(&tmp, sizeof(tmp));
+ } while (!tmp);
+ cmpxchg(&net_secret[--i], 0, tmp);
+ }
}
#ifdef CONFIG_INET
@@ -42,6 +55,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
u32 hash[MD5_DIGEST_WORDS];
u32 i;
+ net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
secret[i] = net_secret[i] + (__force u32)daddr[i];
@@ -63,6 +77,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
u32 hash[MD5_DIGEST_WORDS];
u32 i;
+ net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
secret[i] = net_secret[i] + (__force u32) daddr[i];
@@ -82,6 +97,7 @@ __u32 secure_ip_id(__be32 daddr)
{
u32 hash[MD5_DIGEST_WORDS];
+ net_secret_init();
hash[0] = (__force __u32) daddr;
hash[1] = net_secret[13];
hash[2] = net_secret[14];
@@ -96,6 +112,7 @@ __u32 secure_ipv6_id(const __be32 daddr[4])
{
__u32 hash[4];
+ net_secret_init();
memcpy(hash, daddr, 16);
md5_transform(hash, net_secret);
@@ -107,6 +124,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
{
u32 hash[MD5_DIGEST_WORDS];
+ net_secret_init();
hash[0] = (__force u32)saddr;
hash[1] = (__force u32)daddr;
hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
@@ -121,6 +139,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
u32 hash[MD5_DIGEST_WORDS];
+ net_secret_init();
hash[0] = (__force u32)saddr;
hash[1] = (__force u32)daddr;
hash[2] = (__force u32)dport ^ net_secret[14];
@@ -140,6 +159,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
u32 hash[MD5_DIGEST_WORDS];
u64 seq;
+ net_secret_init();
hash[0] = (__force u32)saddr;
hash[1] = (__force u32)daddr;
hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
@@ -164,6 +184,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
u64 seq;
u32 i;
+ net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
secret[i] = net_secret[i] + daddr[i];
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7a1874b7b8fd4..cfeb85cff4f02 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -263,10 +263,8 @@ void build_ehash_secret(void)
get_random_bytes(&rnd, sizeof(rnd));
} while (rnd == 0);
- if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) {
+ if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0)
get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
- net_secret_init();
- }
}
EXPORT_SYMBOL(build_ehash_secret);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index dace87f06e5f9..7defdc9ba1674 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -736,7 +736,7 @@ static void igmp_gq_timer_expire(unsigned long data)
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_timer_expire(unsigned long data)
@@ -749,7 +749,7 @@ static void igmp_ifc_timer_expire(unsigned long data)
igmp_ifc_start_timer(in_dev,
unsolicited_report_interval(in_dev));
}
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_event(struct in_device *in_dev)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ac9fabe0300f6..63a6d6d6b8758 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -623,6 +623,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->err_count = 0;
}
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = tnl_params->ttl;
if (ttl == 0) {
if (skb->protocol == htons(ETH_P_IP))
@@ -641,18 +642,17 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len;
- if (max_headroom > dev->needed_headroom) {
+ if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom;
- if (skb_cow_head(skb, dev->needed_headroom)) {
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return;
- }
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ dev->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return;
}
err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
- ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df,
- !net_eq(tunnel->net, dev_net(dev)));
+ tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return;
@@ -853,8 +853,10 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
- if (!IS_ERR(itn->fb_tunnel_dev))
+ if (!IS_ERR(itn->fb_tunnel_dev)) {
itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+ }
rtnl_unlock();
return PTR_RET(itn->fb_tunnel_dev);
@@ -884,8 +886,6 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
if (!net_eq(dev_net(t->dev), net))
unregister_netdevice_queue(t->dev, head);
}
- if (itn->fb_tunnel_dev)
- unregister_netdevice_queue(itn->fb_tunnel_dev, head);
}
void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index d6c856b17fd4f..c31e3ad98ef28 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -61,7 +61,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
/* Push down and install the IP header. */
- __skb_push(skb, sizeof(struct iphdr));
+ skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 67e17dcda65e6..b6346bf2fde3b 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -267,7 +267,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
if (th == NULL)
return NF_DROP;
- synproxy_parse_options(skb, par->thoff, th, &opts);
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
if (th->syn && !(th->ack || th->fin || th->rst)) {
/* Initial SYN from client */
@@ -350,7 +351,8 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
/* fall through */
case TCP_CONNTRACK_SYN_SENT:
- synproxy_parse_options(skb, thoff, th, &opts);
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
if (!th->syn && th->ack &&
CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
@@ -373,7 +375,9 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
if (!th->syn || !th->ack)
break;
- synproxy_parse_options(skb, thoff, th, &opts);
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy->tsoff = opts.tsval - synproxy->its;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bfec521c717fd..193db03540ad7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -218,8 +218,10 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_sk_update_pmtu(skb, sk, info);
- else if (type == ICMP_REDIRECT)
+ else if (type == ICMP_REDIRECT) {
ipv4_sk_redirect(skb, sk);
+ return;
+ }
/* Report error on raw socket, if:
1. User requested ip_recverr.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7c83cb8bf1378..e6bb8256e59f3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -895,8 +895,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_orphan(skb);
skb->sk = sk;
- skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
- tcp_wfree : sock_wfree;
+ skb->destructor = tcp_wfree;
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
@@ -1840,7 +1839,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
-
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
@@ -1869,13 +1867,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- /* TSQ : sk_wmem_alloc accounts skb truesize,
- * including skb overhead. But thats OK.
+ /* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
*/
- if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+ limit = max(skb->truesize, sk->sk_pacing_rate >> 10);
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
break;
}
+
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 74d2c95db57f3..0ca44df51ee94 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -658,7 +658,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
break;
case ICMP_REDIRECT:
ipv4_sk_redirect(skb, sk);
- break;
+ goto out;
}
/*
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d6ff12617f36f..cd3fb301da38a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1499,6 +1499,33 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
return false;
}
+/* Compares an address/prefix_len with addresses on device @dev.
+ * If one is found it returns true.
+ */
+bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
+ const unsigned int prefix_len, struct net_device *dev)
+{
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ bool ret = false;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len);
+ if (ret)
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(ipv6_chk_custom_prefix);
+
int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
{
struct inet6_dev *idev;
@@ -2193,43 +2220,21 @@ ok:
else
stored_lft = 0;
if (!update_lft && !create && stored_lft) {
- if (valid_lft > MIN_VALID_LIFETIME ||
- valid_lft > stored_lft)
- update_lft = 1;
- else if (stored_lft <= MIN_VALID_LIFETIME) {
- /* valid_lft <= stored_lft is always true */
- /*
- * RFC 4862 Section 5.5.3e:
- * "Note that the preferred lifetime of
- * the corresponding address is always
- * reset to the Preferred Lifetime in
- * the received Prefix Information
- * option, regardless of whether the
- * valid lifetime is also reset or
- * ignored."
- *
- * So if the preferred lifetime in
- * this advertisement is different
- * than what we have stored, but the
- * valid lifetime is invalid, just
- * reset prefered_lft.
- *
- * We must set the valid lifetime
- * to the stored lifetime since we'll
- * be updating the timestamp below,
- * else we'll set it back to the
- * minimum.
- */
- if (prefered_lft != ifp->prefered_lft) {
- valid_lft = stored_lft;
- update_lft = 1;
- }
- } else {
- valid_lft = MIN_VALID_LIFETIME;
- if (valid_lft < prefered_lft)
- prefered_lft = valid_lft;
- update_lft = 1;
- }
+ const u32 minimum_lft = min(
+ stored_lft, (u32)MIN_VALID_LIFETIME);
+ valid_lft = max(valid_lft, minimum_lft);
+
+ /* RFC4862 Section 5.5.3e:
+ * "Note that the preferred lifetime of the
+ * corresponding address is always reset to
+ * the Preferred Lifetime in the received
+ * Prefix Information option, regardless of
+ * whether the valid lifetime is also reset or
+ * ignored."
+ *
+ * So we should always update prefered_lft here.
+ */
+ update_lft = 1;
}
if (update_lft) {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 6b26e9feafb98..7bb5446b9d73c 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -618,7 +618,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
struct ip6_tnl *tunnel = netdev_priv(dev);
struct net_device *tdev; /* Device to other host */
struct ipv6hdr *ipv6h; /* Our new IP header */
- unsigned int max_headroom; /* The extra header space needed */
+ unsigned int max_headroom = 0; /* The extra header space needed */
int gre_hlen;
struct ipv6_tel_txoption opt;
int mtu;
@@ -693,7 +693,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len;
+ max_headroom += LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len;
if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
(skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3a692d5291636..a54c45ce4a48f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1015,6 +1015,8 @@ static inline int ip6_ufo_append_data(struct sock *sk,
* udp datagram
*/
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+ struct frag_hdr fhdr;
+
skb = sock_alloc_send_skb(sk,
hh_len + fragheaderlen + transhdrlen + 20,
(flags & MSG_DONTWAIT), &err);
@@ -1036,12 +1038,6 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb->protocol = htons(ETH_P_IPV6);
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
- }
-
- err = skb_append_datato_frags(sk,skb, getfrag, from,
- (length - transhdrlen));
- if (!err) {
- struct frag_hdr fhdr;
/* Specify the length of each IPv6 datagram fragment.
* It has to be a multiple of 8.
@@ -1052,15 +1048,10 @@ static inline int ip6_ufo_append_data(struct sock *sk,
ipv6_select_ident(&fhdr, rt);
skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
__skb_queue_tail(&sk->sk_write_queue, skb);
-
- return 0;
}
- /* There is not enough support do UPD LSO,
- * so follow normal path
- */
- kfree_skb(skb);
- return err;
+ return skb_append_datato_frags(sk, skb, getfrag, from,
+ (length - transhdrlen));
}
static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
@@ -1227,27 +1218,27 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
* --yoshfuji
*/
- cork->length += length;
- if (length > mtu) {
- int proto = sk->sk_protocol;
- if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
- ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
- return -EMSGSIZE;
- }
-
- if (proto == IPPROTO_UDP &&
- (rt->dst.dev->features & NETIF_F_UFO)) {
+ if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_RAW)) {
+ ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
+ return -EMSGSIZE;
+ }
- err = ip6_ufo_append_data(sk, getfrag, from, length,
- hh_len, fragheaderlen,
- transhdrlen, mtu, flags, rt);
- if (err)
- goto error;
- return 0;
- }
+ skb = skb_peek_tail(&sk->sk_write_queue);
+ cork->length += length;
+ if (((length > mtu) ||
+ (skb && skb_is_gso(skb))) &&
+ (sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->dst.dev->features & NETIF_F_UFO)) {
+ err = ip6_ufo_append_data(sk, getfrag, from, length,
+ hh_len, fragheaderlen,
+ transhdrlen, mtu, flags, rt);
+ if (err)
+ goto error;
+ return 0;
}
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ if (!skb)
goto alloc_new_skb;
while (length > 0) {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 2d8f4829575b2..a791552e04221 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1731,8 +1731,6 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
}
}
- t = rtnl_dereference(ip6n->tnls_wc[0]);
- unregister_netdevice_queue(t->dev, &list);
unregister_netdevice_many(&list);
}
@@ -1752,6 +1750,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
if (!ip6n->fb_tnl_dev)
goto err_alloc_dev;
dev_net_set(ip6n->fb_tnl_dev, net);
+ ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops;
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 096cd67b737c4..d18f9f903db62 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2034,7 +2034,7 @@ static void mld_dad_timer_expire(unsigned long data)
if (idev->mc_dad_count)
mld_dad_start_timer(idev, idev->mc_maxdelay);
}
- __in6_dev_put(idev);
+ in6_dev_put(idev);
}
static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
@@ -2379,7 +2379,7 @@ static void mld_gq_timer_expire(unsigned long data)
idev->mc_gq_running = 0;
mld_send_report(idev, NULL);
- __in6_dev_put(idev);
+ in6_dev_put(idev);
}
static void mld_ifc_timer_expire(unsigned long data)
@@ -2392,7 +2392,7 @@ static void mld_ifc_timer_expire(unsigned long data)
if (idev->mc_ifc_count)
mld_ifc_start_timer(idev, idev->mc_maxdelay);
}
- __in6_dev_put(idev);
+ in6_dev_put(idev);
}
static void mld_ifc_event(struct inet6_dev *idev)
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 19cfea8dbcaa0..2748b042da72e 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -282,7 +282,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (th == NULL)
return NF_DROP;
- synproxy_parse_options(skb, par->thoff, th, &opts);
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
if (th->syn && !(th->ack || th->fin || th->rst)) {
/* Initial SYN from client */
@@ -372,7 +373,8 @@ static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
/* fall through */
case TCP_CONNTRACK_SYN_SENT:
- synproxy_parse_options(skb, thoff, th, &opts);
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
if (!th->syn && th->ack &&
CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
@@ -395,7 +397,9 @@ static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
if (!th->syn || !th->ack)
break;
- synproxy_parse_options(skb, thoff, th, &opts);
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy->tsoff = opts.tsval - synproxy->its;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 58916bbb17284..a4ed2416399ed 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -335,8 +335,10 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
ip6_sk_update_pmtu(skb, sk, info);
harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
}
- if (type == NDISC_REDIRECT)
+ if (type == NDISC_REDIRECT) {
ip6_sk_redirect(skb, sk);
+ return;
+ }
if (np->recverr) {
u8 *payload = skb->data;
if (!inet->hdrincl)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 7ee5cb96db348..19269453a8eac 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -566,6 +566,70 @@ static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr,
return false;
}
+/* Checks if an address matches an address on the tunnel interface.
+ * Used to detect the NAT of proto 41 packets and let them pass spoofing test.
+ * Long story:
+ * This function is called after we considered the packet as spoofed
+ * in is_spoofed_6rd.
+ * We may have a router that is doing NAT for proto 41 packets
+ * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb
+ * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd
+ * function will return true, dropping the packet.
+ * But, we can still check if is spoofed against the IP
+ * addresses associated with the interface.
+ */
+static bool only_dnatted(const struct ip_tunnel *tunnel,
+ const struct in6_addr *v6dst)
+{
+ int prefix_len;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ prefix_len = tunnel->ip6rd.prefixlen + 32
+ - tunnel->ip6rd.relay_prefixlen;
+#else
+ prefix_len = 48;
+#endif
+ return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev);
+}
+
+/* Returns true if a packet is spoofed */
+static bool packet_is_spoofed(struct sk_buff *skb,
+ const struct iphdr *iph,
+ struct ip_tunnel *tunnel)
+{
+ const struct ipv6hdr *ipv6h;
+
+ if (tunnel->dev->priv_flags & IFF_ISATAP) {
+ if (!isatap_chksrc(skb, iph, tunnel))
+ return true;
+
+ return false;
+ }
+
+ if (tunnel->dev->flags & IFF_POINTOPOINT)
+ return false;
+
+ ipv6h = ipv6_hdr(skb);
+
+ if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) {
+ net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
+ &iph->saddr, &ipv6h->saddr,
+ &iph->daddr, &ipv6h->daddr);
+ return true;
+ }
+
+ if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr)))
+ return false;
+
+ if (only_dnatted(tunnel, &ipv6h->daddr))
+ return false;
+
+ net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
+ &iph->saddr, &ipv6h->saddr,
+ &iph->daddr, &ipv6h->daddr);
+ return true;
+}
+
static int ipip6_rcv(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
@@ -586,19 +650,9 @@ static int ipip6_rcv(struct sk_buff *skb)
IPCB(skb)->flags = 0;
skb->protocol = htons(ETH_P_IPV6);
- if (tunnel->dev->priv_flags & IFF_ISATAP) {
- if (!isatap_chksrc(skb, iph, tunnel)) {
- tunnel->dev->stats.rx_errors++;
- goto out;
- }
- } else if (!(tunnel->dev->flags&IFF_POINTOPOINT)) {
- if (is_spoofed_6rd(tunnel, iph->saddr,
- &ipv6_hdr(skb)->saddr) ||
- is_spoofed_6rd(tunnel, iph->daddr,
- &ipv6_hdr(skb)->daddr)) {
- tunnel->dev->stats.rx_errors++;
- goto out;
- }
+ if (packet_is_spoofed(skb, iph, tunnel)) {
+ tunnel->dev->stats.rx_errors++;
+ goto out;
}
__skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
@@ -748,7 +802,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
if (neigh == NULL) {
- net_dbg_ratelimited("sit: nexthop == NULL\n");
+ net_dbg_ratelimited("nexthop == NULL\n");
goto tx_error;
}
@@ -777,7 +831,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
if (neigh == NULL) {
- net_dbg_ratelimited("sit: nexthop == NULL\n");
+ net_dbg_ratelimited("nexthop == NULL\n");
goto tx_error;
}
@@ -1612,6 +1666,7 @@ static int __net_init sit_init_net(struct net *net)
goto err_alloc_dev;
}
dev_net_set(sitn->fb_tunnel_dev, net);
+ sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops;
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
@@ -1646,7 +1701,6 @@ static void __net_exit sit_exit_net(struct net *net)
rtnl_lock();
sit_destroy_tunnels(sitn, &list);
- unregister_netdevice_queue(sitn->fb_tunnel_dev, &list);
unregister_netdevice_many(&list);
rtnl_unlock();
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f4058150262b1..72b7eaaf3ca0e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -525,8 +525,10 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (type == ICMPV6_PKT_TOOBIG)
ip6_sk_update_pmtu(skb, sk, info);
- if (type == NDISC_REDIRECT)
+ if (type == NDISC_REDIRECT) {
ip6_sk_redirect(skb, sk);
+ goto out;
+ }
np = inet6_sk(sk);
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 54563ad8aeb1f..355cc3b6fa4d3 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -154,6 +154,7 @@ static void lapb_t1timer_expiry(unsigned long param)
} else {
lapb->n2count++;
lapb_requeue_frames(lapb);
+ lapb_kick(lapb);
}
break;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4f69e83ff836b..74fd00c272100 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -116,6 +116,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
s = this_cpu_ptr(dest->stats.cpustats);
s->ustats.inpkts++;
@@ -123,11 +124,14 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s->ustats.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(dest->svc->stats.cpustats);
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
s->ustats.inpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
s = this_cpu_ptr(ipvs->tot_stats.cpustats);
s->ustats.inpkts++;
@@ -146,6 +150,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
s = this_cpu_ptr(dest->stats.cpustats);
s->ustats.outpkts++;
@@ -153,11 +158,14 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s->ustats.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(dest->svc->stats.cpustats);
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
s->ustats.outpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
s = this_cpu_ptr(ipvs->tot_stats.cpustats);
s->ustats.outpkts++;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c8148e4873865..a3df9bddc4f76 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -460,7 +460,7 @@ static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
atomic_inc(&svc->refcnt);
- dest->svc = svc;
+ rcu_assign_pointer(dest->svc, svc);
}
static void ip_vs_service_free(struct ip_vs_service *svc)
@@ -470,18 +470,25 @@ static void ip_vs_service_free(struct ip_vs_service *svc)
kfree(svc);
}
-static void
-__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+static void ip_vs_service_rcu_free(struct rcu_head *head)
{
- struct ip_vs_service *svc = dest->svc;
+ struct ip_vs_service *svc;
+
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
- dest->svc = NULL;
+static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+{
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
- ip_vs_service_free(svc);
+ if (do_delay)
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
+ else
+ ip_vs_service_free(svc);
}
}
@@ -667,11 +674,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- /* We can not reuse dest while in grace period
- * because conns still can use dest->svc
- */
- if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
- continue;
if (dest->af == svc->af &&
ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
dest->port == dport &&
@@ -697,8 +699,10 @@ out:
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
+ struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
+
__ip_vs_dst_cache_reset(dest);
- __ip_vs_unbind_svc(dest);
+ __ip_vs_svc_put(svc, false);
free_percpu(dest->stats.cpustats);
kfree(dest);
}
@@ -771,6 +775,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_service *old_svc;
struct ip_vs_scheduler *sched;
int conn_flags;
@@ -792,13 +797,14 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
atomic_set(&dest->conn_flags, conn_flags);
/* bind the service */
- if (!dest->svc) {
+ old_svc = rcu_dereference_protected(dest->svc, 1);
+ if (!old_svc) {
__ip_vs_bind_svc(dest, svc);
} else {
- if (dest->svc != svc) {
- __ip_vs_unbind_svc(dest);
+ if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
+ __ip_vs_svc_put(old_svc, true);
}
}
@@ -998,16 +1004,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return 0;
}
-static void ip_vs_dest_wait_readers(struct rcu_head *head)
-{
- struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
- rcu_head);
-
- /* End of grace period after unlinking */
- clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
-}
-
-
/*
* Delete a destination (must be already unlinked from the service)
*/
@@ -1023,20 +1019,16 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
*/
ip_vs_rs_unhash(dest);
- if (!cleanup) {
- set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
- call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
- }
-
spin_lock_bh(&ipvs->dest_trash_lock);
IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
if (list_empty(&ipvs->dest_trash) && !cleanup)
mod_timer(&ipvs->dest_trash_timer,
- jiffies + IP_VS_DEST_TRASH_PERIOD);
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
/* dest lives in trash without reference */
list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = 0;
spin_unlock_bh(&ipvs->dest_trash_lock);
ip_vs_dest_put(dest);
}
@@ -1108,24 +1100,30 @@ static void ip_vs_dest_trash_expire(unsigned long data)
struct net *net = (struct net *) data;
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_dest *dest, *next;
+ unsigned long now = jiffies;
spin_lock(&ipvs->dest_trash_lock);
list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
- /* Skip if dest is in grace period */
- if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
- continue;
if (atomic_read(&dest->refcnt) > 0)
continue;
+ if (dest->idle_start) {
+ if (time_before(now, dest->idle_start +
+ IP_VS_DEST_TRASH_PERIOD))
+ continue;
+ } else {
+ dest->idle_start = max(1UL, now);
+ continue;
+ }
IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
dest->vfwmark,
- IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port));
list_del(&dest->t_list);
ip_vs_dest_free(dest);
}
if (!list_empty(&ipvs->dest_trash))
mod_timer(&ipvs->dest_trash_timer,
- jiffies + IP_VS_DEST_TRASH_PERIOD);
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
spin_unlock(&ipvs->dest_trash_lock);
}
@@ -1320,14 +1318,6 @@ out:
return ret;
}
-static void ip_vs_service_rcu_free(struct rcu_head *head)
-{
- struct ip_vs_service *svc;
-
- svc = container_of(head, struct ip_vs_service, rcu_head);
- ip_vs_service_free(svc);
-}
-
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
@@ -1376,13 +1366,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/*
* Free the service if nobody refers to it
*/
- if (atomic_dec_and_test(&svc->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
- svc->fwmark,
- IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port));
- call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
- }
+ __ip_vs_svc_put(svc, true);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 6bee6d0c73a52..1425e9a924c4f 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -59,12 +59,13 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
struct ip_vs_cpu_stats __percpu *stats)
{
int i;
+ bool add = false;
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
unsigned int start;
__u64 inbytes, outbytes;
- if (i) {
+ if (add) {
sum->conns += s->ustats.conns;
sum->inpkts += s->ustats.inpkts;
sum->outpkts += s->ustats.outpkts;
@@ -76,6 +77,7 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
sum->inbytes += inbytes;
sum->outbytes += outbytes;
} else {
+ add = true;
sum->conns = s->ustats.conns;
sum->inpkts = s->ustats.inpkts;
sum->outpkts = s->ustats.outpkts;
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 1383b0eadc0e7..eff13c94498e0 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -93,7 +93,7 @@ struct ip_vs_lblc_entry {
struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
- struct ip_vs_dest __rcu *dest; /* real server (cache) */
+ struct ip_vs_dest *dest; /* real server (cache) */
unsigned long lastuse; /* last used time */
struct rcu_head rcu_head;
};
@@ -130,20 +130,21 @@ static struct ctl_table vs_vars_table[] = {
};
#endif
-static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+static void ip_vs_lblc_rcu_free(struct rcu_head *head)
{
- struct ip_vs_dest *dest;
+ struct ip_vs_lblc_entry *en = container_of(head,
+ struct ip_vs_lblc_entry,
+ rcu_head);
- hlist_del_rcu(&en->list);
- /*
- * We don't kfree dest because it is referred either by its service
- * or the trash dest list.
- */
- dest = rcu_dereference_protected(en->dest, 1);
- ip_vs_dest_put(dest);
- kfree_rcu(en, rcu_head);
+ ip_vs_dest_put(en->dest);
+ kfree(en);
}
+static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
+}
/*
* Returns hash value for IPVS LBLC entry
@@ -203,30 +204,23 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
struct ip_vs_lblc_entry *en;
en = ip_vs_lblc_get(dest->af, tbl, daddr);
- if (!en) {
- en = kmalloc(sizeof(*en), GFP_ATOMIC);
- if (!en)
- return NULL;
-
- en->af = dest->af;
- ip_vs_addr_copy(dest->af, &en->addr, daddr);
- en->lastuse = jiffies;
+ if (en) {
+ if (en->dest == dest)
+ return en;
+ ip_vs_lblc_del(en);
+ }
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
- ip_vs_dest_hold(dest);
- RCU_INIT_POINTER(en->dest, dest);
+ en->af = dest->af;
+ ip_vs_addr_copy(dest->af, &en->addr, daddr);
+ en->lastuse = jiffies;
- ip_vs_lblc_hash(tbl, en);
- } else {
- struct ip_vs_dest *old_dest;
+ ip_vs_dest_hold(dest);
+ en->dest = dest;
- old_dest = rcu_dereference_protected(en->dest, 1);
- if (old_dest != dest) {
- ip_vs_dest_put(old_dest);
- ip_vs_dest_hold(dest);
- /* No ordering constraints for refcnt */
- RCU_INIT_POINTER(en->dest, dest);
- }
- }
+ ip_vs_lblc_hash(tbl, en);
return en;
}
@@ -246,7 +240,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
tbl->dead = 1;
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
- ip_vs_lblc_free(en);
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
}
}
@@ -281,7 +275,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
sysctl_lblc_expiration(svc)))
continue;
- ip_vs_lblc_free(en);
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
}
spin_unlock(&svc->sched_lock);
@@ -335,7 +329,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
continue;
- ip_vs_lblc_free(en);
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
goal--;
}
@@ -443,8 +437,8 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -511,7 +505,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
* free up entries from the trash at any time.
*/
- dest = rcu_dereference(en->dest);
+ dest = en->dest;
if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
goto out;
@@ -631,7 +625,7 @@ static void __exit ip_vs_lblc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
unregister_pernet_subsys(&ip_vs_lblc_ops);
- synchronize_rcu();
+ rcu_barrier();
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 5199448697f64..0b8550089a2e5 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -89,7 +89,7 @@
*/
struct ip_vs_dest_set_elem {
struct list_head list; /* list link */
- struct ip_vs_dest __rcu *dest; /* destination server */
+ struct ip_vs_dest *dest; /* destination server */
struct rcu_head rcu_head;
};
@@ -107,11 +107,7 @@ static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
if (check) {
list_for_each_entry(e, &set->list, list) {
- struct ip_vs_dest *d;
-
- d = rcu_dereference_protected(e->dest, 1);
- if (d == dest)
- /* already existed */
+ if (e->dest == dest)
return;
}
}
@@ -121,7 +117,7 @@ static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
return;
ip_vs_dest_hold(dest);
- RCU_INIT_POINTER(e->dest, dest);
+ e->dest = dest;
list_add_rcu(&e->list, &set->list);
atomic_inc(&set->size);
@@ -129,22 +125,27 @@ static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
set->lastmod = jiffies;
}
+static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
+ ip_vs_dest_put(e->dest);
+ kfree(e);
+}
+
static void
ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
{
struct ip_vs_dest_set_elem *e;
list_for_each_entry(e, &set->list, list) {
- struct ip_vs_dest *d;
-
- d = rcu_dereference_protected(e->dest, 1);
- if (d == dest) {
+ if (e->dest == dest) {
/* HIT */
atomic_dec(&set->size);
set->lastmod = jiffies;
- ip_vs_dest_put(dest);
list_del_rcu(&e->list);
- kfree_rcu(e, rcu_head);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
break;
}
}
@@ -155,16 +156,8 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
struct ip_vs_dest_set_elem *e, *ep;
list_for_each_entry_safe(e, ep, &set->list, list) {
- struct ip_vs_dest *d;
-
- d = rcu_dereference_protected(e->dest, 1);
- /*
- * We don't kfree dest because it is referred either
- * by its service or by the trash dest list.
- */
- ip_vs_dest_put(d);
list_del_rcu(&e->list);
- kfree_rcu(e, rcu_head);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
}
}
@@ -175,12 +168,9 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
struct ip_vs_dest *dest, *least;
int loh, doh;
- if (set == NULL)
- return NULL;
-
/* select the first destination server, whose weight > 0 */
list_for_each_entry_rcu(e, &set->list, list) {
- least = rcu_dereference(e->dest);
+ least = e->dest;
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -195,13 +185,13 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* find the destination with the weighted least load */
nextstage:
list_for_each_entry_continue_rcu(e, &set->list, list) {
- dest = rcu_dereference(e->dest);
+ dest = e->dest;
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if ((loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))
+ if (((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
least = dest;
loh = doh;
@@ -232,7 +222,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* select the first destination server, whose weight > 0 */
list_for_each_entry(e, &set->list, list) {
- most = rcu_dereference_protected(e->dest, 1);
+ most = e->dest;
if (atomic_read(&most->weight) > 0) {
moh = ip_vs_dest_conn_overhead(most);
goto nextstage;
@@ -243,11 +233,11 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* find the destination with the weighted most load */
nextstage:
list_for_each_entry_continue(e, &set->list, list) {
- dest = rcu_dereference_protected(e->dest, 1);
+ dest = e->dest;
doh = ip_vs_dest_conn_overhead(dest);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
- if ((moh * atomic_read(&dest->weight) <
- doh * atomic_read(&most->weight))
+ if (((__s64)moh * atomic_read(&dest->weight) <
+ (__s64)doh * atomic_read(&most->weight))
&& (atomic_read(&dest->weight) > 0)) {
most = dest;
moh = doh;
@@ -611,8 +601,8 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -819,7 +809,7 @@ static void __exit ip_vs_lblcr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
unregister_pernet_subsys(&ip_vs_lblcr_ops);
- synchronize_rcu();
+ rcu_barrier();
}
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index d8d9860934fee..961a6de9bb290 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -40,7 +40,7 @@
#include <net/ip_vs.h>
-static inline unsigned int
+static inline int
ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
{
/*
@@ -59,7 +59,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least = NULL;
- unsigned int loh = 0, doh;
+ int loh = 0, doh;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
@@ -92,8 +92,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
}
if (!least ||
- (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))) {
+ ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))) {
least = dest;
loh = doh;
}
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index a5284cc3d8827..e446b9fa7424c 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -44,7 +44,7 @@
#include <net/ip_vs.h>
-static inline unsigned int
+static inline int
ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
{
/*
@@ -63,7 +63,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
+ int loh, doh;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
@@ -99,8 +99,8 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_sed_dest_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index 6dc1fa1288409..b5b4650d50a91 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -35,7 +35,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
+ int loh, doh;
IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
@@ -71,8 +71,8 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 6fd967c6278c8..cdf4567ba9b33 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -24,7 +24,7 @@
int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
-void
+bool
synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
const struct tcphdr *th, struct synproxy_options *opts)
{
@@ -32,7 +32,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
u8 buf[40], *ptr;
ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
- BUG_ON(ptr == NULL);
+ if (ptr == NULL)
+ return false;
opts->options = 0;
while (length > 0) {
@@ -41,16 +42,16 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
switch (opcode) {
case TCPOPT_EOL:
- return;
+ return true;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2)
- return;
+ return true;
if (opsize > length)
- return;
+ return true;
switch (opcode) {
case TCPOPT_MSS:
@@ -84,6 +85,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
length -= opsize;
}
}
+ return true;
}
EXPORT_SYMBOL_GPL(synproxy_parse_options);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 32ad015ee8ce4..a2fef8b10b960 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -285,7 +285,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
/* remove one skb from head of flow queue */
-static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
{
struct sk_buff *skb = flow->head;
@@ -293,6 +293,8 @@ static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
flow->head = skb->next;
skb->next = NULL;
flow->qlen--;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ sch->q.qlen--;
}
return skb;
}
@@ -418,8 +420,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
+ u32 rate;
- skb = fq_dequeue_head(&q->internal);
+ skb = fq_dequeue_head(sch, &q->internal);
if (skb)
goto out;
fq_check_throttled(q, now);
@@ -449,7 +452,7 @@ begin:
goto begin;
}
- skb = fq_dequeue_head(f);
+ skb = fq_dequeue_head(sch, f);
if (!skb) {
head->first = f->next;
/* force a pass through old_flows to prevent starvation */
@@ -466,43 +469,74 @@ begin:
f->time_next_packet = now;
f->credit -= qdisc_pkt_len(skb);
- if (f->credit <= 0 &&
- q->rate_enable &&
- skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
- u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
+ if (f->credit > 0 || !q->rate_enable)
+ goto out;
- rate = min(rate, q->flow_max_rate);
- if (rate) {
- u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
-
- do_div(len, rate);
- /* Since socket rate can change later,
- * clamp the delay to 125 ms.
- * TODO: maybe segment the too big skb, as in commit
- * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
- */
- if (unlikely(len > 125 * NSEC_PER_MSEC)) {
- len = 125 * NSEC_PER_MSEC;
- q->stat_pkts_too_long++;
- }
+ if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
+ rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
- f->time_next_packet = now + len;
+ rate = min(rate, q->flow_max_rate);
+ } else {
+ rate = q->flow_max_rate;
+ if (rate == ~0U)
+ goto out;
+ }
+ if (rate) {
+ u32 plen = max(qdisc_pkt_len(skb), q->quantum);
+ u64 len = (u64)plen * NSEC_PER_SEC;
+
+ do_div(len, rate);
+ /* Since socket rate can change later,
+ * clamp the delay to 125 ms.
+ * TODO: maybe segment the too big skb, as in commit
+ * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+ */
+ if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+ len = 125 * NSEC_PER_MSEC;
+ q->stat_pkts_too_long++;
}
+
+ f->time_next_packet = now + len;
}
out:
- sch->qstats.backlog -= qdisc_pkt_len(skb);
qdisc_bstats_update(sch, skb);
- sch->q.qlen--;
qdisc_unthrottled(sch);
return skb;
}
static void fq_reset(struct Qdisc *sch)
{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *root;
struct sk_buff *skb;
+ struct rb_node *p;
+ struct fq_flow *f;
+ unsigned int idx;
- while ((skb = fq_dequeue(sch)) != NULL)
+ while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
kfree_skb(skb);
+
+ if (!q->fq_root)
+ return;
+
+ for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+ root = &q->fq_root[idx];
+ while ((p = rb_first(root)) != NULL) {
+ f = container_of(p, struct fq_flow, fq_node);
+ rb_erase(p, root);
+
+ while ((skb = fq_dequeue_head(sch, f)) != NULL)
+ kfree_skb(skb);
+
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+ }
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->flows = 0;
+ q->inactive_flows = 0;
+ q->throttled_flows = 0;
}
static void fq_rehash(struct fq_sched_data *q,
@@ -645,6 +679,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = fq_dequeue(sch);
+ if (!skb)
+ break;
kfree_skb(skb);
drop_count++;
}
@@ -657,21 +693,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
static void fq_destroy(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
- struct rb_root *root;
- struct rb_node *p;
- unsigned int idx;
- if (q->fq_root) {
- for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
- root = &q->fq_root[idx];
- while ((p = rb_first(root)) != NULL) {
- rb_erase(p, root);
- kmem_cache_free(fq_flow_cachep,
- container_of(p, struct fq_flow, fq_node));
- }
- }
- kfree(q->fq_root);
- }
+ fq_reset(sch);
+ kfree(q->fq_root);
qdisc_watchdog_cancel(&q->watchdog);
}