summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/dev.c148
-rw-r--r--net/core/devlink.c111
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c63
-rw-r--r--net/core/failover.c315
-rw-r--r--net/core/fib_rules.c495
-rw-r--r--net/core/filter.c1423
-rw-r--r--net/core/flow_dissector.c19
-rw-r--r--net/core/neighbour.c8
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/page_pool.c317
-rw-r--r--net/core/rtnetlink.c34
-rw-r--r--net/core/skbuff.c25
-rw-r--r--net/core/sock.c40
-rw-r--r--net/core/xdp.c299
16 files changed, 2784 insertions, 520 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 6dbbba8c57ae7..80175e6a2eb87 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
fib_notifier.o xdp.o
obj-y += net-sysfs.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
@@ -30,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 9c149238a4cea..6e18242a1caec 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
+EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1586,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
- };
+ }
#undef N
return "UNKNOWN_NETDEV_EVENT";
}
@@ -1754,38 +1755,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
EXPORT_SYMBOL(call_netdevice_notifiers);
#ifdef CONFIG_NET_INGRESS
-static struct static_key ingress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
void net_inc_ingress_queue(void)
{
- static_key_slow_inc(&ingress_needed);
+ static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
void net_dec_ingress_queue(void)
{
- static_key_slow_dec(&ingress_needed);
+ static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif
#ifdef CONFIG_NET_EGRESS
-static struct static_key egress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
void net_inc_egress_queue(void)
{
- static_key_slow_inc(&egress_needed);
+ static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);
void net_dec_egress_queue(void)
{
- static_key_slow_dec(&egress_needed);
+ static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
-static struct static_key netstamp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
#ifdef HAVE_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
@@ -1796,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work)
wanted = atomic_add_return(deferred, &netstamp_wanted);
if (wanted > 0)
- static_key_enable(&netstamp_needed);
+ static_branch_enable(&netstamp_needed_key);
else
- static_key_disable(&netstamp_needed);
+ static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
@@ -1818,7 +1819,7 @@ void net_enable_timestamp(void)
atomic_inc(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
- static_key_slow_inc(&netstamp_needed);
+ static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);
@@ -1838,7 +1839,7 @@ void net_disable_timestamp(void)
atomic_dec(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
- static_key_slow_dec(&netstamp_needed);
+ static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);
@@ -1846,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
- if (static_key_false(&netstamp_needed))
+ if (static_branch_unlikely(&netstamp_needed_key))
__net_timestamp(skb);
}
-#define net_timestamp_check(COND, SKB) \
- if (static_key_false(&netstamp_needed)) { \
- if ((COND) && !(SKB)->tstamp) \
- __net_timestamp(SKB); \
- } \
+#define net_timestamp_check(COND, SKB) \
+ if (static_branch_unlikely(&netstamp_needed_key)) { \
+ if ((COND) && !(SKB)->tstamp) \
+ __net_timestamp(SKB); \
+ } \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
@@ -2614,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
- unsigned int num_tx_queues)
+static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
- u16 qcount = num_tx_queues;
+ u16 qcount = dev->real_num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= num_tx_queues))
- hash -= num_tx_queues;
+ while (unlikely(hash >= qcount))
+ hash -= qcount;
return hash;
}
@@ -2637,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
-EXPORT_SYMBOL(__skb_tx_hash);
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
@@ -3095,6 +3094,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (unlikely(!skb))
goto out_null;
+ skb = sk_validate_xmit_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
@@ -3223,7 +3226,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_DROP;
} else {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- __qdisc_run(q);
+ qdisc_run(q);
}
if (unlikely(to_free))
@@ -3511,7 +3514,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
#ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
- if (static_key_false(&egress_needed)) {
+ if (static_branch_unlikely(&egress_needed_key)) {
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
@@ -3606,6 +3609,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+ struct net_device *dev = skb->dev;
+ struct sk_buff *orig_skb = skb;
+ struct netdev_queue *txq;
+ int ret = NETDEV_TX_BUSY;
+ bool again = false;
+
+ if (unlikely(!netif_running(dev) ||
+ !netif_carrier_ok(dev)))
+ goto drop;
+
+ skb = validate_xmit_skb_list(skb, dev, &again);
+ if (skb != orig_skb)
+ goto drop;
+
+ skb_set_queue_mapping(skb, queue_id);
+ txq = skb_get_tx_queue(dev, skb);
+
+ local_bh_disable();
+
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_drv_stopped(txq))
+ ret = netdev_start_xmit(skb, dev, txq, false);
+ HARD_TX_UNLOCK(dev, txq);
+
+ local_bh_enable();
+
+ if (!dev_xmit_complete(ret))
+ kfree_skb(skb);
+
+ return ret;
+drop:
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb_list(skb);
+ return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(dev_direct_xmit);
/*************************************************************************
* Receiver routines
@@ -3975,12 +4016,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+ struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct netdev_rx_queue *rxqueue;
+ void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
- struct xdp_buff xdp;
- void *orig_data;
int hlen, off;
u32 mac_len;
@@ -4015,31 +4056,42 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
*/
mac_len = skb->data - skb_mac_header(skb);
hlen = skb_headlen(skb) + mac_len;
- xdp.data = skb->data - mac_len;
- xdp.data_meta = xdp.data;
- xdp.data_end = xdp.data + hlen;
- xdp.data_hard_start = skb->data - skb_headroom(skb);
- orig_data = xdp.data;
+ xdp->data = skb->data - mac_len;
+ xdp->data_meta = xdp->data;
+ xdp->data_end = xdp->data + hlen;
+ xdp->data_hard_start = skb->data - skb_headroom(skb);
+ orig_data_end = xdp->data_end;
+ orig_data = xdp->data;
rxqueue = netif_get_rxqueue(skb);
- xdp.rxq = &rxqueue->xdp_rxq;
+ xdp->rxq = &rxqueue->xdp_rxq;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
- off = xdp.data - orig_data;
+ off = xdp->data - orig_data;
if (off > 0)
__skb_pull(skb, off);
else if (off < 0)
__skb_push(skb, -off);
skb->mac_header += off;
+ /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
+ * pckt.
+ */
+ off = orig_data_end - xdp->data_end;
+ if (off != 0) {
+ skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
+ skb->len -= off;
+
+ }
+
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
__skb_push(skb, mac_len);
break;
case XDP_PASS:
- metalen = xdp.data - xdp.data_meta;
+ metalen = xdp->data - xdp->data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
break;
@@ -4084,22 +4136,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
}
EXPORT_SYMBOL_GPL(generic_xdp_tx);
-static struct static_key generic_xdp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
{
if (xdp_prog) {
- u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+ struct xdp_buff xdp;
+ u32 act;
int err;
+ act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
case XDP_REDIRECT:
err = xdp_do_generic_redirect(skb->dev, skb,
- xdp_prog);
+ &xdp, xdp_prog);
if (err)
goto out_redir;
- /* fallthru to submit skb */
+ break;
case XDP_TX:
generic_xdp_tx(skb, xdp_prog);
break;
@@ -4122,7 +4176,7 @@ static int netif_rx_internal(struct sk_buff *skb)
trace_netif_rx(skb);
- if (static_key_false(&generic_xdp_needed)) {
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret;
preempt_disable();
@@ -4494,7 +4548,7 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
- if (static_key_false(&ingress_needed)) {
+ if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
@@ -4654,9 +4708,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
bpf_prog_put(old);
if (old && !new) {
- static_key_slow_dec(&generic_xdp_needed);
+ static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
- static_key_slow_inc(&generic_xdp_needed);
+ static_branch_inc(&generic_xdp_needed_key);
dev_disable_lro(dev);
dev_disable_gro_hw(dev);
}
@@ -4684,7 +4738,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
- if (static_key_false(&generic_xdp_needed)) {
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret;
preempt_disable();
@@ -7852,6 +7906,8 @@ int register_netdevice(struct net_device *dev)
int ret;
struct net *net = dev_net(dev);
+ BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+ NETDEV_FEATURE_COUNT);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798c..22099705cc410 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+ struct devlink_port *devlink_port)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (!attrs->set)
+ return 0;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+ return -EMSGSIZE;
+ if (!attrs->split)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+ attrs->split_subport_number))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_port *devlink_port,
enum devlink_command cmd, u32 portid,
@@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
ibdev->name))
goto nla_put_failure;
}
- if (devlink_port->split &&
- nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
- devlink_port->split_group))
+ if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -683,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
return 0;
}
-static int devlink_port_split(struct devlink *devlink,
- u32 port_index, u32 count)
+static int devlink_port_split(struct devlink *devlink, u32 port_index,
+ u32 count, struct netlink_ext_ack *extack)
{
if (devlink->ops && devlink->ops->port_split)
- return devlink->ops->port_split(devlink, port_index, count);
+ return devlink->ops->port_split(devlink, port_index, count,
+ extack);
return -EOPNOTSUPP;
}
@@ -705,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
- return devlink_port_split(devlink, port_index, count);
+ return devlink_port_split(devlink, port_index, count, info->extack);
}
-static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
+ struct netlink_ext_ack *extack)
{
if (devlink->ops && devlink->ops->port_unsplit)
- return devlink->ops->port_unsplit(devlink, port_index);
+ return devlink->ops->port_unsplit(devlink, port_index, extack);
return -EOPNOTSUPP;
}
@@ -726,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
return -EINVAL;
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
- return devlink_port_unsplit(devlink, port_index);
+ return devlink_port_unsplit(devlink, port_index, info->extack);
}
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
@@ -1807,7 +1828,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2013,7 +2033,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
return 0;
nla_put_failure:
- genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr);
nlmsg_free(dump_ctx->skb);
return -EMSGSIZE;
}
@@ -2230,7 +2249,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2532,7 +2550,6 @@ nla_put_failure:
err = -EMSGSIZE;
err_resource_put:
err_skb_send_alloc:
- genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
}
@@ -2584,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
return err;
}
- return devlink->ops->reload(devlink);
+ return devlink->ops->reload(devlink, info->extack);
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -2737,7 +2754,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.doit = devlink_nl_cmd_eswitch_set_doit,
.policy = devlink_nl_policy,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
@@ -2971,19 +2989,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
/**
- * devlink_port_split_set - Set port is split
+ * devlink_port_attrs_set - Set port attributes
*
* @devlink_port: devlink port
- * @split_group: split group - identifies group split port is part of
+ * @flavour: flavour of the port
+ * @port_number: number of the port that is facing user, for example
+ * the front panel port number
+ * @split: indicates if this is split port
+ * @split_subport_number: if the port is split, this is the number
+ * of subport.
*/
-void devlink_port_split_set(struct devlink_port *devlink_port,
- u32 split_group)
-{
- devlink_port->split = true;
- devlink_port->split_group = split_group;
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour,
+ u32 port_number, bool split,
+ u32 split_subport_number)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ attrs->set = true;
+ attrs->flavour = flavour;
+ attrs->port_number = port_number;
+ attrs->split = split;
+ attrs->split_subport_number = split_subport_number;
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
-EXPORT_SYMBOL_GPL(devlink_port_split_set);
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+ char *name, size_t len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int n = 0;
+
+ if (!attrs->set)
+ return -EOPNOTSUPP;
+
+ switch (attrs->flavour) {
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ if (!attrs->split)
+ n = snprintf(name, len, "p%u", attrs->port_number);
+ else
+ n = snprintf(name, len, "p%us%u", attrs->port_number,
+ attrs->split_subport_number);
+ break;
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ /* As CPU and DSA ports do not have a netdevice associated
+ * case should not ever happen.
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (n >= len)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
u32 size, u16 ingress_pools_count,
diff --git a/net/core/dst.c b/net/core/dst.c
index 007aa0b082915..2d9b37f8944a9 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = {
*/
.refcnt = REFCOUNT_INIT(1),
};
+EXPORT_SYMBOL(dst_default_metrics);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
struct net_device *dev, int initial_ref, int initial_obsolete,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ba02f0dfe85cd..c15075dc7572c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
[NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
[NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
@@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
[NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
};
static const char
@@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
return ret;
}
-static int phy_get_sset_count(struct phy_device *phydev)
-{
- int ret;
-
- if (phydev->drv->get_sset_count &&
- phydev->drv->get_strings &&
- phydev->drv->get_stats) {
- mutex_lock(&phydev->lock);
- ret = phydev->drv->get_sset_count(phydev);
- mutex_unlock(&phydev->lock);
-
- return ret;
- }
-
- return -EOPNOTSUPP;
-}
-
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_PHY_TUNABLES)
return ARRAY_SIZE(phy_tunable_strings);
- if (sset == ETH_SS_PHY_STATS) {
- if (dev->phydev)
- return phy_get_sset_count(dev->phydev);
- else
- return -EOPNOTSUPP;
- }
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ return phy_ethtool_get_sset_count(dev->phydev);
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
@@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev,
memcpy(data, tunable_strings, sizeof(tunable_strings));
else if (stringset == ETH_SS_PHY_TUNABLES)
memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
- else if (stringset == ETH_SS_PHY_STATS) {
- struct phy_device *phydev = dev->phydev;
-
- if (phydev) {
- mutex_lock(&phydev->lock);
- phydev->drv->get_strings(phydev, data);
- mutex_unlock(&phydev->lock);
- } else {
- return;
- }
- } else
+ else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ phy_ethtool_get_strings(dev->phydev, data);
+ else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
}
@@ -1998,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
u64 *data;
int ret, n_stats;
- if (!phydev)
+ if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
return -EOPNOTSUPP;
- n_stats = phy_get_sset_count(phydev);
+ if (dev->phydev && !ops->get_ethtool_phy_stats)
+ n_stats = phy_ethtool_get_sset_count(dev->phydev);
+ else
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
if (n_stats < 0)
return n_stats;
if (n_stats > S32_MAX / sizeof(u64))
@@ -2021,9 +2000,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (n_stats && !data)
return -ENOMEM;
- mutex_lock(&phydev->lock);
- phydev->drv->get_stats(phydev, &stats, data);
- mutex_unlock(&phydev->lock);
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+ if (ret < 0)
+ return ret;
+ } else {
+ ops->get_ethtool_phy_stats(dev, &stats, data);
+ }
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 0000000000000..4a92a98ccce9a
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct failover *failover;
+
+ spin_lock(&failover_lock);
+ list_for_each_entry(failover, &failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&failover_lock);
+ return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+ int err;
+
+ if (slave_dev->type != ARPHRD_ETHER)
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_register &&
+ fops->slave_pre_register(slave_dev, failover_dev))
+ goto done;
+
+ err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+ failover_dev);
+ if (err) {
+ netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+ err);
+ goto done;
+ }
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+ err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+ &lag_upper_info, NULL);
+ if (err) {
+ netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+ failover_dev->name, err);
+ goto err_upper_link;
+ }
+
+ slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_register &&
+ !fops->slave_register(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+err_upper_link:
+ netdev_rx_handler_unregister(slave_dev);
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_unregister &&
+ fops->slave_pre_unregister(slave_dev, failover_dev))
+ goto done;
+
+ netdev_rx_handler_unregister(slave_dev);
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_unregister &&
+ !fops->slave_unregister(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_link_change &&
+ !fops->slave_link_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_name_change &&
+ !fops->slave_name_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return failover_slave_name_change(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block failover_notifier = {
+ .notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops)
+{
+ struct failover *failover;
+
+ if (dev->type != ARPHRD_ETHER)
+ return ERR_PTR(-EINVAL);
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ dev_hold(dev);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&failover_lock);
+ list_add_tail(&failover->list, &failover_list);
+ spin_unlock(&failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ failover_existing_slave_register(dev);
+
+ return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ dev_put(failover_dev);
+
+ spin_lock(&failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+ register_netdevice_notifier(&failover_notifier);
+
+ return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+ unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 33958f84c1736..126ffc5bc630c 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family)
}
EXPORT_SYMBOL_GPL(fib_rules_seq_read);
-static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
- struct fib_rules_ops *ops)
-{
- int err = -EINVAL;
-
- if (frh->src_len)
- if (tb[FRA_SRC] == NULL ||
- frh->src_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_SRC]) != ops->addr_size)
- goto errout;
-
- if (frh->dst_len)
- if (tb[FRA_DST] == NULL ||
- frh->dst_len > (ops->addr_size * 8) ||
- nla_len(tb[FRA_DST]) != ops->addr_size)
- goto errout;
-
- err = 0;
-errout:
- return err;
-}
-
-static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
- struct nlattr **tb, struct fib_rule *rule)
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb,
+ struct fib_rule *rule,
+ bool user_priority)
{
struct fib_rule *r;
list_for_each_entry(r, &ops->rules_list, list) {
- if (r->action != rule->action)
+ if (rule->action && r->action != rule->action)
continue;
- if (r->table != rule->table)
+ if (rule->table && r->table != rule->table)
continue;
- if (r->pref != rule->pref)
+ if (user_priority && r->pref != rule->pref)
continue;
- if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+ if (rule->iifname[0] &&
+ memcmp(r->iifname, rule->iifname, IFNAMSIZ))
continue;
- if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+ if (rule->oifname[0] &&
+ memcmp(r->oifname, rule->oifname, IFNAMSIZ))
continue;
- if (r->mark != rule->mark)
+ if (rule->mark && r->mark != rule->mark)
continue;
- if (r->mark_mask != rule->mark_mask)
+ if (rule->mark_mask && r->mark_mask != rule->mark_mask)
continue;
- if (r->tun_id != rule->tun_id)
+ if (rule->tun_id && r->tun_id != rule->tun_id)
continue;
if (r->fr_net != rule->fr_net)
continue;
- if (r->l3mdev != rule->l3mdev)
+ if (rule->l3mdev && r->l3mdev != rule->l3mdev)
continue;
- if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
- !uid_eq(r->uid_range.end, rule->uid_range.end))
+ if (uid_range_set(&rule->uid_range) &&
+ (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end)))
continue;
- if (r->ip_proto != rule->ip_proto)
+ if (rule->ip_proto && r->ip_proto != rule->ip_proto)
continue;
- if (!fib_rule_port_range_compare(&r->sport_range,
+ if (fib_rule_port_range_set(&rule->sport_range) &&
+ !fib_rule_port_range_compare(&r->sport_range,
&rule->sport_range))
continue;
- if (!fib_rule_port_range_compare(&r->dport_range,
+ if (fib_rule_port_range_set(&rule->dport_range) &&
+ !fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
if (!ops->compare(r, frh, tb))
continue;
- return 1;
+ return r;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ nlrule->l3mdev = nla_get_u8(nla);
+ if (nlrule->l3mdev != 1) {
+ NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
+ return -1;
}
+
return 0;
}
+#else
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
+ return -1;
+}
+#endif
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct fib_rule **rule,
+ bool *user_priority)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX+1];
- int err = -EINVAL, unresolved = 0;
-
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
- goto errout;
+ struct fib_rule *nlrule = NULL;
+ int err = -EINVAL;
- ops = lookup_rules_ops(net, frh->family);
- if (ops == NULL) {
- err = -EAFNOSUPPORT;
- goto errout;
+ if (frh->src_len)
+ if (!tb[FRA_SRC] ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid source address");
+ goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
- goto errout;
-
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
- goto errout;
+ if (frh->dst_len)
+ if (!tb[FRA_DST] ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size) {
+ NL_SET_ERR_MSG(extack, "Invalid dst address");
+ goto errout;
+ }
- rule = kzalloc(ops->rule_size, GFP_KERNEL);
- if (rule == NULL) {
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (!nlrule) {
err = -ENOMEM;
goto errout;
}
- refcount_set(&rule->refcnt, 1);
- rule->fr_net = net;
+ refcount_set(&nlrule->refcnt, 1);
+ nlrule->fr_net = net;
- rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
- : fib_default_rule_pref(ops);
+ if (tb[FRA_PRIORITY]) {
+ nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+ *user_priority = true;
+ } else {
+ nlrule->pref = fib_default_rule_pref(ops);
+ }
- rule->proto = tb[FRA_PROTOCOL] ?
+ nlrule->proto = tb[FRA_PROTOCOL] ?
nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
- rule->iifindex = -1;
- nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->iifname);
+ nlrule->iifindex = -1;
+ nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->iifname);
if (dev)
- rule->iifindex = dev->ifindex;
+ nlrule->iifindex = dev->ifindex;
}
if (tb[FRA_OIFNAME]) {
struct net_device *dev;
- rule->oifindex = -1;
- nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, rule->oifname);
+ nlrule->oifindex = -1;
+ nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, nlrule->oifname);
if (dev)
- rule->oifindex = dev->ifindex;
+ nlrule->oifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) {
- rule->mark = nla_get_u32(tb[FRA_FWMARK]);
- if (rule->mark)
+ nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (nlrule->mark)
/* compatibility: if the mark value is non-zero all bits
* are compared unless a mask is explicitly specified.
*/
- rule->mark_mask = 0xFFFFFFFF;
+ nlrule->mark_mask = 0xFFFFFFFF;
}
if (tb[FRA_FWMASK])
- rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+ nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
if (tb[FRA_TUN_ID])
- rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+ nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
err = -EINVAL;
- if (tb[FRA_L3MDEV]) {
-#ifdef CONFIG_NET_L3_MASTER_DEV
- rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
- if (rule->l3mdev != 1)
-#endif
- goto errout_free;
- }
+ if (tb[FRA_L3MDEV] &&
+ fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
+ goto errout_free;
- rule->action = frh->action;
- rule->flags = frh->flags;
- rule->table = frh_get_table(frh, tb);
+ nlrule->action = frh->action;
+ nlrule->flags = frh->flags;
+ nlrule->table = frh_get_table(frh, tb);
if (tb[FRA_SUPPRESS_PREFIXLEN])
- rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
else
- rule->suppress_prefixlen = -1;
+ nlrule->suppress_prefixlen = -1;
if (tb[FRA_SUPPRESS_IFGROUP])
- rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
else
- rule->suppress_ifgroup = -1;
+ nlrule->suppress_ifgroup = -1;
if (tb[FRA_GOTO]) {
- if (rule->action != FR_ACT_GOTO)
+ if (nlrule->action != FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Unexpected goto");
goto errout_free;
+ }
- rule->target = nla_get_u32(tb[FRA_GOTO]);
+ nlrule->target = nla_get_u32(tb[FRA_GOTO]);
/* Backward jumps are prohibited to avoid endless loops */
- if (rule->target <= rule->pref)
+ if (nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
goto errout_free;
-
- list_for_each_entry(r, &ops->rules_list, list) {
- if (r->pref == rule->target) {
- RCU_INIT_POINTER(rule->ctarget, r);
- break;
- }
}
-
- if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
- unresolved = 1;
- } else if (rule->action == FR_ACT_GOTO)
+ } else if (nlrule->action == FR_ACT_GOTO) {
+ NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
goto errout_free;
+ }
- if (rule->l3mdev && rule->table)
+ if (nlrule->l3mdev && nlrule->table) {
+ NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
goto errout_free;
+ }
if (tb[FRA_UID_RANGE]) {
if (current_user_ns() != net->user_ns) {
err = -EPERM;
+ NL_SET_ERR_MSG(extack, "No permission to set uid");
goto errout_free;
}
- rule->uid_range = nla_get_kuid_range(tb);
+ nlrule->uid_range = nla_get_kuid_range(tb);
- if (!uid_range_set(&rule->uid_range) ||
- !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ if (!uid_range_set(&nlrule->uid_range) ||
+ !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+ NL_SET_ERR_MSG(extack, "Invalid uid range");
goto errout_free;
+ }
} else {
- rule->uid_range = fib_kuid_range_unset;
+ nlrule->uid_range = fib_kuid_range_unset;
}
if (tb[FRA_IP_PROTO])
- rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+ nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
if (tb[FRA_SPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &rule->sport_range);
- if (err)
+ &nlrule->sport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid sport range");
goto errout_free;
+ }
}
if (tb[FRA_DPORT_RANGE]) {
err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &rule->dport_range);
- if (err)
+ &nlrule->dport_range);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Invalid dport range");
goto errout_free;
+ }
}
+ *rule = nlrule;
+
+ return 0;
+
+errout_free:
+ kfree(nlrule);
+errout:
+ return err;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule = NULL, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
+ int err = -EINVAL, unresolved = 0;
+ bool user_priority = false;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
+ goto errout;
+ }
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (!ops) {
+ err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
+ goto errout;
+ }
+
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ if (err)
+ goto errout;
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
- rule_exists(ops, frh, tb, rule)) {
+ rule_find(ops, frh, tb, rule, user_priority)) {
err = -EEXIST;
goto errout_free;
}
- err = ops->configure(rule, skb, frh, tb);
+ err = ops->configure(rule, skb, frh, tb, extack);
if (err < 0)
goto errout_free;
@@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout_free;
list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
last = r;
@@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rule_port_range sprange = {0, 0};
- struct fib_rule_port_range dprange = {0, 0};
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule, *r;
+ struct fib_rule *rule = NULL, *r, *nlrule = NULL;
struct nlattr *tb[FRA_MAX+1];
- struct fib_kuid_range range;
int err = -EINVAL;
+ bool user_priority = false;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
+ }
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
+ NL_SET_ERR_MSG(extack, "Rule family not supported");
goto errout;
}
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
+ }
- err = validate_rulemsg(frh, tb, ops);
- if (err < 0)
+ err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ if (err)
goto errout;
- if (tb[FRA_UID_RANGE]) {
- range = nla_get_kuid_range(tb);
- if (!uid_range_set(&range)) {
- err = -EINVAL;
- goto errout;
- }
- } else {
- range = fib_kuid_range_unset;
+ rule = rule_find(ops, frh, tb, nlrule, user_priority);
+ if (!rule) {
+ err = -ENOENT;
+ goto errout;
}
- if (tb[FRA_SPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_SPORT_RANGE],
- &sprange);
- if (err)
- goto errout;
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
}
- if (tb[FRA_DPORT_RANGE]) {
- err = nla_get_port_range(tb[FRA_DPORT_RANGE],
- &dprange);
+ if (ops->delete) {
+ err = ops->delete(rule);
if (err)
goto errout;
}
- list_for_each_entry(rule, &ops->rules_list, list) {
- if (tb[FRA_PROTOCOL] &&
- (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
- continue;
-
- if (frh->action && (frh->action != rule->action))
- continue;
-
- if (frh_get_table(frh, tb) &&
- (frh_get_table(frh, tb) != rule->table))
- continue;
-
- if (tb[FRA_PRIORITY] &&
- (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
- continue;
-
- if (tb[FRA_IIFNAME] &&
- nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
- continue;
-
- if (tb[FRA_OIFNAME] &&
- nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
- continue;
-
- if (tb[FRA_FWMARK] &&
- (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
- continue;
-
- if (tb[FRA_FWMASK] &&
- (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
- continue;
-
- if (tb[FRA_TUN_ID] &&
- (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
- continue;
-
- if (tb[FRA_L3MDEV] &&
- (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
- continue;
-
- if (uid_range_set(&range) &&
- (!uid_eq(rule->uid_range.start, range.start) ||
- !uid_eq(rule->uid_range.end, range.end)))
- continue;
-
- if (tb[FRA_IP_PROTO] &&
- (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
- continue;
-
- if (fib_rule_port_range_set(&sprange) &&
- !fib_rule_port_range_compare(&rule->sport_range, &sprange))
- continue;
-
- if (fib_rule_port_range_set(&dprange) &&
- !fib_rule_port_range_compare(&rule->dport_range, &dprange))
- continue;
-
- if (!ops->compare(rule, frh, tb))
- continue;
-
- if (rule->flags & FIB_RULE_PERMANENT) {
- err = -EPERM;
- goto errout;
- }
-
- if (ops->delete) {
- err = ops->delete(rule);
- if (err)
- goto errout;
- }
+ if (rule->tun_id)
+ ip_tunnel_unneed_metadata();
- if (rule->tun_id)
- ip_tunnel_unneed_metadata();
+ list_del_rcu(&rule->list);
- list_del_rcu(&rule->list);
-
- if (rule->action == FR_ACT_GOTO) {
- ops->nr_goto_rules--;
- if (rtnl_dereference(rule->ctarget) == NULL)
- ops->unresolved_rules--;
- }
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
- /*
- * Check if this rule is a target to any of them. If so,
- * adjust to the next one with the same preference or
- * disable them. As this operation is eventually very
- * expensive, it is only performed if goto rules, except
- * current if it is goto rule, have actually been added.
- */
- if (ops->nr_goto_rules > 0) {
- struct fib_rule *n;
-
- n = list_next_entry(rule, list);
- if (&n->list == &ops->rules_list || n->pref != rule->pref)
- n = NULL;
- list_for_each_entry(r, &ops->rules_list, list) {
- if (rtnl_dereference(r->ctarget) != rule)
- continue;
- rcu_assign_pointer(r->ctarget, n);
- if (!n)
- ops->unresolved_rules++;
- }
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * adjust to the next one with the same preference or
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules, except
+ * current if it is goto rule, have actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ struct fib_rule *n;
+
+ n = list_next_entry(rule, list);
+ if (&n->list == &ops->rules_list || n->pref != rule->pref)
+ n = NULL;
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (rtnl_dereference(r->ctarget) != rule)
+ continue;
+ rcu_assign_pointer(r->ctarget, n);
+ if (!n)
+ ops->unresolved_rules++;
}
-
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
- NULL);
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).portid);
- fib_rule_put(rule);
- flush_route_cache(ops);
- rules_ops_put(ops);
- return 0;
}
- err = -ENOENT;
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+ NULL);
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ kfree(nlrule);
+ return 0;
+
errout:
+ if (nlrule)
+ kfree(nlrule);
rules_ops_put(ops);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 201ff36b17a8c..3d9ba7e5965ad 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,7 +57,17 @@
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
+#include <net/xfrm.h>
#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -111,12 +121,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
}
EXPORT_SYMBOL(sk_filter_trim_cap);
-BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
+BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
return skb_get_poff(skb);
}
-BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
struct nlattr *nla;
@@ -136,7 +146,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
-BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
struct nlattr *nla;
@@ -160,13 +170,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
-BPF_CALL_0(__get_raw_cpu_id)
+BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u8 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return *(u8 *)ptr;
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u16 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (offset >= 0) {
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be16(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
+ data, int, headlen, int, offset)
+{
+ u32 tmp, *ptr;
+ const int len = sizeof(tmp);
+
+ if (likely(offset >= 0)) {
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ } else {
+ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+ if (likely(ptr))
+ return get_unaligned_be32(ptr);
+ }
+
+ return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
+ int, offset)
+{
+ return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
+ offset);
+}
+
+BPF_CALL_0(bpf_get_raw_cpu_id)
{
return raw_smp_processor_id();
}
static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
- .func = __get_raw_cpu_id,
+ .func = bpf_get_raw_cpu_id,
.gpl_only = false,
.ret_type = RET_INTEGER,
};
@@ -316,16 +407,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
/* Emit call(arg1=CTX, arg2=A, arg3=X) */
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
- *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
break;
case SKF_AD_OFF + SKF_AD_NLATTR:
- *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
break;
case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
- *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
break;
case SKF_AD_OFF + SKF_AD_CPU:
- *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
break;
case SKF_AD_OFF + SKF_AD_RANDOM:
*insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
@@ -352,26 +443,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
return true;
}
+static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
+{
+ const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
+ int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
+ bool endian = BPF_SIZE(fp->code) == BPF_H ||
+ BPF_SIZE(fp->code) == BPF_W;
+ bool indirect = BPF_MODE(fp->code) == BPF_IND;
+ const int ip_align = NET_IP_ALIGN;
+ struct bpf_insn *insn = *insnp;
+ int offset = fp->k;
+
+ if (!indirect &&
+ ((unaligned_ok && offset >= 0) ||
+ (!unaligned_ok && offset >= 0 &&
+ offset + ip_align >= 0 &&
+ offset + ip_align % size == 0))) {
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
+ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+ *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
+ offset);
+ if (endian)
+ *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
+ *insn++ = BPF_JMP_A(8);
+ }
+
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
+ if (fp->k)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
+ }
+
+ switch (BPF_SIZE(fp->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
+ break;
+ default:
+ return false;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *insn = BPF_EXIT_INSN();
+
+ *insnp = insn;
+ return true;
+}
+
/**
* bpf_convert_filter - convert filter program
* @prog: the user passed filter program
* @len: the length of the user passed filter program
* @new_prog: allocated 'struct bpf_prog' or NULL
* @new_len: pointer to store length of converted program
+ * @seen_ld_abs: bool whether we've seen ld_abs/ind
*
* Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
* style extended BPF (eBPF).
* Conversion workflow:
*
* 1) First pass for calculating the new program length:
- * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
*
* 2) 2nd pass to remap in two passes: 1st pass finds new
* jump offsets, 2nd pass remapping:
- * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
*/
static int bpf_convert_filter(struct sock_filter *prog, int len,
- struct bpf_prog *new_prog, int *new_len)
+ struct bpf_prog *new_prog, int *new_len,
+ bool *seen_ld_abs)
{
int new_flen = 0, pass = 0, target, i, stack_off;
struct bpf_insn *new_insn, *first_insn = NULL;
@@ -410,12 +562,27 @@ do_pass:
* do this ourself. Initial CTX is present in BPF_REG_ARG1.
*/
*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ if (*seen_ld_abs) {
+ /* For packet access in classic BPF, cache skb->data
+ * in callee-saved BPF R8 and skb->len - skb->data_len
+ * (headlen) in BPF R9. Since classic BPF is read-only
+ * on CTX, we only need to cache it once.
+ */
+ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ BPF_REG_D, BPF_REG_CTX,
+ offsetof(struct sk_buff, data));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, data_len));
+ *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
+ }
} else {
new_insn += 3;
}
for (i = 0; i < len; fp++, i++) {
- struct bpf_insn tmp_insns[6] = { };
+ struct bpf_insn tmp_insns[32] = { };
struct bpf_insn *insn = tmp_insns;
if (addrs)
@@ -458,6 +625,11 @@ do_pass:
BPF_MODE(fp->code) == BPF_ABS &&
convert_bpf_extensions(fp, &insn))
break;
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ convert_bpf_ld_abs(fp, &insn)) {
+ *seen_ld_abs = true;
+ break;
+ }
if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
@@ -567,21 +739,31 @@ jmp_rest:
break;
/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
- case BPF_LDX | BPF_MSH | BPF_B:
- /* tmp = A */
- *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ case BPF_LDX | BPF_MSH | BPF_B: {
+ struct sock_filter tmp = {
+ .code = BPF_LD | BPF_ABS | BPF_B,
+ .k = fp->k,
+ };
+
+ *seen_ld_abs = true;
+
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
/* A = BPF_R0 = *(u8 *) (skb->data + K) */
- *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ convert_bpf_ld_abs(&tmp, &insn);
+ insn++;
/* A &= 0xf */
*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
/* A <<= 2 */
*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* tmp = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
/* X = A */
*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
/* A = tmp */
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
-
+ }
/* RET_K is remaped into 2 insns. RET_A case doesn't need an
* extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
*/
@@ -663,6 +845,8 @@ jmp_rest:
if (!new_prog) {
/* Only calculating new length. */
*new_len = new_insn - first_insn;
+ if (*seen_ld_abs)
+ *new_len += 4; /* Prologue bits. */
return 0;
}
@@ -1024,6 +1208,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
struct sock_filter *old_prog;
struct bpf_prog *old_fp;
int err, new_len, old_len = fp->len;
+ bool seen_ld_abs = false;
/* We are free to overwrite insns et al right here as it
* won't be used at this point in time anymore internally
@@ -1045,7 +1230,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
}
/* 1st pass: calculate the new program length. */
- err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
+ &seen_ld_abs);
if (err)
goto out_err_free;
@@ -1064,7 +1250,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
fp->len = new_len;
/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
- err = bpf_convert_filter(old_prog, old_len, fp, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
+ &seen_ld_abs);
if (err)
/* 2nd bpf_convert_filter() can fail only if it fails
* to allocate memory, remapping must succeed. Note,
@@ -1512,6 +1699,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+ u32, offset, void *, to, u32, len, u32, start_header)
+{
+ u8 *ptr;
+
+ if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
+ goto err_clear;
+
+ switch (start_header) {
+ case BPF_HDR_START_MAC:
+ ptr = skb_mac_header(skb) + offset;
+ break;
+ case BPF_HDR_START_NET:
+ ptr = skb_network_header(skb) + offset;
+ break;
+ default:
+ goto err_clear;
+ }
+
+ if (likely(ptr >= skb_mac_header(skb) &&
+ ptr + len <= skb_tail_pointer(skb))) {
+ memcpy(to, ptr, len);
+ return 0;
+ }
+
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+ .func = bpf_skb_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
/* Idea is the following: should the needed direct read/write
@@ -1857,6 +2085,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
@@ -1866,9 +2121,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.key = key;
tcb->bpf.flags = flags;
- tcb->bpf.map = map;
+ tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
return SK_PASS;
}
@@ -1876,16 +2132,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct sock *do_sk_redirect_map(struct sk_buff *skb)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- struct sock *sk = NULL;
-
- if (tcb->bpf.map) {
- sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
- tcb->bpf.key = 0;
- tcb->bpf.map = NULL;
- }
-
- return sk;
+ return tcb->bpf.sk_redir;
}
static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -1898,32 +2146,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
- struct bpf_map *, map, u32, key, u64, flags)
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
{
/* If user passes invalid input drop the packet. */
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->key = key;
msg->flags = flags;
- msg->map = map;
+ msg->sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
return SK_PASS;
}
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
{
- struct sock *sk = NULL;
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
- if (msg->map) {
- sk = __sock_map_lookup_elem(msg->map, msg->key);
+ msg->flags = flags;
+ msg->sk_redir = __sock_map_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
- msg->key = 0;
- msg->map = NULL;
- }
+ return SK_PASS;
+}
- return sk;
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+ return msg->sk_redir;
}
static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
@@ -2186,7 +2451,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
return ret;
}
-const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
.func = bpf_skb_vlan_push,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -2194,7 +2459,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
@@ -2208,13 +2472,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
return ret;
}
-const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
.func = bpf_skb_vlan_pop,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
@@ -2699,8 +2962,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
unsigned long metalen = xdp_get_metalen(xdp);
- void *data_start = xdp->data_hard_start + metalen;
+ void *data_start = xdp_frame_end + metalen;
void *data = xdp->data + offset;
if (unlikely(data < data_start ||
@@ -2724,14 +2988,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+ void *data_end = xdp->data_end + offset;
+
+ /* only shrinking is allowed for now. */
+ if (unlikely(offset >= 0))
+ return -EINVAL;
+
+ if (unlikely(data_end < xdp->data + ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data_end = data_end;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+ .func = bpf_xdp_adjust_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
+ void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
void *meta = xdp->data_meta + offset;
unsigned long metalen = xdp->data - meta;
if (xdp_data_meta_unsupported(xdp))
return -ENOTSUPP;
- if (unlikely(meta < xdp->data_hard_start ||
+ if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
if (unlikely((metalen & (sizeof(__u32) - 1)) ||
@@ -2756,16 +3045,20 @@ static int __bpf_tx_xdp(struct net_device *dev,
struct xdp_buff *xdp,
u32 index)
{
- int err;
+ struct xdp_frame *xdpf;
+ int sent;
if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP;
}
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
- if (err)
- return err;
- dev->netdev_ops->ndo_xdp_flush(dev);
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
+ if (sent <= 0)
+ return sent;
return 0;
}
@@ -2776,24 +3069,33 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
{
int err;
- if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
- struct net_device *dev = fwd;
-
- if (!dev->netdev_ops->ndo_xdp_xmit)
- return -EOPNOTSUPP;
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_DEVMAP: {
+ struct bpf_dtab_netdev *dst = fwd;
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+ err = dev_map_enqueue(dst, xdp, dev_rx);
if (err)
return err;
__dev_map_insert_ctx(map, index);
-
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+ break;
+ }
+ case BPF_MAP_TYPE_CPUMAP: {
struct bpf_cpu_map_entry *rcpu = fwd;
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (err)
return err;
__cpu_map_insert_ctx(map, index);
+ break;
+ }
+ case BPF_MAP_TYPE_XSKMAP: {
+ struct xdp_sock *xs = fwd;
+
+ err = __xsk_map_redirect(map, xdp, xs);
+ return err;
+ }
+ default:
+ break;
}
return 0;
}
@@ -2812,6 +3114,9 @@ void xdp_do_flush_map(void)
case BPF_MAP_TYPE_CPUMAP:
__cpu_map_flush(map);
break;
+ case BPF_MAP_TYPE_XSKMAP:
+ __xsk_map_flush(map);
+ break;
default:
break;
}
@@ -2826,6 +3131,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
return __dev_map_lookup_elem(map, index);
case BPF_MAP_TYPE_CPUMAP:
return __cpu_map_lookup_elem(map, index);
+ case BPF_MAP_TYPE_XSKMAP:
+ return __xsk_map_lookup_elem(map, index);
default:
return NULL;
}
@@ -2923,13 +3230,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
+ struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
unsigned long map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
- struct net_device *fwd = NULL;
u32 index = ri->ifindex;
+ void *fwd = NULL;
int err = 0;
ri->ifindex = 0;
@@ -2951,6 +3259,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
goto err;
skb->dev = fwd;
+ generic_xdp_tx(skb, xdp_prog);
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+ struct xdp_sock *xs = fwd;
+
+ err = xsk_generic_rcv(xs, xdp);
+ if (err)
+ goto err;
+ consume_skb(skb);
} else {
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
@@ -2965,7 +3281,7 @@ err:
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
u32 index = ri->ifindex;
@@ -2973,7 +3289,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
int err = 0;
if (ri->map)
- return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
ri->ifindex = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
@@ -2987,6 +3303,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
skb->dev = fwd;
_trace_xdp_redirect(dev, xdp_prog, index);
+ generic_xdp_tx(skb, xdp_prog);
return 0;
err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
@@ -3045,27 +3362,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
.arg3_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_pkt_data(void *func)
-{
- if (func == bpf_skb_vlan_push ||
- func == bpf_skb_vlan_pop ||
- func == bpf_skb_store_bytes ||
- func == bpf_skb_change_proto ||
- func == bpf_skb_change_head ||
- func == bpf_skb_change_tail ||
- func == bpf_skb_adjust_room ||
- func == bpf_skb_pull_data ||
- func == bpf_clone_redirect ||
- func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace ||
- func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta ||
- func == bpf_msg_pull_data)
- return true;
-
- return false;
-}
-
static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
unsigned long off, unsigned long len)
{
@@ -3148,6 +3444,7 @@ set_compat:
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
+ to->tunnel_ext = 0;
if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3155,6 +3452,8 @@ set_compat:
to->tunnel_label = be32_to_cpu(info->key.label);
} else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ to->tunnel_label = 0;
}
if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -3364,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
.arg3_type = ARG_ANYTHING,
};
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+ struct cgroup *cgrp;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+ .func = bpf_skb_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+#endif
+
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
unsigned long off, unsigned long len)
{
@@ -3711,6 +4031,603 @@ static const struct bpf_func_proto bpf_bind_proto = {
.arg3_type = ARG_CONST_SIZE,
};
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+ struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+ const struct sec_path *sp = skb_sec_path(skb);
+ const struct xfrm_state *x;
+
+ if (!sp || unlikely(index >= sp->len || flags))
+ goto err_clear;
+
+ x = sp->xvec[index];
+
+ if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+ goto err_clear;
+
+ to->reqid = x->props.reqid;
+ to->spi = x->id.spi;
+ to->family = x->props.family;
+ to->ext = 0;
+
+ if (to->family == AF_INET6) {
+ memcpy(to->remote_ipv6, x->props.saddr.a6,
+ sizeof(to->remote_ipv6));
+ } else {
+ to->remote_ipv4 = x->props.saddr.a4;
+ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ }
+
+ return 0;
+err_clear:
+ memset(to, 0, size);
+ return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+ .func = bpf_skb_get_xfrm_state,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+ const struct neighbour *neigh,
+ const struct net_device *dev)
+{
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+ params->h_vlan_TCI = 0;
+ params->h_vlan_proto = 0;
+
+ return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in_device *in_dev;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct fib_result res;
+ struct fib_nh *nh;
+ struct flowi4 fl4;
+ int err;
+ u32 mtu;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ /* verify forwarding is enabled on this interface */
+ in_dev = __in_dev_get_rcu(dev);
+ if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+ return 0;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl4.flowi4_iif = 1;
+ fl4.flowi4_oif = params->ifindex;
+ } else {
+ fl4.flowi4_iif = params->ifindex;
+ fl4.flowi4_oif = 0;
+ }
+ fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
+
+ fl4.flowi4_proto = params->l4_protocol;
+ fl4.daddr = params->ipv4_dst;
+ fl4.saddr = params->ipv4_src;
+ fl4.fl4_sport = params->sport;
+ fl4.fl4_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib_table *tb;
+
+ tb = fib_get_table(net, tbid);
+ if (unlikely(!tb))
+ return 0;
+
+ err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+ } else {
+ fl4.flowi4_mark = 0;
+ fl4.flowi4_secid = 0;
+ fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+ err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+ }
+
+ if (err || res.type != RTN_UNICAST)
+ return 0;
+
+ if (res.fi->fib_nhs > 1)
+ fib_select_path(net, &res, &fl4, NULL);
+
+ if (check_mtu) {
+ mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
+ if (params->tot_len > mtu)
+ return 0;
+ }
+
+ nh = &res.fi->fib_nh[res.nh_sel];
+
+ /* do not handle lwt encaps right now */
+ if (nh->nh_lwtstate)
+ return 0;
+
+ dev = nh->nh_dev;
+ if (unlikely(!dev))
+ return 0;
+
+ if (nh->nh_gw)
+ params->ipv4_dst = nh->nh_gw;
+
+ params->rt_metric = res.fi->fib_priority;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so
+ * rcu_read_lock_bh is not needed here
+ */
+ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+ if (neigh)
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+
+ return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+ u32 flags, bool check_mtu)
+{
+ struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+ struct neighbour *neigh;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct fib6_info *f6i;
+ struct flowi6 fl6;
+ int strict = 0;
+ int oif;
+ u32 mtu;
+
+ /* link local addresses are never forwarded */
+ if (rt6_need_strict(dst) || rt6_need_strict(src))
+ return 0;
+
+ dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ idev = __in6_dev_get_safely(dev);
+ if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+ return 0;
+
+ if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+ fl6.flowi6_iif = 1;
+ oif = fl6.flowi6_oif = params->ifindex;
+ } else {
+ oif = fl6.flowi6_iif = params->ifindex;
+ fl6.flowi6_oif = 0;
+ strict = RT6_LOOKUP_F_HAS_SADDR;
+ }
+ fl6.flowlabel = params->flowinfo;
+ fl6.flowi6_scope = 0;
+ fl6.flowi6_flags = 0;
+ fl6.mp_hash = 0;
+
+ fl6.flowi6_proto = params->l4_protocol;
+ fl6.daddr = *dst;
+ fl6.saddr = *src;
+ fl6.fl6_sport = params->sport;
+ fl6.fl6_dport = params->dport;
+
+ if (flags & BPF_FIB_LOOKUP_DIRECT) {
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+ struct fib6_table *tb;
+
+ tb = ipv6_stub->fib6_get_table(net, tbid);
+ if (unlikely(!tb))
+ return 0;
+
+ f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+ } else {
+ fl6.flowi6_mark = 0;
+ fl6.flowi6_secid = 0;
+ fl6.flowi6_tun_key.tun_id = 0;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+ f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+ }
+
+ if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+ return 0;
+
+ if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+ f6i->fib6_type != RTN_UNICAST))
+ return 0;
+
+ if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+ f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+ fl6.flowi6_oif, NULL,
+ strict);
+
+ if (check_mtu) {
+ mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
+ if (params->tot_len > mtu)
+ return 0;
+ }
+
+ if (f6i->fib6_nh.nh_lwtstate)
+ return 0;
+
+ if (f6i->fib6_flags & RTF_GATEWAY)
+ *dst = f6i->fib6_nh.nh_gw;
+
+ dev = f6i->fib6_nh.nh_dev;
+ params->rt_metric = f6i->fib6_metric;
+
+ /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+ * not needed here. Can not use __ipv6_neigh_lookup_noref here
+ * because we need to get nd_tbl via the stub
+ */
+ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+ ndisc_hashfn, dst, dev);
+ if (neigh)
+ return bpf_fib_set_fwd_params(params, neigh, dev);
+
+ return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+ flags, true);
+#endif
+ }
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+ .func = bpf_xdp_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+ struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+ struct net *net = dev_net(skb->dev);
+ int index = -EAFNOSUPPORT;
+
+ if (plen < sizeof(*params))
+ return -EINVAL;
+
+ if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ return -EINVAL;
+
+ switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+ case AF_INET:
+ index = bpf_ipv4_fib_lookup(net, params, flags, false);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ index = bpf_ipv6_fib_lookup(net, params, flags, false);
+ break;
+#endif
+ }
+
+ if (index > 0) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(net, index);
+ if (!is_skb_forwardable(dev, skb))
+ index = 0;
+ }
+
+ return index;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+ .func = bpf_skb_fib_lookup,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+ int err;
+ struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+ if (!seg6_validate_srh(srh, len))
+ return -EINVAL;
+
+ switch (type) {
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EBADMSG;
+
+ err = seg6_do_srh_inline(skb, srh);
+ break;
+ case BPF_LWT_ENCAP_SEG6:
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (err)
+ return err;
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+ u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ case BPF_LWT_ENCAP_SEG6:
+ case BPF_LWT_ENCAP_SEG6_INLINE:
+ return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+ .func = bpf_lwt_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ void *srh_tlvs, *srh_end, *ptr;
+ struct ipv6_sr_hdr *srh;
+ int srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+ ptr = skb->data + offset;
+ if (ptr >= srh_tlvs && ptr + len <= srh_end)
+ srh_state->valid = 0;
+ else if (ptr < (void *)&srh->flags ||
+ ptr + len > (void *)&srh->segments)
+ return -EFAULT;
+
+ if (unlikely(bpf_try_make_writable(skb, offset + len)))
+ return -EFAULT;
+
+ memcpy(skb->data + offset, from, len);
+ return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+ .func = bpf_lwt_seg6_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+ u32, action, void *, param, u32, param_len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh;
+ int srhoff = 0;
+ int err;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ if (!srh_state->valid) {
+ if (unlikely((srh_state->hdrlen & 7) != 0))
+ return -EBADMSG;
+
+ srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+ if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+ return -EBADMSG;
+
+ srh_state->valid = 1;
+ }
+
+ switch (action) {
+ case SEG6_LOCAL_ACTION_END_X:
+ if (param_len != sizeof(struct in6_addr))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+ case SEG6_LOCAL_ACTION_END_T:
+ if (param_len != sizeof(int))
+ return -EINVAL;
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_B6:
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+ param, param_len);
+ if (!err)
+ srh_state->hdrlen =
+ ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ return err;
+ case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+ err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+ param, param_len);
+ if (!err)
+ srh_state->hdrlen =
+ ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ return err;
+ default:
+ return -EINVAL;
+ }
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+ .func = bpf_lwt_seg6_action,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+ s32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ void *srh_end, *srh_tlvs, *ptr;
+ struct ipv6_sr_hdr *srh;
+ struct ipv6hdr *hdr;
+ int srhoff = 0;
+ int ret;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+ ((srh->first_segment + 1) << 4));
+ srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+ srh_state->hdrlen);
+ ptr = skb->data + offset;
+
+ if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+ return -EFAULT;
+ if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+ return -EFAULT;
+
+ if (len > 0) {
+ ret = skb_cow_head(skb, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ ret = bpf_skb_net_hdr_push(skb, offset, len);
+ } else {
+ ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+ }
+
+ bpf_compute_data_pointers(skb);
+ if (unlikely(ret < 0))
+ return ret;
+
+ hdr = (struct ipv6hdr *)skb->data;
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ srh_state->hdrlen += len;
+ srh_state->valid = 0;
+ return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+ return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+ .func = bpf_lwt_seg6_adjust_srh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
+ func == bpf_skb_change_tail ||
+ func == bpf_skb_adjust_room ||
+ func == bpf_skb_pull_data ||
+ func == bpf_clone_redirect ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head ||
+ func == bpf_xdp_adjust_meta ||
+ func == bpf_msg_pull_data ||
+ func == bpf_xdp_adjust_tail ||
+ func == bpf_lwt_push_encap ||
+ func == bpf_lwt_seg6_store_bytes ||
+ func == bpf_lwt_seg6_adjust_srh ||
+ func == bpf_lwt_seg6_action
+ )
+ return true;
+
+ return false;
+}
+
static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -3781,6 +4698,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
@@ -3798,6 +4717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_skb_pull_data:
return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
@@ -3852,6 +4773,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_XFRM
+ case BPF_FUNC_skb_get_xfrm_state:
+ return &bpf_skb_get_xfrm_state_proto;
+#endif
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -3875,33 +4806,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_proto;
case BPF_FUNC_redirect_map:
return &bpf_xdp_redirect_map_proto;
- default:
- return bpf_base_func_proto(func_id);
- }
-}
-
-static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_skb_load_bytes:
- return &bpf_skb_load_bytes_proto;
- case BPF_FUNC_skb_pull_data:
- return &bpf_skb_pull_data_proto;
- case BPF_FUNC_csum_diff:
- return &bpf_csum_diff_proto;
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_proto;
- case BPF_FUNC_get_route_realm:
- return &bpf_get_route_realm_proto;
- case BPF_FUNC_get_hash_recalc:
- return &bpf_get_hash_recalc_proto;
- case BPF_FUNC_perf_event_output:
- return &bpf_skb_event_output_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_smp_processor_id_proto;
- case BPF_FUNC_skb_under_cgroup:
- return &bpf_skb_under_cgroup_proto;
+ case BPF_FUNC_xdp_adjust_tail:
+ return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_fib_lookup:
+ return &bpf_xdp_fib_lookup_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3919,6 +4827,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
return &bpf_sock_map_update_proto;
+ case BPF_FUNC_sock_hash_update:
+ return &bpf_sock_hash_update_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3930,6 +4840,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_msg_redirect_map:
return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_redirect_hash:
+ return &bpf_msg_redirect_hash_proto;
case BPF_FUNC_msg_apply_bytes:
return &bpf_msg_apply_bytes_proto;
case BPF_FUNC_msg_cork_bytes:
@@ -3961,12 +4873,52 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_sk_redirect_map:
return &bpf_sk_redirect_map_proto;
+ case BPF_FUNC_sk_redirect_hash:
+ return &bpf_sk_redirect_hash_proto;
default:
return bpf_base_func_proto(func_id);
}
}
static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_push_encap_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -3997,7 +4949,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
default:
- return lwt_inout_func_proto(func_id, prog);
+ return lwt_out_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ return &bpf_lwt_seg6_store_bytes_proto;
+ case BPF_FUNC_lwt_seg6_action:
+ return &bpf_lwt_seg6_action_proto;
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ return &bpf_lwt_seg6_adjust_srh_proto;
+ default:
+ return lwt_out_func_proto(func_id, prog);
}
}
@@ -4105,7 +5072,6 @@ static bool lwt_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
-
/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
enum bpf_access_type access_type,
@@ -4221,6 +5187,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
return insn - insn_buf;
}
+static int bpf_gen_ld_abs(const struct bpf_insn *orig,
+ struct bpf_insn *insn_buf)
+{
+ bool indirect = BPF_MODE(orig->code) == BPF_IND;
+ struct bpf_insn *insn = insn_buf;
+
+ /* We're guaranteed here that CTX is in R6. */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
+ if (!indirect) {
+ *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
+ if (orig->imm)
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
+ }
+
+ switch (BPF_SIZE(orig->code)) {
+ case BPF_B:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
+ break;
+ case BPF_H:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
+ break;
+ case BPF_W:
+ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
+ break;
+ }
+
+ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_EXIT_INSN();
+
+ return insn - insn_buf;
+}
+
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog)
{
@@ -4279,8 +5280,15 @@ static bool xdp_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (type == BPF_WRITE)
+ if (type == BPF_WRITE) {
+ if (bpf_prog_is_dev_bound(prog->aux)) {
+ switch (off) {
+ case offsetof(struct xdp_md, rx_queue_index):
+ return __is_valid_xdp_access(off, size);
+ }
+ }
return false;
+ }
switch (off) {
case offsetof(struct xdp_md, data):
@@ -4327,6 +5335,7 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
break;
default:
return false;
@@ -4336,6 +5345,24 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP4_SENDMSG:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_UDP6_SENDMSG:
break;
default:
return false;
@@ -4346,6 +5373,9 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
/* Only narrow read access allowed for now. */
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
@@ -4465,18 +5495,23 @@ static bool sk_msg_is_valid_access(int off, int size,
switch (off) {
case offsetof(struct sk_msg_md, data):
info->reg_type = PTR_TO_PACKET;
+ if (size != sizeof(__u64))
+ return false;
break;
case offsetof(struct sk_msg_md, data_end):
info->reg_type = PTR_TO_PACKET_END;
+ if (size != sizeof(__u64))
+ return false;
break;
+ default:
+ if (size != sizeof(__u32))
+ return false;
}
if (off < 0 || off >= sizeof(struct sk_msg_md))
return false;
if (off % size != 0)
return false;
- if (size != sizeof(__u64))
- return false;
return true;
}
@@ -5095,6 +6130,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
SK_FL_PROTO_SHIFT);
break;
+
+ case offsetof(struct bpf_sock_addr, msg_src_ip4):
+ /* Treat t_ctx as struct in_addr for msg_src_ip4. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+ s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+ msg_src_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+ /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+ s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+ break;
}
return insn - insn_buf;
@@ -5152,7 +6204,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -5469,6 +6522,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
+#if IS_ENABLED(CONFIG_IPV6)
+ int off;
+#endif
switch (si->off) {
case offsetof(struct sk_msg_md, data):
@@ -5481,6 +6537,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct sk_msg_buff, data_end));
break;
+ case offsetof(struct sk_msg_md, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+ offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_ip6[0]) ...
+ offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct sk_msg_md, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct sk_msg_md, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct sk_msg_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
}
return insn - insn_buf;
@@ -5490,6 +6647,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
+ .gen_ld_abs = bpf_gen_ld_abs,
};
const struct bpf_prog_ops sk_filter_prog_ops = {
@@ -5501,6 +6659,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.is_valid_access = tc_cls_act_is_valid_access,
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
+ .gen_ld_abs = bpf_gen_ld_abs,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -5527,13 +6686,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
-const struct bpf_verifier_ops lwt_inout_verifier_ops = {
- .get_func_proto = lwt_inout_func_proto,
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+ .get_func_proto = lwt_in_func_proto,
.is_valid_access = lwt_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
};
-const struct bpf_prog_ops lwt_inout_prog_ops = {
+const struct bpf_prog_ops lwt_in_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+ .get_func_proto = lwt_out_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_out_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
@@ -5548,6 +6717,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
.test_run = bpf_prog_test_run_skb,
};
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+ .get_func_proto = lwt_seg6local_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+ .test_run = bpf_prog_test_run_skb,
+};
+
const struct bpf_verifier_ops cg_sock_verifier_ops = {
.get_func_proto = sock_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff90..53f96e4f7bf59 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
EXPORT_SYMBOL(skb_get_hash_perturb);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
- const struct flow_keys *keys, int hlen)
+ const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
@@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
*/
u32 skb_get_poff(const struct sk_buff *skb)
{
- struct flow_keys keys;
+ struct flow_keys_basic keys;
- if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
+ if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
keys->ports.src = fl6->fl6_sport;
keys->ports.dst = fl6->fl6_dport;
keys->keyid.keyid = fl6->fl6_gre_key;
- keys->tags.flow_label = (__force u32)fl6->flowlabel;
+ keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
keys->basic.ip_proto = fl6->flowi6_proto;
return flow_hash_from_keys(keys);
@@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
},
};
-static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
{
.key_id = FLOW_DISSECTOR_KEY_CONTROL,
.offset = offsetof(struct flow_keys, control),
@@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
struct flow_dissector flow_keys_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_dissector);
-struct flow_dissector flow_keys_buf_dissector __read_mostly;
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
static int __init init_default_flow_dissectors(void)
{
@@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)
skb_flow_dissector_init(&flow_keys_dissector_symmetric,
flow_keys_dissector_symmetric_keys,
ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
- skb_flow_dissector_init(&flow_keys_buf_dissector,
- flow_keys_buf_dissector_keys,
- ARRAY_SIZE(flow_keys_buf_dissector_keys));
+ skb_flow_dissector_init(&flow_keys_basic_dissector,
+ flow_keys_basic_dissector_keys,
+ ARRAY_SIZE(flow_keys_basic_dissector_keys));
return 0;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1fb43bff417d7..a7a9c3d738ba8 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work)
write_lock(&n->lock);
state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+ (n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
goto next_elt;
}
@@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (neigh->dead)
goto out;
+ neigh_update_ext_learned(neigh, flags, &notify);
+
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
@@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
+ if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+
if (ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa14..419af6dfe29f4 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
#include <trace/events/tcp.h>
#include <trace/events/fib.h>
#include <trace/events/qdisc.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <trace/events/fib6.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
-#endif
#if IS_ENABLED(CONFIG_BRIDGE)
#include <trace/events/bridge.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 0000000000000..68bf072067442
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <net/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+static int page_pool_init(struct page_pool *pool,
+ const struct page_pool_params *params)
+{
+ unsigned int ring_qsize = 1024; /* Default */
+
+ memcpy(&pool->p, params, sizeof(pool->p));
+
+ /* Validate only known flags were used */
+ if (pool->p.flags & ~(PP_FLAG_ALL))
+ return -EINVAL;
+
+ if (pool->p.pool_size)
+ ring_qsize = pool->p.pool_size;
+
+ /* Sanity limit mem that can be pinned down */
+ if (ring_qsize > 32768)
+ return -E2BIG;
+
+ /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+ * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+ * which is the XDP_TX use-case.
+ */
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ struct page_pool *pool;
+ int err = 0;
+
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ err = page_pool_init(pool, params);
+ if (err < 0) {
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+ }
+ return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+ struct ptr_ring *r = &pool->ring;
+ struct page *page;
+
+ /* Quicker fallback, avoid locks when ring is empty */
+ if (__ptr_ring_empty(r))
+ return NULL;
+
+ /* Test for safe-context, caller should provide this guarantee */
+ if (likely(in_serving_softirq())) {
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
+ return page;
+ }
+ /* Slower-path: Alloc array empty, time to refill
+ *
+ * Open-coded bulk ptr_ring consumer.
+ *
+ * Discussion: the ring consumer lock is not really
+ * needed due to the softirq/NAPI protection, but
+ * later need the ability to reclaim pages on the
+ * ring. Thus, keeping the locks.
+ */
+ spin_lock(&r->consumer_lock);
+ while ((page = __ptr_ring_consume(r))) {
+ if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+ break;
+ pool->alloc.cache[pool->alloc.count++] = page;
+ }
+ spin_unlock(&r->consumer_lock);
+ return page;
+ }
+
+ /* Slow-path: Get page from locked ring queue */
+ page = ptr_ring_consume(&pool->ring);
+ return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+ gfp_t _gfp)
+{
+ struct page *page;
+ gfp_t gfp = _gfp;
+ dma_addr_t dma;
+
+ /* We could always set __GFP_COMP, and avoid this branch, as
+ * prep_new_page() can handle order-0 with __GFP_COMP.
+ */
+ if (pool->p.order)
+ gfp |= __GFP_COMP;
+
+ /* FUTURE development:
+ *
+ * Current slow-path essentially falls back to single page
+ * allocations, which doesn't improve performance. This code
+ * need bulk allocation support from the page allocator code.
+ */
+
+ /* Cache was empty, do real allocation */
+ page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+ if (!page)
+ return NULL;
+
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ goto skip_dma_map;
+
+ /* Setup DMA mapping: use page->private for DMA-addr
+ * This mapping is kept for lifetime of page, until leaving pool.
+ */
+ dma = dma_map_page(pool->p.dev, page, 0,
+ (PAGE_SIZE << pool->p.order),
+ pool->p.dma_dir);
+ if (dma_mapping_error(pool->p.dev, dma)) {
+ put_page(page);
+ return NULL;
+ }
+ set_page_private(page, dma); /* page->private = dma; */
+
+skip_dma_map:
+ /* When page just alloc'ed is should/must have refcnt 1. */
+ return page;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ struct page *page;
+
+ /* Fast-path: Get a page from cache */
+ page = __page_pool_get_cached(pool);
+ if (page)
+ return page;
+
+ /* Slow-path: cache empty, do real allocation */
+ page = __page_pool_alloc_pages_slow(pool, gfp);
+ return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+static void __page_pool_clean_page(struct page_pool *pool,
+ struct page *page)
+{
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ return;
+
+ /* DMA unmap */
+ dma_unmap_page(pool->p.dev, page_private(page),
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir);
+ set_page_private(page, 0);
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+ /* An optimization would be to call __free_pages(page, pool->p.order)
+ * knowing page is not part of page-cache (thus avoiding a
+ * __page_cache_release() call).
+ */
+}
+
+static bool __page_pool_recycle_into_ring(struct page_pool *pool,
+ struct page *page)
+{
+ int ret;
+ /* BH protection not needed if current is serving softirq */
+ if (in_serving_softirq())
+ ret = ptr_ring_produce(&pool->ring, page);
+ else
+ ret = ptr_ring_produce_bh(&pool->ring, page);
+
+ return (ret == 0) ? true : false;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+ struct page_pool *pool)
+{
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ return false;
+
+ /* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ pool->alloc.cache[pool->alloc.count++] = page;
+ return true;
+}
+
+void __page_pool_put_page(struct page_pool *pool,
+ struct page *page, bool allow_direct)
+{
+ /* This allocator is optimized for the XDP mode that uses
+ * one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * refcnt == 1 means page_pool owns page, and can recycle it.
+ */
+ if (likely(page_ref_count(page) == 1)) {
+ /* Read barrier done in page_ref_count / READ_ONCE */
+
+ if (allow_direct && in_serving_softirq())
+ if (__page_pool_recycle_direct(page, pool))
+ return;
+
+ if (!__page_pool_recycle_into_ring(pool, page)) {
+ /* Cache full, fallback to free pages */
+ __page_pool_return_page(pool, page);
+ }
+ return;
+ }
+ /* Fallback/non-XDP mode: API user have elevated refcnt.
+ *
+ * Many drivers split up the page into fragments, and some
+ * want to keep doing this to save memory and do refcnt based
+ * recycling. Support this use case too, to ease drivers
+ * switching between XDP/non-XDP.
+ *
+ * In-case page_pool maintains the DMA mapping, API user must
+ * call page_pool_put_page once. In this elevated refcnt
+ * case, the DMA is unmapped/released, as driver is likely
+ * doing refcnt based recycle tricks, meaning another process
+ * will be invoking put_page.
+ */
+ __page_pool_clean_page(pool, page);
+ put_page(page);
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __page_pool_empty_ring(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty recycle ring */
+ while ((page = ptr_ring_consume(&pool->ring))) {
+ /* Verify the refcnt invariant of cached pages */
+ if (!(page_ref_count(page) == 1))
+ pr_crit("%s() page_pool refcnt %d violation\n",
+ __func__, page_ref_count(page));
+
+ __page_pool_return_page(pool, page);
+ }
+}
+
+static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+{
+ struct page_pool *pool;
+
+ pool = container_of(rcu, struct page_pool, rcu);
+
+ WARN(pool->alloc.count, "API usage violation");
+
+ __page_pool_empty_ring(pool);
+ ptr_ring_cleanup(&pool->ring, NULL);
+ kfree(pool);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Empty alloc cache, assume caller made sure this is
+ * no-longer in use, and page_pool_alloc_pages() cannot be
+ * call concurrently.
+ */
+ while (pool->alloc.count) {
+ page = pool->alloc.cache[--pool->alloc.count];
+ __page_pool_return_page(pool, page);
+ }
+
+ /* No more consumers should exist, but producers could still
+ * be in-flight.
+ */
+ __page_pool_empty_ring(pool);
+
+ /* An xdp_mem_allocator can still ref page_pool pointer */
+ call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+}
+EXPORT_SYMBOL(page_pool_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 45936922d7e23..5ef61222fdef1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,6 +59,9 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#define RTNL_MAX_TYPE 48
+#define RTNL_SLAVE_MAX_TYPE 36
+
struct rtnl_link {
rtnl_doit_func doit;
rtnl_dumpit_func dumpit;
@@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
{
int err;
+ /* Sanity-check max sizes to avoid stack buffer overflow. */
+ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
+ ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
+ return -EINVAL;
+
rtnl_lock();
err = __rtnl_link_register(ops);
rtnl_unlock();
@@ -785,13 +793,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
long expires, u32 error)
{
struct rta_cacheinfo ci = {
- .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
- .rta_used = dst->__use,
- .rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
.rta_id = id,
};
+ if (dst) {
+ ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+ ci.rta_used = dst->__use;
+ ci.rta_clntref = atomic_read(&dst->__refcnt);
+ }
if (expires) {
unsigned long clock;
@@ -2256,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb,
const struct net_device_ops *ops = dev->netdev_ops;
int err;
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
tb, CAP_NET_ADMIN);
@@ -2619,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = validate_linkmsg(dev, tb);
- if (err < 0)
- goto errout;
-
err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
errout:
return err;
@@ -2900,13 +2910,16 @@ replay:
}
if (1) {
- struct nlattr *attr[ops ? ops->maxtype + 1 : 1];
- struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
struct nlattr **data = NULL;
struct nlattr **slave_data = NULL;
struct net *dest_net, *link_net = NULL;
if (ops) {
+ if (ops->maxtype > RTNL_MAX_TYPE)
+ return -EINVAL;
+
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
err = nla_parse_nested(attr, ops->maxtype,
linkinfo[IFLA_INFO_DATA],
@@ -2923,6 +2936,9 @@ replay:
}
if (m_ops) {
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
if (m_ops->slave_maxtype &&
linkinfo[IFLA_INFO_SLAVE_DATA]) {
err = nla_parse_nested(slave_attr,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca80..c642304f178ce 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
skb->inner_mac_header += off;
}
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
__copy_skb_header(new, old);
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+EXPORT_SYMBOL(skb_copy_header);
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
return n;
}
EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
skb_clone_fraglist(n);
}
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
out:
return n;
}
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
skb->len + head_copy_len));
- copy_skb_header(n, skb);
+ skb_copy_header(n, skb);
skb_headers_offset_update(n, newheadroom - oldheadroom);
@@ -1839,6 +1840,20 @@ done:
}
EXPORT_SYMBOL(___pskb_trim);
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ int delta = skb->len - len;
+
+ skb->csum = csum_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0));
+ }
+ return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
/**
* __pskb_pull_tail - advance tail of skb header
* @skb: buffer to reallocate
@@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen = tcp_hdrlen(skb);
} else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/core/sock.c b/net/core/sock.c
index 2aed99a541d5d..f333d75ef1a9c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
- x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MAX"
static const char *const af_family_key_strings[AF_MAX+1] = {
_sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
"rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
"rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
"rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
- "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX"
+ "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
+ "rlock-AF_MAX"
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
"wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
"wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
"wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
"wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
- "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX"
+ "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
+ "wlock-AF_MAX"
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
"elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
"elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
"elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
"elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
- "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX"
+ "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
+ "elock-AF_MAX"
};
/*
@@ -323,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max);
int sysctl_tstamp_allow_data __read_mostly = 1;
-struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL_GPL(memalloc_socks);
+DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
+EXPORT_SYMBOL_GPL(memalloc_socks_key);
/**
* sk_set_memalloc - sets %SOCK_MEMALLOC
@@ -338,7 +342,7 @@ void sk_set_memalloc(struct sock *sk)
{
sock_set_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation |= __GFP_MEMALLOC;
- static_key_slow_inc(&memalloc_socks);
+ static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);
@@ -346,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk)
{
sock_reset_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation &= ~__GFP_MEMALLOC;
- static_key_slow_dec(&memalloc_socks);
+ static_branch_dec(&memalloc_socks_key);
/*
* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
@@ -724,9 +728,22 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock_valbool_flag(sk, SOCK_DBG, valbool);
break;
case SO_REUSEADDR:
- sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+ inet_sk(sk)->inet_num &&
+ (sk->sk_reuse != val)) {
+ ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+ break;
+ }
+ sk->sk_reuse = val;
break;
case SO_REUSEPORT:
+ if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+ inet_sk(sk)->inet_num &&
+ (sk->sk_reuseport != valbool)) {
+ ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+ break;
+ }
sk->sk_reuseport = valbool;
break;
case SO_TYPE:
@@ -905,7 +922,10 @@ set_rcvbuf:
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- sk->sk_rcvlowat = val ? : 1;
+ if (sock->ops->set_rcvlowat)
+ ret = sock->ops->set_rcvlowat(sk, val);
+ else
+ sk->sk_rcvlowat = val ? : 1;
break;
case SO_RCVTIMEO:
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e0048..9d1f22072d5d5 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -5,6 +5,10 @@
*/
#include <linux/types.h>
#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <net/page_pool.h>
#include <net/xdp.h>
@@ -13,6 +17,105 @@
#define REG_STATE_UNREGISTERED 0x2
#define REG_STATE_UNUSED 0x3
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+struct xdp_mem_allocator {
+ struct xdp_mem_info mem;
+ union {
+ void *allocator;
+ struct page_pool *page_pool;
+ struct zero_copy_allocator *zc_alloc;
+ };
+ struct rhash_head node;
+ struct rcu_head rcu;
+};
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+ const u32 *k = data;
+ const u32 key = *k;
+
+ BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+ != sizeof(u32));
+
+ /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
+ return key << RHT_HASH_RESERVED_SPACE;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xdp_mem_allocator *xa = ptr;
+ u32 mem_id = *(u32 *)arg->key;
+
+ return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+ .nelem_hint = 64,
+ .head_offset = offsetof(struct xdp_mem_allocator, node),
+ .key_offset = offsetof(struct xdp_mem_allocator, mem.id),
+ .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+ .max_size = MEM_ID_MAX,
+ .min_size = 8,
+ .automatic_shrinking = true,
+ .hashfn = xdp_mem_id_hashfn,
+ .obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+ struct xdp_mem_allocator *xa;
+
+ xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+ /* Allow this ID to be reused */
+ ida_simple_remove(&mem_id_pool, xa->mem.id);
+
+ /* Notice, driver is expected to free the *allocator,
+ * e.g. page_pool, and MUST also use RCU free.
+ */
+
+ /* Poison memory */
+ xa->mem.id = 0xFFFF;
+ xa->mem.type = 0xF0F0;
+ xa->allocator = (void *)0xDEAD9001;
+
+ kfree(xa);
+}
+
+static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+ struct xdp_mem_allocator *xa;
+ int id = xdp_rxq->mem.id;
+ int err;
+
+ if (id == 0)
+ return;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ return;
+ }
+
+ err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
+ WARN_ON(err);
+
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+}
+
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -21,8 +124,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
+ __xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
+
+ /* Reset mem info to defaults */
+ xdp_rxq->mem.id = 0;
+ xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
@@ -71,3 +180,193 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+static int __mem_id_init_hash_table(void)
+{
+ struct rhashtable *rht;
+ int ret;
+
+ if (unlikely(mem_id_init))
+ return 0;
+
+ rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+ if (!rht)
+ return -ENOMEM;
+
+ ret = rhashtable_init(rht, &mem_id_rht_params);
+ if (ret < 0) {
+ kfree(rht);
+ return ret;
+ }
+ mem_id_ht = rht;
+ smp_mb(); /* mutex lock should provide enough pairing */
+ mem_id_init = true;
+
+ return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+ int retries = 1;
+ int id;
+
+again:
+ id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+ if (id < 0) {
+ if (id == -ENOSPC) {
+ /* Cyclic allocator, reset next id */
+ if (retries--) {
+ mem_id_next = MEM_ID_MIN;
+ goto again;
+ }
+ }
+ return id; /* errno */
+ }
+ mem_id_next = id + 1;
+
+ return id;
+}
+
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+ if (type == MEM_TYPE_PAGE_POOL)
+ return is_page_pool_compiled_in();
+
+ if (type >= MEM_TYPE_MAX)
+ return false;
+
+ return true;
+}
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+ gfp_t gfp = GFP_KERNEL;
+ int id, errno, ret;
+ void *ptr;
+
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return -EFAULT;
+ }
+
+ if (!__is_supported_mem_type(type))
+ return -EOPNOTSUPP;
+
+ xdp_rxq->mem.type = type;
+
+ if (!allocator) {
+ if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
+ return -EINVAL; /* Setup time check page_pool req */
+ return 0;
+ }
+
+ /* Delay init of rhashtable to save memory if feature isn't used */
+ if (!mem_id_init) {
+ mutex_lock(&mem_id_lock);
+ ret = __mem_id_init_hash_table();
+ mutex_unlock(&mem_id_lock);
+ if (ret < 0) {
+ WARN_ON(1);
+ return ret;
+ }
+ }
+
+ xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+ if (!xdp_alloc)
+ return -ENOMEM;
+
+ mutex_lock(&mem_id_lock);
+ id = __mem_id_cyclic_get(gfp);
+ if (id < 0) {
+ errno = id;
+ goto err;
+ }
+ xdp_rxq->mem.id = id;
+ xdp_alloc->mem = xdp_rxq->mem;
+ xdp_alloc->allocator = allocator;
+
+ /* Insert allocator into ID lookup table */
+ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+ if (IS_ERR(ptr)) {
+ errno = PTR_ERR(ptr);
+ goto err;
+ }
+
+ mutex_unlock(&mem_id_lock);
+
+ return 0;
+err:
+ mutex_unlock(&mem_id_lock);
+ kfree(xdp_alloc);
+ return errno;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection. The @napi_direct boolian
+ * is used for those calls sites. Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ unsigned long handle)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ switch (mem->type) {
+ case MEM_TYPE_PAGE_POOL:
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa)
+ page_pool_put_page(xa->page_pool, page, napi_direct);
+ else
+ put_page(page);
+ rcu_read_unlock();
+ break;
+ case MEM_TYPE_PAGE_SHARED:
+ page_frag_free(data);
+ break;
+ case MEM_TYPE_PAGE_ORDER0:
+ page = virt_to_page(data); /* Assumes order0 page*/
+ put_page(page);
+ break;
+ case MEM_TYPE_ZERO_COPY:
+ /* NB! Only valid from an xdp_buff! */
+ rcu_read_lock();
+ /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ xa->zc_alloc->free(xa->zc_alloc, handle);
+ rcu_read_unlock();
+ default:
+ /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ break;
+ }
+}
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);
+
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
+void xdp_return_buff(struct xdp_buff *xdp)
+{
+ __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+}
+EXPORT_SYMBOL_GPL(xdp_return_buff);