diff options
Diffstat (limited to 'net/netfilter/ipvs')
-rw-r--r-- | net/netfilter/ipvs/Kconfig | 37 | ||||
-rw-r--r-- | net/netfilter/ipvs/Makefile | 1 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_app.c | 24 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_dh.c | 3 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ftp.c | 467 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_lblc.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_lblcr.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_mh.c | 540 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_nfct.c | 101 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_sctp.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_tcp.c | 8 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_udp.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sh.c | 3 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_xmit.c | 5 |
15 files changed, 971 insertions, 238 deletions
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index b32fb0dbe237d..05dc1b77e466f 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -225,6 +225,25 @@ config IP_VS_SH If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_MH + tristate "maglev hashing scheduling" + ---help--- + The maglev consistent hashing scheduling algorithm provides the + Google's Maglev hashing algorithm as a IPVS scheduler. It assigns + network connections to the servers through looking up a statically + assigned special hash table called the lookup table. Maglev hashing + is to assign a preference list of all the lookup table positions + to each destination. + + Through this operation, The maglev hashing gives an almost equal + share of the lookup table to each of the destinations and provides + minimal disruption by using the lookup table. When the set of + destinations changes, a connection will likely be sent to the same + destination as it was before. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_SED tristate "shortest expected delay scheduling" ---help--- @@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS needs to be large enough to effectively fit all the destinations multiplied by their respective weights. +comment 'IPVS MH scheduler' + +config IP_VS_MH_TAB_INDEX + int "IPVS maglev hashing table index of size (the prime numbers)" + range 8 17 + default 12 + ---help--- + The maglev hashing scheduler maps source IPs to destinations + stored in a hash table. This table is assigned by a preference + list of the positions to each destination until all slots in + the table are filled. The index determines the prime for size of + the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, + 65521 or 131071. When using weights to allow destinations to + receive more connections, the table is assigned an amount + proportional to the weights specified. The table needs to be large + enough to effectively fit all the destinations multiplied by their + respective weights. + comment 'IPVS application helper' config IP_VS_FTP diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index c552993fa4b91..bfce2677fda26 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index c3db074fc1f73..7588aeaa605ff 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); @@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, if (app->pkt_out == NULL) return 1; - if (!app->pkt_out(app, cp, skb, &diff)) + if (!app->pkt_out(app, cp, skb, &diff, ipvsh)) return 0; /* @@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, * called by ipvs packet handler, assumes previously checked cp!=NULL * returns false if it can't handle packet (oom) */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; @@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); + return app_tcp_pkt_out(cp, skb, app, ipvsh); /* * Call private output hook function @@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) if (app->pkt_out == NULL) return 1; - return app->pkt_out(app, cp, skb, NULL); + return app->pkt_out(app, cp, skb, NULL, ipvsh); } static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); @@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, if (app->pkt_in == NULL) return 1; - if (!app->pkt_in(app, cp, skb, &diff)) + if (!app->pkt_in(app, cp, skb, &diff, ipvsh)) return 0; /* @@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, * called by ipvs packet handler, assumes previously checked cp!=NULL. * returns false if can't handle packet (oom). */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; @@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); + return app_tcp_pkt_in(cp, skb, app, ipvsh); /* * Call private input hook function @@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) if (app->pkt_in == NULL) return 1; - return app->pkt_in(app, cp, skb, NULL); + return app->pkt_in(app, cp, skb, NULL, ipvsh); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 141b1509c9483..0c03c0e16a964 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; + /* keep the last_weight with latest non-0 weight */ + if (add || udest->weight != 0) + atomic_set(&dest->last_weight, udest->weight); + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 75f798f8e83b7..07459e71d9072 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -43,6 +43,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/hash.h> #include <net/ip_vs.h> @@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 58d5d05aec24c..4398a72edec59 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -29,6 +29,8 @@ #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/ctype.h> +#include <linux/inet.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> @@ -44,9 +46,18 @@ #include <net/ip_vs.h> -#define SERVER_STRING "227 " -#define CLIENT_STRING "PORT" +#define SERVER_STRING_PASV "227 " +#define CLIENT_STRING_PORT "PORT" +#define SERVER_STRING_EPSV "229 " +#define CLIENT_STRING_EPRT "EPRT" +enum { + IP_VS_FTP_ACTIVE = 0, + IP_VS_FTP_PORT = 0, + IP_VS_FTP_PASV, + IP_VS_FTP_EPRT, + IP_VS_FTP_EPSV, +}; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); -/* Dummy variable */ -static int ip_vs_ftp_pasv; +static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) +{ + struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len); + + if ((th->doff << 2) < sizeof(struct tcphdr)) + return NULL; + return (char *)th + (th->doff << 2); +} static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) @@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) } -/* - * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern", ignoring before "skip" and terminated with - * the "term" character. - * <addr,port> is in network order. +/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern". <addr,port> is in network order. + * Parse extended format depending on ext. In this case addr can be pre-set. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, const char *pattern, size_t plen, - char skip, char term, - __be32 *addr, __be16 *port, - char **start, char **end) + char skip, bool ext, int mode, + union nf_inet_addr *addr, __be16 *port, + __u16 af, char **start, char **end) { char *s, c; unsigned char p[6]; + char edelim; + __u16 hport; int i = 0; if (data_limit - data < plen) { @@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, if (s == data_limit) return -1; if (!found) { + /* "(" is optional for non-extended format, + * so catch the start of IPv4 address + */ + if (!ext && isdigit(*s)) + break; if (*s == skip) found = 1; } else if (*s != skip) { @@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, } } } + /* Old IPv4-only format? */ + if (!ext) { + p[0] = 0; + for (data = s; ; data++) { + if (data == data_limit) + return -1; + c = *data; + if (isdigit(c)) { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { + i++; + p[i] = 0; + } else { + /* unexpected character or terminator */ + break; + } + } - for (data = s; ; data++) { - if (data == data_limit) + if (i != 5) return -1; - if (*data == term) - break; + + *start = s; + *end = data; + addr->ip = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); + return 1; } - *end = data; + if (s == data_limit) + return -1; + *start = s; + edelim = *s++; + if (edelim < 33 || edelim > 126) + return -1; + if (s == data_limit) + return -1; + if (*s == edelim) { + /* Address family is usually missing for EPSV response */ + if (mode != IP_VS_FTP_EPSV) + return -1; + s++; + if (s == data_limit) + return -1; + /* Then address should be missing too */ + if (*s != edelim) + return -1; + /* Caller can pre-set addr, if needed */ + s++; + } else { + const char *ep; - memset(p, 0, sizeof(p)); - for (data = s; ; data++) { - c = *data; - if (c == term) - break; - if (c >= '0' && c <= '9') { - p[i] = p[i]*10 + c - '0'; - } else if (c == ',' && i < 5) { - i++; - } else { - /* unexpected character */ + /* We allow address only from same family */ + if (af == AF_INET6 && *s != '2') return -1; + if (af == AF_INET && *s != '1') + return -1; + s++; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; + if (s == data_limit) + return -1; + if (af == AF_INET6) { + if (in6_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; + } else { + if (in4_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; } + s = (char *) ep; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; } - - if (i != 5) + for (hport = 0; ; s++) + { + if (s == data_limit) + return -1; + if (!isdigit(*s)) + break; + hport = hport * 10 + *s - '0'; + } + if (s == data_limit || !hport || *s != edelim) return -1; - - *start = s; - *addr = get_unaligned((__be32 *) p); - *port = get_unaligned((__be16 *) (p + 4)); + s++; + *end = s; + *port = htons(hport); return 1; } -/* - * Look at outgoing ftp packets to catch the response to a PASV command +/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, * client port 0 (unknown at the moment), the server address and the @@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, * The outgoing packet should be something like * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + * The extended format for EPSV response provides usually only port: + * "229 Entering Extended Passive Mode (|||ppp|)" */ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) { - struct iphdr *iph; - struct tcphdr *th; char *data, *data_limit; char *start, *end; union nf_inet_addr from; @@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, *diff = 0; -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; @@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, if (!skb_make_writable(skb, skb->len)) return 0; - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); + if (cp->app_data == (void *) IP_VS_FTP_PASV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); + if (!data || data >= data_limit) + return 1; + if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, - '(', ')', - &from.ip, &port, + SERVER_STRING_PASV, + sizeof(SERVER_STRING_PASV)-1, + '(', false, IP_VS_FTP_PASV, + &from, &port, cp->af, &start, &end) != 1) return 1; - IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n", + IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n", &from.ip, ntohs(port), &cp->caddr.ip, 0); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); + data_limit = skb_tail_pointer(skb); - /* - * Now update or create an connection entry for it + if (!data || data >= data_limit) + return 1; + + /* Usually, data address is not specified but + * we support different address, so pre-set it. */ - { - struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, AF_INET, - iph->protocol, &from, port, - &cp->caddr, 0, &p); - n_cp = ip_vs_conn_out_get(&p); - } - if (!n_cp) { - struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, - AF_INET, IPPROTO_TCP, &cp->caddr, - 0, &cp->vaddr, port, &p); - /* As above, this is ipv4 only */ - n_cp = ip_vs_conn_new(&p, AF_INET, &from, port, - IP_VS_CONN_F_NO_CPORT | - IP_VS_CONN_F_NFCT, - cp->dest, skb->mark); - if (!n_cp) - return 0; + from = cp->daddr; + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING_EPSV, + sizeof(SERVER_STRING_EPSV)-1, + '(', true, IP_VS_FTP_EPSV, + &from, &port, cp->af, + &start, &end) != 1) + return 1; - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } + IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &from), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0); + } else { + return 1; + } - /* - * Replace the old passive address with the new one - */ + /* Now update or create a connection entry for it */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } + if (!n_cp) { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, + cp->af, ipvsh->protocol, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, cp->af, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* Replace the old passive address with the new one */ + if (cp->app_data == (void *) IP_VS_FTP_PASV) { from.ip = n_cp->vaddr.ip; port = n_cp->vport; snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", @@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ((unsigned char *)&from.ip)[3], ntohs(port) >> 8, ntohs(port) & 0xFF); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + from = n_cp->vaddr; + port = n_cp->vport; + /* Only port, client will use VIP for the data connection */ + snprintf(buf, sizeof(buf), "|||%u|", + ntohs(port)); + } else { + *buf = 0; + } + buf_len = strlen(buf); - buf_len = strlen(buf); - - ct = nf_ct_get(skb, &ctinfo); - if (ct) { - bool mangled; - - /* If mangling fails this function will return 0 - * which will cause the packet to be dropped. - * Mangling can only fail under memory pressure, - * hopefully it will succeed on the retransmitted - * packet. - */ - mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, - iph->ihl * 4, - start - data, - end - start, - buf, buf_len); - if (mangled) { - ip_vs_nfct_expect_related(skb, ct, n_cp, - IPPROTO_TCP, 0, 0); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_UNNECESSARY; - /* csum is updated */ - ret = 1; - } - } + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + bool mangled; - /* - * Not setting 'diff' is intentional, otherwise the sequence - * would be adjusted twice. + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. */ - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; + mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + ipvsh->len, + start - data, + end - start, + buf, buf_len); + if (mangled) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + ipvsh->protocol, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } } - return 1; + + /* Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. + */ + + cp->app_data = (void *) IP_VS_FTP_ACTIVE; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; } -/* - * Look at incoming ftp packets to catch the PASV/PORT command +/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command * (outside-to-inside). * * The incoming packet having the PORT command should be something like @@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * In this case, we create a connection entry using the client address and * port, so that the active ftp data connection from the server can reach * the client. + * Extended format: + * "EPSV\r\n" when client requests server address from same family + * "EPSV 1\r\n" when client requests IPv4 server address + * "EPSV 2\r\n" when client requests IPv6 server address + * "EPSV ALL\r\n" - not supported + * EPRT with specified delimiter (ASCII 33..126), "|" by default: + * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport + * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport */ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) { - struct iphdr *iph; - struct tcphdr *th; char *data, *data_start, *data_limit; char *start, *end; union nf_inet_addr to; @@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* no diff required for incoming packets */ *diff = 0; -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; @@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, if (!skb_make_writable(skb, skb->len)) return 0; - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); + data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); + if (!data || data >= data_limit) + return 1; while (data <= data_limit - 6) { - if (strncasecmp(data, "PASV\r\n", 6) == 0) { + if (cp->af == AF_INET && + strncasecmp(data, "PASV\r\n", 6) == 0) { /* Passive mode on */ IP_VS_DBG(7, "got PASV at %td of %td\n", data - data_start, data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; + cp->app_data = (void *) IP_VS_FTP_PASV; return 1; } + + /* EPSV or EPSV<space><net-prt> */ + if (strncasecmp(data, "EPSV", 4) == 0 && + (data[4] == ' ' || data[4] == '\r')) { + if (data[4] == ' ') { + char proto = data[5]; + + if (data > data_limit - 7 || data[6] != '\r') + return 1; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && proto == '2') { + } else +#endif + if (cp->af == AF_INET && proto == '1') { + } else { + return 1; + } + } + /* Extended Passive mode on */ + IP_VS_DBG(7, "got EPSV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = (void *) IP_VS_FTP_EPSV; + return 1; + } + data++; } @@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, * then create a new connection entry for the coming data * connection. */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - ' ', '\r', &to.ip, &port, - &start, &end) != 1) + if (cp->af == AF_INET && + ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_PORT, + sizeof(CLIENT_STRING_PORT)-1, + ' ', false, IP_VS_FTP_PORT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n", + ip_vs_proto_name(ipvsh->protocol), + &to.ip, ntohs(port), &cp->vaddr.ip, + ntohs(cp->vport)-1); + } else if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_EPRT, + sizeof(CLIENT_STRING_EPRT)-1, + ' ', true, IP_VS_FTP_EPRT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n", + ip_vs_proto_name(ipvsh->protocol), + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport)-1); + } else { return 1; - - IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port)); + } /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n", - ip_vs_proto_name(iph->protocol), - &to.ip, ntohs(port), &cp->vaddr.ip, 0); + cp->app_data = (void *) IP_VS_FTP_ACTIVE; { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, AF_INET, - iph->protocol, &to, port, &cp->vaddr, + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { - /* This is ipv4 only */ - n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr, + n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); @@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; - pr_info("%s: loaded support on port[%d] = %d\n", + pr_info("%s: loaded support on port[%d] = %u\n", app->name, i, ports[i]); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 3057e453bf317..b9f375e6dc937 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -48,6 +48,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> +#include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> @@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); } @@ -371,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) tbl->counter = 1; tbl->dead = false; tbl->svc = svc; + atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 92adc04557ed9..542c4949937ab 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -47,6 +47,7 @@ #include <linux/jiffies.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> @@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); } @@ -534,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) tbl->counter = 1; tbl->dead = false; tbl->svc = svc; + atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c new file mode 100644 index 0000000000000..0f795b186eb3d --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0 +/* IPVS: Maglev Hashing scheduling module + * + * Authors: Inju Song <inju.song@navercorp.com> + * + */ + +/* The mh algorithm is to assign a preference list of all the lookup + * table positions to each destination and populate the table with + * the most-preferred position of destinations. Then it is to select + * destination with the hash key of source IP address through looking + * up a the lookup table. + * + * The algorithm is detailed in: + * [3.4 Consistent Hasing] +https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + +#include <linux/siphash.h> +#include <linux/bitops.h> +#include <linux/gcd.h> + +#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ +#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ + +struct ip_vs_mh_lookup { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +struct ip_vs_mh_dest_setup { + unsigned int offset; /* starting offset */ + unsigned int skip; /* skip */ + unsigned int perm; /* next_offset */ + int turns; /* weight / gcd() and rshift */ +}; + +/* Available prime numbers for MH table */ +static int primes[] = {251, 509, 1021, 2039, 4093, + 8191, 16381, 32749, 65521, 131071}; + +/* For IPVS MH entry hash table */ +#ifndef CONFIG_IP_VS_MH_TAB_INDEX +#define CONFIG_IP_VS_MH_TAB_INDEX 12 +#endif +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] + +struct ip_vs_mh_state { + struct rcu_head rcu_head; + struct ip_vs_mh_lookup *lookup; + struct ip_vs_mh_dest_setup *dest_setup; + hsiphash_key_t hash1, hash2; + int gcd; + int rshift; +}; + +static inline void generate_hash_secret(hsiphash_key_t *hash1, + hsiphash_key_t *hash2) +{ + hash1->key[0] = 2654435761UL; + hash1->key[1] = 2654435761UL; + + hash2->key[0] = 2654446892UL; + hash2->key[1] = 2654446892UL; +} + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + +/* Returns hash value for IPVS MH entry */ +static inline unsigned int +ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, hsiphash_key_t *key, unsigned int offset) +{ + unsigned int v; + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ + addr->ip6[2] ^ addr->ip6[3]; +#endif + v = (offset + ntohs(port) + ntohl(addr_fold)); + return hsiphash(&v, sizeof(v), key); +} + +/* Reset all the hash buckets of the specified table. */ +static void ip_vs_mh_reset(struct ip_vs_mh_state *s) +{ + int i; + struct ip_vs_mh_lookup *l; + struct ip_vs_dest *dest; + + l = &s->lookup[0]; + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(l->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(l->dest, NULL); + } + l++; + } +} + +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest; + int lw; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * permutation for the dests. + */ + if (s->gcd < 1) + return 0; + + /* Set dest_setup for the dests permutation */ + p = &svc->destinations; + ds = &s->dest_setup[0]; + while ((p = p->next) != &svc->destinations) { + dest = list_entry(p, struct ip_vs_dest, n_list); + + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash1, 0) % + IP_VS_MH_TAB_SIZE; + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash2, 0) % + (IP_VS_MH_TAB_SIZE - 1) + 1; + ds->perm = ds->offset; + + lw = atomic_read(&dest->last_weight); + ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); + ds++; + } + + return 0; +} + +static int ip_vs_mh_populate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int n, c, dt_count; + unsigned long *table; + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest, *new_dest; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * the population for the dests and reset lookup table. + */ + if (s->gcd < 1) { + ip_vs_mh_reset(s); + return 0; + } + + table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), + sizeof(unsigned long), GFP_KERNEL); + if (!table) + return -ENOMEM; + + p = &svc->destinations; + n = 0; + dt_count = 0; + while (n < IP_VS_MH_TAB_SIZE) { + if (p == &svc->destinations) + p = p->next; + + ds = &s->dest_setup[0]; + while (p != &svc->destinations) { + /* Ignore added server with zero weight */ + if (ds->turns < 1) { + p = p->next; + ds++; + continue; + } + + c = ds->perm; + while (test_bit(c, table)) { + /* Add skip, mod IP_VS_MH_TAB_SIZE */ + ds->perm += ds->skip; + if (ds->perm >= IP_VS_MH_TAB_SIZE) + ds->perm -= IP_VS_MH_TAB_SIZE; + c = ds->perm; + } + + __set_bit(c, table); + + dest = rcu_dereference_protected(s->lookup[c].dest, 1); + new_dest = list_entry(p, struct ip_vs_dest, n_list); + if (dest != new_dest) { + if (dest) + ip_vs_dest_put(dest); + ip_vs_dest_hold(new_dest); + RCU_INIT_POINTER(s->lookup[c].dest, new_dest); + } + + if (++n == IP_VS_MH_TAB_SIZE) + goto out; + + if (++dt_count >= ds->turns) { + dt_count = 0; + p = p->next; + ds++; + } + } + } + +out: + kfree(table); + return 0; +} + +/* Get ip_vs_dest associated with supplied parameters. */ +static inline struct ip_vs_dest * +ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) + % IP_VS_MH_TAB_SIZE; + struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; +} + +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */ +static inline struct ip_vs_dest * +ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* First try the dest it's supposed to go to */ + ihash = ip_vs_mh_hashkey(svc->af, addr, port, + &s->hash1, 0) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); + + /* If the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; + hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, + roffset) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, + "MH: selected unavailable server %s:%u (offset %u), reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + +/* Assign all the hash buckets of the specified table with the service. */ +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int ret; + + if (svc->num_dests > IP_VS_MH_TAB_SIZE) + return -EINVAL; + + if (svc->num_dests >= 1) { + s->dest_setup = kcalloc(svc->num_dests, + sizeof(struct ip_vs_mh_dest_setup), + GFP_KERNEL); + if (!s->dest_setup) + return -ENOMEM; + } + + ip_vs_mh_permutate(s, svc); + + ret = ip_vs_mh_populate(s, svc); + if (ret < 0) + goto out; + + IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n", + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port)); + +out: + if (svc->num_dests >= 1) { + kfree(s->dest_setup); + s->dest_setup = NULL; + } + return ret; +} + +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->last_weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g; +} + +/* To avoid assigning huge weight for the MH table, + * calculate shift value with gcd. + */ +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + int mw, shift; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, return + * shift value as zero. + */ + if (gcd < 1) + return 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->last_weight); + if (new_weight > weight) + weight = new_weight; + } + + /* Because gcd is greater than zero, + * the maximum weight and gcd are always greater than zero + */ + mw = weight / gcd; + + /* shift = occupied bits of weight/gcd - MH highest bits */ + shift = fls(mw) - IP_VS_MH_TAB_BITS; + return (shift >= 0) ? shift : 0; +} + +static void ip_vs_mh_state_free(struct rcu_head *head) +{ + struct ip_vs_mh_state *s; + + s = container_of(head, struct ip_vs_mh_state, rcu_head); + kfree(s->lookup); + kfree(s); +} + +static int ip_vs_mh_init_svc(struct ip_vs_service *svc) +{ + int ret; + struct ip_vs_mh_state *s; + + /* Allocate the MH table for this service */ + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), + GFP_KERNEL); + if (!s->lookup) { + kfree(s); + return -ENOMEM; + } + + generate_hash_secret(&s->hash1, &s->hash2); + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + IP_VS_DBG(6, + "MH lookup table (memory=%zdbytes) allocated for current service\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); + + /* Assign the lookup table with current dests */ + ret = ip_vs_mh_reassign(s, svc); + if (ret < 0) { + ip_vs_mh_reset(s); + ip_vs_mh_state_free(&s->rcu_head); + return ret; + } + + /* No more failures, attach state */ + svc->sched_data = s; + return 0; +} + +static void ip_vs_mh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + /* Got to clean up lookup entry here */ + ip_vs_mh_reset(s); + + call_rcu(&s->rcu_head, ip_vs_mh_state_free); + IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); +} + +static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + /* Assign the lookup table with the updated service */ + return ip_vs_mh_reassign(s, svc); +} + +/* Helper function to get port number */ +static inline __be16 +ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *ports; + + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ + switch (iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) + return 0; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; + default: + return 0; + } +} + +/* Maglev Hashing scheduling */ +static struct ip_vs_dest * +ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_mh_state *s; + __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; + + IP_VS_DBG(6, "%s : Scheduling...\n", __func__); + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT) + port = ip_vs_mh_get_port(skb, iph); + + s = (struct ip_vs_mh_state *)svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK) + dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port); + else + dest = ip_vs_mh_get(svc, s, hash_addr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n", + IP_VS_DBG_ADDR(svc->af, hash_addr), + ntohs(port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + +/* IPVS MH Scheduler structure */ +static struct ip_vs_scheduler ip_vs_mh_scheduler = { + .name = "mh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list), + .init_service = ip_vs_mh_init_svc, + .done_service = ip_vs_mh_done_svc, + .add_dest = ip_vs_mh_dest_changed, + .del_dest = ip_vs_mh_dest_changed, + .upd_dest = ip_vs_mh_dest_changed, + .schedule = ip_vs_mh_schedule, +}; + +static int __init ip_vs_mh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_mh_scheduler); +} + +static void __exit ip_vs_mh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_mh_scheduler); + rcu_barrier(); +} + +module_init(ip_vs_mh_init); +module_exit(ip_vs_mh_cleanup); +MODULE_DESCRIPTION("Maglev hashing ipvs scheduler"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>"); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 6cf3fd81a5eca..eb8b9c883889c 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -67,15 +67,20 @@ #include <net/netfilter/nf_conntrack_zones.h> -#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" -#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ - &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ +#define FMT_TUPLE "%s:%u->%s:%u/%u" +#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \ + ntohs((T)->src.u.all), \ + IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \ + ntohs((T)->dst.u.all), \ (T)->dst.protonum -#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" -#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ - &((C)->vaddr.ip), ntohs((C)->vport), \ - &((C)->daddr.ip), ntohs((C)->dport), \ +#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u" +#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \ + ntohs((C)->cport), \ + IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \ + ntohs((C)->vport), \ + IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \ + ntohs((C)->dport), \ (C)->protocol, (C)->state void @@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) new_tuple.dst.protonum != IPPROTO_ICMPV6) new_tuple.dst.u.tcp.port = cp->vport; } - IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " - "ctinfo=%d, old reply=" FMT_TUPLE - ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", - __func__, ct, ct->status, ctinfo, - ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), - ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)); + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, new reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&new_tuple)); nf_conntrack_alter_reply(ct, &new_tuple); + IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n", + __func__, ct, ARG_CONN(cp)); } int ip_vs_confirm_conntrack(struct sk_buff *skb) @@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, struct ip_vs_conn_param p; struct net *net = nf_ct_net(ct); - if (exp->tuple.src.l3num != PF_INET) - return; - /* * We assume that no NF locks are held before this callback. * ip_vs_conn_out_get and ip_vs_conn_in_get should match their @@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, cp = ip_vs_conn_out_get(&p); if (cp) { /* Change reply CLIENT->RS to CLIENT->VS */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found inout cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); new_reply.dst.u3 = cp->vaddr; new_reply.dst.u.tcp.port = cp->vport; - IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE - ", inout cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); goto alter; } @@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, cp = ip_vs_conn_in_get(&p); if (cp) { /* Change reply VS->CLIENT to RS->CLIENT */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found outin cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); new_reply.src.u3 = cp->daddr; new_reply.src.u.tcp.port = cp->dport; - IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " - FMT_TUPLE ", outin cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); goto alter; } - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE - " - unknown expect\n", - __func__, ct, ct->status, ARG_TUPLE(orig)); + IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); return; alter: @@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, exp->expectfn = ip_vs_nfct_expect_callback; - IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&exp->tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); nf_ct_expect_related(exp); nf_ct_expect_put(exp); } @@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) tuple.dst.u3 = cp->vaddr; tuple.dst.u.all = cp->vport; - IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE - " for conn " FMT_CONN "\n", - __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n", + __func__, ARG_CONN(cp)); h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_kill(ct)) { - IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple=" - FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); } else { - IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" - FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); } nf_ct_put(ct); } else { - IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", - __func__, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); } } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index eff7569824e5b..3250c4a1111e2 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - ret = ip_vs_app_pkt_out(cp, skb); + ret = ip_vs_app_pkt_out(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ @@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - ret = ip_vs_app_pkt_in(cp, skb); + ret = ip_vs_app_pkt_in(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index bcd9b7bde4ee3..80d10ad12a15f 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - if (!(ret = ip_vs_app_pkt_out(cp, skb))) + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) @@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ - if (!(ret = ip_vs_app_pkt_in(cp, skb))) + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) @@ -436,7 +436,7 @@ static bool tcp_state_active(int state) return tcp_state_active_table[state]; } -static struct tcp_states_t tcp_states [] = { +static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, @@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = { /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; -static struct tcp_states_t tcp_states_dos [] = { +static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index c15ef7c2a1fae..e0ef11c3691e4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, /* * Call application helper if needed */ - if (!(ret = ip_vs_app_pkt_out(cp, skb))) + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) @@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, * Attempt ip_vs_app call. * It will fix ip_vs_conn */ - if (!(ret = ip_vs_app_pkt_in(cp, skb))) + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 16aaac6eedc96..1e01c782583a6 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + return (offset + hash_32(ntohs(port) + ntohl(addr_fold), + IP_VS_SH_TAB_BITS)) & IP_VS_SH_TAB_MASK; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 4527921b1c3ac..ba0a0fd045c85 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, /* check and decrement ttl */ if (ipv6_hdr(skb)->hop_limit <= 1) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); + /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } |