aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter/ipvs
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter/ipvs')
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c98
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c248
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c456
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c1002
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c171
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c61
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c99
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c114
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c20
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c17
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c12
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c129
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c45
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c153
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c142
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c110
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1238
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c22
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c14
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c117
26 files changed, 2918 insertions, 1389 deletions
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index a475ede..5c48ffb 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -43,11 +43,6 @@ EXPORT_SYMBOL(register_ip_vs_app);
EXPORT_SYMBOL(unregister_ip_vs_app);
EXPORT_SYMBOL(register_ip_vs_app_inc);
-/* ipvs application list head */
-static LIST_HEAD(ip_vs_app_list);
-static DEFINE_MUTEX(__ip_vs_app_mutex);
-
-
/*
* Get an ip_vs_app object
*/
@@ -67,7 +62,8 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
* Allocate/initialize app incarnation and register it in proto apps.
*/
static int
-ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
struct ip_vs_protocol *pp;
struct ip_vs_app *inc;
@@ -98,7 +94,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
}
}
- ret = pp->register_app(inc);
+ ret = pp->register_app(net, inc);
if (ret)
goto out;
@@ -119,7 +115,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
* Release app incarnation
*/
static void
-ip_vs_app_inc_release(struct ip_vs_app *inc)
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
@@ -127,7 +123,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
return;
if (pp->unregister_app)
- pp->unregister_app(inc);
+ pp->unregister_app(net, inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, ntohs(inc->port));
@@ -168,15 +164,17 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)
* Register an application incarnation in protocol applications
*/
int
-register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
int result;
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
- result = ip_vs_app_inc_new(app, proto, port);
+ result = ip_vs_app_inc_new(net, app, proto, port);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
return result;
}
@@ -185,16 +183,17 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
/*
* ip_vs_app registration routine
*/
-int register_ip_vs_app(struct ip_vs_app *app)
+int register_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
- list_add(&app->a_list, &ip_vs_app_list);
+ list_add(&app->a_list, &ipvs->app_list);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
return 0;
}
@@ -204,19 +203,20 @@ int register_ip_vs_app(struct ip_vs_app *app)
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
*/
-void unregister_ip_vs_app(struct ip_vs_app *app)
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_app *inc, *nxt;
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
- ip_vs_app_inc_release(inc);
+ ip_vs_app_inc_release(net, inc);
}
list_del(&app->a_list);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -226,7 +226,8 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
-int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
{
return pp->app_conn_bind(cp);
}
@@ -481,11 +482,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
* /proc/net/ip_vs_app entry function
*/
-static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
{
struct ip_vs_app *app, *inc;
- list_for_each_entry(app, &ip_vs_app_list, a_list) {
+ list_for_each_entry(app, &ipvs->app_list, a_list) {
list_for_each_entry(inc, &app->incs_list, a_list) {
if (pos-- == 0)
return inc;
@@ -497,19 +498,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
{
- mutex_lock(&__ip_vs_app_mutex);
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+ mutex_lock(&ipvs->app_mutex);
+
+ return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_app *inc, *app;
struct list_head *e;
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_app_idx(0);
+ return ip_vs_app_idx(ipvs, 0);
inc = v;
app = inc->app;
@@ -518,7 +524,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return list_entry(e, struct ip_vs_app, a_list);
/* go on to next application */
- for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
app = list_entry(e, struct ip_vs_app, a_list);
list_for_each_entry(inc, &app->incs_list, a_list) {
return inc;
@@ -529,7 +535,9 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
{
- mutex_unlock(&__ip_vs_app_mutex);
+ struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq));
+
+ mutex_unlock(&ipvs->app_mutex);
}
static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
@@ -557,7 +565,8 @@ static const struct seq_operations ip_vs_app_seq_ops = {
static int ip_vs_app_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_app_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations ip_vs_app_fops = {
@@ -569,15 +578,36 @@ static const struct file_operations ip_vs_app_fops = {
};
#endif
-int __init ip_vs_app_init(void)
+static int __net_init __ip_vs_app_init(struct net *net)
{
- /* we will replace it with proc_net_ipvs_create() soon */
- proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->app_list);
+ __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key);
+ proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
return 0;
}
+static void __net_exit __ip_vs_app_cleanup(struct net *net)
+{
+ proc_net_remove(net, "ip_vs_app");
+}
+
+static struct pernet_operations ip_vs_app_ops = {
+ .init = __ip_vs_app_init,
+ .exit = __ip_vs_app_cleanup,
+};
+
+int __init ip_vs_app_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_app_ops);
+ return rv;
+}
+
void ip_vs_app_cleanup(void)
{
- proc_net_remove(&init_net, "ip_vs_app");
+ unregister_pernet_subsys(&ip_vs_app_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e9adecd..f289306 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -48,35 +48,32 @@
/*
* Connection hash size. Default is what was selected at compile time.
*/
-int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
/* size and mask values */
-int ip_vs_conn_tab_size;
-int ip_vs_conn_tab_mask;
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
-static struct list_head *ip_vs_conn_tab;
+static struct hlist_head *ip_vs_conn_tab __read_mostly;
/* SLAB cache for IPVS connections */
static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
-/* counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
/* counter for no client port connections */
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
+static unsigned int ip_vs_conn_rnd __read_mostly;
/*
* Fine locking granularity for big connection hash table
*/
-#define CT_LOCKARRAY_BITS 4
+#define CT_LOCKARRAY_BITS 5
#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
@@ -133,19 +130,19 @@ static inline void ct_write_unlock_bh(unsigned key)
/*
* Returns hash value for IPVS connection entry
*/
-static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
const union nf_inet_addr *addr,
__be16 port)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
- return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
- (__force u32)port, proto, ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+ (__force u32)port, proto, ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
#endif
- return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
- ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+ ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
}
static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -166,18 +163,18 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
port = p->vport;
}
- return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+ return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
}
static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
- NULL, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ &cp->caddr, cp->cport, NULL, 0, &p);
- if (cp->dest && cp->dest->svc->pe) {
- p.pe = cp->dest->svc->pe;
+ if (cp->pe) {
+ p.pe = cp->pe;
p.pe_data = cp->pe_data;
p.pe_data_len = cp->pe_data_len;
}
@@ -186,7 +183,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
}
/*
- * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
* returns bool success.
*/
static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
@@ -204,7 +201,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
spin_lock(&cp->lock);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+ hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
ret = 1;
@@ -237,7 +234,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
- list_del(&cp->c_list);
+ hlist_del(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
@@ -262,18 +259,20 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
+ struct hlist_node *n;
hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
+ p->cport == cp->cport && p->vport == cp->vport &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
/* HIT */
atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
@@ -313,23 +312,23 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
struct ip_vs_conn_param *p)
{
__be16 _ports[2], *pptr;
+ struct net *net = skb_net(skb);
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
if (pptr == NULL)
return 1;
if (likely(!inverse))
- ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
- &iph->daddr, pptr[1], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ pptr[0], &iph->daddr, pptr[1], p);
else
- ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
- &iph->saddr, pptr[0], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ pptr[1], &iph->saddr, pptr[0], p);
return 0;
}
struct ip_vs_conn *
ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
@@ -347,14 +346,17 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
+ struct hlist_node *n;
hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
if (p->pe_data && p->pe->ct_match) {
- if (p->pe->ct_match(p, cp))
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp))
goto out;
continue;
}
@@ -394,6 +396,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp, *ret=NULL;
+ struct hlist_node *n;
/*
* Check for "full" addressed entries
@@ -402,12 +405,13 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
ct_read_lock(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
+ p->vport == cp->cport && p->cport == cp->dport &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
- p->vport == cp->cport && p->cport == cp->dport &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
/* HIT */
atomic_inc(&cp->refcnt);
ret = cp;
@@ -428,7 +432,6 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
struct ip_vs_conn *
ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
@@ -611,9 +614,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
struct ip_vs_dest *dest;
if ((cp) && (!cp->dest)) {
- dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
- &cp->vaddr, cp->vport,
- cp->protocol);
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark);
ip_vs_bind_dest(cp, dest);
return dest;
} else
@@ -677,6 +680,16 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
atomic_dec(&dest->refcnt);
}
+static int expire_quiescent_template(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest)
+{
+#ifdef CONFIG_SYSCTL
+ return ipvs->sysctl_expire_quiescent_template &&
+ (atomic_read(&dest->weight) == 0);
+#else
+ return 0;
+#endif
+}
/*
* Checking if the destination of a connection template is available.
@@ -686,14 +699,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
/*
* Checking the dest server status.
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- (sysctl_ip_vs_expire_quiescent_template &&
- (atomic_read(&dest->weight) == 0))) {
+ expire_quiescent_template(ipvs, dest)) {
IP_VS_DBG_BUF(9, "check_template: dest not available for "
"protocol %s s:%s:%d v:%s:%d "
"-> d:%s:%d\n",
@@ -730,6 +743,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
cp->timeout = 60*HZ;
@@ -765,13 +779,14 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->flags & IP_VS_CONN_F_NFCT)
ip_vs_conn_drop_conntrack(cp);
+ ip_vs_pe_put(cp->pe);
kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
- atomic_dec(&ip_vs_conn_count);
+ atomic_dec(&ipvs->conn_count);
kmem_cache_free(ip_vs_conn_cachep, cp);
return;
@@ -802,10 +817,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p,
const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
- struct ip_vs_dest *dest)
+ struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
+ struct netns_ipvs *ipvs = net_ipvs(p->net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ p->protocol);
cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
@@ -813,8 +830,9 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
return NULL;
}
- INIT_LIST_HEAD(&cp->c_list);
+ INIT_HLIST_NODE(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+ ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->protocol = p->protocol;
ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
@@ -826,7 +844,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
&cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
- if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ cp->fwmark = fwmark;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
}
@@ -842,7 +863,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
- atomic_inc(&ip_vs_conn_count);
+ atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
@@ -861,8 +882,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
#endif
ip_vs_bind_xmit(cp);
- if (unlikely(pp && atomic_read(&pp->appcnt)))
- ip_vs_bind_app(cp, pp);
+ if (unlikely(pd && atomic_read(&pd->appcnt)))
+ ip_vs_bind_app(cp, pd->pp);
/*
* Allow conntrack to be preserved. By default, conntrack
@@ -871,7 +892,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* IP_VS_CONN_F_ONE_PACKET too.
*/
- if (ip_vs_conntrack_enabled())
+ if (ip_vs_conntrack_enabled(ipvs))
cp->flags |= IP_VS_CONN_F_NFCT;
/* Hash it in the ip_vs_conn_tab finally */
@@ -884,18 +905,24 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* /proc/net/ip_vs_conn entries
*/
#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+ struct seq_net_private p;
+ struct hlist_head *l;
+};
static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
{
int idx;
struct ip_vs_conn *cp;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *n;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
- seq->private = &ip_vs_conn_tab[idx];
- return cp;
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
}
}
ct_read_unlock_bh(idx);
@@ -906,14 +933,18 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
{
- seq->private = NULL;
+ struct ip_vs_iter_state *iter = seq->private;
+
+ iter->l = NULL;
return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_conn *cp = v;
- struct list_head *e, *l = seq->private;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
+ struct hlist_head *l = iter->l;
int idx;
++*pos;
@@ -921,27 +952,28 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return ip_vs_conn_array(seq, 0);
/* more on same hash chain? */
- if ((e = cp->c_list.next) != l)
- return list_entry(e, struct ip_vs_conn, c_list);
+ if ((e = cp->c_list.next))
+ return hlist_entry(e, struct ip_vs_conn, c_list);
idx = l - ip_vs_conn_tab;
ct_read_unlock_bh(idx);
while (++idx < ip_vs_conn_tab_size) {
ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
- seq->private = &ip_vs_conn_tab[idx];
+ hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) {
+ iter->l = &ip_vs_conn_tab[idx];
return cp;
}
ct_read_unlock_bh(idx);
}
- seq->private = NULL;
+ iter->l = NULL;
return NULL;
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
{
- struct list_head *l = seq->private;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_head *l = iter->l;
if (l)
ct_read_unlock_bh(l - ip_vs_conn_tab);
@@ -955,18 +987,19 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
size_t len = 0;
- if (cp->dest && cp->pe_data &&
- cp->dest->svc->pe->show_pe_data) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+ if (cp->pe_data) {
pe_data[0] = ' ';
- len = strlen(cp->dest->svc->pe->name);
- memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
pe_data[len + 1] = ' ';
len += 2;
- len += cp->dest->svc->pe->show_pe_data(cp,
- pe_data + len);
+ len += cp->pe->show_pe_data(cp, pe_data + len);
}
pe_data[len] = '\0';
@@ -1004,7 +1037,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
static int ip_vs_conn_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_fops = {
@@ -1031,6 +1065,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -1067,7 +1105,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_sync_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_sync_fops = {
@@ -1113,7 +1152,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
}
/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
+void ip_vs_random_dropentry(struct net *net)
{
int idx;
struct ip_vs_conn *cp;
@@ -1123,17 +1162,19 @@ void ip_vs_random_dropentry(void)
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
unsigned hash = net_random() & ip_vs_conn_tab_mask;
+ struct hlist_node *n;
/*
* Lock is actually needed in this loop.
*/
ct_write_lock_bh(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
-
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1168,20 +1209,24 @@ void ip_vs_random_dropentry(void)
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
-static void ip_vs_conn_flush(void)
+static void ip_vs_conn_flush(struct net *net)
{
int idx;
struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- flush_again:
+flush_again:
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ struct hlist_node *n;
+
/*
* Lock is actually needed in this loop.
*/
ct_write_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
if (cp->control) {
@@ -1194,16 +1239,41 @@ static void ip_vs_conn_flush(void)
/* the counter may be not NULL, because maybe some conn entries
are run by slow timer handler or unhashed but still referred */
- if (atomic_read(&ip_vs_conn_count) != 0) {
+ if (atomic_read(&ipvs->conn_count) != 0) {
schedule();
goto flush_again;
}
}
+/*
+ * per netns init and exit
+ */
+int __net_init __ip_vs_conn_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ atomic_set(&ipvs->conn_count, 0);
+ proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+ proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+ return 0;
+}
+
+static void __net_exit __ip_vs_conn_cleanup(struct net *net)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush(net);
+ proc_net_remove(net, "ip_vs_conn");
+ proc_net_remove(net, "ip_vs_conn_sync");
+}
+static struct pernet_operations ipvs_conn_ops = {
+ .init = __ip_vs_conn_init,
+ .exit = __ip_vs_conn_cleanup,
+};
int __init ip_vs_conn_init(void)
{
int idx;
+ int retc;
/* Compute size and mask */
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
@@ -1212,8 +1282,7 @@ int __init ip_vs_conn_init(void)
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
- sizeof(struct list_head));
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
if (!ip_vs_conn_tab)
return -ENOMEM;
@@ -1233,32 +1302,25 @@ int __init ip_vs_conn_init(void)
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
- for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
- }
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
+ INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
- proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
- proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+ retc = register_pernet_subsys(&ipvs_conn_ops);
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
- return 0;
+ return retc;
}
-
void ip_vs_conn_cleanup(void)
{
- /* flush all the connection entries first */
- ip_vs_conn_flush();
-
+ unregister_pernet_subsys(&ipvs_conn_ops);
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- proc_net_remove(&init_net, "ip_vs_conn");
- proc_net_remove(&init_net, "ip_vs_conn_sync");
vfree(ip_vs_conn_tab);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b4e51e9..07accf6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -41,6 +41,7 @@
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h>
#include <net/ip6_checksum.h>
+#include <net/netns/generic.h> /* net_generic() */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
+int ip_vs_net_id __read_mostly;
+#ifdef IP_VS_GENERIC_NETNS
+EXPORT_SYMBOL(ip_vs_net_id);
+#endif
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
/* ID used in ICMP lookups */
#define icmp_id(icmph) (((icmph)->un).echo.id)
@@ -108,21 +115,28 @@ static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.inpkts++;
- dest->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.inpkts++;
- dest->svc->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.inpkts++;
- ip_vs_stats.ustats.inbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(dest->svc->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -131,21 +145,28 @@ static inline void
ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.outpkts++;
- dest->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.outpkts++;
- dest->svc->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.outpkts++;
- ip_vs_stats.ustats.outbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(dest->svc->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -153,41 +174,44 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
static inline void
ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
- spin_lock(&cp->dest->stats.lock);
- cp->dest->stats.ustats.conns++;
- spin_unlock(&cp->dest->stats.lock);
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(cp->dest->stats.cpustats);
+ s->ustats.conns++;
- spin_lock(&svc->stats.lock);
- svc->stats.ustats.conns++;
- spin_unlock(&svc->stats.lock);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.conns++;
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.conns++;
- spin_unlock(&ip_vs_stats.lock);
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s->ustats.conns++;
}
static inline int
ip_vs_set_state(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- if (unlikely(!pp->state_transition))
+ if (unlikely(!pd->pp->state_transition))
return 0;
- return pp->state_transition(cp, direction, skb, pp);
+ return pd->pp->state_transition(cp, direction, skb, pd);
}
-static inline void
+static inline int
ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
struct sk_buff *skb, int protocol,
const union nf_inet_addr *caddr, __be16 cport,
const union nf_inet_addr *vaddr, __be16 vport,
struct ip_vs_conn_param *p)
{
- ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+ ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ vport, p);
p->pe = svc->pe;
if (p->pe && p->pe->fill_param)
- p->pe->fill_param(p, skb);
+ return p->pe->fill_param(p, skb);
+
+ return 0;
}
/*
@@ -200,7 +224,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
struct sk_buff *skb,
- __be16 ports[2])
+ __be16 src_port, __be16 dst_port, int *ignored)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
@@ -224,8 +248,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
- IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+ IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -247,14 +271,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
__be16 vport = 0;
- if (ports[1] == svc->port) {
+ if (dst_port == svc->port) {
/* non-FTP template:
* <protocol, caddr, 0, vaddr, vport, daddr, dport>
* FTP template:
* <protocol, caddr, 0, vaddr, 0, daddr, 0>
*/
if (svc->port != FTPPORT)
- vport = ports[1];
+ vport = dst_port;
} else {
/* Note: persistent fwmark-based services and
* persistent port zero service are handled here.
@@ -268,24 +292,31 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
vaddr = &fwmark;
}
}
- ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
- vaddr, vport, &param);
+ /* return *ignored = -1 so NF_DROP can be used */
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param) < 0) {
+ *ignored = -1;
+ return NULL;
+ }
}
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
if (!ct || !ip_vs_check_template(ct)) {
- /* No template found or the dest of the connection
+ /*
+ * No template found or the dest of the connection
* template is not available.
+ * return *ignored=0 i.e. ICMP and NF_DROP
*/
dest = svc->scheduler->schedule(svc, skb);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
+ *ignored = 0;
return NULL;
}
- if (ports[1] == svc->port && svc->port != FTPPORT)
+ if (dst_port == svc->port && svc->port != FTPPORT)
dport = dest->port;
/* Create a template
@@ -293,9 +324,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* and thus param.pe_data will be destroyed
* when the template expires */
ct = ip_vs_conn_new(&param, &dest->addr, dport,
- IP_VS_CONN_F_TEMPLATE, dest);
+ IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
if (ct == NULL) {
kfree(param.pe_data);
+ *ignored = -1;
return NULL;
}
@@ -306,7 +338,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
kfree(param.pe_data);
}
- dport = ports[1];
+ dport = dst_port;
if (dport == svc->port && dest->port)
dport = dest->port;
@@ -317,11 +349,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
- &iph.daddr, ports[1], &param);
- cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
+ src_port, &iph.daddr, dst_port, &param);
+
+ cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
if (cp == NULL) {
ip_vs_conn_put(ct);
+ *ignored = -1;
return NULL;
}
@@ -341,11 +375,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* It selects a server according to the virtual service, and
* creates a connection entry.
* Protocols supported: TCP, UDP
+ *
+ * Usage of *ignored
+ *
+ * 1 : protocol tried to schedule (eg. on SYN), found svc but the
+ * svc/scheduler decides that this packet should be accepted with
+ * NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 : scheduler can not find destination, so try bypass or
+ * return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 : scheduler tried to schedule but fatal error occurred, eg.
+ * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ * failure such as missing Call-ID, ENOMEM on skb_linearize
+ * or pe_data. In this case we should return NF_DROP without
+ * any attempts to send ICMP with ip_vs_leave.
*/
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp, int *ignored)
+ struct ip_vs_proto_data *pd, int *ignored)
{
+ struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
@@ -371,12 +421,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
}
/*
- * Do not schedule replies from local real server. It is risky
- * for fwmark services but mostly for persistent services.
+ * Do not schedule replies from local real server.
*/
if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
- (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+ (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
"Not scheduling reply for existing connection");
__ip_vs_conn_put(cp);
@@ -386,10 +434,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Persistent service
*/
- if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
- *ignored = 0;
- return ip_vs_sched_persist(svc, skb, pptr);
- }
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
+
+ *ignored = 0;
/*
* Non-persistent service
@@ -402,8 +450,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return NULL;
}
- *ignored = 0;
-
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -419,13 +465,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
- pptr[0], &iph.daddr, pptr[1], &p);
+
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
+ &iph.saddr, pptr[0], &iph.daddr, pptr[1],
+ &p);
cp = ip_vs_conn_new(&p, &dest->addr,
dest->port ? dest->port : pptr[1],
- flags, dest);
- if (!cp)
+ flags, dest, skb->mark);
+ if (!cp) {
+ *ignored = -1;
return NULL;
+ }
}
IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
@@ -447,11 +497,16 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
* no destination is available for a new connection.
*/
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
__be16 _ports[2], *pptr;
struct ip_vs_iphdr iph;
+#ifdef CONFIG_SYSCTL
+ struct net *net;
+ struct netns_ipvs *ipvs;
int unicast;
+#endif
+
ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -460,17 +515,21 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
return NF_DROP;
}
+#ifdef CONFIG_SYSCTL
+ net = skb_net(skb);
+
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
else
#endif
- unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+ unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is a non-local unicast, then create
a cache_bypass connection entry */
- if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
+ ipvs = net_ipvs(net);
+ if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
int ret, cs;
struct ip_vs_conn *cp;
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
@@ -484,12 +543,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol,
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
&iph.saddr, pptr[0],
&iph.daddr, pptr[1], &p);
cp = ip_vs_conn_new(&p, &daddr, 0,
IP_VS_CONN_F_BYPASS | flags,
- NULL);
+ NULL, skb->mark);
if (!cp)
return NF_DROP;
}
@@ -498,16 +557,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_in_stats(cp, skb);
/* set state */
- cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
/* transmit the first SYN packet */
- ret = cp->packet_xmit(skb, cp, pp);
+ ret = cp->packet_xmit(skb, cp, pd->pp);
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
+#endif
/*
* When the virtual ftp service is presented, packets destined
@@ -544,6 +604,33 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
return NF_DROP;
}
+#ifdef CONFIG_SYSCTL
+
+static int sysctl_snat_reroute(struct sk_buff *skb)
+{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ return ipvs->sysctl_snat_reroute;
+}
+
+static int sysctl_nat_icmp_send(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ return ipvs->sysctl_nat_icmp_send;
+}
+
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
+{
+ return ipvs->sysctl_expire_nodest_conn;
+}
+
+#else
+
+static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
+static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
+
+#endif
+
__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -576,6 +663,22 @@ static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
}
#endif
+static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
+ return 1;
+ } else
+#endif
+ if ((sysctl_snat_reroute(skb) ||
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ return 1;
+
+ return 0;
+}
+
/*
* Packet has been made sufficiently writable in caller
* - inout: 1=in->out, 0=out->in
@@ -674,7 +777,7 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* Handle relevant response ICMP messages - forward to the right
- * destination host. Used for NAT and local client.
+ * destination host.
*/
static int handle_response_icmp(int af, struct sk_buff *skb,
union nf_inet_addr *snet,
@@ -710,16 +813,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#endif
ip_vs_nat_icmp(skb, pp, cp, 1);
-#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
- goto out;
- } else
-#endif
- if ((sysctl_ip_vs_snat_reroute ||
- skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(skb, RTN_LOCAL) != 0)
- goto out;
+ if (ip_vs_route_me_harder(af, skb))
+ goto out;
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
@@ -808,7 +903,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
ip_vs_fill_iphdr(AF_INET, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
@@ -885,7 +980,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
@@ -921,12 +1016,13 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
}
/* Handle response packets: rewrite addresses and send away...
- * Used for NAT and local client.
*/
static unsigned int
-handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
struct ip_vs_conn *cp, int ihl)
{
+ struct ip_vs_protocol *pp = pd->pp;
+
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
if (!skb_make_writable(skb, ihl))
@@ -961,21 +1057,13 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* if it came from this machine itself. So re-compute
* the routing information.
*/
-#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
- goto drop;
- } else
-#endif
- if ((sysctl_ip_vs_snat_reroute ||
- skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(skb, RTN_LOCAL) != 0)
- goto drop;
+ if (ip_vs_route_me_harder(af, skb))
+ goto drop;
IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
- ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
ip_vs_notrack(skb);
@@ -999,8 +1087,10 @@ drop:
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
EnterFunction(11);
@@ -1022,6 +1112,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
+ net = skb_net(skb);
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -1045,9 +1136,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
+ pp = pd->pp;
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
@@ -1073,11 +1165,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
if (likely(cp))
- return handle_response(af, skb, pp, cp, iph.len);
- if (sysctl_ip_vs_nat_icmp_send &&
+ return handle_response(af, skb, pd, cp, iph.len);
+ if (sysctl_nat_icmp_send(net) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
@@ -1087,7 +1179,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(af, iph.protocol,
+ if (ip_vs_lookup_real_service(net, af, iph.protocol,
&iph.saddr,
pptr[0])) {
/*
@@ -1202,14 +1294,15 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
static int
ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
{
+ struct net *net = NULL;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
struct ip_vs_iphdr ciph;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
unsigned int offset, ihl, verdict;
- union nf_inet_addr snet;
*related = 1;
@@ -1249,9 +1342,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- pp = ip_vs_proto_get(cih->protocol);
- if (!pp)
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, cih->protocol);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
/* Is the embedded protocol header present? */
if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
@@ -1265,18 +1360,9 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
ip_vs_fill_iphdr(AF_INET, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
- if (!cp) {
- /* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
- if (cp) {
- snet.ip = iph->saddr;
- return handle_response_icmp(AF_INET, skb, &snet,
- cih->protocol, cp, pp,
- offset, ihl);
- }
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
+ if (!cp)
return NF_ACCEPT;
- }
verdict = NF_DROP;
@@ -1312,6 +1398,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
static int
ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
{
+ struct net *net = NULL;
struct ipv6hdr *iph;
struct icmp6hdr _icmph, *ic;
struct ipv6hdr _ciph, *cih; /* The ip header contained
@@ -1319,8 +1406,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_iphdr ciph;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
unsigned int offset, verdict;
- union nf_inet_addr snet;
struct rt6_info *rt;
*related = 1;
@@ -1361,9 +1448,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- pp = ip_vs_proto_get(cih->nexthdr);
- if (!pp)
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, cih->nexthdr);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
/* Is the embedded protocol header present? */
/* TODO: we don't support fragmentation at the moment anyways */
@@ -1377,19 +1466,9 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
- if (!cp) {
- /* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
- if (cp) {
- ipv6_addr_copy(&snet.in6, &iph->saddr);
- return handle_response_icmp(AF_INET6, skb, &snet,
- cih->nexthdr,
- cp, pp, offset,
- sizeof(struct ipv6hdr));
- }
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
+ if (!cp)
return NF_ACCEPT;
- }
verdict = NF_DROP;
@@ -1423,10 +1502,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, restart, pkts;
+ struct netns_ipvs *ipvs;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1480,20 +1562,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
+ net = skb_net(skb);
/* Protocol supported? */
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
-
+ pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
if (unlikely(!cp)) {
int v;
- if (!pp->conn_schedule(af, skb, pp, &v, &cp))
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp))
return v;
}
@@ -1505,12 +1588,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
- if (sysctl_ip_vs_expire_nodest_conn) {
+ if (sysctl_expire_nodest_conn(ipvs)) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
@@ -1521,7 +1605,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
ip_vs_in_stats(cp, skb);
- restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp);
/* do not touch skb anymore */
@@ -1535,35 +1619,41 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
*
* Sync connection if it is about to close to
* encorage the standby servers to update the connections timeout
+ *
+ * For ONE_PKT let ip_vs_sync_conn() do the filter work.
*/
- pkts = atomic_add_return(1, &cp->in_pkts);
- if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ pkts = sysctl_sync_threshold(ipvs);
+ else
+ pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
cp->protocol == IPPROTO_SCTP) {
if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
+ (pkts % sysctl_sync_period(ipvs)
+ == sysctl_sync_threshold(ipvs))) ||
(cp->old_state != cp->state &&
((cp->state == IP_VS_SCTP_S_CLOSED) ||
(cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
(cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
- ip_vs_sync_conn(cp);
+ ip_vs_sync_conn(net, cp);
goto out;
}
}
/* Keep this block last: TCP and others with pp->num_states <= 1 */
- else if (af == AF_INET &&
- (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+ else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
(((cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
+ (pkts % sysctl_sync_period(ipvs)
+ == sysctl_sync_threshold(ipvs))) ||
((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
(cp->state == IP_VS_TCP_S_CLOSE) ||
(cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
(cp->state == IP_VS_TCP_S_TIME_WAIT)))))
- ip_vs_sync_conn(cp);
+ ip_vs_sync_conn(net, cp);
out:
cp->old_state = cp->state;
@@ -1782,7 +1872,39 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
},
#endif
};
+/*
+ * Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_generic(net, ip_vs_net_id);
+ if (ipvs == NULL) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ ipvs->net = net;
+ /* Counters used for creating unique names */
+ ipvs->gen = atomic_read(&ipvs_netns_cnt);
+ atomic_inc(&ipvs_netns_cnt);
+ net->ipvs = ipvs;
+ printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+ sizeof(struct netns_ipvs), ipvs->gen);
+ return 0;
+}
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+ IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+ .init = __ip_vs_init,
+ .exit = __ip_vs_cleanup,
+ .id = &ip_vs_net_id,
+ .size = sizeof(struct netns_ipvs),
+};
/*
* Initialize IP Virtual Server
@@ -1791,8 +1913,11 @@ static int __init ip_vs_init(void)
{
int ret;
- ip_vs_estimator_init();
+ ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
+ if (ret < 0)
+ return ret;
+ ip_vs_estimator_init();
ret = ip_vs_control_init();
if (ret < 0) {
pr_err("can't setup control.\n");
@@ -1813,15 +1938,23 @@ static int __init ip_vs_init(void)
goto cleanup_app;
}
+ ret = ip_vs_sync_init();
+ if (ret < 0) {
+ pr_err("can't setup sync data.\n");
+ goto cleanup_conn;
+ }
+
ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
if (ret < 0) {
pr_err("can't register hooks.\n");
- goto cleanup_conn;
+ goto cleanup_sync;
}
pr_info("ipvs loaded.\n");
return ret;
+cleanup_sync:
+ ip_vs_sync_cleanup();
cleanup_conn:
ip_vs_conn_cleanup();
cleanup_app:
@@ -1831,17 +1964,20 @@ static int __init ip_vs_init(void)
ip_vs_control_cleanup();
cleanup_estimator:
ip_vs_estimator_cleanup();
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
return ret;
}
static void __exit ip_vs_cleanup(void)
{
nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ ip_vs_sync_cleanup();
ip_vs_conn_cleanup();
ip_vs_app_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
ip_vs_estimator_cleanup();
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
pr_info("ipvs unloaded.\n");
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ba98e13..b799cea 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -38,6 +38,7 @@
#include <linux/mutex.h>
#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
#include <net/ip.h>
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
@@ -57,42 +58,7 @@ static DEFINE_MUTEX(__ip_vs_mutex);
/* lock for service table */
static DEFINE_RWLOCK(__ip_vs_svc_lock);
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
/* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-#ifdef CONFIG_IP_VS_NFCT
-int sysctl_ip_vs_conntrack;
-#endif
-int sysctl_ip_vs_snat_reroute = 1;
-
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
@@ -105,27 +71,28 @@ int ip_vs_get_debug_level(void)
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+static int __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
{
struct rt6_info *rt;
- struct flowi fl = {
- .oif = 0,
- .fl6_dst = *addr,
- .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
+ struct flowi6 fl6 = {
+ .daddr = *addr,
};
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+ rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
- return 1;
+ return 1;
return 0;
}
#endif
+
+#ifdef CONFIG_SYSCTL
/*
* update_defense_level is called from keventd and from sysctl,
* so it needs to protect itself from softirqs
*/
-static void update_defense_level(void)
+static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
static int old_secure_tcp = 0;
@@ -141,73 +108,73 @@ static void update_defense_level(void)
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
- nomem = (availmem < sysctl_ip_vs_amemthresh);
+ nomem = (availmem < ipvs->sysctl_amemthresh);
local_bh_disable();
/* drop_entry */
- spin_lock(&__ip_vs_dropentry_lock);
- switch (sysctl_ip_vs_drop_entry) {
+ spin_lock(&ipvs->dropentry_lock);
+ switch (ipvs->sysctl_drop_entry) {
case 0:
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
break;
case 1:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
- sysctl_ip_vs_drop_entry = 2;
+ atomic_set(&ipvs->dropentry, 1);
+ ipvs->sysctl_drop_entry = 2;
} else {
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
}
break;
case 2:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
} else {
- atomic_set(&ip_vs_dropentry, 0);
- sysctl_ip_vs_drop_entry = 1;
+ atomic_set(&ipvs->dropentry, 0);
+ ipvs->sysctl_drop_entry = 1;
};
break;
case 3:
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
break;
}
- spin_unlock(&__ip_vs_dropentry_lock);
+ spin_unlock(&ipvs->dropentry_lock);
/* drop_packet */
- spin_lock(&__ip_vs_droppacket_lock);
- switch (sysctl_ip_vs_drop_packet) {
+ spin_lock(&ipvs->droppacket_lock);
+ switch (ipvs->sysctl_drop_packet) {
case 0:
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
break;
case 1:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
- sysctl_ip_vs_drop_packet = 2;
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ ipvs->sysctl_drop_packet = 2;
} else {
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
}
break;
case 2:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
} else {
- ip_vs_drop_rate = 0;
- sysctl_ip_vs_drop_packet = 1;
+ ipvs->drop_rate = 0;
+ ipvs->sysctl_drop_packet = 1;
}
break;
case 3:
- ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+ ipvs->drop_rate = ipvs->sysctl_am_droprate;
break;
}
- spin_unlock(&__ip_vs_droppacket_lock);
+ spin_unlock(&ipvs->droppacket_lock);
/* secure_tcp */
- spin_lock(&ip_vs_securetcp_lock);
- switch (sysctl_ip_vs_secure_tcp) {
+ spin_lock(&ipvs->securetcp_lock);
+ switch (ipvs->sysctl_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
to_change = 0;
@@ -216,7 +183,7 @@ static void update_defense_level(void)
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
- sysctl_ip_vs_secure_tcp = 2;
+ ipvs->sysctl_secure_tcp = 2;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
@@ -229,7 +196,7 @@ static void update_defense_level(void)
} else {
if (old_secure_tcp >= 2)
to_change = 0;
- sysctl_ip_vs_secure_tcp = 1;
+ ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
@@ -237,10 +204,11 @@ static void update_defense_level(void)
to_change = 1;
break;
}
- old_secure_tcp = sysctl_ip_vs_secure_tcp;
+ old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
- ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
- spin_unlock(&ip_vs_securetcp_lock);
+ ip_vs_protocol_timeout_change(ipvs,
+ ipvs->sysctl_secure_tcp > 1);
+ spin_unlock(&ipvs->securetcp_lock);
local_bh_enable();
}
@@ -250,17 +218,18 @@ static void update_defense_level(void)
* Timer for checking the defense
*/
#define DEFENSE_TIMER_PERIOD 1*HZ
-static void defense_work_handler(struct work_struct *work);
-static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
static void defense_work_handler(struct work_struct *work)
{
- update_defense_level();
- if (atomic_read(&ip_vs_dropentry))
- ip_vs_random_dropentry();
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, defense_work.work);
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+ update_defense_level(ipvs);
+ if (atomic_read(&ipvs->dropentry))
+ ip_vs_random_dropentry(ipvs->net);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
+#endif
int
ip_vs_use_count_inc(void)
@@ -287,33 +256,13 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
-/*
- * Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
- * Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
- * FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
-
/*
* Returns hash value for virtual service
*/
-static __inline__ unsigned
-ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
- __be16 port)
+static inline unsigned
+ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
+ const union nf_inet_addr *addr, __be16 port)
{
register unsigned porth = ntohs(port);
__be32 addr_fold = addr->ip;
@@ -323,6 +272,7 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
+ addr_fold ^= ((size_t)net>>8);
return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
& IP_VS_SVC_TAB_MASK;
@@ -331,13 +281,13 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
/*
* Returns hash value of fwmark for virtual service lookup
*/
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
{
- return fwmark & IP_VS_SVC_TAB_MASK;
+ return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
- * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
* or in the ip_vs_svc_fwm_table by fwmark.
* Should be called with locked tables.
*/
@@ -353,16 +303,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
if (svc->fwmark == 0) {
/*
- * Hash it by <protocol,addr,port> in ip_vs_svc_table
+ * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
- hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
- svc->port);
+ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ &svc->addr, svc->port);
list_add(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
- * Hash it by fwmark in ip_vs_svc_fwm_table
+ * Hash it by fwmark in svc_fwm_table
*/
- hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
@@ -374,7 +324,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
/*
- * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ * Unhashes a service from svc_table / svc_fwm_table.
* Should be called with locked tables.
*/
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
@@ -386,10 +336,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
}
if (svc->fwmark == 0) {
- /* Remove it from the ip_vs_svc_table table */
+ /* Remove it from the svc_table table */
list_del(&svc->s_list);
} else {
- /* Remove it from the ip_vs_svc_fwm_table table */
+ /* Remove it from the svc_fwm_table table */
list_del(&svc->f_list);
}
@@ -400,23 +350,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
/*
- * Get service by {proto,addr,port} in the service table.
+ * Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
- __be16 vport)
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
unsigned hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
- hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
+ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
- && (svc->protocol == protocol)) {
+ && (svc->protocol == protocol)
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -430,16 +381,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_find(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
{
unsigned hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
- hash = ip_vs_svc_fwm_hashkey(fwmark);
+ hash = ip_vs_svc_fwm_hashkey(net, fwmark);
list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
- if (svc->fwmark == fwmark && svc->af == af) {
+ if (svc->fwmark == fwmark && svc->af == af
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -449,42 +401,46 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark)
}
struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
read_lock(&__ip_vs_svc_lock);
/*
* Check the table hashed by fwmark first
*/
- if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
- goto out;
+ if (fwmark) {
+ svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ if (svc)
+ goto out;
+ }
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
- && atomic_read(&ip_vs_ftpsvc_counter)
+ && atomic_read(&ipvs->ftpsvc_counter)
&& (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
- && atomic_read(&ip_vs_nullsvc_counter)) {
+ && atomic_read(&ipvs->nullsvc_counter)) {
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
}
out:
@@ -519,6 +475,7 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port), atomic_read(&svc->usecnt));
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
}
@@ -545,10 +502,10 @@ static inline unsigned ip_vs_rs_hashkey(int af,
}
/*
- * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
* should be called with locked tables.
*/
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned hash;
@@ -562,19 +519,19 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ip_vs_rtable[hash]);
+ list_add(&dest->d_list, &ipvs->rs_table[hash]);
return 1;
}
/*
- * UNhashes ip_vs_dest from ip_vs_rtable.
+ * UNhashes ip_vs_dest from rs_table.
* should be called with locked tables.
*/
static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
- * Remove it from the ip_vs_rtable table.
+ * Remove it from the rs_table table.
*/
if (!list_empty(&dest->d_list)) {
list_del(&dest->d_list);
@@ -588,10 +545,11 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
* Lookup real service by <proto,addr,port> in the real service table.
*/
struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
+ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
const union nf_inet_addr *daddr,
__be16 dport)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
unsigned hash;
struct ip_vs_dest *dest;
@@ -601,19 +559,19 @@ ip_vs_lookup_real_service(int af, __u16 protocol,
*/
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&__ip_vs_rs_lock);
- list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+ read_lock(&ipvs->rs_lock);
+ list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
if ((dest->af == af)
&& ip_vs_addr_equal(af, &dest->addr, daddr)
&& (dest->port == dport)
&& ((dest->protocol == protocol) ||
dest->vfwmark)) {
/* HIT */
- read_unlock(&__ip_vs_rs_lock);
+ read_unlock(&ipvs->rs_lock);
return dest;
}
}
- read_unlock(&__ip_vs_rs_lock);
+ read_unlock(&ipvs->rs_lock);
return NULL;
}
@@ -652,15 +610,16 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* ip_vs_lookup_real_service() looked promissing, but
* seems not working as expected.
*/
-struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
+ const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
- __be16 vport, __u16 protocol)
+ __be16 vport, __u16 protocol, __u32 fwmark)
{
struct ip_vs_dest *dest;
struct ip_vs_service *svc;
- svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+ svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
dest = ip_vs_lookup_dest(svc, daddr, dport);
@@ -685,11 +644,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
@@ -720,6 +680,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
}
}
@@ -737,25 +698,53 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* are expired, and the refcnt of each destination in the trash must
* be 1, so we simply release them here.
*/
-static void ip_vs_trash_cleanup(void)
+static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
}
}
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
+
+ spin_lock_bh(&src->lock);
+
+ IP_VS_SHOW_STATS_COUNTER(conns);
+ IP_VS_SHOW_STATS_COUNTER(inpkts);
+ IP_VS_SHOW_STATS_COUNTER(outpkts);
+ IP_VS_SHOW_STATS_COUNTER(inbytes);
+ IP_VS_SHOW_STATS_COUNTER(outbytes);
+
+ ip_vs_read_estimator(dst, src);
+
+ spin_unlock_bh(&src->lock);
+}
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
spin_lock_bh(&stats->lock);
- memset(&stats->ustats, 0, sizeof(stats->ustats));
+ /* get current counters as zero point, rates are zeroed */
+
+#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
+
+ IP_VS_ZERO_STATS_COUNTER(conns);
+ IP_VS_ZERO_STATS_COUNTER(inpkts);
+ IP_VS_ZERO_STATS_COUNTER(outpkts);
+ IP_VS_ZERO_STATS_COUNTER(inbytes);
+ IP_VS_ZERO_STATS_COUNTER(outbytes);
+
ip_vs_zero_estimator(stats);
spin_unlock_bh(&stats->lock);
@@ -768,6 +757,7 @@ static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
int conn_flags;
/* set the weight and the flags */
@@ -780,12 +770,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
- * Put the real service in ip_vs_rtable if not present.
+ * Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&__ip_vs_rs_lock);
- ip_vs_rs_hash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
+ write_lock_bh(&ipvs->rs_lock);
+ ip_vs_rs_hash(ipvs, dest);
+ write_unlock_bh(&ipvs->rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
@@ -813,7 +803,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
spin_unlock_bh(&dest->dst_lock);
if (add)
- ip_vs_new_estimator(&dest->stats);
+ ip_vs_start_estimator(svc->net, &dest->stats);
write_lock_bh(&__ip_vs_svc_lock);
@@ -850,12 +840,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
- !__ip_vs_addr_is_local_v6(&udest->addr.in6))
+ !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
return -EINVAL;
} else
#endif
{
- atype = inet_addr_type(&init_net, udest->addr.ip);
+ atype = inet_addr_type(svc->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
@@ -865,6 +855,11 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
pr_err("%s(): no memory.\n", __func__);
return -ENOMEM;
}
+ dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!dest->stats.cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto err_alloc;
+ }
dest->af = svc->af;
dest->protocol = svc->protocol;
@@ -888,6 +883,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
LeaveFunction(2);
return 0;
+
+err_alloc:
+ kfree(dest);
+ return -ENOMEM;
}
@@ -1006,16 +1005,18 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
{
- ip_vs_kill_estimator(&dest->stats);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_stop_estimator(net, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&__ip_vs_rs_lock);
+ write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
+ write_unlock_bh(&ipvs->rs_lock);
/*
* Decrease the refcnt of the dest, and free the dest
@@ -1034,6 +1035,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
and only one user context can update virtual service at a
time, so the operation here is OK */
atomic_dec(&dest->svc->refcnt);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
} else {
IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
@@ -1041,7 +1043,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ip_vs_dest_trash);
+ list_add(&dest->n_list, &ipvs->dest_trash);
atomic_inc(&dest->refcnt);
}
}
@@ -1105,7 +1107,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete the destination
*/
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest);
LeaveFunction(2);
@@ -1117,13 +1119,14 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
* Add a service into the service hash table
*/
static int
-ip_vs_add_service(struct ip_vs_service_user_kern *u,
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
@@ -1137,7 +1140,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1159,6 +1162,11 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
ret = -ENOMEM;
goto out_err;
}
+ svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!svc->stats.cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto out_err;
+ }
/* I'm the first user of the service */
atomic_set(&svc->usecnt, 0);
@@ -1172,6 +1180,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
+ svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
rwlock_init(&svc->sched_lock);
@@ -1189,15 +1198,15 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
/* Update the virtual service counters */
if (svc->port == FTPPORT)
- atomic_inc(&ip_vs_ftpsvc_counter);
+ atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_inc(&ip_vs_nullsvc_counter);
+ atomic_inc(&ipvs->nullsvc_counter);
- ip_vs_new_estimator(&svc->stats);
+ ip_vs_start_estimator(net, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services++;
+ ipvs->num_services++;
/* Hash the service into the service table */
write_lock_bh(&__ip_vs_svc_lock);
@@ -1207,6 +1216,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
*svc_p = svc;
return 0;
+
out_err:
if (svc != NULL) {
ip_vs_unbind_scheduler(svc);
@@ -1215,6 +1225,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
ip_vs_app_inc_put(svc->inc);
local_bh_enable();
}
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
ip_vs_scheduler_put(sched);
@@ -1248,7 +1260,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = sched;
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1334,14 +1346,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
pr_info("%s: enter\n", __func__);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services--;
+ ipvs->num_services--;
- ip_vs_kill_estimator(&svc->stats);
+ ip_vs_stop_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
old_sched = svc->scheduler;
@@ -1364,16 +1377,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest);
}
/*
* Update the virtual service counters
*/
if (svc->port == FTPPORT)
- atomic_dec(&ip_vs_ftpsvc_counter);
+ atomic_dec(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_dec(&ip_vs_nullsvc_counter);
+ atomic_dec(&ipvs->nullsvc_counter);
/*
* Free the service if nobody refers to it
@@ -1383,6 +1396,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port), atomic_read(&svc->usecnt));
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
@@ -1428,17 +1442,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(void)
+static int ip_vs_flush(struct net *net)
{
int idx;
struct ip_vs_service *svc, *nxt;
/*
- * Flush the service table hashed by <protocol,addr,port>
+ * Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- ip_vs_unlink_service(svc);
+ list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
+ s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc);
}
}
@@ -1448,7 +1464,8 @@ static int ip_vs_flush(void)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt,
&ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_unlink_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc);
}
}
@@ -1472,32 +1489,35 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
return 0;
}
-static int ip_vs_zero_all(void)
+static int ip_vs_zero_all(struct net *net)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- ip_vs_zero_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_zero_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
- ip_vs_zero_stats(&ip_vs_stats);
+ ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
return 0;
}
-
+#ifdef CONFIG_SYSCTL
static int
proc_do_defense_mode(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = current->nsproxy->net_ns;
int *valp = table->data;
int val = *valp;
int rc;
@@ -1508,13 +1528,12 @@ proc_do_defense_mode(ctl_table *table, int write,
/* Restore the correct value */
*valp = val;
} else {
- update_defense_level();
+ update_defense_level(net_ipvs(net));
}
}
return rc;
}
-
static int
proc_do_sync_threshold(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -1534,45 +1553,54 @@ proc_do_sync_threshold(ctl_table *table, int write,
return rc;
}
+static int
+proc_do_sync_mode(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 1)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ struct net *net = current->nsproxy->net_ns;
+ ip_vs_sync_switch_mode(net, val);
+ }
+ }
+ return rc;
+}
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ * Do not change order or insert new entries without
+ * align with netns init in __ip_vs_control_init()
*/
static struct ctl_table vs_vars[] = {
{
.procname = "amemthresh",
- .data = &sysctl_ip_vs_amemthresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_IP_VS_DEBUG
- {
- .procname = "debug_level",
- .data = &sysctl_ip_vs_debug_level,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#endif
{
.procname = "am_droprate",
- .data = &sysctl_ip_vs_am_droprate,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "drop_entry",
- .data = &sysctl_ip_vs_drop_entry,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "drop_packet",
- .data = &sysctl_ip_vs_drop_packet,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
@@ -1580,7 +1608,6 @@ static struct ctl_table vs_vars[] = {
#ifdef CONFIG_IP_VS_NFCT
{
.procname = "conntrack",
- .data = &sysctl_ip_vs_conntrack,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
@@ -1588,18 +1615,62 @@ static struct ctl_table vs_vars[] = {
#endif
{
.procname = "secure_tcp",
- .data = &sysctl_ip_vs_secure_tcp,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "snat_reroute",
- .data = &sysctl_ip_vs_snat_reroute,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .procname = "sync_version",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_mode,
+ },
+ {
+ .procname = "cache_bypass",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_nodest_conn",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_quiescent_template",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_threshold",
+ .maxlen =
+ sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+ .mode = 0644,
+ .proc_handler = proc_do_sync_threshold,
+ },
+ {
+ .procname = "nat_icmp_send",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#if 0
{
.procname = "timeout_established",
@@ -1686,41 +1757,6 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
- {
- .procname = "cache_bypass",
- .data = &sysctl_ip_vs_cache_bypass,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_nodest_conn",
- .data = &sysctl_ip_vs_expire_nodest_conn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_quiescent_template",
- .data = &sysctl_ip_vs_expire_quiescent_template,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sync_threshold",
- .data = &sysctl_ip_vs_sync_threshold,
- .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
- .mode = 0644,
- .proc_handler = proc_do_sync_threshold,
- },
- {
- .procname = "nat_icmp_send",
- .data = &sysctl_ip_vs_nat_icmp_send,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{ }
};
@@ -1731,12 +1767,12 @@ const struct ctl_path net_vs_ctl_path[] = {
{ }
};
EXPORT_SYMBOL_GPL(net_vs_ctl_path);
-
-static struct ctl_table_header * sysctl_header;
+#endif
#ifdef CONFIG_PROC_FS
struct ip_vs_iter {
+ struct seq_net_private p; /* Do not move this, netns depends upon it*/
struct list_head *table;
int bucket;
};
@@ -1763,6 +1799,7 @@ static inline const char *ip_vs_fwd_name(unsigned flags)
/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
+ struct net *net = seq_file_net(seq);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
@@ -1770,7 +1807,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (pos-- == 0){
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
@@ -1781,7 +1818,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (pos-- == 0) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
@@ -1935,7 +1972,7 @@ static const struct seq_operations ip_vs_info_seq_ops = {
static int ip_vs_info_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &ip_vs_info_seq_ops,
+ return seq_open_net(inode, file, &ip_vs_info_seq_ops,
sizeof(struct ip_vs_iter));
}
@@ -1949,13 +1986,11 @@ static const struct file_operations ip_vs_info_fops = {
#endif
-struct ip_vs_stats ip_vs_stats = {
- .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
-};
-
#ifdef CONFIG_PROC_FS
static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats_user show;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
@@ -1963,29 +1998,25 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_printf(seq,
" Conns Packets Packets Bytes Bytes\n");
- spin_lock_bh(&ip_vs_stats.lock);
- seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
- ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
- (unsigned long long) ip_vs_stats.ustats.inbytes,
- (unsigned long long) ip_vs_stats.ustats.outbytes);
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
+ show.inpkts, show.outpkts,
+ (unsigned long long) show.inbytes,
+ (unsigned long long) show.outbytes);
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
- seq_printf(seq,"%8X %8X %8X %16X %16X\n",
- ip_vs_stats.ustats.cps,
- ip_vs_stats.ustats.inpps,
- ip_vs_stats.ustats.outpps,
- ip_vs_stats.ustats.inbps,
- ip_vs_stats.ustats.outbps);
- spin_unlock_bh(&ip_vs_stats.lock);
+ seq_printf(seq, "%8X %8X %8X %16X %16X\n",
+ show.cps, show.inpps, show.outpps,
+ show.inbps, show.outbps);
return 0;
}
static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, ip_vs_stats_show, NULL);
+ return single_open_net(inode, file, ip_vs_stats_show);
}
static const struct file_operations ip_vs_stats_fops = {
@@ -1996,13 +2027,85 @@ static const struct file_operations ip_vs_stats_fops = {
.release = single_release,
};
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
+ struct ip_vs_stats_user rates;
+ int i;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ "CPU Conns Packets Packets Bytes Bytes\n");
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&u->syncp);
+ inbytes = u->ustats.inbytes;
+ outbytes = u->ustats.outbytes;
+ } while (u64_stats_fetch_retry_bh(&u->syncp, start));
+
+ seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
+ i, u->ustats.conns, u->ustats.inpkts,
+ u->ustats.outpkts, (__u64)inbytes,
+ (__u64)outbytes);
+ }
+
+ spin_lock_bh(&tot_stats->lock);
+
+ seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
+ tot_stats->ustats.conns, tot_stats->ustats.inpkts,
+ tot_stats->ustats.outpkts,
+ (unsigned long long) tot_stats->ustats.inbytes,
+ (unsigned long long) tot_stats->ustats.outbytes);
+
+ ip_vs_read_estimator(&rates, tot_stats);
+
+ spin_unlock_bh(&tot_stats->lock);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, " %8X %8X %8X %16X %16X\n",
+ rates.cps,
+ rates.inpps,
+ rates.outpps,
+ rates.inbps,
+ rates.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_percpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
u->tcp_timeout,
u->tcp_fin_timeout,
@@ -2010,19 +2113,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
#endif
@@ -2087,6 +2193,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
+ struct net *net = sock_net(sk);
int ret;
unsigned char arg[MAX_ARG_LEN];
struct ip_vs_service_user *usvc_compat;
@@ -2121,19 +2228,20 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush();
+ ret = ip_vs_flush(net);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
- ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = stop_sync_thread(dm->state);
+ ret = stop_sync_thread(net, dm->state);
goto out_unlock;
}
@@ -2148,7 +2256,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out_unlock;
}
}
@@ -2165,10 +2273,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
if (usvc.fwmark == 0)
- svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+ svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2181,7 +2289,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (svc != NULL)
ret = -EEXIST;
else
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
@@ -2218,14 +2326,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
static void
-ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
-{
- spin_lock_bh(&src->lock);
- memcpy(dst, &src->ustats, sizeof(*dst));
- spin_unlock_bh(&src->lock);
-}
-
-static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
dst->protocol = src->protocol;
@@ -2241,7 +2341,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
}
static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+__ip_vs_get_service_entries(struct net *net,
+ const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
int idx, count=0;
@@ -2252,7 +2353,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2271,7 +2372,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2291,7 +2392,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
}
static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
@@ -2299,9 +2400,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
int ret = 0;
if (get->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+ svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
if (svc) {
@@ -2336,17 +2437,21 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
}
static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
#ifdef CONFIG_IP_VS_PROTO_TCP
- u->tcp_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
- u->tcp_fin_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
u->udp_timeout =
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+ pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
}
@@ -2375,7 +2480,10 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
unsigned char arg[128];
int ret = 0;
unsigned int copylen;
+ struct net *net = sock_net(sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ BUG_ON(!net);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -2418,7 +2526,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
info.size = ip_vs_conn_tab_size;
- info.num_services = ip_vs_num_services;
+ info.num_services = ipvs->num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
}
@@ -2437,7 +2545,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_service_entries(get, user);
+ ret = __ip_vs_get_service_entries(net, get, user);
}
break;
@@ -2450,10 +2558,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, entry->protocol,
- &addr, entry->port);
+ svc = __ip_vs_service_find(net, AF_INET,
+ entry->protocol, &addr,
+ entry->port);
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2476,7 +2585,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_dest_entries(get, user);
+ ret = __ip_vs_get_dest_entries(net, get, user);
}
break;
@@ -2484,7 +2593,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
@@ -2495,15 +2604,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_daemon_user d[2];
memset(&d, 0, sizeof(d));
- if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
- d[0].syncid = ip_vs_master_syncid;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
}
- if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
- d[1].syncid = ip_vs_backup_syncid;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
@@ -2542,6 +2653,7 @@ static struct genl_family ip_vs_genl_family = {
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
.maxattr = IPVS_CMD_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
};
/* Policy used for first-level command attributes */
@@ -2599,31 +2711,29 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
struct ip_vs_stats *stats)
{
+ struct ip_vs_stats_user ustats;
struct nlattr *nl_stats = nla_nest_start(skb, container_type);
if (!nl_stats)
return -EMSGSIZE;
- spin_lock_bh(&stats->lock);
-
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
- NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
- NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
+ ip_vs_copy_stats(&ustats, stats);
- spin_unlock_bh(&stats->lock);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts);
+ NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes);
+ NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps);
+ NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps);
nla_nest_end(skb, nl_stats);
return 0;
nla_put_failure:
- spin_unlock_bh(&stats->lock);
nla_nest_cancel(skb, nl_stats);
return -EMSGSIZE;
}
@@ -2696,11 +2806,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
- if (++idx <= start)
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2711,7 +2822,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
- if (++idx <= start)
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2727,7 +2838,8 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
+static int ip_vs_genl_parse_service(struct net *net,
+ struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, int full_entry,
struct ip_vs_service **ret_svc)
{
@@ -2770,9 +2882,9 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
}
if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
- svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+ svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
*ret_svc = svc;
@@ -2809,13 +2921,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
return 0;
}
-static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+ struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+ ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
return ret ? ERR_PTR(ret) : svc;
}
@@ -2883,6 +2996,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
@@ -2891,7 +3005,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
goto out_err;
- svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+
+ svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc) || svc == NULL)
goto out_err;
@@ -3005,20 +3120,23 @@ nla_put_failure:
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ struct net *net = skb_net(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
mutex_lock(&__ip_vs_mutex);
- if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
- ip_vs_master_mcast_ifn,
- ip_vs_master_syncid, cb) < 0)
+ ipvs->master_mcast_ifn,
+ ipvs->master_syncid, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
}
- if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+ if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
- ip_vs_backup_mcast_ifn,
- ip_vs_backup_syncid, cb) < 0)
+ ipvs->backup_mcast_ifn,
+ ipvs->backup_syncid, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
@@ -3030,31 +3148,33 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
{
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+ return start_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
}
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
{
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ return stop_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
}
-static int ip_vs_genl_set_config(struct nlattr **attrs)
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3066,7 +3186,7 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
- return ip_vs_set_timeout(&t);
+ return ip_vs_set_timeout(net, &t);
}
static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
@@ -3076,16 +3196,20 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush();
+ ret = ip_vs_flush(net);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
- ret = ip_vs_genl_set_config(info->attrs);
+ ret = ip_vs_genl_set_config(net, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_NEW_DAEMON ||
cmd == IPVS_CMD_DEL_DAEMON) {
@@ -3101,13 +3225,13 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
}
if (cmd == IPVS_CMD_NEW_DAEMON)
- ret = ip_vs_genl_new_daemon(daemon_attrs);
+ ret = ip_vs_genl_new_daemon(net, daemon_attrs);
else
- ret = ip_vs_genl_del_daemon(daemon_attrs);
+ ret = ip_vs_genl_del_daemon(net, daemon_attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out;
}
@@ -3117,7 +3241,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = 1;
- ret = ip_vs_genl_parse_service(&usvc,
+ ret = ip_vs_genl_parse_service(net, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
@@ -3147,7 +3271,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
else
ret = -EEXIST;
break;
@@ -3185,7 +3309,11 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3214,7 +3342,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_service *svc;
- svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+ svc = ip_vs_genl_find_service(net,
+ info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
goto out_err;
@@ -3234,7 +3363,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
@@ -3380,62 +3509,189 @@ static void ip_vs_genl_unregister(void)
/* End of Generic Netlink interface definitions */
+/*
+ * per netns intit/exit func.
+ */
+#ifdef CONFIG_SYSCTL
+int __net_init __ip_vs_control_init_sysctl(struct net *net)
+{
+ int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ctl_table *tbl;
+
+ atomic_set(&ipvs->dropentry, 0);
+ spin_lock_init(&ipvs->dropentry_lock);
+ spin_lock_init(&ipvs->droppacket_lock);
+ spin_lock_init(&ipvs->securetcp_lock);
+
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+ } else
+ tbl = vs_vars;
+ /* Initialize sysctl defaults */
+ idx = 0;
+ ipvs->sysctl_amemthresh = 1024;
+ tbl[idx++].data = &ipvs->sysctl_amemthresh;
+ ipvs->sysctl_am_droprate = 10;
+ tbl[idx++].data = &ipvs->sysctl_am_droprate;
+ tbl[idx++].data = &ipvs->sysctl_drop_entry;
+ tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+ tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+ tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+ ipvs->sysctl_snat_reroute = 1;
+ tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+ ipvs->sysctl_sync_ver = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+ tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+ ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
+ ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
+ tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+
+
+ ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
+ tbl);
+ if (ipvs->sysctl_hdr == NULL) {
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return -ENOMEM;
+ }
+ ip_vs_start_estimator(net, &ipvs->tot_stats);
+ ipvs->sysctl_tbl = tbl;
+ /* Schedule defense work */
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+
+ return 0;
+}
+
+void __net_init __ip_vs_control_cleanup_sysctl(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ cancel_delayed_work_sync(&ipvs->defense_work);
+ cancel_work_sync(&ipvs->defense_work.work);
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+}
+
+#else
+
+int __net_init __ip_vs_control_init_sysctl(struct net *net) { return 0; }
+void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { }
+
+#endif
+
+int __net_init __ip_vs_control_init(struct net *net)
+{
+ int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
+
+ /* Initialize rs_table */
+ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+ INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+
+ INIT_LIST_HEAD(&ipvs->dest_trash);
+ atomic_set(&ipvs->ftpsvc_counter, 0);
+ atomic_set(&ipvs->nullsvc_counter, 0);
+
+ /* procfs stats */
+ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (ipvs->tot_stats.cpustats) {
+ pr_err("%s(): alloc_percpu.\n", __func__);
+ return -ENOMEM;
+ }
+ spin_lock_init(&ipvs->tot_stats.lock);
+
+ proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
+ proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
+ proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
+ &ip_vs_stats_percpu_fops);
+
+ if (__ip_vs_control_init_sysctl(net))
+ goto err;
+
+ return 0;
+
+err:
+ free_percpu(ipvs->tot_stats.cpustats);
+ return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_control_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_trash_cleanup(net);
+ ip_vs_stop_estimator(net, &ipvs->tot_stats);
+ __ip_vs_control_cleanup_sysctl(net);
+ proc_net_remove(net, "ip_vs_stats_percpu");
+ proc_net_remove(net, "ip_vs_stats");
+ proc_net_remove(net, "ip_vs");
+ free_percpu(ipvs->tot_stats.cpustats);
+}
+
+static struct pernet_operations ipvs_control_ops = {
+ .init = __ip_vs_control_init,
+ .exit = __ip_vs_control_cleanup,
+};
int __init ip_vs_control_init(void)
{
- int ret;
int idx;
+ int ret;
EnterFunction(2);
- /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+ /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
- for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+
+ ret = register_pernet_subsys(&ipvs_control_ops);
+ if (ret) {
+ pr_err("cannot register namespace.\n");
+ goto err;
}
- smp_wmb();
+
+ smp_wmb(); /* Do we really need it now ? */
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
- return ret;
+ goto err_net;
}
ret = ip_vs_genl_register();
if (ret) {
pr_err("cannot register Generic Netlink interface.\n");
nf_unregister_sockopt(&ip_vs_sockopts);
- return ret;
+ goto err_net;
}
- proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
- proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
-
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
-
- ip_vs_new_estimator(&ip_vs_stats);
-
- /* Hook the defense timer */
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-
LeaveFunction(2);
return 0;
+
+err_net:
+ unregister_pernet_subsys(&ipvs_control_ops);
+err:
+ return ret;
}
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
- ip_vs_trash_cleanup();
- cancel_delayed_work_sync(&defense_work);
- cancel_work_sync(&defense_work.work);
- ip_vs_kill_estimator(&ip_vs_stats);
- unregister_sysctl_table(sysctl_header);
- proc_net_remove(&init_net, "ip_vs_stats");
- proc_net_remove(&init_net, "ip_vs");
+ unregister_pernet_subsys(&ipvs_control_ops);
ip_vs_genl_unregister();
nf_unregister_sockopt(&ip_vs_sockopts);
LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ff28801..8c8766c 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -8,8 +8,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
- *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
*/
#define KMSG_COMPONENT "IPVS"
@@ -48,11 +52,42 @@
*/
-static void estimation_timer(unsigned long arg);
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
+ struct ip_vs_cpu_stats *stats)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+ if (i) {
+ sum->conns += s->ustats.conns;
+ sum->inpkts += s->ustats.inpkts;
+ sum->outpkts += s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ inbytes = s->ustats.inbytes;
+ outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ sum->inbytes += inbytes;
+ sum->outbytes += outbytes;
+ } else {
+ sum->conns = s->ustats.conns;
+ sum->inpkts = s->ustats.inpkts;
+ sum->outpkts = s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ sum->inbytes = s->ustats.inbytes;
+ sum->outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ }
+ }
+}
-static LIST_HEAD(est_list);
-static DEFINE_SPINLOCK(est_lock);
-static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
static void estimation_timer(unsigned long arg)
{
@@ -62,12 +97,16 @@ static void estimation_timer(unsigned long arg)
u32 n_inpkts, n_outpkts;
u64 n_inbytes, n_outbytes;
u32 rate;
+ struct net *net = (struct net *)arg;
+ struct netns_ipvs *ipvs;
- spin_lock(&est_lock);
- list_for_each_entry(e, &est_list, list) {
+ ipvs = net_ipvs(net);
+ spin_lock(&ipvs->est_lock);
+ list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
spin_lock(&s->lock);
+ ip_vs_read_cpu_stats(&s->ustats, s->cpustats);
n_conns = s->ustats.conns;
n_inpkts = s->ustats.inpkts;
n_outpkts = s->ustats.outpkts;
@@ -75,81 +114,64 @@ static void estimation_timer(unsigned long arg)
n_outbytes = s->ustats.outbytes;
/* scaled by 2^10, but divided 2 seconds */
- rate = (n_conns - e->last_conns)<<9;
+ rate = (n_conns - e->last_conns) << 9;
e->last_conns = n_conns;
- e->cps += ((long)rate - (long)e->cps)>>2;
- s->ustats.cps = (e->cps+0x1FF)>>10;
+ e->cps += ((long)rate - (long)e->cps) >> 2;
- rate = (n_inpkts - e->last_inpkts)<<9;
+ rate = (n_inpkts - e->last_inpkts) << 9;
e->last_inpkts = n_inpkts;
- e->inpps += ((long)rate - (long)e->inpps)>>2;
- s->ustats.inpps = (e->inpps+0x1FF)>>10;
+ e->inpps += ((long)rate - (long)e->inpps) >> 2;
- rate = (n_outpkts - e->last_outpkts)<<9;
+ rate = (n_outpkts - e->last_outpkts) << 9;
e->last_outpkts = n_outpkts;
- e->outpps += ((long)rate - (long)e->outpps)>>2;
- s->ustats.outpps = (e->outpps+0x1FF)>>10;
+ e->outpps += ((long)rate - (long)e->outpps) >> 2;
- rate = (n_inbytes - e->last_inbytes)<<4;
+ rate = (n_inbytes - e->last_inbytes) << 4;
e->last_inbytes = n_inbytes;
- e->inbps += ((long)rate - (long)e->inbps)>>2;
- s->ustats.inbps = (e->inbps+0xF)>>5;
+ e->inbps += ((long)rate - (long)e->inbps) >> 2;
- rate = (n_outbytes - e->last_outbytes)<<4;
+ rate = (n_outbytes - e->last_outbytes) << 4;
e->last_outbytes = n_outbytes;
- e->outbps += ((long)rate - (long)e->outbps)>>2;
- s->ustats.outbps = (e->outbps+0xF)>>5;
+ e->outbps += ((long)rate - (long)e->outbps) >> 2;
spin_unlock(&s->lock);
}
- spin_unlock(&est_lock);
- mod_timer(&est_timer, jiffies + 2*HZ);
+ spin_unlock(&ipvs->est_lock);
+ mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_new_estimator(struct ip_vs_stats *stats)
+void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
- est->last_conns = stats->ustats.conns;
- est->cps = stats->ustats.cps<<10;
-
- est->last_inpkts = stats->ustats.inpkts;
- est->inpps = stats->ustats.inpps<<10;
-
- est->last_outpkts = stats->ustats.outpkts;
- est->outpps = stats->ustats.outpps<<10;
-
- est->last_inbytes = stats->ustats.inbytes;
- est->inbps = stats->ustats.inbps<<5;
-
- est->last_outbytes = stats->ustats.outbytes;
- est->outbps = stats->ustats.outbps<<5;
-
- spin_lock_bh(&est_lock);
- list_add(&est->list, &est_list);
- spin_unlock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
+ list_add(&est->list, &ipvs->est_list);
+ spin_unlock_bh(&ipvs->est_lock);
}
-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
- spin_lock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
list_del(&est->list);
- spin_unlock_bh(&est_lock);
+ spin_unlock_bh(&ipvs->est_lock);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
-
- /* set counters zero, caller must hold the stats->lock lock */
- est->last_inbytes = 0;
- est->last_outbytes = 0;
- est->last_conns = 0;
- est->last_inpkts = 0;
- est->last_outpkts = 0;
+ struct ip_vs_stats_user *u = &stats->ustats;
+
+ /* reset counters, caller must hold the stats->lock lock */
+ est->last_inbytes = u->inbytes;
+ est->last_outbytes = u->outbytes;
+ est->last_conns = u->conns;
+ est->last_inpkts = u->inpkts;
+ est->last_outpkts = u->outpkts;
est->cps = 0;
est->inpps = 0;
est->outpps = 0;
@@ -157,13 +179,48 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
est->outbps = 0;
}
-int __init ip_vs_estimator_init(void)
+/* Get decoded rates */
+void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
+ struct ip_vs_stats *stats)
{
- mod_timer(&est_timer, jiffies + 2 * HZ);
+ struct ip_vs_estimator *e = &stats->est;
+
+ dst->cps = (e->cps + 0x1FF) >> 10;
+ dst->inpps = (e->inpps + 0x1FF) >> 10;
+ dst->outpps = (e->outpps + 0x1FF) >> 10;
+ dst->inbps = (e->inbps + 0xF) >> 5;
+ dst->outbps = (e->outbps + 0xF) >> 5;
+}
+
+static int __net_init __ip_vs_estimator_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->est_list);
+ spin_lock_init(&ipvs->est_lock);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
return 0;
}
+static void __net_exit __ip_vs_estimator_exit(struct net *net)
+{
+ del_timer_sync(&net_ipvs(net)->est_timer);
+}
+static struct pernet_operations ip_vs_app_ops = {
+ .init = __ip_vs_estimator_init,
+ .exit = __ip_vs_estimator_exit,
+};
+
+int __init ip_vs_estimator_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_app_ops);
+ return rv;
+}
+
void ip_vs_estimator_cleanup(void)
{
- del_timer_sync(&est_timer);
+ unregister_pernet_subsys(&ip_vs_app_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7545500..6b5dd6d 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
int ret = 0;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -197,18 +198,20 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol,
- &from, port, &cp->caddr, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &from, port,
+ &cp->caddr, 0, &p);
n_cp = ip_vs_conn_out_get(&p);
}
if (!n_cp) {
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ AF_INET, IPPROTO_TCP, &cp->caddr,
0, &cp->vaddr, port, &p);
n_cp = ip_vs_conn_new(&p, &from, port,
IP_VS_CONN_F_NO_CPORT |
IP_VS_CONN_F_NFCT,
- cp->dest);
+ cp->dest, skb->mark);
if (!n_cp)
return 0;
@@ -257,8 +260,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* would be adjusted twice.
*/
+ net = skb_net(skb);
cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return ret;
}
@@ -287,6 +291,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -358,14 +363,15 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
- &cp->vaddr, htons(ntohs(cp->vport)-1),
- &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &to, port, &cp->vaddr,
+ htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
if (!n_cp) {
n_cp = ip_vs_conn_new(&p, &cp->daddr,
htons(ntohs(cp->dport)-1),
- IP_VS_CONN_F_NFCT, cp->dest);
+ IP_VS_CONN_F_NFCT, cp->dest,
+ skb->mark);
if (!n_cp)
return 0;
@@ -377,7 +383,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Move tunnel to listen state
*/
- ip_vs_tcp_conn_listen(n_cp);
+ net = skb_net(skb);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return 1;
@@ -398,23 +405,22 @@ static struct ip_vs_app ip_vs_ftp = {
.pkt_in = ip_vs_ftp_in,
};
-
/*
- * ip_vs_ftp initialization
+ * per netns ip_vs_ftp initialization
*/
-static int __init ip_vs_ftp_init(void)
+static int __net_init __ip_vs_ftp_init(struct net *net)
{
int i, ret;
struct ip_vs_app *app = &ip_vs_ftp;
- ret = register_ip_vs_app(app);
+ ret = register_ip_vs_app(net, app);
if (ret)
return ret;
for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
if (!ports[i])
continue;
- ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+ ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
if (ret)
break;
pr_info("%s: loaded support on port[%d] = %d\n",
@@ -422,18 +428,39 @@ static int __init ip_vs_ftp_init(void)
}
if (ret)
- unregister_ip_vs_app(app);
+ unregister_ip_vs_app(net, app);
return ret;
}
+/*
+ * netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+ struct ip_vs_app *app = &ip_vs_ftp;
+
+ unregister_ip_vs_app(net, app);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+ .init = __ip_vs_ftp_init,
+ .exit = __ip_vs_ftp_exit,
+};
+int __init ip_vs_ftp_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ return rv;
+}
/*
* ip_vs_ftp finish.
*/
static void __exit ip_vs_ftp_exit(void)
{
- unregister_ip_vs_app(&ip_vs_ftp);
+ unregister_pernet_subsys(&ip_vs_ftp_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 9323f89..f276df9 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -63,6 +63,8 @@
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
@@ -70,7 +72,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
/*
@@ -113,19 +114,18 @@ struct ip_vs_lblc_table {
/*
* IPVS LBLC sysctl table
*/
-
+#ifdef CONFIG_SYSCTL
static ctl_table vs_vars_table[] = {
{
.procname = "lblc_expiration",
- .data = &sysctl_ip_vs_lblc_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{ }
};
-
-static struct ctl_table_header * sysctl_header;
+#endif
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
{
@@ -241,6 +241,15 @@ static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
}
}
+static int sysctl_lblc_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblc_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
{
@@ -255,7 +264,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now,
- en->lastuse + sysctl_ip_vs_lblc_expiration))
+ en->lastuse +
+ sysctl_lblc_expiration(svc)))
continue;
ip_vs_lblc_free(en);
@@ -390,12 +400,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
int loh, doh;
/*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
+ * We use the following formula to estimate the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
@@ -411,8 +416,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
continue;
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -426,8 +430,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
@@ -511,7 +514,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* No cache entry or it is invalid, time to schedule */
dest = __ip_vs_lblc_schedule(svc);
if (!dest) {
- IP_VS_ERR_RL("LBLC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
@@ -543,23 +546,77 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
.schedule = ip_vs_lblc_schedule,
};
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblc_ctl_table == NULL)
+ return -ENOMEM;
+ } else
+ ipvs->lblc_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+ ipvs->lblc_ctl_header =
+ register_net_sysctl_table(net, net_vs_ctl_path,
+ ipvs->lblc_ctl_table);
+ if (!ipvs->lblc_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblc_ops = {
+ .init = __ip_vs_lblc_init,
+ .exit = __ip_vs_lblc_exit,
+};
static int __init ip_vs_lblc_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblc_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
return ret;
}
-
static void __exit ip_vs_lblc_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index dbeed8e..cb1c991 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -63,6 +63,8 @@
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
@@ -70,8 +72,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
/*
* for IPVS lblcr entry hash table
@@ -180,8 +180,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if ((atomic_read(&least->weight) > 0)
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -194,8 +193,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
if ((loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -230,8 +228,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
list_for_each_entry(e, &set->list, list) {
most = e->dest;
if (atomic_read(&most->weight) > 0) {
- moh = atomic_read(&most->activeconns) * 50
- + atomic_read(&most->inactconns);
+ moh = ip_vs_dest_conn_overhead(most);
goto nextstage;
}
}
@@ -241,8 +238,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
nextstage:
list_for_each_entry(e, &set->list, list) {
dest = e->dest;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
if ((moh * atomic_read(&dest->weight) <
doh * atomic_read(&most->weight))
@@ -289,6 +285,7 @@ struct ip_vs_lblcr_table {
};
+#ifdef CONFIG_SYSCTL
/*
* IPVS LBLCR sysctl table
*/
@@ -296,15 +293,14 @@ struct ip_vs_lblcr_table {
static ctl_table vs_vars_table[] = {
{
.procname = "lblcr_expiration",
- .data = &sysctl_ip_vs_lblcr_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{ }
};
-
-static struct ctl_table_header * sysctl_header;
+#endif
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
@@ -418,6 +414,15 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
}
}
+static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblcr_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
{
@@ -431,8 +436,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
- now))
+ if (time_after(en->lastuse +
+ sysctl_lblcr_expiration(svc), now))
continue;
ip_vs_lblcr_free(en);
@@ -566,12 +571,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
int loh, doh;
/*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
+ * We use the following formula to estimate the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
@@ -588,8 +588,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -603,8 +602,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
@@ -675,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
time_after(jiffies, en->set.lastmod +
- sysctl_ip_vs_lblcr_expiration)) {
+ sysctl_lblcr_expiration(svc))) {
struct ip_vs_dest *m;
write_lock(&en->set.lock);
@@ -694,7 +692,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* The cache entry is invalid, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
if (!dest) {
- IP_VS_ERR_RL("LBLCR: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
read_unlock(&svc->sched_lock);
return NULL;
}
@@ -744,23 +742,77 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
.schedule = ip_vs_lblcr_schedule,
};
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblcr_ctl_table == NULL)
+ return -ENOMEM;
+ } else
+ ipvs->lblcr_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+ ipvs->lblcr_ctl_header =
+ register_net_sysctl_table(net, net_vs_ctl_path,
+ ipvs->lblcr_ctl_table);
+ if (!ipvs->lblcr_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+ .init = __ip_vs_lblcr_init,
+ .exit = __ip_vs_lblcr_exit,
+};
static int __init ip_vs_lblcr_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
return ret;
}
-
static void __exit ip_vs_lblcr_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index 4f69db1..f391819 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -22,22 +22,6 @@
#include <net/ip_vs.h>
-
-static inline unsigned int
-ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
-{
- /*
- * We think the overhead of processing active connections is 256
- * times higher than that of inactive connections in average. (This
- * 256 times might not be accurate, we will change it later) We
- * use the following formula to estimate the overhead now:
- * dest->activeconns*256 + dest->inactconns
- */
- return (atomic_read(&dest->activeconns) << 8) +
- atomic_read(&dest->inactconns);
-}
-
-
/*
* Least Connection scheduling
*/
@@ -62,7 +46,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
- doh = ip_vs_lc_dest_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest);
if (!least || doh < loh) {
least = dest;
loh = doh;
@@ -70,7 +54,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
if (!least)
- IP_VS_ERR_RL("LC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
else
IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "
"inactconns %d\n",
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 4680647..f454c80 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -141,6 +141,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
struct nf_conntrack_tuple *orig, new_reply;
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = nf_ct_net(ct);
if (exp->tuple.src.l3num != PF_INET)
return;
@@ -155,7 +156,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
/* RS->CLIENT */
orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+ ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
&orig->src.u3, orig->src.u.tcp.port,
&orig->dst.u3, orig->dst.u.tcp.port, &p);
cp = ip_vs_conn_out_get(&p);
@@ -268,7 +269,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
" for conn " FMT_CONN "\n",
__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
- h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+ h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+ &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
/* Show what happens instead of calling nf_ct_kill() */
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index c413e18..984d9c1 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -99,7 +99,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
if (!least) {
- IP_VS_ERR_RL("NQ: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 3414af7..5cf859c 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc)
}
/* Get pe in the pe list by name */
-static struct ip_vs_pe *
-ip_vs_pe_getbyname(const char *pe_name)
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
{
struct ip_vs_pe *pe;
- IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+ IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
spin_lock_bh(&ip_vs_pe_lock);
@@ -60,28 +59,22 @@ ip_vs_pe_getbyname(const char *pe_name)
}
/* Lookup pe and try to load it if it doesn't exist */
-struct ip_vs_pe *ip_vs_pe_get(const char *name)
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
{
struct ip_vs_pe *pe;
/* Search for the pe by name */
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
/* If pe not found, load the module and search again */
if (!pe) {
request_module("ip_vs_pe_%s", name);
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
}
return pe;
}
-void ip_vs_pe_put(struct ip_vs_pe *pe)
-{
- if (pe && pe->module)
- module_put(pe->module);
-}
-
/* Register a pe in the pe list */
int register_ip_vs_pe(struct ip_vs_pe *pe)
{
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index b8b4e96..13d607a 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -71,6 +71,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
struct ip_vs_iphdr iph;
unsigned int dataoff, datalen, matchoff, matchlen;
const char *dptr;
+ int retc;
ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
@@ -83,20 +84,21 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
if (dataoff >= skb->len)
return -EINVAL;
+ if ((retc=skb_linearize(skb)) < 0)
+ return retc;
dptr = skb->data + dataoff;
datalen = skb->len - dataoff;
if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
return -EINVAL;
- p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
- if (!p->pe_data)
- return -ENOMEM;
-
/* N.B: pe_data is only set on success,
* this allows fallback to the default persistence logic on failure
*/
- memcpy(p->pe_data, dptr + matchoff, matchlen);
+ p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
p->pe_data_len = matchlen;
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index c539983..17484a4 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -60,6 +60,35 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
return 0;
}
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \
+ defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \
+ defined(CONFIG_IP_VS_PROTO_ESP)
+/*
+ * register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ struct ip_vs_proto_data *pd =
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+
+ if (!pd) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ pd->pp = pp; /* For speed issues */
+ pd->next = ipvs->proto_data_table[hash];
+ ipvs->proto_data_table[hash] = pd;
+ atomic_set(&pd->appcnt, 0); /* Init app counter */
+
+ if (pp->init_netns != NULL)
+ pp->init_netns(net, pd);
+
+ return 0;
+}
+#endif
/*
* unregister an ipvs protocol
@@ -82,6 +111,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
return -ESRCH;
}
+/*
+ * unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data **pd_p;
+ unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+ pd_p = &ipvs->proto_data_table[hash];
+ for (; *pd_p; pd_p = &(*pd_p)->next) {
+ if (*pd_p == pd) {
+ *pd_p = pd->next;
+ if (pd->pp->exit_netns != NULL)
+ pd->pp->exit_netns(net, pd);
+ kfree(pd);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
/*
* get ip_vs_protocol object by its proto.
@@ -100,19 +152,44 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
}
EXPORT_SYMBOL(ip_vs_proto_get);
+/*
+ * get ip_vs_protocol object data by netns and proto
+ */
+struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+ struct ip_vs_proto_data *pd;
+ unsigned hash = IP_VS_PROTO_HASH(proto);
+
+ for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+ if (pd->pp->protocol == proto)
+ return pd;
+ }
+
+ return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
/*
* Propagate event for state change to all protocols
*/
-void ip_vs_protocol_timeout_change(int flags)
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
{
- struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
int i;
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
- for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
- if (pp->timeout_change)
- pp->timeout_change(pp, flags);
+ for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+ if (pd->pp->timeout_change)
+ pd->pp->timeout_change(pd, flags);
}
}
}
@@ -236,6 +313,46 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
}
+/*
+ * per network name-space init
+ */
+static int __net_init __ip_vs_protocol_init(struct net *net)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
+#endif
+ return 0;
+}
+
+static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ /* unregister all the ipvs proto data for this netns */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pd = ipvs->proto_data_table[i]) != NULL)
+ unregister_ip_vs_proto_netns(net, pd);
+ }
+}
+
+static struct pernet_operations ipvs_proto_ops = {
+ .init = __ip_vs_protocol_init,
+ .exit = __ip_vs_protocol_cleanup,
+};
int __init ip_vs_protocol_init(void)
{
@@ -265,6 +382,7 @@ int __init ip_vs_protocol_init(void)
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
pr_info("Registered protocols (%s)\n", &protocols[2]);
+ return register_pernet_subsys(&ipvs_proto_ops);
return 0;
}
@@ -275,6 +393,7 @@ void ip_vs_protocol_cleanup(void)
struct ip_vs_protocol *pp;
int i;
+ unregister_pernet_subsys(&ipvs_proto_ops);
/* unregister all the ipvs protocols */
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 3a04611..5b8eb8b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,28 +41,30 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
static void
-ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
- int inverse, struct ip_vs_conn_param *p)
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+ const struct ip_vs_iphdr *iph, int inverse,
+ struct ip_vs_conn_param *p)
{
if (likely(!inverse))
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->saddr, htons(PORT_ISAKMP),
&iph->daddr, htons(PORT_ISAKMP), p);
else
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->daddr, htons(PORT_ISAKMP),
&iph->saddr, htons(PORT_ISAKMP), p);
}
static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
@@ -72,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -83,21 +85,21 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn *
ah_esp_conn_out_get(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -107,7 +109,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
/*
@@ -117,26 +119,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
#ifdef CONFIG_IP_VS_PROTO_AH
struct ip_vs_protocol ip_vs_protocol_ah = {
.name = "AH",
.protocol = IPPROTO_AH,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
@@ -149,7 +139,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
.app_conn_bind = NULL,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
- .set_state_timeout = NULL,
};
#endif
@@ -159,8 +148,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
.protocol = IPPROTO_ESP,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1ea96bc..b027ccc 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,9 +9,10 @@
#include <net/ip_vs.h>
static int
-sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
sctp_chunkhdr_t _schunkh, *sch;
sctp_sctphdr_t *sh, _sctph;
@@ -27,13 +28,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
sizeof(_schunkh), &_schunkh);
if (sch == NULL)
return 0;
-
+ net = skb_net(skb);
if ((sch->type == SCTP_CID_INIT) &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+ (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
&iph.daddr, sh->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -46,14 +47,19 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
-
+ /* NF_ACCEPT */
return 1;
}
@@ -856,7 +862,7 @@ static struct ipvs_sctp_nextstate
/*
* Timeout table[state]
*/
-static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
[IP_VS_SCTP_S_NONE] = 2 * HZ,
[IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ,
[IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ,
@@ -900,20 +906,8 @@ static const char *sctp_state_name(int state)
return "?";
}
-static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-}
-
-static int
-sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-
-return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
- sctp_state_name_table, sname, to);
-}
-
static inline int
-set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, const struct sk_buff *skb)
{
sctp_chunkhdr_t _sctpch, *sch;
@@ -971,7 +965,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
IP_VS_DBG_BUF(8, "%s %s %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((direction == IP_VS_DIR_OUTPUT) ?
"output " : "input "),
IP_VS_DBG_ADDR(cp->af, &cp->daddr),
@@ -995,75 +989,73 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
}
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = next_state];
+ else /* What to do ? */
+ cp->timeout = sctp_timeouts[cp->state = next_state];
- cp->timeout = pp->timeout_table[cp->state = next_state];
-
- return 1;
+ return 1;
}
static int
sctp_state_transition(struct ip_vs_conn *cp, int direction,
- const struct sk_buff *skb, struct ip_vs_protocol *pp)
+ const struct sk_buff *skb, struct ip_vs_proto_data *pd)
{
int ret = 0;
spin_lock(&cp->lock);
- ret = set_sctp_state(pp, cp, direction, skb);
+ ret = set_sctp_state(pd, cp, direction, skb);
spin_unlock(&cp->lock);
return ret;
}
-/*
- * Hash table for SCTP application incarnations
- */
-#define SCTP_APP_TAB_BITS 4
-#define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS)
-#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1)
-
-static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(sctp_app_lock);
-
static inline __u16 sctp_app_hashkey(__be16 port)
{
return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
& SCTP_APP_TAB_MASK;
}
-static int sctp_register_app(struct ip_vs_app *inc)
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
hash = sctp_app_hashkey(port);
- spin_lock_bh(&sctp_app_lock);
- list_for_each_entry(i, &sctp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->sctp_app_lock);
+ list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &sctp_apps[hash]);
- atomic_inc(&ip_vs_protocol_sctp.appcnt);
+ list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&sctp_app_lock);
+ spin_unlock_bh(&ipvs->sctp_app_lock);
return ret;
}
-static void sctp_unregister_app(struct ip_vs_app *inc)
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&sctp_app_lock);
- atomic_dec(&ip_vs_protocol_sctp.appcnt);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ spin_lock_bh(&ipvs->sctp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&sctp_app_lock);
+ spin_unlock_bh(&ipvs->sctp_app_lock);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -1074,12 +1066,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&sctp_app_lock);
- list_for_each_entry(inc, &sctp_apps[hash], p_list) {
+ spin_lock(&ipvs->sctp_app_lock);
+ list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&sctp_app_lock);
+ spin_unlock(&ipvs->sctp_app_lock);
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1095,43 +1087,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&sctp_app_lock);
+ spin_unlock(&ipvs->sctp_app_lock);
out:
return result;
}
-static void ip_vs_sctp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(sctp_apps);
- pp->timeout_table = sctp_timeouts;
-}
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->sctp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+ sizeof(sctp_timeouts));
+}
-static void ip_vs_sctp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
-
+ kfree(pd->timeout_table);
}
struct ip_vs_protocol ip_vs_protocol_sctp = {
- .name = "SCTP",
- .protocol = IPPROTO_SCTP,
- .num_states = IP_VS_SCTP_S_LAST,
- .dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_sctp_init,
- .exit = ip_vs_sctp_exit,
- .register_app = sctp_register_app,
+ .name = "SCTP",
+ .protocol = IPPROTO_SCTP,
+ .num_states = IP_VS_SCTP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_sctp_init,
+ .exit_netns = __ip_vs_sctp_exit,
+ .register_app = sctp_register_app,
.unregister_app = sctp_unregister_app,
- .conn_schedule = sctp_conn_schedule,
- .conn_in_get = ip_vs_conn_in_get_proto,
- .conn_out_get = ip_vs_conn_out_get_proto,
- .snat_handler = sctp_snat_handler,
- .dnat_handler = sctp_dnat_handler,
- .csum_check = sctp_csum_check,
- .state_name = sctp_state_name,
+ .conn_schedule = sctp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = sctp_snat_handler,
+ .dnat_handler = sctp_dnat_handler,
+ .csum_check = sctp_csum_check,
+ .state_name = sctp_state_name,
.state_transition = sctp_state_transition,
- .app_conn_bind = sctp_app_conn_bind,
- .debug_packet = ip_vs_tcpudp_debug_packet,
- .timeout_change = sctp_timeout_change,
- .set_state_timeout = sctp_set_state_timeout,
+ .app_conn_bind = sctp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index f6c5200..c0cc341 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,8 +9,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
*
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * tcp_timeouts table has copy per netns in a hash table per
+ * protocol ip_vs_proto_data and is handled by netns
*/
#define KMSG_COMPONENT "IPVS"
@@ -28,9 +32,10 @@
#include <net/ip_vs.h>
static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct ip_vs_iphdr iph;
@@ -42,14 +47,14 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*verdict = NF_DROP;
return 0;
}
-
+ net = skb_net(skb);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
if (th->syn &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
- th->dest))) {
+ (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+ &iph.daddr, th->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -63,13 +68,19 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
+ /* NF_ACCEPT */
return 1;
}
@@ -338,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
/*
* Timeout table[state]
*/
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
@@ -437,10 +448,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
{
int on = (flags & 1); /* secure_tcp */
@@ -450,14 +458,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
** for most if not for all of the applications. Something
** like "capabilities" (flags) for each object.
*/
- tcp_state_table = (on? tcp_states_dos : tcp_states);
-}
-
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
- tcp_state_name_table, sname, to);
+ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
}
static inline int tcp_state_idx(struct tcphdr *th)
@@ -474,7 +475,7 @@ static inline int tcp_state_idx(struct tcphdr *th)
}
static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, struct tcphdr *th)
{
int state_idx;
@@ -497,7 +498,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
goto tcp_state_out;
}
- new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+ new_state =
+ pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
tcp_state_out:
if (new_state != cp->state) {
@@ -505,7 +507,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
th->syn ? 'S' : '.',
@@ -535,17 +537,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
- cp->timeout = pp->timeout_table[cp->state = new_state];
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = new_state];
+ else /* What to do ? */
+ cp->timeout = tcp_timeouts[cp->state = new_state];
}
-
/*
* Handle state transitions
*/
static int
tcp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
struct tcphdr _tcph, *th;
@@ -560,23 +564,12 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
return 0;
spin_lock(&cp->lock);
- set_tcp_state(pp, cp, direction, th);
+ set_tcp_state(pd, cp, direction, th);
spin_unlock(&cp->lock);
return 1;
}
-
-/*
- * Hash table for TCP application incarnations
- */
-#define TCP_APP_TAB_BITS 4
-#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
-#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
static inline __u16 tcp_app_hashkey(__be16 port)
{
return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
@@ -584,44 +577,50 @@ static inline __u16 tcp_app_hashkey(__be16 port)
}
-static int tcp_register_app(struct ip_vs_app *inc)
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
hash = tcp_app_hashkey(port);
- spin_lock_bh(&tcp_app_lock);
- list_for_each_entry(i, &tcp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->tcp_app_lock);
+ list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &tcp_apps[hash]);
- atomic_inc(&ip_vs_protocol_tcp.appcnt);
+ list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&tcp_app_lock);
+ spin_unlock_bh(&ipvs->tcp_app_lock);
return ret;
}
static void
-tcp_unregister_app(struct ip_vs_app *inc)
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&tcp_app_lock);
- atomic_dec(&ip_vs_protocol_tcp.appcnt);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ spin_lock_bh(&ipvs->tcp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&tcp_app_lock);
+ spin_unlock_bh(&ipvs->tcp_app_lock);
}
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -633,12 +632,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&tcp_app_lock);
- list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+ spin_lock(&ipvs->tcp_app_lock);
+ list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&tcp_app_lock);
+ spin_unlock(&ipvs->tcp_app_lock);
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -655,7 +654,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&tcp_app_lock);
+ spin_unlock(&ipvs->tcp_app_lock);
out:
return result;
@@ -665,24 +664,35 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
spin_lock(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
- cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+ cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+ : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
spin_unlock(&cp->lock);
}
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(tcp_apps);
- pp->timeout_table = tcp_timeouts;
-}
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->tcp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+ sizeof(tcp_timeouts));
+ pd->tcp_state_table = tcp_states;
+}
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -691,9 +701,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.protocol = IPPROTO_TCP,
.num_states = IP_VS_TCP_S_LAST,
.dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_tcp_init,
- .exit = ip_vs_tcp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_tcp_init,
+ .exit_netns = __ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
@@ -707,5 +718,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
- .set_state_timeout = tcp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9d106a0..f1282cb 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,7 +9,8 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
*
*/
@@ -28,9 +29,10 @@
#include <net/ip6_checksum.h>
static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
struct ip_vs_iphdr iph;
@@ -42,13 +44,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*verdict = NF_DROP;
return 0;
}
-
- svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+ net = skb_net(skb);
+ svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
&iph.daddr, uh->dest);
if (svc) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -62,13 +64,19 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
+ /* NF_ACCEPT */
return 1;
}
@@ -338,19 +346,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 1;
}
-
-/*
- * Note: the caller guarantees that only one of register_app,
- * unregister_app or app_conn_bind is called each time.
- */
-
-#define UDP_APP_TAB_BITS 4
-#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
-#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
static inline __u16 udp_app_hashkey(__be16 port)
{
return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
@@ -358,44 +353,50 @@ static inline __u16 udp_app_hashkey(__be16 port)
}
-static int udp_register_app(struct ip_vs_app *inc)
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
hash = udp_app_hashkey(port);
- spin_lock_bh(&udp_app_lock);
- list_for_each_entry(i, &udp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->udp_app_lock);
+ list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &udp_apps[hash]);
- atomic_inc(&ip_vs_protocol_udp.appcnt);
+ list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&udp_app_lock);
+ spin_unlock_bh(&ipvs->udp_app_lock);
return ret;
}
static void
-udp_unregister_app(struct ip_vs_app *inc)
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&udp_app_lock);
- atomic_dec(&ip_vs_protocol_udp.appcnt);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ spin_lock_bh(&ipvs->udp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&udp_app_lock);
+ spin_unlock_bh(&ipvs->udp_app_lock);
}
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -407,12 +408,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&udp_app_lock);
- list_for_each_entry(inc, &udp_apps[hash], p_list) {
+ spin_lock(&ipvs->udp_app_lock);
+ list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&udp_app_lock);
+ spin_unlock(&ipvs->udp_app_lock);
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -429,14 +430,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&udp_app_lock);
+ spin_unlock(&ipvs->udp_app_lock);
out:
return result;
}
-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = 5*60*HZ,
[IP_VS_UDP_S_LAST] = 2*HZ,
};
@@ -446,14 +447,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_LAST] = "BUG!",
};
-
-static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
- udp_state_name_table, sname, to);
-}
-
static const char * udp_state_name(int state)
{
if (state >= IP_VS_UDP_S_LAST)
@@ -464,20 +457,30 @@ static const char * udp_state_name(int state)
static int
udp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+ if (unlikely(!pd)) {
+ pr_err("UDP no ns data\n");
+ return 0;
+ }
+
+ cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
return 1;
}
-static void udp_init(struct ip_vs_protocol *pp)
+static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(udp_apps);
- pp->timeout_table = udp_timeouts;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->udp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+ sizeof(udp_timeouts));
}
-static void udp_exit(struct ip_vs_protocol *pp)
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -486,8 +489,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.protocol = IPPROTO_UDP,
.num_states = IP_VS_UDP_S_LAST,
.dont_defrag = 0,
- .init = udp_init,
- .exit = udp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __udp_init,
+ .exit_netns = __udp_exit,
.conn_schedule = udp_conn_schedule,
.conn_in_get = ip_vs_conn_in_get_proto,
.conn_out_get = ip_vs_conn_out_get_proto,
@@ -501,5 +506,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.app_conn_bind = udp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL,
- .set_state_timeout = udp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index e210f37..c49b388 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -72,7 +72,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
q = q->next;
} while (q != p);
write_unlock(&svc->sched_lock);
- IP_VS_ERR_RL("RR: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
out:
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 076ebe0..08dbdd5 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -29,6 +29,7 @@
#include <net/ip_vs.h>
+EXPORT_SYMBOL(ip_vs_scheduler_err);
/*
* IPVS scheduler list
*/
@@ -146,6 +147,30 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
module_put(scheduler->module);
}
+/*
+ * Common error output helper for schedulers
+ */
+
+void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
+{
+ if (svc->fwmark) {
+ IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
+ svc->scheduler->name, svc->fwmark,
+ svc->fwmark, msg);
+#ifdef CONFIG_IP_VS_IPV6
+ } else if (svc->af == AF_INET6) {
+ IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n",
+ svc->scheduler->name,
+ ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6, ntohs(svc->port), msg);
+#endif
+ } else {
+ IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
+ svc->scheduler->name,
+ ip_vs_proto_name(svc->protocol),
+ &svc->addr.ip, ntohs(svc->port), msg);
+ }
+}
/*
* Register a scheduler in the scheduler list
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 1ab75a9..89ead24 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -87,7 +87,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
goto nextstage;
}
}
- IP_VS_ERR_RL("SED: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
/*
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e6cc174..b5e2556 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -223,7 +223,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
- IP_VS_ERR_RL("SH: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index ab85aed..3e7961e 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -5,6 +5,18 @@
* high-performance and highly available server based on a
* cluster of servers.
*
+ * Version 1, is capable of handling both version 0 and 1 messages.
+ * Version 0 is the plain old format.
+ * Note Version 0 receivers will just drop Ver 1 messages.
+ * Version 1 is capable of handle IPv6, Persistence data,
+ * time-outs, and firewall marks.
+ * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions Message: is a complete datagram
+ * Sync_conn: is a part of a Message
+ * Param Data is an option to a Sync_conn.
+ *
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* ip_vs_sync: sync connection info from master load balancer to backups
@@ -15,6 +27,8 @@
* Alexandre Cassen : Added SyncID support for incoming sync
* messages filtering.
* Justin Ossevoort : Fix endian problem on sync message size.
+ * Hans Schillstrom : Added Version 1: i.e. IPv6,
+ * Persistence support, fwmark and time-out.
*/
#define KMSG_COMPONENT "IPVS"
@@ -35,6 +49,8 @@
#include <linux/wait.h>
#include <linux/kernel.h>
+#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+
#include <net/ip.h>
#include <net/sock.h>
@@ -43,11 +59,13 @@
#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
#define IP_VS_SYNC_PORT 8848 /* multicast port */
+#define SYNC_PROTO_VER 1 /* Protocol version in header */
/*
* IPVS sync connection entry
+ * Version 0, i.e. original version.
*/
-struct ip_vs_sync_conn {
+struct ip_vs_sync_conn_v0 {
__u8 reserved;
/* Protocol, addresses and port numbers */
@@ -71,41 +89,159 @@ struct ip_vs_sync_conn_options {
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
+/*
+ Sync Connection format (sync_conn)
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Protocol | Ver. | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | State | cport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | vport | dport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | fwmark |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | timeout (in sec.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | ... |
+ | IP-Addresses (v4 or v6) |
+ | ... |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ Optional Parameters.
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param. Type | Param. Length | Param. data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ | ... |
+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | | Param Type | Param. Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param data |
+ | Last Param data should be padded for 32 bit alignment |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ * Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ struct in6_addr caddr; /* client address */
+ struct in6_addr vaddr; /* virtual address */
+ struct in6_addr daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+ struct ip_vs_sync_v4 v4;
+ struct ip_vs_sync_v6 v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6 0
+#define STYPE_F_INET6 (1 << STYPE_INET6)
+
+#define SVER_SHIFT 12 /* Shift to get version */
+#define SVER_MASK 0x0fff /* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA 1
+#define IPVS_OPT_PE_DATA 2
+#define IPVS_OPT_PE_NAME 3
+#define IPVS_OPT_PARAM 7
+
+#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
+
struct ip_vs_sync_thread_data {
+ struct net *net;
struct socket *sock;
char *buf;
};
-#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
#define FULL_CONN_SIZE \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
/*
- The master mulitcasts messages to the backup load balancers in the
- following format.
+ The master mulitcasts messages (Datagrams) to the backup load balancers
+ in the following format.
+
+ Version 1:
+ Note, first byte should be Zero, so ver 0 receivers will drop the packet.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | Count Conns | SyncID | Size |
+ | 0 | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | Version | Reserved, set to Zero |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (1) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| . |
- | . |
+ ~ . ~
| . |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (n) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | IPVS Sync Connection (1) |
*/
#define SYNC_MESG_HEADER_LEN 4
#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-struct ip_vs_sync_mesg {
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
__u8 syncid;
__u16 size;
@@ -113,9 +249,16 @@ struct ip_vs_sync_mesg {
/* ip_vs_sync_conn entries start here */
};
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+ __u8 reserved; /* must be zero */
+ __u8 syncid;
+ __u16 size;
+ __u8 nr_conns;
+ __s8 version; /* SYNC_PROTO_VER */
+ __u16 spare;
+ /* ip_vs_sync_conn entries start here */
+};
struct ip_vs_sync_buff {
struct list_head list;
@@ -127,28 +270,6 @@ struct ip_vs_sync_buff {
unsigned char *end;
};
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
/* multicast addr */
static struct sockaddr_in mcast_addr = {
.sin_family = AF_INET,
@@ -156,41 +277,71 @@ static struct sockaddr_in mcast_addr = {
.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
};
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+ ho->init_seq = get_unaligned_be32(&no->init_seq);
+ ho->delta = get_unaligned_be32(&no->delta);
+ ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
+
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+ put_unaligned_be32(ho->init_seq, &no->init_seq);
+ put_unaligned_be32(ho->delta, &no->delta);
+ put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
+static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&ip_vs_sync_lock);
- if (list_empty(&ip_vs_sync_queue)) {
+ spin_lock_bh(&ipvs->sync_lock);
+ if (list_empty(&ipvs->sync_queue)) {
sb = NULL;
} else {
- sb = list_entry(ip_vs_sync_queue.next,
+ sb = list_entry(ipvs->sync_queue.next,
struct ip_vs_sync_buff,
list);
list_del(&sb->list);
}
- spin_unlock_bh(&ip_vs_sync_lock);
+ spin_unlock_bh(&ipvs->sync_lock);
return sb;
}
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
kfree(sb);
return NULL;
}
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */
+ sb->mesg->version = SYNC_PROTO_VER;
+ sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
sb->mesg->nr_conns = 0;
- sb->mesg->syncid = ip_vs_master_syncid;
- sb->mesg->size = 4;
- sb->head = (unsigned char *)sb->mesg + 4;
- sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+ sb->mesg->spare = 0;
+ sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+ sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
sb->firstuse = jiffies;
return sb;
}
@@ -201,14 +352,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
kfree(sb);
}
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs)
{
- spin_lock(&ip_vs_sync_lock);
- if (ip_vs_sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&sb->list, &ip_vs_sync_queue);
+ struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+
+ spin_lock(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ list_add_tail(&sb->list, &ipvs->sync_queue);
else
ip_vs_sync_buff_release(sb);
- spin_unlock(&ip_vs_sync_lock);
+ spin_unlock(&ipvs->sync_lock);
}
/*
@@ -216,36 +369,101 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&curr_sb_lock);
- if (curr_sb && (time == 0 ||
- time_before(jiffies - curr_sb->firstuse, time))) {
- sb = curr_sb;
- curr_sb = NULL;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (ipvs->sync_buff &&
+ time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
+ sb = ipvs->sync_buff;
+ ipvs->sync_buff = NULL;
} else
sb = NULL;
- spin_unlock_bh(&curr_sb_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
return sb;
}
+/*
+ * Switch mode from sending version 0 or 1
+ * - must handle sync_buf
+ */
+void ip_vs_sync_switch_mode(struct net *net, int mode)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
+ return;
+ if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff)
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ /* Buffer empty ? then let buf_create do the job */
+ if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
+ kfree(ipvs->sync_buff);
+ ipvs->sync_buff = NULL;
+ } else {
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ list_add_tail(&ipvs->sync_buff->list,
+ &ipvs->sync_queue);
+ else
+ ip_vs_sync_buff_release(ipvs->sync_buff);
+ spin_unlock_bh(&ipvs->sync_lock);
+ }
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+}
/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg_v0 *mesg;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+ mesg->nr_conns = 0;
+ mesg->syncid = ipvs->master_syncid;
+ mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+ sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+ sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+/*
+ * Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
- * Called by ip_vs_in.
*/
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
{
- struct ip_vs_sync_mesg *m;
- struct ip_vs_sync_conn *s;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg_v0 *m;
+ struct ip_vs_sync_conn_v0 *s;
int len;
- spin_lock(&curr_sb_lock);
- if (!curr_sb) {
- if (!(curr_sb=ip_vs_sync_buff_create())) {
- spin_unlock(&curr_sb_lock);
+ if (unlikely(cp->af != AF_INET))
+ return;
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
+ spin_lock(&ipvs->sync_buff_lock);
+ if (!ipvs->sync_buff) {
+ ipvs->sync_buff =
+ ip_vs_sync_buff_create_v0(ipvs);
+ if (!ipvs->sync_buff) {
+ spin_unlock(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
@@ -253,10 +471,11 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
- m = curr_sb->mesg;
- s = (struct ip_vs_sync_conn *)curr_sb->head;
+ m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
/* copy members */
+ s->reserved = 0;
s->protocol = cp->protocol;
s->cport = cp->cport;
s->vport = cp->vport;
@@ -274,83 +493,365 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
m->nr_conns++;
m->size += len;
- curr_sb->head += len;
+ ipvs->sync_buff->head += len;
/* check if there is a space for next one */
- if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
- sb_queue_tail(curr_sb);
- curr_sb = NULL;
+ if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
+ sb_queue_tail(ipvs);
+ ipvs->sync_buff = NULL;
}
- spin_unlock(&curr_sb_lock);
+ spin_unlock(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
if (cp->control)
- ip_vs_sync_conn(cp->control);
+ ip_vs_sync_conn(net, cp->control);
+}
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ * Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m;
+ union ip_vs_sync_conn *s;
+ __u8 *p;
+ unsigned int len, pe_name_len, pad;
+
+ /* Handle old version of the protocol */
+ if (sysctl_sync_ver(ipvs) == 0) {
+ ip_vs_sync_conn_v0(net, cp);
+ return;
+ }
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ goto control;
+sloop:
+ /* Sanity checks */
+ pe_name_len = 0;
+ if (cp->pe_data_len) {
+ if (!cp->pe_data || !cp->dest) {
+ IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+ return;
+ }
+ pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+ }
+
+ spin_lock(&ipvs->sync_buff_lock);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ len = sizeof(struct ip_vs_sync_v6);
+ else
+#endif
+ len = sizeof(struct ip_vs_sync_v4);
+
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+ len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+ if (cp->pe_data_len)
+ len += cp->pe_data_len + 2; /* + Param hdr field */
+ if (pe_name_len)
+ len += pe_name_len + 2;
+
+ /* check if there is a space for this one */
+ pad = 0;
+ if (ipvs->sync_buff) {
+ pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
+ if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
+ sb_queue_tail(ipvs);
+ ipvs->sync_buff = NULL;
+ pad = 0;
+ }
+ }
+
+ if (!ipvs->sync_buff) {
+ ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
+ if (!ipvs->sync_buff) {
+ spin_unlock(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ }
+
+ m = ipvs->sync_buff->mesg;
+ p = ipvs->sync_buff->head;
+ ipvs->sync_buff->head += pad + len;
+ m->size += pad + len;
+ /* Add ev. padding from prev. sync_conn */
+ while (pad--)
+ *(p++) = 0;
+
+ s = (union ip_vs_sync_conn *)p;
+
+ /* Set message type & copy members */
+ s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+ s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
+ s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->v4.state = htons(cp->state);
+ s->v4.protocol = cp->protocol;
+ s->v4.cport = cp->cport;
+ s->v4.vport = cp->vport;
+ s->v4.dport = cp->dport;
+ s->v4.fwmark = htonl(cp->fwmark);
+ s->v4.timeout = htonl(cp->timeout / HZ);
+ m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6) {
+ p += sizeof(struct ip_vs_sync_v6);
+ ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
+ ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
+ ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
+ } else
+#endif
+ {
+ p += sizeof(struct ip_vs_sync_v4); /* options ptr */
+ s->v4.caddr = cp->caddr.ip;
+ s->v4.vaddr = cp->vaddr.ip;
+ s->v4.daddr = cp->daddr.ip;
+ }
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ *(p++) = IPVS_OPT_SEQ_DATA;
+ *(p++) = sizeof(struct ip_vs_sync_conn_options);
+ hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+ p += sizeof(struct ip_vs_seq);
+ hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+ p += sizeof(struct ip_vs_seq);
+ }
+ /* Handle pe data */
+ if (cp->pe_data_len && cp->pe_data) {
+ *(p++) = IPVS_OPT_PE_DATA;
+ *(p++) = cp->pe_data_len;
+ memcpy(p, cp->pe_data, cp->pe_data_len);
+ p += cp->pe_data_len;
+ if (pe_name_len) {
+ /* Add PE_NAME */
+ *(p++) = IPVS_OPT_PE_NAME;
+ *(p++) = pe_name_len;
+ memcpy(p, cp->pe->name, pe_name_len);
+ p += pe_name_len;
+ }
+ }
+
+ spin_unlock(&ipvs->sync_buff_lock);
+
+control:
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (!cp)
+ return;
+ /*
+ * Reduce sync rate for templates
+ * i.e only increment in_pkts for Templates.
+ */
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ int pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if (pkts % sysctl_sync_period(ipvs) != 1)
+ return;
+ }
+ goto sloop;
}
+/*
+ * fill_param used by version 1
+ */
static inline int
-ip_vs_conn_fill_param_sync(int af, int protocol,
- const union nf_inet_addr *caddr, __be16 cport,
- const union nf_inet_addr *vaddr, __be16 vport,
- struct ip_vs_conn_param *p)
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ struct ip_vs_conn_param *p,
+ __u8 *pe_data, unsigned int pe_data_len,
+ __u8 *pe_name, unsigned int pe_name_len)
{
- /* XXX: Need to take into account persistence engine */
- ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ (const union nf_inet_addr *)&sc->v6.caddr,
+ sc->v6.cport,
+ (const union nf_inet_addr *)&sc->v6.vaddr,
+ sc->v6.vport, p);
+ else
+#endif
+ ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ (const union nf_inet_addr *)&sc->v4.caddr,
+ sc->v4.cport,
+ (const union nf_inet_addr *)&sc->v4.vaddr,
+ sc->v4.vport, p);
+ /* Handle pe data */
+ if (pe_data_len) {
+ if (pe_name_len) {
+ char buff[IP_VS_PENAME_MAXLEN+1];
+
+ memcpy(buff, pe_name, pe_name_len);
+ buff[pe_name_len]=0;
+ p->pe = __ip_vs_pe_getbyname(buff);
+ if (!p->pe) {
+ IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+ buff);
+ return 1;
+ }
+ } else {
+ IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+ return 1;
+ }
+
+ p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
+ if (!p->pe_data) {
+ if (p->pe->module)
+ module_put(p->pe->module);
+ return -ENOMEM;
+ }
+ p->pe_data_len = pe_data_len;
+ }
return 0;
}
/*
- * Process received multicast message and create the corresponding
- * ip_vs_conn entries.
+ * Connection Add / Update.
+ * Common for version 0 and 1 reception of backup sync_conns.
+ * Param: ...
+ * timeout is in sec.
*/
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+ unsigned int flags, unsigned int state,
+ unsigned int protocol, unsigned int type,
+ const union nf_inet_addr *daddr, __be16 dport,
+ unsigned long timeout, __u32 fwmark,
+ struct ip_vs_sync_conn_options *opt)
{
- struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
- struct ip_vs_sync_conn *s;
- struct ip_vs_sync_conn_options *opt;
- struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp;
struct ip_vs_dest *dest;
- struct ip_vs_conn_param param;
- char *p;
- int i;
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- if (buflen < sizeof(struct ip_vs_sync_mesg)) {
- IP_VS_ERR_RL("sync message header too short\n");
- return;
- }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(param);
+ else
+ cp = ip_vs_ct_in_get(param);
- /* Convert size back to host byte order */
- m->size = ntohs(m->size);
+ if (cp && param->pe_data) /* Free pe_data */
+ kfree(param->pe_data);
+ if (!cp) {
+ /*
+ * Find the appropriate destination for the connection.
+ * If it is not found the connection will remain unbound
+ * but still handled.
+ */
+ dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
+ param->vport, protocol, fwmark);
- if (buflen != m->size) {
- IP_VS_ERR_RL("bogus sync message size\n");
- return;
+ /* Set the approprite ativity flag */
+ if (protocol == IPPROTO_TCP) {
+ if (state != IP_VS_TCP_S_ESTABLISHED)
+ flags |= IP_VS_CONN_F_INACTIVE;
+ else
+ flags &= ~IP_VS_CONN_F_INACTIVE;
+ } else if (protocol == IPPROTO_SCTP) {
+ if (state != IP_VS_SCTP_S_ESTABLISHED)
+ flags |= IP_VS_CONN_F_INACTIVE;
+ else
+ flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ if (!cp) {
+ if (param->pe_data)
+ kfree(param->pe_data);
+ IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+ return;
+ }
+ } else if (!cp->dest) {
+ dest = ip_vs_try_bind_dest(cp);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+ (cp->state != state)) {
+ /* update active/inactive flag for the connection */
+ dest = cp->dest;
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
+ (cp->state != state)) {
+ dest = cp->dest;
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state != IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
}
- /* SyncID sanity check */
- if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
- IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
- m->syncid);
- return;
+ if (opt)
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
+ cp->state = state;
+ cp->old_state = cp->state;
+ /*
+ * For Ver 0 messages style
+ * - Not possible to recover the right timeout for templates
+ * - can not find the right fwmark
+ * virtual service. If needed, we can do it for
+ * non-fwmark persistent services.
+ * Ver 1 messages style.
+ * - No problem.
+ */
+ if (timeout) {
+ if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+ timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+ cp->timeout = timeout*HZ;
+ } else {
+ struct ip_vs_proto_data *pd;
+
+ pd = ip_vs_proto_data_get(net, protocol);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+ cp->timeout = pd->timeout_table[state];
+ else
+ cp->timeout = (3*60*HZ);
}
+ ip_vs_conn_put(cp);
+}
- p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+/*
+ * Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+ const size_t buflen)
+{
+ struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ char *p;
+ int i;
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
for (i=0; i<m->nr_conns; i++) {
unsigned flags, state;
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
return;
}
- s = (struct ip_vs_sync_conn *) p;
+ s = (struct ip_vs_sync_conn_v0 *) p;
flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
flags &= ~IP_VS_CONN_F_HASHED;
if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
p += FULL_CONN_SIZE;
if (p > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn options in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
return;
}
} else {
@@ -362,118 +863,286 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
pp = ip_vs_proto_get(s->protocol);
if (!pp) {
- IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
s->protocol);
continue;
}
if (state >= pp->num_states) {
- IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
pp->name, state);
continue;
}
} else {
/* protocol in templates is not used for state/timeout */
- pp = NULL;
if (state > 0) {
- IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
state);
state = 0;
}
}
- {
- if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport, &param)) {
- pr_err("ip_vs_conn_fill_param_sync failed");
- return;
+ ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ (const union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (const union nf_inet_addr *)&s->vaddr,
+ s->vport, &param);
+
+ /* Send timeout as Zero */
+ ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ (union nf_inet_addr *)&s->daddr, s->dport,
+ 0, 0, opt);
+ }
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+ __u32 *opt_flags,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_sync_conn_options *topt;
+
+ topt = (struct ip_vs_sync_conn_options *)p;
+
+ if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+ IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+ return -EINVAL;
+ }
+ if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+ IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+ return -EINVAL;
+ }
+ ntoh_seq(&topt->in_seq, &opt->in_seq);
+ ntoh_seq(&topt->out_seq, &opt->out_seq);
+ *opt_flags |= IPVS_OPT_F_SEQ_DATA;
+ return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+ __u8 **data, unsigned int maxlen,
+ __u32 *opt_flags, __u32 flag)
+{
+ if (plen > maxlen) {
+ IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+ return -EINVAL;
+ }
+ if (*opt_flags & flag) {
+ IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+ return -EINVAL;
+ }
+ *data_len = plen;
+ *data = p;
+ *opt_flags |= flag;
+ return 0;
+}
+/*
+ * Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+ struct ip_vs_sync_conn_options opt;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ __u32 flags;
+ unsigned int af, state, pe_data_len=0, pe_name_len=0;
+ __u8 *pe_data=NULL, *pe_name=NULL;
+ __u32 opt_flags=0;
+ int retc=0;
+
+ s = (union ip_vs_sync_conn *) p;
+
+ if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ af = AF_INET6;
+ p += sizeof(struct ip_vs_sync_v6);
+#else
+ IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+ retc = 10;
+ goto out;
+#endif
+ } else if (!s->v4.type) {
+ af = AF_INET;
+ p += sizeof(struct ip_vs_sync_v4);
+ } else {
+ return -10;
+ }
+ if (p > msg_end)
+ return -20;
+
+ /* Process optional params check Type & Len. */
+ while (p < msg_end) {
+ int ptype;
+ int plen;
+
+ if (p+2 > msg_end)
+ return -30;
+ ptype = *(p++);
+ plen = *(p++);
+
+ if (!plen || ((p + plen) > msg_end))
+ return -40;
+ /* Handle seq option p = param data */
+ switch (ptype & ~IPVS_OPT_F_PARAM) {
+ case IPVS_OPT_SEQ_DATA:
+ if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+ return -50;
+ break;
+
+ case IPVS_OPT_PE_DATA:
+ if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+ IP_VS_PEDATA_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_DATA))
+ return -60;
+ break;
+
+ case IPVS_OPT_PE_NAME:
+ if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+ IP_VS_PENAME_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_NAME))
+ return -70;
+ break;
+
+ default:
+ /* Param data mandatory ? */
+ if (!(ptype & IPVS_OPT_F_PARAM)) {
+ IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+ ptype & ~IPVS_OPT_F_PARAM);
+ retc = 20;
+ goto out;
}
- if (!(flags & IP_VS_CONN_F_TEMPLATE))
- cp = ip_vs_conn_in_get(&param);
- else
- cp = ip_vs_ct_in_get(&param);
}
- if (!cp) {
- /*
- * Find the appropriate destination for the connection.
- * If it is not found the connection will remain unbound
- * but still handled.
- */
- dest = ip_vs_find_dest(AF_INET,
- (union nf_inet_addr *)&s->daddr,
- s->dport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport,
- s->protocol);
- /* Set the approprite ativity flag */
- if (s->protocol == IPPROTO_TCP) {
- if (state != IP_VS_TCP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- } else if (s->protocol == IPPROTO_SCTP) {
- if (state != IP_VS_SCTP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
+ p += plen; /* Next option */
+ }
+
+ /* Get flags and Mask off unsupported */
+ flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+ flags |= IP_VS_CONN_F_SYNC;
+ state = ntohs(s->v4.state);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->v4.protocol);
+ if (!pp) {
+ IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+ s->v4.protocol);
+ retc = 30;
+ goto out;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+ pp->name, state);
+ retc = 40;
+ goto out;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+ if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ pe_data_len, pe_name, pe_name_len)) {
+ retc = 50;
+ goto out;
+ }
+ /* If only IPv4, just silent skip IPv6 */
+ if (af == AF_INET)
+ ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+ ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#ifdef CONFIG_IP_VS_IPV6
+ else
+ ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+ ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#endif
+ return 0;
+ /* Error exit */
+out:
+ IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+ return retc;
+
+}
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ * Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+ const size_t buflen)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+ __u8 *p, *msg_end;
+ int i, nr_conns;
+
+ if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+ IP_VS_DBG(2, "BACKUP, message header too short\n");
+ return;
+ }
+ /* Convert size back to host byte order */
+ m2->size = ntohs(m2->size);
+
+ if (buflen != m2->size) {
+ IP_VS_DBG(2, "BACKUP, bogus message size\n");
+ return;
+ }
+ /* SyncID sanity check */
+ if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+ return;
+ }
+ /* Handle version 1 message */
+ if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+ && (m2->spare == 0)) {
+
+ msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+ nr_conns = m2->nr_conns;
+
+ for (i=0; i<nr_conns; i++) {
+ union ip_vs_sync_conn *s;
+ unsigned size;
+ int retc;
+
+ p = msg_end;
+ if (p + sizeof(s->v4) > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ return;
}
- cp = ip_vs_conn_new(&param,
- (union nf_inet_addr *)&s->daddr,
- s->dport, flags, dest);
- if (dest)
- atomic_dec(&dest->refcnt);
- if (!cp) {
- pr_err("ip_vs_conn_new failed\n");
+ s = (union ip_vs_sync_conn *)p;
+ size = ntohs(s->v4.ver_size) & SVER_MASK;
+ msg_end = p + size;
+ /* Basic sanity checks */
+ if (msg_end > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
return;
}
- } else if (!cp->dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
- (cp->state != state)) {
- /* update active/inactive flag for the connection */
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+ ntohs(s->v4.ver_size) >> SVER_SHIFT);
+ return;
}
- } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
- (cp->state != state)) {
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_SCTP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* Process a single sync_conn */
+ retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ if (retc < 0) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+ retc);
+ return;
}
+ /* Make sure we have 32 bit alignment */
+ msg_end = p + ((size + 3) & ~3);
}
-
- if (opt)
- memcpy(&cp->in_seq, opt, sizeof(*opt));
- atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
- cp->state = state;
- cp->old_state = cp->state;
- /*
- * We can not recover the right timeout for templates
- * in all cases, we can not find the right fwmark
- * virtual service. If needed, we can do it for
- * non-fwmark persistent services.
- */
- if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
- cp->timeout = pp->timeout_table[state];
- else
- cp->timeout = (3*60*HZ);
- ip_vs_conn_put(cp);
+ } else {
+ /* Old type of message */
+ ip_vs_process_message_v0(net, buffer, buflen);
+ return;
}
}
@@ -511,8 +1180,10 @@ static int set_mcast_if(struct sock *sk, char *ifname)
{
struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -531,30 +1202,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)
* Set the maximum length of sync message according to the
* specified interface's MTU.
*/
-static int set_sync_mesg_maxlen(int sync_state)
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct net_device *dev;
int num;
if (sync_state == IP_VS_STATE_MASTER) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+ if (!dev)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
sizeof(struct udphdr) -
SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
- sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+ ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
IP_VS_DBG(7, "setting the maximum length of sync sending "
- "message %d.\n", sync_send_mesg_maxlen);
+ "message %d.\n", ipvs->send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+ if (!dev)
return -ENODEV;
- sync_recv_mesg_maxlen = dev->mtu -
+ ipvs->recv_mesg_maxlen = dev->mtu -
sizeof(struct iphdr) - sizeof(struct udphdr);
IP_VS_DBG(7, "setting the maximum length of sync receiving "
- "message %d.\n", sync_recv_mesg_maxlen);
+ "message %d.\n", ipvs->recv_mesg_maxlen);
}
return 0;
@@ -569,6 +1243,7 @@ static int set_sync_mesg_maxlen(int sync_state)
static int
join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
{
+ struct net *net = sock_net(sk);
struct ip_mreqn mreq;
struct net_device *dev;
int ret;
@@ -576,7 +1251,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -593,11 +1269,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
+ struct net *net = sock_net(sock->sk);
struct net_device *dev;
__be32 addr;
struct sockaddr_in sin;
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -619,19 +1297,20 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
/*
* Set up sending multicast socket over UDP
*/
-static struct socket * make_send_sock(void)
+static struct socket *make_send_sock(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct socket *sock;
int result;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
- result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+ result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
@@ -640,7 +1319,7 @@ static struct socket * make_send_sock(void)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
- result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+ result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error binding address of the mcast interface\n");
goto error;
@@ -664,13 +1343,14 @@ static struct socket * make_send_sock(void)
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket * make_receive_sock(void)
+static struct socket *make_receive_sock(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct socket *sock;
int result;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
@@ -689,7 +1369,7 @@ static struct socket * make_receive_sock(void)
/* join the multicast group */
result = join_mcast_group(sock->sk,
(struct in_addr *) &mcast_addr.sin_addr,
- ip_vs_backup_mcast_ifn);
+ ipvs->backup_mcast_ifn);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
@@ -760,20 +1440,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
"syncid = %d\n",
- ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+ ipvs->master_mcast_ifn, ipvs->master_syncid);
while (!kthread_should_stop()) {
- while ((sb = sb_dequeue())) {
+ while ((sb = sb_dequeue(ipvs))) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
}
- /* check if entries stay in curr_sb for 2 seconds */
- sb = get_curr_sync_buff(2 * HZ);
+ /* check if entries stay in ipvs->sync_buff for 2 seconds */
+ sb = get_curr_sync_buff(ipvs, 2 * HZ);
if (sb) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
@@ -783,14 +1464,13 @@ static int sync_thread_master(void *data)
}
/* clean up the sync_buff queue */
- while ((sb=sb_dequeue())) {
+ while ((sb = sb_dequeue(ipvs)))
ip_vs_sync_buff_release(sb);
- }
/* clean up the current sync_buff */
- if ((sb = get_curr_sync_buff(0))) {
+ sb = get_curr_sync_buff(ipvs, 0);
+ if (sb)
ip_vs_sync_buff_release(sb);
- }
/* release the sending multicast socket */
sock_release(tinfo->sock);
@@ -803,11 +1483,12 @@ static int sync_thread_master(void *data)
static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
"syncid = %d\n",
- ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -817,7 +1498,7 @@ static int sync_thread_backup(void *data)
/* do we have data now? */
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
- sync_recv_mesg_maxlen);
+ ipvs->recv_mesg_maxlen);
if (len <= 0) {
pr_err("receiving message error\n");
break;
@@ -826,7 +1507,7 @@ static int sync_thread_backup(void *data)
/* disable bottom half, because it accesses the data
shared by softirq while getting/creating conns */
local_bh_disable();
- ip_vs_process_message(tinfo->buf, len);
+ ip_vs_process_message(tinfo->net, tinfo->buf, len);
local_bh_enable();
}
}
@@ -840,41 +1521,42 @@ static int sync_thread_backup(void *data)
}
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
struct task_struct **realtask, *task;
struct socket *sock;
+ struct netns_ipvs *ipvs = net_ipvs(net);
char *name, *buf = NULL;
int (*threadfn)(void *data);
int result = -ENOMEM;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
- sizeof(struct ip_vs_sync_conn));
+ sizeof(struct ip_vs_sync_conn_v0));
if (state == IP_VS_STATE_MASTER) {
- if (sync_master_thread)
+ if (ipvs->master_thread)
return -EEXIST;
- strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_master_mcast_ifn));
- ip_vs_master_syncid = syncid;
- realtask = &sync_master_thread;
- name = "ipvs_syncmaster";
+ strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->master_mcast_ifn));
+ ipvs->master_syncid = syncid;
+ realtask = &ipvs->master_thread;
+ name = "ipvs_master:%d";
threadfn = sync_thread_master;
- sock = make_send_sock();
+ sock = make_send_sock(net);
} else if (state == IP_VS_STATE_BACKUP) {
- if (sync_backup_thread)
+ if (ipvs->backup_thread)
return -EEXIST;
- strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_backup_mcast_ifn));
- ip_vs_backup_syncid = syncid;
- realtask = &sync_backup_thread;
- name = "ipvs_syncbackup";
+ strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->backup_mcast_ifn));
+ ipvs->backup_syncid = syncid;
+ realtask = &ipvs->backup_thread;
+ name = "ipvs_backup:%d";
threadfn = sync_thread_backup;
- sock = make_receive_sock();
+ sock = make_receive_sock(net);
} else {
return -EINVAL;
}
@@ -884,9 +1566,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
goto out;
}
- set_sync_mesg_maxlen(state);
+ set_sync_mesg_maxlen(net, state);
if (state == IP_VS_STATE_BACKUP) {
- buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+ buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
if (!buf)
goto outsocket;
}
@@ -895,10 +1577,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
if (!tinfo)
goto outbuf;
+ tinfo->net = net;
tinfo->sock = sock;
tinfo->buf = buf;
- task = kthread_run(threadfn, tinfo, name);
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen);
if (IS_ERR(task)) {
result = PTR_ERR(task);
goto outtinfo;
@@ -906,7 +1589,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
/* mark as active */
*realtask = task;
- ip_vs_sync_state |= state;
+ ipvs->sync_state |= state;
/* increase the module use count */
ip_vs_use_count_inc();
@@ -924,16 +1607,18 @@ out:
}
-int stop_sync_thread(int state)
+int stop_sync_thread(struct net *net, int state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
- if (!sync_master_thread)
+ if (!ipvs->master_thread)
return -ESRCH;
pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(sync_master_thread));
+ task_pid_nr(ipvs->master_thread));
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
@@ -941,21 +1626,21 @@ int stop_sync_thread(int state)
* progress of stopping the master sync daemon.
*/
- spin_lock_bh(&ip_vs_sync_lock);
- ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
- spin_unlock_bh(&ip_vs_sync_lock);
- kthread_stop(sync_master_thread);
- sync_master_thread = NULL;
+ spin_lock_bh(&ipvs->sync_lock);
+ ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+ spin_unlock_bh(&ipvs->sync_lock);
+ kthread_stop(ipvs->master_thread);
+ ipvs->master_thread = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!sync_backup_thread)
+ if (!ipvs->backup_thread)
return -ESRCH;
pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(sync_backup_thread));
+ task_pid_nr(ipvs->backup_thread));
- ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
- kthread_stop(sync_backup_thread);
- sync_backup_thread = NULL;
+ ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+ kthread_stop(ipvs->backup_thread);
+ ipvs->backup_thread = NULL;
} else {
return -EINVAL;
}
@@ -965,3 +1650,42 @@ int stop_sync_thread(int state)
return 0;
}
+
+/*
+ * Initialize data struct for each netns
+ */
+static int __net_init __ip_vs_sync_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->sync_queue);
+ spin_lock_init(&ipvs->sync_lock);
+ spin_lock_init(&ipvs->sync_buff_lock);
+
+ ipvs->sync_mcast_addr.sin_family = AF_INET;
+ ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
+ ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
+ return 0;
+}
+
+static void __ip_vs_sync_cleanup(struct net *net)
+{
+ stop_sync_thread(net, IP_VS_STATE_MASTER);
+ stop_sync_thread(net, IP_VS_STATE_BACKUP);
+}
+
+static struct pernet_operations ipvs_sync_ops = {
+ .init = __ip_vs_sync_init,
+ .exit = __ip_vs_sync_cleanup,
+};
+
+
+int __init ip_vs_sync_init(void)
+{
+ return register_pernet_subsys(&ipvs_sync_ops);
+}
+
+void ip_vs_sync_cleanup(void)
+{
+ unregister_pernet_subsys(&ipvs_sync_ops);
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bbddfdb..bc1bfc4 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -27,22 +27,6 @@
#include <net/ip_vs.h>
-
-static inline unsigned int
-ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
-{
- /*
- * We think the overhead of processing active connections is 256
- * times higher than that of inactive connections in average. (This
- * 256 times might not be accurate, we will change it later) We
- * use the following formula to estimate the overhead now:
- * dest->activeconns*256 + dest->inactconns
- */
- return (atomic_read(&dest->activeconns) << 8) +
- atomic_read(&dest->inactconns);
-}
-
-
/*
* Weighted Least Connection scheduling
*/
@@ -71,11 +55,11 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
- loh = ip_vs_wlc_dest_overhead(least);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
- IP_VS_ERR_RL("WLC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
/*
@@ -85,7 +69,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = ip_vs_wlc_dest_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 30db633..1ef41f5 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -147,8 +147,9 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if (mark->cl == mark->cl->next) {
/* no dest entry */
- IP_VS_ERR_RL("WRR: no destination available: "
- "no destinations present\n");
+ ip_vs_scheduler_err(svc,
+ "no destination available: "
+ "no destinations present");
dest = NULL;
goto out;
}
@@ -162,8 +163,8 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
*/
if (mark->cw == 0) {
mark->cl = &svc->destinations;
- IP_VS_ERR_RL("WRR: no destination "
- "available\n");
+ ip_vs_scheduler_err(svc,
+ "no destination available");
dest = NULL;
goto out;
}
@@ -185,8 +186,9 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* back to the start, and no dest is found.
It is only possible when all dests are OVERLOADED */
dest = NULL;
- IP_VS_ERR_RL("WRR: no destination available: "
- "all destinations are overloaded\n");
+ ip_vs_scheduler_err(svc,
+ "no destination available: "
+ "all destinations are overloaded");
goto out;
}
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 5325a3f..6132b21 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -43,6 +43,13 @@
#include <net/ip_vs.h>
+enum {
+ IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
+ IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
+ IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
+ * local
+ */
+};
/*
* Destination cache to speed up outgoing route lookup
@@ -77,11 +84,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
return dst;
}
-/*
- * Get route to destination or remote server
- * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
- * &4=Allow redirect from remote daddr to local
- */
+/* Get route to destination or remote server */
static struct rtable *
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
__be32 daddr, u32 rtos, int rt_mode)
@@ -95,12 +98,8 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
spin_lock(&dest->dst_lock);
if (!(rt = (struct rtable *)
__ip_vs_dst_check(dest, rtos))) {
- struct flowi fl = {
- .fl4_dst = dest->addr.ip,
- .fl4_tos = rtos,
- };
-
- if (ip_route_output_key(net, &rt, &fl)) {
+ rt = ip_route_output(net, dest->addr.ip, 0, rtos, 0);
+ if (IS_ERR(rt)) {
spin_unlock(&dest->dst_lock);
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
&dest->addr.ip);
@@ -113,12 +112,8 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
}
spin_unlock(&dest->dst_lock);
} else {
- struct flowi fl = {
- .fl4_dst = daddr,
- .fl4_tos = rtos,
- };
-
- if (ip_route_output_key(net, &rt, &fl)) {
+ rt = ip_route_output(net, daddr, 0, rtos, 0);
+ if (IS_ERR(rt)) {
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
&daddr);
return NULL;
@@ -126,15 +121,16 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
}
local = rt->rt_flags & RTCF_LOCAL;
- if (!((local ? 1 : 2) & rt_mode)) {
+ if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+ rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
(rt->rt_flags & RTCF_LOCAL) ?
"local":"non-local", &rt->rt_dst);
ip_rt_put(rt);
return NULL;
}
- if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
- ort->rt_flags & RTCF_LOCAL)) {
+ if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
"requires NAT method, dest: %pI4\n",
&ip_hdr(skb)->daddr, &rt->rt_dst);
@@ -169,15 +165,15 @@ __ip_vs_reroute_locally(struct sk_buff *skb)
return 0;
refdst_drop(orefdst);
} else {
- struct flowi fl = {
- .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .mark = skb->mark,
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_mark = skb->mark,
};
- struct rtable *rt;
- if (ip_route_output_key(net, &rt, &fl))
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
return 0;
if (!(rt->rt_flags & RTCF_LOCAL)) {
ip_rt_put(rt);
@@ -202,22 +198,27 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
struct in6_addr *ret_saddr, int do_xfrm)
{
struct dst_entry *dst;
- struct flowi fl = {
- .fl6_dst = *daddr,
+ struct flowi6 fl6 = {
+ .daddr = *daddr,
};
- dst = ip6_route_output(net, NULL, &fl);
+ dst = ip6_route_output(net, NULL, &fl6);
if (dst->error)
goto out_err;
if (!ret_saddr)
return dst;
- if (ipv6_addr_any(&fl.fl6_src) &&
+ if (ipv6_addr_any(&fl6.saddr) &&
ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
- &fl.fl6_dst, 0, &fl.fl6_src) < 0)
- goto out_err;
- if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+ &fl6.daddr, 0, &fl6.saddr) < 0)
goto out_err;
- ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+ if (do_xfrm) {
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ dst = NULL;
+ goto out_err;
+ }
+ }
+ ipv6_addr_copy(ret_saddr, &fl6.saddr);
return dst;
out_err:
@@ -384,13 +385,14 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
- RT_TOS(iph->tos), 2)))
+ if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
+ IP_VS_RT_MODE_NON_LOCAL)))
goto tx_error_icmp;
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -443,7 +445,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -512,7 +514,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos), 1|2|4)))
+ RT_TOS(iph->tos),
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR)))
goto tx_error_icmp;
local = rt->rt_flags & RTCF_LOCAL;
/*
@@ -543,7 +548,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
"ip_vs_nat_xmit(): frag needed for");
@@ -658,7 +664,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -754,7 +760,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(tos), 1|2)))
+ RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL)))
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
@@ -773,8 +780,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
df |= (old_iph->frag_off & htons(IP_DF));
- if ((old_iph->frag_off & htons(IP_DF))
- && mtu < ntohs(old_iph->tot_len)) {
+ if ((old_iph->frag_off & htons(IP_DF) &&
+ mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
@@ -886,7 +893,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
- if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+ if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
+ !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -982,7 +990,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos), 1|2)))
+ RT_TOS(iph->tos),
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL)))
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
@@ -991,7 +1001,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
+ if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -1125,7 +1136,10 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
+ RT_TOS(ip_hdr(skb)->tos),
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR)))
goto tx_error_icmp;
local = rt->rt_flags & RTCF_LOCAL;
@@ -1158,7 +1172,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
@@ -1272,7 +1287,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);