From 669502ff31d7dba1849aec7ee2450a3c61f57d39 Mon Sep 17 00:00:00 2001 From: Andy Chittenden Date: Tue, 10 Aug 2010 10:19:53 -0400 Subject: SUNRPC: fix NFS client over TCP hangs due to packet loss (Bug 16494) When reusing a TCP connection, ensure that it's aborted if a previous shutdown attempt has been made on that connection so that the RPC over TCP recovery mechanism succeeds. Signed-off-by: Andy Chittenden Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7ca65c7..66d1369 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1305,10 +1305,11 @@ static void xs_tcp_state_change(struct sock *sk) if (!(xprt = xprt_from_sock(sk))) goto out; dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); - dprintk("RPC: state %x conn %d dead %d zapped %d\n", + dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n", sk->sk_state, xprt_connected(xprt), sock_flag(sk, SOCK_DEAD), - sock_flag(sk, SOCK_ZAPPED)); + sock_flag(sk, SOCK_ZAPPED), + sk->sk_shutdown); switch (sk->sk_state) { case TCP_ESTABLISHED: @@ -1779,10 +1780,25 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *tra { unsigned int state = transport->inet->sk_state; - if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) - return; - if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) - return; + if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) { + /* we don't need to abort the connection if the socket + * hasn't undergone a shutdown + */ + if (transport->inet->sk_shutdown == 0) + return; + dprintk("RPC: %s: TCP_CLOSEd and sk_shutdown set to %d\n", + __func__, transport->inet->sk_shutdown); + } + if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) { + /* we don't need to abort the connection if the socket + * hasn't undergone a shutdown + */ + if (transport->inet->sk_shutdown == 0) + return; + dprintk("RPC: %s: ESTABLISHED/SYN_SENT " + "sk_shutdown set to %d\n", + __func__, transport->inet->sk_shutdown); + } xs_abort_connection(xprt, transport); } -- cgit v1.1 From 7a8b80eb38b248cfdf84048dad12073d5cfba3e6 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 11 Aug 2010 12:47:08 -0400 Subject: xprtrdma: Do not truncate iova_start values in frmr registrations. A bad cast causes the iova_start, which in this case is a 64b DMA bus address, to be truncated on 32b systems. This breaks frmrs on 32b systems. No cast is needed. Signed-off-by: Steve Wise Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 27015c6..3bdbd9f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1490,7 +1490,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, memset(&frmr_wr, 0, sizeof frmr_wr); frmr_wr.opcode = IB_WR_FAST_REG_MR; frmr_wr.send_flags = 0; /* unsignaled */ - frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma; + frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; frmr_wr.wr.fast_reg.page_list_len = i; frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; -- cgit v1.1 From 15cdc644b268a9a9ce73ce0b153129222c254b7b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 11 Aug 2010 12:47:24 -0400 Subject: rpcrdma: Fix SQ size calculation when memreg is FRMR This patch updates the computation to include the worst case situation where three FRMR are required to map a single RPC REQ. Signed-off-by: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 2 ++ net/sunrpc/xprtrdma/verbs.c | 20 ++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e5e28d1..2ac3f6e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -249,6 +249,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, req->rl_nchunks = nchunks; BUG_ON(nchunks == 0); + BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + && (nchunks > 3)); /* * finish off header. If write, marshal discrim and nchunks. diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3bdbd9f..5f4c7b3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -650,10 +650,22 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: - /* Add room for frmr register and invalidate WRs */ - ep->rep_attr.cap.max_send_wr *= 3; - if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) - return -EINVAL; + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. FRMR reg WR for pagelist + * 4. FRMR invalidate WR for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + ep->rep_attr.cap.max_send_wr *= 7; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { + cdata->max_requests = devattr.max_qp_wr / 7; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; + } break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: -- cgit v1.1 From df486a25900f4dba9cdc3886c4ac871951c6aef3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 17 Aug 2010 17:42:45 -0400 Subject: NFS: Fix the selection of security flavours in Kconfig Randy Dunlap reports: ERROR: "svc_gss_principal" [fs/nfs/nfs.ko] undefined! because in fs/nfs/Kconfig, NFS_V4 selects RPCSEC_GSS_KRB5 and/or in fs/nfsd/Kconfig, NFSD_V4 selects RPCSEC_GSS_KRB5. RPCSEC_GSS_KRB5 does 5 selects, but none of these is enforced/followed by the fs/nfs[d]/Kconfig configs: select SUNRPC_GSS select CRYPTO select CRYPTO_MD5 select CRYPTO_DES select CRYPTO_CBC Reported-by: Randy Dunlap Cc: J. Bruce Fields Acked-by: Randy Dunlap Signed-off-by: Trond Myklebust --- net/sunrpc/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 443c161..3376d76 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -18,10 +18,11 @@ config SUNRPC_XPRT_RDMA If unsure, say N. config RPCSEC_GSS_KRB5 - tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL + tristate + depends on SUNRPC && CRYPTO + prompt "Secure RPC: Kerberos V mechanism" if !(NFS_V4 || NFSD_V4) + default y select SUNRPC_GSS - select CRYPTO select CRYPTO_MD5 select CRYPTO_DES select CRYPTO_CBC @@ -34,7 +35,7 @@ config RPCSEC_GSS_KRB5 available from http://linux-nfs.org/. In addition, user-space Kerberos support should be installed. - If unsure, say N. + If unsure, say Y. config RPCSEC_GSS_SPKM3 tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)" -- cgit v1.1 From cca77b7c81876d819a5806f408b3c29b5b61a815 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 23 Aug 2010 14:41:22 -0700 Subject: netfilter: fix CONFIG_COMPAT support commit f3c5c1bfd430858d3a05436f82c51e53104feb6b (netfilter: xtables: make ip_tables reentrant) forgot to also compute the jumpstack size in the compat handlers. Result is that "iptables -I INPUT -j userchain" turns into -j DROP. Reported by Sebastian Roesner on #netfilter, closes http://bugzilla.netfilter.org/show_bug.cgi?id=669. Note: arptables change is compile-tested only. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Tested-by: Mikael Pettersson Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 3 +++ net/ipv4/netfilter/ip_tables.c | 3 +++ net/ipv6/netfilter/ip6_tables.c | 3 +++ 3 files changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 51d6c31..e8f4f9a 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1420,6 +1420,9 @@ static int translate_compat_table(const char *name, if (ret != 0) break; ++i; + if (strcmp(arpt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 97b64b2..d163f2e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1751,6 +1751,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ipt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 29a7bca..8e754be 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1766,6 +1766,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ip6t_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* -- cgit v1.1 From 4c3a76abd379d9a4668b2d417baa991de9757dc2 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sun, 22 Aug 2010 19:03:26 +0000 Subject: bridge: netfilter: fix a memory leak nf_bridge_alloc() always reset the skb->nf_bridge, so we should always put the old one. Signed-off-by: Changli Gao Signed-off-by: Bart De Schuymer Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 2c911c0..5ed00bd 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -162,8 +162,8 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) if (tmp) { memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); atomic_set(&tmp->use, 1); - nf_bridge_put(nf_bridge); } + nf_bridge_put(nf_bridge); nf_bridge = tmp; } return nf_bridge; -- cgit v1.1 From ad1af0fedba14f82b240a03fe20eb9b2fdbd0357 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 25 Aug 2010 02:27:49 -0700 Subject: tcp: Combat per-cpu skew in orphan tests. As reported by Anton Blanchard when we use percpu_counter_read_positive() to make our orphan socket limit checks, the check can be off by up to num_cpus_online() * batch (which is 32 by default) which on a 128 cpu machine can be as large as the default orphan limit itself. Fix this by doing the full expensive sum check if the optimized check triggers. Reported-by: Anton Blanchard Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- net/ipv4/tcp.c | 5 +---- net/ipv4/tcp_timer.c | 8 ++++---- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 176e11a..197b9b7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2011,11 +2011,8 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - int orphan_count = percpu_counter_read_positive( - sk->sk_prot->orphan_count); - sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, orphan_count)) { + if (tcp_too_many_orphans(sk, 0)) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 808bb92..c35b469 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -66,18 +66,18 @@ static void tcp_write_err(struct sock *sk) static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = percpu_counter_read_positive(&tcp_orphan_count); + int shift = 0; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) - orphans <<= 1; + shift++; /* If some dubious ICMP arrived, penalize even more. */ if (sk->sk_err_soft) - orphans <<= 1; + shift++; - if (tcp_too_many_orphans(sk, orphans)) { + if (tcp_too_many_orphans(sk, shift)) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); -- cgit v1.1 From c5ed63d66f24fd4f7089b5a6e087b0ce7202aa8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Aug 2010 23:02:17 -0700 Subject: tcp: fix three tcp sysctls tuning As discovered by Anton Blanchard, current code to autotune tcp_death_row.sysctl_max_tw_buckets, sysctl_tcp_max_orphans and sysctl_max_syn_backlog makes little sense. The bigger a page is, the less tcp_max_orphans is : 4096 on a 512GB machine in Anton's case. (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)) is much bigger if spinlock debugging is on. Its wrong to select bigger limits in this case (where kernel structures are also bigger) bhash_size max is 65536, and we get this value even for small machines. A better ground is to use size of ehash table, this also makes code shorter and more obvious. Based on a patch from Anton, and another from David. Reported-and-tested-by: Anton Blanchard Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 197b9b7..e2add5f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3209,7 +3209,7 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long nr_pages, limit; - int order, i, max_share; + int i, max_share, cnt; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3258,22 +3258,12 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } - /* Try to be a bit smarter and adjust defaults depending - * on available memory. - */ - for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); - order++) - ; - if (order >= 4) { - tcp_death_row.sysctl_max_tw_buckets = 180000; - sysctl_tcp_max_orphans = 4096 << (order - 4); - sysctl_max_syn_backlog = 1024; - } else if (order < 3) { - tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); - sysctl_tcp_max_orphans >>= (3 - order); - sysctl_max_syn_backlog = 128; - } + + cnt = tcp_hashinfo.ehash_mask + 1; + + tcp_death_row.sysctl_max_tw_buckets = cnt / 2; + sysctl_tcp_max_orphans = cnt / 2; + sysctl_max_syn_backlog = max(128, cnt / 256); /* Set the pressure threshold to be a fraction of global memory that * is up to 1/2 at 256 MB, decreasing toward zero with the amount of -- cgit v1.1 From d84ba638e4ba3c40023ff997aa5e8d3ed002af36 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 Aug 2010 16:05:48 +0000 Subject: tcp: select(writefds) don't hang up when a peer close connection This issue come from ruby language community. Below test program hang up when only run on Linux. % uname -mrsv Linux 2.6.26-2-486 #1 Sat Dec 26 08:37:39 UTC 2009 i686 % ruby -rsocket -ve ' BasicSocket.do_not_reverse_lookup = true serv = TCPServer.open("127.0.0.1", 0) s1 = TCPSocket.open("127.0.0.1", serv.addr[1]) s2 = serv.accept s2.close s1.write("a") rescue p $! s1.write("a") rescue p $! Thread.new { s1.write("a") }.join' ruby 1.9.3dev (2010-07-06 trunk 28554) [i686-linux] # [Hang Here] FreeBSD, Solaris, Mac doesn't. because Ruby's write() method call select() internally. and tcp_poll has a bug. SUS defined 'ready for writing' of select() as following. | A descriptor shall be considered ready for writing when a call to an output | function with O_NONBLOCK clear would not block, whether or not the function | would transfer data successfully. That said, EPIPE situation is clearly one of 'ready for writing'. We don't have read-side issue because tcp_poll() already has read side shutdown care. | if (sk->sk_shutdown & RCV_SHUTDOWN) | mask |= POLLIN | POLLRDNORM | POLLRDHUP; So, Let's insert same logic in write side. - reference url http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31065 http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31068 Signed-off-by: KOSAKI Motohiro Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2add5f..3fb1428 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -451,7 +451,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) mask |= POLLOUT | POLLWRNORM; } - } + } else + mask |= POLLOUT | POLLWRNORM; if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; -- cgit v1.1 From bfc960a8eec023a170a80697fe65157cd4f44f81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Aug 2010 23:44:35 +0000 Subject: l2tp: test for ethernet header in l2tp_eth_dev_recv() close https://bugzilla.kernel.org/show_bug.cgi?id=16529 Before calling dev_forward_skb(), we should make sure skb head contains at least an ethernet header, even if length included in upper layer said so. Use pskb_may_pull() to make sure this ethernet header is present in skb head. Reported-by: Thomas Heil Reported-by: Ian Campbell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 58c6c4c..1ae6976 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -132,7 +132,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, printk("\n"); } - if (data_len < ETH_HLEN) + if (!pskb_may_pull(skb, sizeof(ETH_HLEN))) goto error; secpath_reset(skb); -- cgit v1.1 From d71b0e9c0028f3af910226f995e0074873e16979 Mon Sep 17 00:00:00 2001 From: Bernard Pidoux F6BVP Date: Thu, 26 Aug 2010 11:40:00 +0000 Subject: ax25: missplaced sock_put(sk) This patch moves a missplaced sock_put(sk) after bh_unlock_sock(sk) like in other parts of AX25 driver. Signed-off-by: Bernard Pidoux Signed-off-by: David S. Miller --- net/ax25/ax25_ds_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 2ce79df..c7d8143 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -112,8 +112,8 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25) if (sk) { sock_hold(sk); ax25_destroy_socket(ax25); - sock_put(sk); bh_unlock_sock(sk); + sock_put(sk); } else ax25_destroy_socket(ax25); return; -- cgit v1.1 From 7e368739e3b3f1d7944794c178a15f05829b56bc Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 26 Aug 2010 16:11:08 -0700 Subject: net/caif/cfrfml.c: use asm/unaligned.h caif does not build on ia64 starting with 2.6.32-rc1. Using asm/unaligned.h instead of linux/unaligned/le_byteshift.h fixes the issue. include/linux/unaligned/le_byteshift.h:40:50: error: redefinition of 'get_unaligned_le16' include/linux/unaligned/le_byteshift.h:45:50: error: redefinition of 'get_unaligned_le32' include/linux/unaligned/le_byteshift.h:50:50: error: redefinition of 'get_unaligned_le64' include/linux/unaligned/le_byteshift.h:55:51: error: redefinition of 'put_unaligned_le16' include/linux/unaligned/le_byteshift.h:60:51: error: redefinition of 'put_unaligned_le32' include/linux/unaligned/le_byteshift.h:65:51: error: redefinition of 'put_unaligned_le64' include/linux/unaligned/le_struct.h:31:51: note: previous definition of 'put_unaligned_le64' was here Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/caif/cfrfml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index eb16020..9a69924 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.1 From c34186ed008229e7f7e3f1de8e6acf6374995358 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 27 Aug 2010 19:31:56 -0700 Subject: net/ipv4: Eliminate kstrdup memory leak The string clone is only used as a temporary copy of the argument val within the while loop, and so it should be freed before leaving the function. The call to strsep, however, modifies clone, so a pointer to the front of the string is kept in saved_clone, to make it possible to free it. The sematic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r exists@ local idexpression x; expression E; identifier l; statement S; @@ *x= \(kasprintf\|kstrdup\)(...); ... if (x == NULL) S ... when != kfree(x) when != E = x if (...) { <... when != kfree(x) * goto l; ...> * return ...; } // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0ec9bd0..850c737 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; - char *clone, *name; + char *saved_clone, *clone, *name; int ret = 0; - clone = kstrdup(val, GFP_USER); + saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; @@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val) } out: spin_unlock(&tcp_cong_list_lock); + kfree(saved_clone); return ret; } -- cgit v1.1 From 071249b1d501b1f31a6b1af3fbcbe03158a84e5c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 25 Aug 2010 14:47:38 +0200 Subject: mac80211: delete work timer The new workqueue changes helped me find this bug that's been lingering since the changes to the work processing in mac80211 -- the work timer is never deleted properly. Do that to avoid having it fire after all data structures have been freed. It can't be re-armed because all it will do, if running, is schedule the work, but that gets flushed later and won't have anything to do since all work items are gone by now (by way of interface removal). Cc: stable@kernel.org [2.6.34+] Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/main.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 798a91b..ded5c38 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -732,6 +732,12 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) rtnl_unlock(); + /* + * Now all work items will be gone, but the + * timer might still be armed, so delete it + */ + del_timer_sync(&local->work_timer); + cancel_work_sync(&local->reconfig_filter); ieee80211_clear_tx_pending(local); -- cgit v1.1 From 42da2f948d949efd0111309f5827bf0298bcc9a4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 30 Aug 2010 12:24:54 +0200 Subject: wireless extensions: fix kernel heap content leak Wireless extensions have an unfortunate, undocumented requirement which requires drivers to always fill iwp->length when returning a successful status. When a driver doesn't do this, it leads to a kernel heap content leak when userspace offers a larger buffer than would have been necessary. Arguably, this is a driver bug, as it should, if it returns 0, fill iwp->length, even if it separately indicated that the buffer contents was not valid. However, we can also at least avoid the memory content leak if the driver doesn't do this by setting the iwp length to max_tokens, which then reflects how big the buffer is that the driver may fill, regardless of how big the userspace buffer is. To illustrate the point, this patch also fixes a corresponding cfg80211 bug (since this requirement isn't documented nor was ever pointed out by anyone during code review, I don't trust all drivers nor all cfg80211 handlers to implement it correctly). Cc: stable@kernel.org [all the way back] Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/wext-compat.c | 3 +++ net/wireless/wext-core.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'net') diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index bb5e0a5..7e5c3a4 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1420,6 +1420,9 @@ int cfg80211_wext_giwessid(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; + data->flags = 0; + data->length = 0; + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 0ef17bc..8f5116f 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -782,6 +782,22 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd, } } + if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) { + /* + * If this is a GET, but not NOMAX, it means that the extra + * data is not bounded by userspace, but by max_tokens. Thus + * set the length to max_tokens. This matches the extra data + * allocation. + * The driver should fill it with the number of tokens it + * provided, and it may check iwp->length rather than having + * knowledge of max_tokens. If the driver doesn't change the + * iwp->length, this ioctl just copies back max_token tokens + * filled with zeroes. Hopefully the driver isn't claiming + * them to be valid data. + */ + iwp->length = descr->max_tokens; + } + err = handler(dev, info, (union iwreq_data *) iwp, extra); iwp->length += essid_compat; -- cgit v1.1 From 628e300cccaa628d8fb92aa28cb7530a3d5f2257 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 30 Aug 2010 18:35:24 -0700 Subject: irda: Correctly clean up self->ias_obj on irda_bind() failure. If irda_open_tsap() fails, the irda_bind() code tries to destroy the ->ias_obj object by hand, but does so wrongly. In particular, it fails to a) release the hashbin attached to the object and b) reset the self->ias_obj pointer to NULL. Fix both problems by using irias_delete_object() and explicitly setting self->ias_obj to NULL, just as irda_release() does. Reported-by: Tavis Ormandy Signed-off-by: David S. Miller --- net/irda/af_irda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 79986a6..fd55b51 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -824,8 +824,8 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name); if (err < 0) { - kfree(self->ias_obj->name); - kfree(self->ias_obj); + irias_delete_object(self->ias_obj); + self->ias_obj = NULL; goto out; } -- cgit v1.1 From b963ea89f00f13c648af4082e5efb2172d369727 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 30 Aug 2010 19:08:01 -0700 Subject: netlink: Make NETLINK_USERSOCK work again. Once we started enforcing the a nl_table[] entry exist for a protocol, NETLINK_USERSOCK stopped working. Add a dummy table entry so that it works again. Reported-by: Thomas Voegtle Tested-by: Thomas Voegtle Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 980fe4a..cd96ed3 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2102,6 +2102,26 @@ static void __net_exit netlink_net_exit(struct net *net) #endif } +static void __init netlink_add_usersock_entry(void) +{ + unsigned long *listeners; + int groups = 32; + + listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head), + GFP_KERNEL); + if (!listeners) + panic("netlink_add_usersock_entry: Cannot allocate listneres\n"); + + netlink_table_grab(); + + nl_table[NETLINK_USERSOCK].groups = groups; + nl_table[NETLINK_USERSOCK].listeners = listeners; + nl_table[NETLINK_USERSOCK].module = THIS_MODULE; + nl_table[NETLINK_USERSOCK].registered = 1; + + netlink_table_ungrab(); +} + static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, @@ -2150,6 +2170,8 @@ static int __init netlink_proto_init(void) hash->rehash_time = jiffies; } + netlink_add_usersock_entry(); + sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); /* The netlink device handler may be needed early. */ -- cgit v1.1 From c3d34d5d9654ec9c2510f9341bfb1030b8f029d1 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Mon, 30 Aug 2010 17:36:40 -0400 Subject: wireless: register wiphy rfkill w/o holding cfg80211_mutex Otherwise lockdep complains... https://bugzilla.kernel.org/show_bug.cgi?id=17311 [ INFO: possible circular locking dependency detected ] 2.6.36-rc2-git4 #12 ------------------------------------------------------- kworker/0:3/3630 is trying to acquire lock: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x12/0x14 but task is already holding lock: (rfkill_global_mutex){+.+.+.}, at: [] rfkill_switch_all+0x24/0x49 [rfkill] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (rfkill_global_mutex){+.+.+.}: [] lock_acquire+0x120/0x15b [] __mutex_lock_common+0x54/0x52e [] mutex_lock_nested+0x34/0x39 [] rfkill_register+0x2b/0x29c [rfkill] [] wiphy_register+0x1ae/0x270 [cfg80211] [] ieee80211_register_hw+0x1b4/0x3cf [mac80211] [] iwl_ucode_callback+0x9e9/0xae3 [iwlagn] [] request_firmware_work_func+0x54/0x6f [] kthread+0x8c/0x94 [] kernel_thread_helper+0x4/0x10 -> #1 (cfg80211_mutex){+.+.+.}: [] lock_acquire+0x120/0x15b [] __mutex_lock_common+0x54/0x52e [] mutex_lock_nested+0x34/0x39 [] cfg80211_get_dev_from_ifindex+0x1b/0x7c [cfg80211] [] cfg80211_wext_giwscan+0x58/0x990 [cfg80211] [] ioctl_standard_iw_point+0x1a8/0x272 [] ioctl_standard_call+0x91/0xa7 [] T.723+0xbd/0x12c [] wext_handle_ioctl+0x31/0x6d [] dev_ioctl+0x63d/0x67a [] sock_ioctl+0x48/0x21d [] do_vfs_ioctl+0x4ba/0x509 [] sys_ioctl+0x51/0x74 [] system_call_fastpath+0x16/0x1b -> #0 (rtnl_mutex){+.+.+.}: [] __lock_acquire+0xa93/0xd9a [] lock_acquire+0x120/0x15b [] __mutex_lock_common+0x54/0x52e [] mutex_lock_nested+0x34/0x39 [] rtnl_lock+0x12/0x14 [] cfg80211_rfkill_set_block+0x1a/0x7b [cfg80211] [] rfkill_set_block+0x80/0xd5 [rfkill] [] __rfkill_switch_all+0x3f/0x6f [rfkill] [] rfkill_switch_all+0x38/0x49 [rfkill] [] rfkill_op_handler+0x105/0x136 [rfkill] [] process_one_work+0x248/0x403 [] worker_thread+0x139/0x214 [] kthread+0x8c/0x94 [] kernel_thread_helper+0x4/0x10 Signed-off-by: John W. Linville Acked-by: Johannes Berg --- net/wireless/core.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 541e2ff..d6d046b 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -475,12 +475,10 @@ int wiphy_register(struct wiphy *wiphy) mutex_lock(&cfg80211_mutex); res = device_add(&rdev->wiphy.dev); - if (res) - goto out_unlock; - - res = rfkill_register(rdev->rfkill); - if (res) - goto out_rm_dev; + if (res) { + mutex_unlock(&cfg80211_mutex); + return res; + } /* set up regulatory info */ wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE); @@ -509,13 +507,18 @@ int wiphy_register(struct wiphy *wiphy) cfg80211_debugfs_rdev_add(rdev); mutex_unlock(&cfg80211_mutex); + /* + * due to a locking dependency this has to be outside of the + * cfg80211_mutex lock + */ + res = rfkill_register(rdev->rfkill); + if (res) + goto out_rm_dev; + return 0; out_rm_dev: device_del(&rdev->wiphy.dev); - -out_unlock: - mutex_unlock(&cfg80211_mutex); return res; } EXPORT_SYMBOL(wiphy_register); -- cgit v1.1 From 0f04cfd098fb81fded74e78ea1a1b86cc6c6c31e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 31 Aug 2010 13:21:42 +0000 Subject: net sched: fix kernel leak in act_police While reviewing commit 1c40be12f7d8ca1d387510d39787b12e512a7ce8, I audited other users of tc_action_ops->dump for information leaks. That commit covered almost all of them but act_police still had a leak. opt.limit and opt.capab aren't zeroed out before the structure is passed out. This patch uses the C99 initializers to zero everything unused out. Signed-off-by: Jeff Mahoney Acked-by: Jeff Mahoney Signed-off-by: David S. Miller --- net/sched/act_police.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 537a487..7ebf743 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -350,22 +350,19 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = a->priv; - struct tc_police opt; - - opt.index = police->tcf_index; - opt.action = police->tcf_action; - opt.mtu = police->tcfp_mtu; - opt.burst = police->tcfp_burst; - opt.refcnt = police->tcf_refcnt - ref; - opt.bindcnt = police->tcf_bindcnt - bind; + struct tc_police opt = { + .index = police->tcf_index, + .action = police->tcf_action, + .mtu = police->tcfp_mtu, + .burst = police->tcfp_burst, + .refcnt = police->tcf_refcnt - ref, + .bindcnt = police->tcf_bindcnt - bind, + }; + if (police->tcfp_R_tab) opt.rate = police->tcfp_R_tab->rate; - else - memset(&opt.rate, 0, sizeof(opt.rate)); if (police->tcfp_P_tab) opt.peakrate = police->tcfp_P_tab->rate; - else - memset(&opt.peakrate, 0, sizeof(opt.peakrate)); NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); if (police->tcfp_result) NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); -- cgit v1.1 From 3b2eb6131e2f6ff646abb0fc69648179b8b70216 Mon Sep 17 00:00:00 2001 From: Michal Soltys Date: Mon, 30 Aug 2010 11:34:10 +0000 Subject: net/sched/sch_hfsc.c: initialize parent's cl_cfmin properly in init_vf() This patch fixes init_vf() function, so on each new backlog period parent's cl_cfmin is properly updated (including further propgation towards the root), even if the activated leaf has no upperlimit curve defined. Signed-off-by: Michal Soltys Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index abd904b..4749609 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -761,8 +761,8 @@ init_vf(struct hfsc_class *cl, unsigned int len) if (f != cl->cl_f) { cl->cl_f = f; cftree_update(cl); - update_cfmin(cl->cl_parent); } + update_cfmin(cl->cl_parent); } } -- cgit v1.1 From 928497f0209027ccd649480a38a499fab9c3f6f6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 31 Aug 2010 05:54:00 +0000 Subject: xfrm_user: avoid a warning with some compiler Attached is a small patch to remove a warning ("warning: ISO C90 forbids mixed declarations and code" with gcc 4.3.2). Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b14ed4b..8bae6b2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1801,7 +1801,7 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_user_expire *ue = nlmsg_data(nlh); struct xfrm_usersa_info *p = &ue->state; struct xfrm_mark m; - u32 mark = xfrm_mark_get(attrs, &m);; + u32 mark = xfrm_mark_get(attrs, &m); x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family); -- cgit v1.1 From 750e9fad8c7f44e0005ffb7195f72dd76978c2cf Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 31 Aug 2010 05:50:43 +0000 Subject: ipv4: minor fix about RPF in help of Kconfig Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7c3a7d1..571f895 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER rp_filter on use: echo 1 > /proc/sys/net/ipv4/conf//rp_filter - and + or echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter Note that some distributions enable it in startup scripts. -- cgit v1.1 From 87f94b4e91dc042620c527f3c30c37e5127ef757 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 1 Sep 2010 18:06:39 -0700 Subject: bridge: Clear INET control block of SKBs passed into ip_fragment(). In a similar vain to commit 17762060c25590bfddd68cc1131f28ec720f405f ("bridge: Clear IPCB before possible entry into IP stack") Any time we call into the IP stack we have to make sure the state there is as expected by the ipv4 code. With help from Eric Dumazet and Herbert Xu. Reported-by: Bandan Das Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 5ed00bd..137f232 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -761,9 +761,11 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb) { if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) && skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && - !skb_is_gso(skb)) + !skb_is_gso(skb)) { + /* BUG: Should really parse the IP options here. */ + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); return ip_fragment(skb, br_dev_queue_push_xmit); - else + } else return br_dev_queue_push_xmit(skb); } #else -- cgit v1.1 From 3d3be4333fdf6faa080947b331a6a19bce1a4f57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Sep 2010 00:50:51 +0000 Subject: gro: fix different skb headrooms Packets entering GRO might have different headrooms, even for a given flow (because of implementation details in drivers, like copybreak). We cant force drivers to deliver packets with a fixed headroom. 1) fix skb_segment() skb_segment() makes the false assumption headrooms of fragments are same than the head. When CHECKSUM_PARTIAL is used, this can give csum_start errors, and crash later in skb_copy_and_csum_dev() 2) allocate a minimal skb for head of frag_list skb_gro_receive() uses netdev_alloc_skb(headroom + skb_gro_offset(p)) to allocate a fresh skb. This adds NET_SKB_PAD to a padding already provided by netdevice, depending on various things, like copybreak. Use alloc_skb() to allocate an exact padding, to reduce cache line needs: NET_SKB_PAD + NET_IP_ALIGN bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=16626 Many thanks to Plamen Petrov, testing many debugging patches ! With help of Jarek Poplawski. Reported-by: Plamen Petrov Signed-off-by: Eric Dumazet CC: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3a2513f..26396ff 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2573,6 +2573,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) __copy_skb_header(nskb, skb); nskb->mac_len = skb->mac_len; + /* nskb and skb might have different headroom */ + if (nskb->ip_summed == CHECKSUM_PARTIAL) + nskb->csum_start += skb_headroom(nskb) - headroom; + skb_reset_mac_header(nskb); skb_set_network_header(nskb, skb->mac_len); nskb->transport_header = (nskb->network_header + @@ -2702,8 +2706,8 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } else if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; - headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); + headroom = NET_SKB_PAD + NET_IP_ALIGN; + nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); if (unlikely(!nskb)) return -ENOMEM; -- cgit v1.1 From 7bcbf81a2296a8f71342445560dcbe16100b567c Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 1 Sep 2010 23:07:10 +0000 Subject: ipvs: avoid oops for passive FTP Fix Passive FTP problem in ip_vs_ftp: - Do not oops in nf_nat_set_seq_adjust (adjust_tcp_sequence) when iptable_nat module is not loaded Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ftp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index f228a17..33b329b 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -359,7 +360,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, buf_len = strlen(buf); ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct)) { + if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) { /* If mangling fails this function will return 0 * which will cause the packet to be dropped. * Mangling can only fail under memory pressure, -- cgit v1.1 From 0b5d404e349c0236b11466c0a4785520c0be6982 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 2 Sep 2010 13:22:11 -0700 Subject: pkt_sched: Fix lockdep warning on est_tree_lock in gen_estimator This patch fixes a lockdep warning: [ 516.287584] ========================================================= [ 516.288386] [ INFO: possible irq lock inversion dependency detected ] [ 516.288386] 2.6.35b #7 [ 516.288386] --------------------------------------------------------- [ 516.288386] swapper/0 just changed the state of lock: [ 516.288386] (&qdisc_tx_lock){+.-...}, at: [] est_timer+0x62/0x1b4 [ 516.288386] but this lock took another, SOFTIRQ-unsafe lock in the past: [ 516.288386] (est_tree_lock){+.+...} [ 516.288386] [ 516.288386] and interrupts could create inverse lock ordering between them. ... So, est_tree_lock needs BH protection because it's taken by qdisc_tx_lock, which is used both in BH and process contexts. (Full warning with this patch at netdev, 02 Sep 2010.) Fixes commit: ae638c47dc040b8def16d05dc6acdd527628f231 ("pkt_sched: gen_estimator: add a new lock") Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9fbe7f7..6743146 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -232,7 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est->last_packets = bstats->packets; est->avpps = rate_est->pps<<10; - spin_lock(&est_tree_lock); + spin_lock_bh(&est_tree_lock); if (!elist[idx].timer.function) { INIT_LIST_HEAD(&elist[idx].list); setup_timer(&elist[idx].timer, est_timer, idx); @@ -243,7 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, list_add_rcu(&est->list, &elist[idx].list); gen_add_node(est); - spin_unlock(&est_tree_lock); + spin_unlock_bh(&est_tree_lock); return 0; } @@ -270,7 +270,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, { struct gen_estimator *e; - spin_lock(&est_tree_lock); + spin_lock_bh(&est_tree_lock); while ((e = gen_find_node(bstats, rate_est))) { rb_erase(&e->node, &est_root); @@ -281,7 +281,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, list_del_rcu(&e->list); call_rcu(&e->e_rcu, __gen_kill_estimator); } - spin_unlock(&est_tree_lock); + spin_unlock_bh(&est_tree_lock); } EXPORT_SYMBOL(gen_kill_estimator); @@ -320,9 +320,9 @@ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, ASSERT_RTNL(); - spin_lock(&est_tree_lock); + spin_lock_bh(&est_tree_lock); res = gen_find_node(bstats, rate_est) != NULL; - spin_unlock(&est_tree_lock); + spin_unlock_bh(&est_tree_lock); return res; } -- cgit v1.1 From deabc772f39405054a438d711f408d2d94d26d96 Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Fri, 3 Sep 2010 02:39:56 +0000 Subject: net: fix tx queue selection for bridged devices implementing select_queue When a net device is implementing the select_queue callback and is part of a bridge, frames coming from the bridge already have a tx queue associated to the socket (introduced in commit a4ee3ce3293dc931fab19beb472a8bde1295aebe, "net: Use sk_tx_queue_mapping for connected sockets"). The call to sk_tx_queue_get will then return the tx queue used by the bridge instead of calling the select_queue callback. In case of mac80211 this broke QoS which is implemented by using the select_queue callback. Furthermore it introduced problems with rt2x00 because frames with the same TID and RA sometimes appeared on different tx queues which the hw cannot handle correctly. Fix this by always calling select_queue first if it is available and only afterwards use the socket tx queue mapping. Signed-off-by: Helmut Schaa Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 3721fbb..b9b22a3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2058,16 +2058,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { int queue_index; - struct sock *sk = skb->sk; + const struct net_device_ops *ops = dev->netdev_ops; - queue_index = sk_tx_queue_get(sk); - if (queue_index < 0) { - const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); + } else { + struct sock *sk = skb->sk; + queue_index = sk_tx_queue_get(sk); + if (queue_index < 0) { - if (ops->ndo_select_queue) { - queue_index = ops->ndo_select_queue(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); - } else { queue_index = 0; if (dev->real_num_tx_queues > 1) queue_index = skb_tx_hash(dev, skb); -- cgit v1.1 From 70789d7052239992824628db8133de08dc78e593 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Sep 2010 05:13:05 +0000 Subject: ipv6: discard overlapping fragment RFC5722 prohibits reassembling fragments when some data overlaps. Bug spotted by Zhang Zuotao . Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 71 +++++++++++---------------------------------------- 1 file changed, 15 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 545c414..64cfef1 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -149,13 +149,6 @@ int ip6_frag_match(struct inet_frag_queue *q, void *a) } EXPORT_SYMBOL(ip6_frag_match); -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); -} - void ip6_frag_init(struct inet_frag_queue *q, void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); @@ -346,58 +339,22 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } found: - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. + /* RFC5722, Section 4: + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments, including those not yet received) MUST be silently + * discarded. */ - if (prev) { - int i = (FRAG6_CB(prev)->offset + prev->len) - offset; - if (i > 0) { - offset += i; - if (end <= offset) - goto err; - if (!pskb_pull(skb, i)) - goto err; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } + /* Check for overlap with preceding fragment. */ + if (prev && + (FRAG6_CB(prev)->offset + prev->len) - offset > 0) + goto discard_fq; - /* Look for overlap with succeeding segments. - * If we can merge fragments, do it. - */ - while (next && FRAG6_CB(next)->offset < end) { - int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */ - - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - if (!pskb_pull(next, i)) - goto err; - FRAG6_CB(next)->offset += i; /* next fragment */ - fq->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragment is completely overridden with - * new one drop it. - */ - next = next->next; - - if (prev) - prev->next = next; - else - fq->q.fragments = next; - - fq->q.meat -= free_it->len; - frag_kfree_skb(fq->q.net, free_it); - } - } + /* Look for overlap with succeeding segment. */ + if (next && FRAG6_CB(next)->offset < end) + goto discard_fq; FRAG6_CB(skb)->offset = offset; @@ -436,6 +393,8 @@ found: write_unlock(&ip6_frags.lock); return -1; +discard_fq: + fq_kill(fq); err: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); -- cgit v1.1 From 1ee89bd0fe72e381c8e4ef887d53c19c8adc6c93 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Sep 2010 05:13:07 +0000 Subject: netfilter: discard overlapping IPv6 fragment RFC5722 prohibits reassembling IPv6 fragments when some data overlaps. Bug spotted by Zhang Zuotao . Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 80 +++++++-------------------------- 1 file changed, 15 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 13ef5bc..578f3c1 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -113,14 +113,6 @@ static void nf_skb_free(struct sk_buff *skb) kfree_skb(NFCT_FRAG6_CB(skb)->orig); } -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct sk_buff *skb) -{ - atomic_sub(skb->truesize, &nf_init_frags.mem); - nf_skb_free(skb); - kfree_skb(skb); -} - /* Destruction primitives. */ static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) @@ -282,66 +274,22 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, } found: - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. - */ - if (prev) { - int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; - - if (i > 0) { - offset += i; - if (end <= offset) { - pr_debug("overlap\n"); - goto err; - } - if (!pskb_pull(skb, i)) { - pr_debug("Can't pull\n"); - goto err; - } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } - - /* Look for overlap with succeeding segments. - * If we can merge fragments, do it. + /* RFC5722, Section 4: + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments, including those not yet received) MUST be silently + * discarded. */ - while (next && NFCT_FRAG6_CB(next)->offset < end) { - /* overlap is 'i' bytes */ - int i = end - NFCT_FRAG6_CB(next)->offset; - - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - pr_debug("Eat head of the overlapped parts.: %d", i); - if (!pskb_pull(next, i)) - goto err; - /* next fragment */ - NFCT_FRAG6_CB(next)->offset += i; - fq->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragmnet is completely overridden with - * new one drop it. - */ - next = next->next; + /* Check for overlap with preceding fragment. */ + if (prev && + (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset > 0) + goto discard_fq; - if (prev) - prev->next = next; - else - fq->q.fragments = next; - - fq->q.meat -= free_it->len; - frag_kfree_skb(free_it); - } - } + /* Look for overlap with succeeding segment. */ + if (next && NFCT_FRAG6_CB(next)->offset < end) + goto discard_fq; NFCT_FRAG6_CB(skb)->offset = offset; @@ -371,6 +319,8 @@ found: write_unlock(&nf_frags.lock); return 0; +discard_fq: + fq_kill(fq); err: return -1; } -- cgit v1.1 From cf9b94f88bdbe8a02015fc30d7c232b2d262d4ad Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 4 Sep 2010 03:14:35 +0000 Subject: irda: off by one This is an off by one. We would go past the end when we NUL terminate the "value" string at end of the function. The "value" buffer is allocated in irlan_client_parse_response() or irlan_provider_parse_command(). CC: stable@kernel.org Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/irda/irlan/irlan_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c index a788f9e..6130f9d 100644 --- a/net/irda/irlan/irlan_common.c +++ b/net/irda/irlan/irlan_common.c @@ -1102,7 +1102,7 @@ int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len) memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */ le16_to_cpus(&val_len); n+=2; - if (val_len > 1016) { + if (val_len >= 1016) { IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ ); return -RSP_INVALID_COMMAND_FORMAT; } -- cgit v1.1 From 8df73ff90f00f14d2c7ff7156f7ef153f7e9d3b7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 4 Sep 2010 01:34:28 +0000 Subject: UNIX: Do not loop forever at unix_autobind(). We assumed that unix_autobind() never fails if kzalloc() succeeded. But unix_autobind() allows only 1048576 names. If /proc/sys/fs/file-max is larger than 1048576 (e.g. systems with more than 10GB of RAM), a local user can consume all names using fork()/socket()/bind(). If all names are in use, those who call bind() with addr_len == sizeof(short) or connect()/sendmsg() with setsockopt(SO_PASSCRED) will continue while (1) yield(); loop at unix_autobind() till a name becomes available. This patch adds a loop counter in order to give up after 1048576 attempts. Calling yield() for once per 256 attempts may not be sufficient when many names are already in use, for __unix_find_socket_byname() can take long time under such circumstance. Therefore, this patch also adds cond_resched() call. Note that currently a local user can consume 2GB of kernel memory if the user is allowed to create and autobind 1048576 UNIX domain sockets. We should consider adding some restriction for autobind operation. Signed-off-by: Tetsuo Handa Signed-off-by: David S. Miller --- net/unix/af_unix.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18..0b39b24 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock) static u32 ordernum = 1; struct unix_address *addr; int err; + unsigned int retries = 0; mutex_lock(&u->readlock); @@ -717,9 +718,17 @@ retry: if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, addr->hash)) { spin_unlock(&unix_table_lock); - /* Sanity yield. It is unusual case, but yet... */ - if (!(ordernum&0xFF)) - yield(); + /* + * __unix_find_socket_byname() may take long time if many names + * are already in use. + */ + cond_resched(); + /* Give up if all names seems to be in use. */ + if (retries++ == 0xFFFFF) { + err = -ENOSPC; + kfree(addr); + goto out; + } goto retry; } addr->hash ^= sk->sk_type; -- cgit v1.1 From 6f86b325189e0a53c97bf86cff0c8b02ff624934 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Sep 2010 22:36:19 -0700 Subject: ipv4: Fix reverse path filtering with multipath routing. Actually iterate over the next-hops to make sure we have a device match. Otherwise RP filtering is always elided when the route matched has multiple next-hops. Reported-by: Igor M Podlesny Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a439689..7d02a9f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -246,6 +246,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct fib_result res; int no_addr, rpf, accept_local; + bool dev_match; int ret; struct net *net; @@ -273,12 +274,22 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, } *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); + dev_match = false; + #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) + for (ret = 0; ret < res.fi->fib_nhs; ret++) { + struct fib_nh *nh = &res.fi->fib_nh[ret]; + + if (nh->nh_dev == dev) { + dev_match = true; + break; + } + } #else if (FIB_RES_DEV(res) == dev) + dev_match = true; #endif - { + if (dev_match) { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; fib_res_put(&res); return ret; -- cgit v1.1 From 64289c8e6851bca0e589e064c9a5c9fbd6ae5dd4 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sat, 4 Sep 2010 10:34:29 +0000 Subject: gro: Re-fix different skb headrooms The patch: "gro: fix different skb headrooms" in its part: "2) allocate a minimal skb for head of frag_list" is buggy. The copied skb has p->data set at the ip header at the moment, and skb_gro_offset is the length of ip + tcp headers. So, after the change the length of mac header is skipped. Later skb_set_mac_header() sets it into the NET_SKB_PAD area (if it's long enough) and ip header is misaligned at NET_SKB_PAD + NET_IP_ALIGN offset. There is no reason to assume the original skb was wrongly allocated, so let's copy it as it was. bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=16626 fixes commit: 3d3be4333fdf6faa080947b331a6a19bce1a4f57 Reported-by: Plamen Petrov Signed-off-by: Jarek Poplawski CC: Eric Dumazet Acked-by: Eric Dumazet Tested-by: Plamen Petrov Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26396ff..c83b421 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2706,7 +2706,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } else if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; - headroom = NET_SKB_PAD + NET_IP_ALIGN; + headroom = skb_headroom(p); nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); if (unlikely(!nskb)) return -ENOMEM; -- cgit v1.1 From 6523ce1525e88c598c75a1a6b8c4edddfa9defe8 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 5 Sep 2010 18:02:29 +0000 Subject: ipvs: fix active FTP - Do not create expectation when forwarding the PORT command to avoid blocking the connection. The problem is that nf_conntrack_ftp.c:help() tries to create the same expectation later in POST_ROUTING and drops the packet with "dropping packet" message after failure in nf_ct_expect_related. - Change ip_vs_update_conntrack to alter the conntrack for related connections from real server. If we do not alter the reply in this direction the next packet from client sent to vport 20 comes as NEW connection. We alter it but may be some collision happens for both conntracks and the second conntrack gets destroyed immediately. The connection stucks too. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_core.c | 1 + net/netfilter/ipvs/ip_vs_ftp.c | 6 ------ net/netfilter/ipvs/ip_vs_xmit.c | 18 ++++++++++++------ 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f8ddba..4c2f89d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -924,6 +924,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); skb->ipvs_property = 1; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 33b329b..7e9af5b 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -410,7 +410,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; - struct nf_conn *ct; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -497,11 +496,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_control_add(n_cp, cp); } - ct = (struct nf_conn *)skb->nfct; - if (ct && ct != &nf_conntrack_untracked) - ip_vs_expect_related(skb, ct, n_cp, - IPPROTO_TCP, &n_cp->dport, 1); - /* * Move tunnel to listen state */ diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 21e1a5e..49df6be 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -349,8 +349,8 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } #endif -static void -ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) { struct nf_conn *ct = (struct nf_conn *)skb->nfct; struct nf_conntrack_tuple new_tuple; @@ -365,11 +365,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) * real-server we will see RIP->DIP. */ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - new_tuple.src.u3 = cp->daddr; + if (outin) + new_tuple.src.u3 = cp->daddr; + else + new_tuple.dst.u3 = cp->vaddr; /* * This will also take care of UDP and other protocols. */ - new_tuple.src.u.tcp.port = cp->dport; + if (outin) + new_tuple.src.u.tcp.port = cp->dport; + else + new_tuple.dst.u.tcp.port = cp->vport; nf_conntrack_alter_reply(ct, &new_tuple); } @@ -428,7 +434,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp); + ip_vs_update_conntrack(skb, cp, 1); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -506,7 +512,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp); + ip_vs_update_conntrack(skb, cp, 1); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still -- cgit v1.1 From f6b085b69d1cbbd62f49f34e71a3d58cb6d34b7e Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Tue, 7 Sep 2010 07:51:17 +0000 Subject: ipv4: Suppress lockdep-RCU false positive in FIB trie (3) Hi, Here is one more of these warnings and a patch below: Sep 5 23:52:33 del kernel: [46044.244833] =================================================== Sep 5 23:52:33 del kernel: [46044.269681] [ INFO: suspicious rcu_dereference_check() usage. ] Sep 5 23:52:33 del kernel: [46044.277000] --------------------------------------------------- Sep 5 23:52:33 del kernel: [46044.285185] net/ipv4/fib_trie.c:1756 invoked rcu_dereference_check() without protection! Sep 5 23:52:33 del kernel: [46044.293627] Sep 5 23:52:33 del kernel: [46044.293632] other info that might help us debug this: Sep 5 23:52:33 del kernel: [46044.293634] Sep 5 23:52:33 del kernel: [46044.325333] Sep 5 23:52:33 del kernel: [46044.325335] rcu_scheduler_active = 1, debug_locks = 0 Sep 5 23:52:33 del kernel: [46044.348013] 1 lock held by pppd/1717: Sep 5 23:52:33 del kernel: [46044.357548] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0xf/0x20 Sep 5 23:52:33 del kernel: [46044.367647] Sep 5 23:52:33 del kernel: [46044.367652] stack backtrace: Sep 5 23:52:33 del kernel: [46044.387429] Pid: 1717, comm: pppd Not tainted 2.6.35.4.4a #3 Sep 5 23:52:33 del kernel: [46044.398764] Call Trace: Sep 5 23:52:33 del kernel: [46044.409596] [] ? printk+0x18/0x1e Sep 5 23:52:33 del kernel: [46044.420761] [] lockdep_rcu_dereference+0xa9/0xb0 Sep 5 23:52:33 del kernel: [46044.432229] [] trie_firstleaf+0x65/0x70 Sep 5 23:52:33 del kernel: [46044.443941] [] fib_table_flush+0x14/0x170 Sep 5 23:52:33 del kernel: [46044.455823] [] ? local_bh_enable_ip+0x62/0xd0 Sep 5 23:52:33 del kernel: [46044.467995] [] ? _raw_spin_unlock_bh+0x2f/0x40 Sep 5 23:52:33 del kernel: [46044.480404] [] ? fib_sync_down_dev+0x120/0x180 Sep 5 23:52:33 del kernel: [46044.493025] [] fib_flush+0x2d/0x60 Sep 5 23:52:33 del kernel: [46044.505796] [] fib_disable_ip+0x25/0x50 Sep 5 23:52:33 del kernel: [46044.518772] [] fib_netdev_event+0x73/0xd0 Sep 5 23:52:33 del kernel: [46044.531918] [] notifier_call_chain+0x2d/0x70 Sep 5 23:52:33 del kernel: [46044.545358] [] raw_notifier_call_chain+0x1a/0x20 Sep 5 23:52:33 del kernel: [46044.559092] [] call_netdevice_notifiers+0x27/0x60 Sep 5 23:52:33 del kernel: [46044.573037] [] __dev_notify_flags+0x5c/0x80 Sep 5 23:52:33 del kernel: [46044.586489] [] dev_change_flags+0x37/0x60 Sep 5 23:52:33 del kernel: [46044.599394] [] devinet_ioctl+0x54d/0x630 Sep 5 23:52:33 del kernel: [46044.612277] [] inet_ioctl+0x97/0xc0 Sep 5 23:52:34 del kernel: [46044.625208] [] sock_ioctl+0x6f/0x270 Sep 5 23:52:34 del kernel: [46044.638046] [] ? handle_mm_fault+0x420/0x6c0 Sep 5 23:52:34 del kernel: [46044.650968] [] ? sock_ioctl+0x0/0x270 Sep 5 23:52:34 del kernel: [46044.663865] [] vfs_ioctl+0x28/0xa0 Sep 5 23:52:34 del kernel: [46044.676556] [] do_vfs_ioctl+0x6a/0x5c0 Sep 5 23:52:34 del kernel: [46044.688989] [] ? up_read+0x16/0x30 Sep 5 23:52:34 del kernel: [46044.701411] [] ? do_page_fault+0x1d6/0x3a0 Sep 5 23:52:34 del kernel: [46044.714223] [] ? fget_light+0xf8/0x2f0 Sep 5 23:52:34 del kernel: [46044.726601] [] ? sys_socketcall+0x208/0x2c0 Sep 5 23:52:34 del kernel: [46044.739140] [] sys_ioctl+0x63/0x70 Sep 5 23:52:34 del kernel: [46044.751967] [] syscall_call+0x7/0xb Sep 5 23:52:34 del kernel: [46044.764734] [] ? cookie_v6_check+0x3d0/0x630 --------------> This patch fixes the warning: =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- net/ipv4/fib_trie.c:1756 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 1 lock held by pppd/1717: #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0xf/0x20 stack backtrace: Pid: 1717, comm: pppd Not tainted 2.6.35.4a #3 Call Trace: [] ? printk+0x18/0x1e [] lockdep_rcu_dereference+0xa9/0xb0 [] trie_firstleaf+0x65/0x70 [] fib_table_flush+0x14/0x170 ... Allow trie_firstleaf() to be called either under rcu_read_lock() protection or with RTNL held. The same annotation is added to node_parent_rcu() to prevent a similar warning a bit later. Followup of commits 634a4b20 and 4eaa0e3c. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 79d057a..4a8e370 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -186,7 +186,9 @@ static inline struct tnode *node_parent_rcu(struct node *node) { struct tnode *ret = node_parent(node); - return rcu_dereference(ret); + return rcu_dereference_check(ret, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); } /* Same as rcu_assign_pointer @@ -1753,7 +1755,9 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) static struct leaf *trie_firstleaf(struct trie *t) { - struct tnode *n = (struct tnode *) rcu_dereference(t->trie); + struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); if (!n) return NULL; -- cgit v1.1 From ae2688d59b5f861dc70a091d003773975d2ae7fb Mon Sep 17 00:00:00 2001 From: Jianzhao Wang Date: Wed, 8 Sep 2010 14:35:43 -0700 Subject: net: blackhole route should always be recalculated Blackhole routes are used when xfrm_lookup() returns -EREMOTE (error triggered by IKE for example), hence this kind of route is always temporary and so we should check if a better route exists for next packets. Bug has been introduced by commit d11a4dc18bf41719c9f0d7ed494d295dd2973b92. Signed-off-by: Jianzhao Wang Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3f56b6e..6298f75 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2738,6 +2738,11 @@ slow_output: } EXPORT_SYMBOL_GPL(__ip_route_output_key); +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } @@ -2746,7 +2751,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, - .check = ipv4_dst_check, + .check = ipv4_blackhole_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, .entries = ATOMIC_INIT(0), }; -- cgit v1.1 From 719f835853a92f6090258114a72ffe41f09155cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Sep 2010 05:08:44 +0000 Subject: udp: add rehash on connect() commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation) added a secondary hash on UDP, hashed on (local addr, local port). Problem is that following sequence : fd = socket(...) connect(fd, &remote, ...) not only selects remote end point (address and port), but also sets local address, while UDP stack stored in secondary hash table the socket while its local address was INADDR_ANY (or ipv6 equivalent) Sequence is : - autobind() : choose a random local port, insert socket in hash tables [while local address is INADDR_ANY] - connect() : set remote address and port, change local address to IP given by a route lookup. When an incoming UDP frame comes, if more than 10 sockets are found in primary hash table, we switch to secondary table, and fail to find socket because its local address changed. One solution to this problem is to rehash datagram socket if needed. We add a new rehash(struct socket *) method in "struct proto", and implement this method for UDP v4 & v6, using a common helper. This rehashing only takes care of secondary hash table, since primary hash (based on local port only) is not changed. Reported-by: Krzysztof Piotr Oledzki Signed-off-by: Eric Dumazet Tested-by: Krzysztof Piotr Oledzki Signed-off-by: David S. Miller --- net/ipv4/datagram.c | 5 ++++- net/ipv4/udp.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/datagram.c | 7 ++++++- net/ipv6/udp.c | 10 ++++++++++ 4 files changed, 64 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f055094..721a8a3 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -62,8 +62,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (!inet->inet_saddr) inet->inet_saddr = rt->rt_src; /* Update source address */ - if (!inet->inet_rcv_saddr) + if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = rt->rt_src; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } inet->inet_daddr = rt->rt_dst; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32e0bef..fb23c2e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk) } EXPORT_SYMBOL(udp_lib_unhash); +/* + * inet_rcv_saddr was changed, we must rehash secondary hash + */ +void udp_lib_rehash(struct sock *sk, u16 newhash) +{ + if (sk_hashed(sk)) { + struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_hslot *hslot, *hslot2, *nhslot2; + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + nhslot2 = udp_hashslot2(udptable, newhash); + udp_sk(sk)->udp_portaddr_hash = newhash; + if (hslot2 != nhslot2) { + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + /* we must lock primary chain too */ + spin_lock_bh(&hslot->lock); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + + spin_unlock_bh(&hslot->lock); + } + } +} +EXPORT_SYMBOL(udp_lib_rehash); + +static void udp_v4_rehash(struct sock *sk) +{ + u16 new_hash = udp4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + udp_lib_rehash(sk, new_hash); +} + static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1843,6 +1886,7 @@ struct proto udp_prot = { .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 7d929a2..ef371aa 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -105,9 +105,12 @@ ipv4_connected: if (ipv6_addr_any(&np->saddr)) ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); - if (ipv6_addr_any(&np->rcv_saddr)) + if (ipv6_addr_any(&np->rcv_saddr)) { ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &np->rcv_saddr); + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } goto out; } @@ -181,6 +184,8 @@ ipv4_connected: if (ipv6_addr_any(&np->rcv_saddr)) { ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); inet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); } ip6_dst_store(sk, dst, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1dd1aff..5acb356 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -111,6 +111,15 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); } +static void udp_v6_rehash(struct sock *sk) +{ + u16 new_hash = udp6_portaddr_hash(sock_net(sk), + &inet6_sk(sk)->rcv_saddr, + inet_sk(sk)->inet_num); + + udp_lib_rehash(sk, new_hash); +} + static inline int compute_score(struct sock *sk, struct net *net, unsigned short hnum, struct in6_addr *saddr, __be16 sport, @@ -1447,6 +1456,7 @@ struct proto udpv6_prot = { .backlog_rcv = udpv6_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, -- cgit v1.1 From 123031c0eeda6144b4002dc3285375aa9ae9dc11 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 8 Sep 2010 11:04:21 +0000 Subject: sctp: fix test for end of loop Add a list_has_sctp_addr function to simplify loop Based on a patches by Dan Carpenter and David Miller Signed-off-by: Joe Perches Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 24b2cd5..d344dc4 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1232,6 +1232,18 @@ out: return 0; } +static bool list_has_sctp_addr(const struct list_head *list, + union sctp_addr *ipaddr) +{ + struct sctp_transport *addr; + + list_for_each_entry(addr, list, transports) { + if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr)) + return true; + } + + return false; +} /* A restart is occurring, check to make sure no new addresses * are being added as we may be under a takeover attack. */ @@ -1240,10 +1252,10 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc, struct sctp_chunk *init, sctp_cmd_seq_t *commands) { - struct sctp_transport *new_addr, *addr; - int found; + struct sctp_transport *new_addr; + int ret = 1; - /* Implementor's Guide - Sectin 5.2.2 + /* Implementor's Guide - Section 5.2.2 * ... * Before responding the endpoint MUST check to see if the * unexpected INIT adds new addresses to the association. If new @@ -1254,31 +1266,19 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc, /* Search through all current addresses and make sure * we aren't adding any new ones. */ - new_addr = NULL; - found = 0; - list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list, - transports) { - found = 0; - list_for_each_entry(addr, &asoc->peer.transport_addr_list, - transports) { - if (sctp_cmp_addr_exact(&new_addr->ipaddr, - &addr->ipaddr)) { - found = 1; - break; - } - } - if (!found) + transports) { + if (!list_has_sctp_addr(&asoc->peer.transport_addr_list, + &new_addr->ipaddr)) { + sctp_sf_send_restart_abort(&new_addr->ipaddr, init, + commands); + ret = 0; break; - } - - /* If a new address was added, ABORT the sender. */ - if (!found && new_addr) { - sctp_sf_send_restart_abort(&new_addr->ipaddr, init, commands); + } } /* Return success if all addresses were found. */ - return found; + return ret; } /* Populate the verification/tie tags based on overlapping INIT -- cgit v1.1