From 919130981e72465867038ac1dec823dbd7a87eb5 Mon Sep 17 00:00:00 2001
From: Ted Feng <artisdom@gmail.com>
Date: Thu, 8 Dec 2011 00:46:21 +0000
Subject: ipip, sit: copy parms.name after register_netdevice

commit 72b36015ba43a3cca5303f5534d2c3e1899eae29 upstream.

Same fix as 731abb9cb2 for ipip and sit tunnel.
Commit 1c5cae815d removed an explicit call to dev_alloc_name in
ipip_tunnel_locate and ipip6_tunnel_locate, because register_netdevice
will now create a valid name, however the tunnel keeps a copy of the
name in the private parms structure. Fix this by copying the name back
after register_netdevice has successfully returned.

This shows up if you do a simple tunnel add, followed by a tunnel show:

$ sudo ip tunnel add mode ipip remote 10.2.20.211
$ ip tunnel
tunl0: ip/ip  remote any  local any  ttl inherit  nopmtudisc
tunl%d: ip/ip  remote 10.2.20.211  local any  ttl inherit
$ sudo ip tunnel add mode sit remote 10.2.20.212
$ ip tunnel
sit0: ipv6/ip  remote any  local any  ttl 64  nopmtudisc 6rd-prefix 2002::/16
sit%d: ioctl 89f8 failed: No such device
sit%d: ipv6/ip  remote 10.2.20.212  local any  ttl inherit

Signed-off-by: Ted Feng <artisdom@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/ipip.c | 7 ++++++-
 net/ipv6/sit.c  | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 378b20b..6f06f7f 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -285,6 +285,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 	if (register_netdevice(dev) < 0)
 		goto failed_free;
 
+	strcpy(nt->parms.name, dev->name);
+
 	dev_hold(dev);
 	ipip_tunnel_link(ipn, nt);
 	return nt;
@@ -759,7 +761,6 @@ static int ipip_tunnel_init(struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	tunnel->dev = dev;
-	strcpy(tunnel->parms.name, dev->name);
 
 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
@@ -825,6 +826,7 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
 static int __net_init ipip_init_net(struct net *net)
 {
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
+	struct ip_tunnel *t;
 	int err;
 
 	ipn->tunnels[0] = ipn->tunnels_wc;
@@ -848,6 +850,9 @@ static int __net_init ipip_init_net(struct net *net)
 	if ((err = register_netdev(ipn->fb_tunnel_dev)))
 		goto err_reg_dev;
 
+	t = netdev_priv(ipn->fb_tunnel_dev);
+
+	strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
 	return 0;
 
 err_reg_dev:
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 1cca576..38490d5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -263,6 +263,8 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
 	if (register_netdevice(dev) < 0)
 		goto failed_free;
 
+	strcpy(nt->parms.name, dev->name);
+
 	dev_hold(dev);
 
 	ipip6_tunnel_link(sitn, nt);
@@ -1141,7 +1143,6 @@ static int ipip6_tunnel_init(struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	tunnel->dev = dev;
-	strcpy(tunnel->parms.name, dev->name);
 
 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
@@ -1204,6 +1205,7 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea
 static int __net_init sit_init_net(struct net *net)
 {
 	struct sit_net *sitn = net_generic(net, sit_net_id);
+	struct ip_tunnel *t;
 	int err;
 
 	sitn->tunnels[0] = sitn->tunnels_wc;
@@ -1228,6 +1230,9 @@ static int __net_init sit_init_net(struct net *net)
 	if ((err = register_netdev(sitn->fb_tunnel_dev)))
 		goto err_reg_dev;
 
+	t = netdev_priv(sitn->fb_tunnel_dev);
+
+	strcpy(t->parms.name, sitn->fb_tunnel_dev->name);
 	return 0;
 
 err_reg_dev:
-- 
cgit v1.1


From afa2450ce311b3182c737c3fda59bb557da93409 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 7 Dec 2011 09:02:21 +0100
Subject: mac80211: fix another race in aggregation start

commit 15062e6a8524f5977f2cbdf6e3eb2f144262f74e upstream.

Emmanuel noticed that when mac80211 stops the queues
for aggregation that can leave a packet pending. This
packet will be given to the driver after the AMPDU
callback, but as a non-aggregated packet which messes
up the sequence number etc.

I also noticed by looking at the code that if packets
are being processed while we clear the WANT_START bit,
they might see it cleared already and queue up on
tid_tx->pending. If the driver then rejects the new
aggregation session we leak the packet.

Fix both of these issues by changing this code to not
stop the queues at all. Instead, let packets queue up
on the tid_tx->pending queue instead of letting them
get to the driver, and add code to recover properly
in case the driver rejects the session.

(The patch looks large because it has to move two
functions to before their new use.)

Reported-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/mac80211/agg-tx.c | 86 ++++++++++++++++++++++++---------------------------
 1 file changed, 41 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index db7db43..b7f4f5c 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -304,6 +304,38 @@ ieee80211_wake_queue_agg(struct ieee80211_local *local, int tid)
 	__release(agg_queue);
 }
 
+/*
+ * splice packets from the STA's pending to the local pending,
+ * requires a call to ieee80211_agg_splice_finish later
+ */
+static void __acquires(agg_queue)
+ieee80211_agg_splice_packets(struct ieee80211_local *local,
+			     struct tid_ampdu_tx *tid_tx, u16 tid)
+{
+	int queue = ieee80211_ac_from_tid(tid);
+	unsigned long flags;
+
+	ieee80211_stop_queue_agg(local, tid);
+
+	if (WARN(!tid_tx, "TID %d gone but expected when splicing aggregates"
+			  " from the pending queue\n", tid))
+		return;
+
+	if (!skb_queue_empty(&tid_tx->pending)) {
+		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+		/* copy over remaining packets */
+		skb_queue_splice_tail_init(&tid_tx->pending,
+					   &local->pending[queue]);
+		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+	}
+}
+
+static void __releases(agg_queue)
+ieee80211_agg_splice_finish(struct ieee80211_local *local, u16 tid)
+{
+	ieee80211_wake_queue_agg(local, tid);
+}
+
 void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 {
 	struct tid_ampdu_tx *tid_tx;
@@ -315,19 +347,17 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
 
 	/*
-	 * While we're asking the driver about the aggregation,
-	 * stop the AC queue so that we don't have to worry
-	 * about frames that came in while we were doing that,
-	 * which would require us to put them to the AC pending
-	 * afterwards which just makes the code more complex.
+	 * Start queuing up packets for this aggregation session.
+	 * We're going to release them once the driver is OK with
+	 * that.
 	 */
-	ieee80211_stop_queue_agg(local, tid);
-
 	clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
 
 	/*
-	 * make sure no packets are being processed to get
-	 * valid starting sequence number
+	 * Make sure no packets are being processed. This ensures that
+	 * we have a valid starting sequence number and that in-flight
+	 * packets have been flushed out and no packets for this TID
+	 * will go into the driver during the ampdu_action call.
 	 */
 	synchronize_net();
 
@@ -341,17 +371,15 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 					" tid %d\n", tid);
 #endif
 		spin_lock_bh(&sta->lock);
+		ieee80211_agg_splice_packets(local, tid_tx, tid);
 		ieee80211_assign_tid_tx(sta, tid, NULL);
+		ieee80211_agg_splice_finish(local, tid);
 		spin_unlock_bh(&sta->lock);
 
-		ieee80211_wake_queue_agg(local, tid);
 		kfree_rcu(tid_tx, rcu_head);
 		return;
 	}
 
-	/* we can take packets again now */
-	ieee80211_wake_queue_agg(local, tid);
-
 	/* activate the timer for the recipient's addBA response */
 	mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
 #ifdef CONFIG_MAC80211_HT_DEBUG
@@ -471,38 +499,6 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
 }
 EXPORT_SYMBOL(ieee80211_start_tx_ba_session);
 
-/*
- * splice packets from the STA's pending to the local pending,
- * requires a call to ieee80211_agg_splice_finish later
- */
-static void __acquires(agg_queue)
-ieee80211_agg_splice_packets(struct ieee80211_local *local,
-			     struct tid_ampdu_tx *tid_tx, u16 tid)
-{
-	int queue = ieee80211_ac_from_tid(tid);
-	unsigned long flags;
-
-	ieee80211_stop_queue_agg(local, tid);
-
-	if (WARN(!tid_tx, "TID %d gone but expected when splicing aggregates"
-			  " from the pending queue\n", tid))
-		return;
-
-	if (!skb_queue_empty(&tid_tx->pending)) {
-		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
-		/* copy over remaining packets */
-		skb_queue_splice_tail_init(&tid_tx->pending,
-					   &local->pending[queue]);
-		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
-	}
-}
-
-static void __releases(agg_queue)
-ieee80211_agg_splice_finish(struct ieee80211_local *local, u16 tid)
-{
-	ieee80211_wake_queue_agg(local, tid);
-}
-
 static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
 					 struct sta_info *sta, u16 tid)
 {
-- 
cgit v1.1


From 7eac8f9de24674cc55ee9797d05447bbfbdf1a96 Mon Sep 17 00:00:00 2001
From: Alex Juncu <ajuncu@ixiacom.com>
Date: Thu, 15 Dec 2011 23:01:25 +0000
Subject: llc: llc_cmsg_rcv was getting called after sk_eat_skb.

[ Upstream commit 9cef310fcdee12b49b8b4c96fd8f611c8873d284 ]

Received non stream protocol packets were calling llc_cmsg_rcv that used a
skb after that skb was released by sk_eat_skb. This caused received STP
packets to generate kernel panics.

Signed-off-by: Alexandru Juncu <ajuncu@ixiacom.com>
Signed-off-by: Kunjan Naik <knaik@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/llc/af_llc.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index dfd3a64..a18e6c3 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -833,15 +833,15 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied += used;
 		len -= used;
 
+		/* For non stream protcols we get one packet per recvmsg call */
+		if (sk->sk_type != SOCK_STREAM)
+			goto copy_uaddr;
+
 		if (!(flags & MSG_PEEK)) {
 			sk_eat_skb(sk, skb, 0);
 			*seq = 0;
 		}
 
-		/* For non stream protcols we get one packet per recvmsg call */
-		if (sk->sk_type != SOCK_STREAM)
-			goto copy_uaddr;
-
 		/* Partial read */
 		if (used + offset < skb->len)
 			continue;
@@ -857,6 +857,12 @@ copy_uaddr:
 	}
 	if (llc_sk(sk)->cmsg_flags)
 		llc_cmsg_rcv(msg, skb);
+
+	if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, 0);
+			*seq = 0;
+	}
+
 	goto out;
 }
 
-- 
cgit v1.1


From 477a897533f9ab9a6ebb6eedfa9ca3760caa94b2 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@redhat.com>
Date: Thu, 22 Dec 2011 02:05:07 +0000
Subject: mqprio: Avoid panic if no options are provided

[ Upstream commit 7838f2ce36b6ab5c13ef20b1857e3bbd567f1759 ]

Userspace may not provide TCA_OPTIONS, in fact tc currently does
so not do so if no arguments are specified on the command line.
Return EINVAL instead of panicing.

Signed-off-by: Thomas Graf <tgraf@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/sched/sch_mqprio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index ea17cbe..59b26b8 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -106,7 +106,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!netif_is_multiqueue(dev))
 		return -EOPNOTSUPP;
 
-	if (nla_len(opt) < sizeof(*qopt))
+	if (!opt || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
 
 	qopt = nla_data(opt);
-- 
cgit v1.1


From 9ec14c04ec6be93ff397adf250bc91ee77742bfb Mon Sep 17 00:00:00 2001
From: Gerlando Falauto <gerlando.falauto@keymile.com>
Date: Mon, 19 Dec 2011 22:58:04 +0000
Subject: net: have ipconfig not wait if no dev is available

[ Upstream commit cd7816d14953c8af910af5bb92f488b0b277e29d ]

previous commit 3fb72f1e6e6165c5f495e8dc11c5bbd14c73385c
makes IP-Config wait for carrier on at least one network device.

Before waiting (predefined value 120s), check that at least one device
was successfully brought up. Otherwise (e.g. buggy bootloader
which does not set the MAC address) there is no point in waiting
for carrier.

Cc: Micha Nelissen <micha@neli.hopto.org>
Cc: Holger Brunck <holger.brunck@keymile.com>
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/ipconfig.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index ab7e554..7fbcaba 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -252,6 +252,10 @@ static int __init ic_open_devs(void)
 		}
 	}
 
+	/* no point in waiting if we could not bring up at least one device */
+	if (!ic_first_dev)
+		goto have_carrier;
+
 	/* wait for a carrier on at least one device */
 	start = jiffies;
 	while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
-- 
cgit v1.1


From 01d6bbab3834409c220083f25810be9f1a553054 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 11 Dec 2011 23:42:53 +0000
Subject: sch_gred: should not use GFP_KERNEL while holding a spinlock

[ Upstream commit 3f1e6d3fd37bd4f25e5b19f1c7ca21850426c33f ]

gred_change_vq() is called under sch_tree_lock(sch).

This means a spinlock is held, and we are not allowed to sleep in this
context.

We might pre-allocate memory using GFP_KERNEL before taking spinlock,
but this is not suitable for stable material.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/sched/sch_gred.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index b9493a0..6cd8ddf 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -385,7 +385,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
 	struct gred_sched_data *q;
 
 	if (table->tab[dp] == NULL) {
-		table->tab[dp] = kzalloc(sizeof(*q), GFP_KERNEL);
+		table->tab[dp] = kzalloc(sizeof(*q), GFP_ATOMIC);
 		if (table->tab[dp] == NULL)
 			return -ENOMEM;
 	}
-- 
cgit v1.1


From f6e4c89e089ae671a677242edb9e8b08c369c415 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Fri, 16 Dec 2011 12:44:15 +0000
Subject: sctp: fix incorrect overflow check on autoclose

[ Upstream commit 2692ba61a82203404abd7dd2a027bda962861f74 ]

Commit 8ffd3208 voids the previous patches f6778aab and 810c0719 for
limiting the autoclose value.  If userspace passes in -1 on 32-bit
platform, the overflow check didn't work and autoclose would be set
to 0xffffffff.

This patch defines a max_autoclose (in seconds) for limiting the value
and exposes it through sysctl, with the following intentions.

1) Avoid overflowing autoclose * HZ.

2) Keep the default autoclose bound consistent across 32- and 64-bit
   platforms (INT_MAX / HZ in this patch).

3) Keep the autoclose value consistent between setsockopt() and
   getsockopt() calls.

Suggested-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/sctp/associola.c |  2 +-
 net/sctp/protocol.c  |  3 +++
 net/sctp/socket.c    |  2 --
 net/sctp/sysctl.c    | 13 +++++++++++++
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 4a62888..17a6e65 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -173,7 +173,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
-		(unsigned long)sp->autoclose * HZ;
+		min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ;
 
 	/* Initializes the timers */
 	for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 207175b..946afd6 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1144,6 +1144,9 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_max_instreams    		= SCTP_DEFAULT_INSTREAMS;
 	sctp_max_outstreams   		= SCTP_DEFAULT_OUTSTREAMS;
 
+	/* Initialize maximum autoclose timeout. */
+	sctp_max_autoclose		= INT_MAX / HZ;
+
 	/* Initialize handle used for association ids. */
 	idr_init(&sctp_assocs_id);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d3ccf79..fa9b5c7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2129,8 +2129,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
 		return -EINVAL;
 	if (copy_from_user(&sp->autoclose, optval, optlen))
 		return -EFAULT;
-	/* make sure it won't exceed MAX_SCHEDULE_TIMEOUT */
-	sp->autoclose = min_t(long, sp->autoclose, MAX_SCHEDULE_TIMEOUT / HZ);
 
 	return 0;
 }
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 50cb57f..6752f48 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -53,6 +53,10 @@ static int sack_timer_min = 1;
 static int sack_timer_max = 500;
 static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */
 static int rwnd_scale_max = 16;
+static unsigned long max_autoclose_min = 0;
+static unsigned long max_autoclose_max =
+	(MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX)
+	? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ;
 
 extern long sysctl_sctp_mem[3];
 extern int sysctl_sctp_rmem[3];
@@ -251,6 +255,15 @@ static ctl_table sctp_table[] = {
 		.extra1		= &one,
 		.extra2		= &rwnd_scale_max,
 	},
+	{
+		.procname	= "max_autoclose",
+		.data		= &sctp_max_autoclose,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.extra1		= &max_autoclose_min,
+		.extra2		= &max_autoclose_max,
+	},
 
 	{ /* sentinel */ }
 };
-- 
cgit v1.1


From 0e5fe3ed8d751c7be333fa193882e91dcc289158 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@redhat.com>
Date: Mon, 19 Dec 2011 04:11:40 +0000
Subject: sctp: Do not account for sizeof(struct sk_buff) in estimated rwnd

[ Upstream commit a76c0adf60f6ca5ff3481992e4ea0383776b24d2 ]

When checking whether a DATA chunk fits into the estimated rwnd a
full sizeof(struct sk_buff) is added to the needed chunk size. This
quickly exhausts the available rwnd space and leads to packets being
sent which are much below the PMTU limit. This can lead to much worse
performance.

The reason for this behaviour was to avoid putting too much memory
pressure on the receiver. The concept is not completely irational
because a Linux receiver does in fact clone an skb for each DATA chunk
delivered. However, Linux also reserves half the available socket
buffer space for data structures therefore usage of it is already
accounted for.

When proposing to change this the last time it was noted that this
behaviour was introduced to solve a performance issue caused by rwnd
overusage in combination with small DATA chunks.

Trying to reproduce this I found that with the sk_buff overhead removed,
the performance would improve significantly unless socket buffer limits
are increased.

The following numbers have been gathered using a patched iperf
supporting SCTP over a live 1 Gbit ethernet network. The -l option
was used to limit DATA chunk sizes. The numbers listed are based on
the average of 3 test runs each. Default values have been used for
sk_(r|w)mem.

Chunk
Size    Unpatched     No Overhead
-------------------------------------
   4    15.2 Kbit [!]   12.2 Mbit [!]
   8    35.8 Kbit [!]   26.0 Mbit [!]
  16    95.5 Kbit [!]   54.4 Mbit [!]
  32   106.7 Mbit      102.3 Mbit
  64   189.2 Mbit      188.3 Mbit
 128   331.2 Mbit      334.8 Mbit
 256   537.7 Mbit      536.0 Mbit
 512   766.9 Mbit      766.6 Mbit
1024   810.1 Mbit      808.6 Mbit

Signed-off-by: Thomas Graf <tgraf@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/sctp/output.c   | 8 +-------
 net/sctp/outqueue.c | 6 ++----
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 08b3cea..817174e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -697,13 +697,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
 	/* Keep track of how many bytes are in flight to the receiver. */
 	asoc->outqueue.outstanding_bytes += datasize;
 
-	/* Update our view of the receiver's rwnd. Include sk_buff overhead
-	 * while updating peer.rwnd so that it reduces the chances of a
-	 * receiver running out of receive buffer space even when receive
-	 * window is still open. This can happen when a sender is sending
-	 * sending small messages.
-	 */
-	datasize += sizeof(struct sk_buff);
+	/* Update our view of the receiver's rwnd. */
 	if (datasize < rwnd)
 		rwnd -= datasize;
 	else
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index d036821..1f2938f 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -411,8 +411,7 @@ void sctp_retransmit_mark(struct sctp_outq *q,
 					chunk->transport->flight_size -=
 							sctp_data_size(chunk);
 				q->outstanding_bytes -= sctp_data_size(chunk);
-				q->asoc->peer.rwnd += (sctp_data_size(chunk) +
-							sizeof(struct sk_buff));
+				q->asoc->peer.rwnd += sctp_data_size(chunk);
 			}
 			continue;
 		}
@@ -432,8 +431,7 @@ void sctp_retransmit_mark(struct sctp_outq *q,
 			 * (Section 7.2.4)), add the data size of those
 			 * chunks to the rwnd.
 			 */
-			q->asoc->peer.rwnd += (sctp_data_size(chunk) +
-						sizeof(struct sk_buff));
+			q->asoc->peer.rwnd += sctp_data_size(chunk);
 			q->outstanding_bytes -= sctp_data_size(chunk);
 			if (chunk->transport)
 				transport->flight_size -= sctp_data_size(chunk);
-- 
cgit v1.1


From 6c3efb1526c3fcdab3e5bbc9c77710b306493507 Mon Sep 17 00:00:00 2001
From: Weiping Pan <panweiping3@gmail.com>
Date: Thu, 1 Dec 2011 15:47:06 +0000
Subject: ipv4: flush route cache after change accept_local

[ Upstream commit d01ff0a049f749e0bf10a35bb23edd012718c8c2 ]

After reset ipv4_devconf->data[IPV4_DEVCONF_ACCEPT_LOCAL] to 0,
we should flush route cache, or it will continue receive packets with local
source address, which should be dropped.

Signed-off-by: Weiping Pan <panweiping3@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/devinet.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 4155abc..7d7fb20 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1490,7 +1490,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
 			     void __user *buffer,
 			     size_t *lenp, loff_t *ppos)
 {
+	int old_value = *(int *)ctl->data;
 	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+	int new_value = *(int *)ctl->data;
 
 	if (write) {
 		struct ipv4_devconf *cnf = ctl->extra1;
@@ -1501,6 +1503,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
 
 		if (cnf == net->ipv4.devconf_dflt)
 			devinet_copy_dflt_conf(net, i);
+		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
+			if ((new_value == 0) && (old_value != 0))
+				rt_cache_flush(net, 0);
 	}
 
 	return ret;
-- 
cgit v1.1


From ad5dd5dc45d80c397dfe314934e91d0ead793928 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 21 Dec 2011 15:47:16 -0500
Subject: ipv4: reintroduce route cache garbage collector

[ Upstream commit 9f28a2fc0bd77511f649c0a788c7bf9a5fd04edb ]

Commit 2c8cec5c10b (ipv4: Cache learned PMTU information in inetpeer)
removed IP route cache garbage collector a bit too soon, as this gc was
responsible for expired routes cleanup, releasing their neighbour
reference.

As pointed out by Robert Gladewitz, recent kernels can fill and exhaust
their neighbour cache.

Reintroduce the garbage collection, since we'll have to wait our
neighbour lookups become refcount-less to not depend on this stuff.

Reported-by: Robert Gladewitz <gladewitz@gmx.de>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/route.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 75ef66f..b01f569 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -132,6 +132,9 @@ static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
 static int rt_chain_length_max __read_mostly	= 20;
 
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
+
 /*
  *	Interface to generic destination cache.
  */
@@ -821,6 +824,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
 	return ONE;
 }
 
+static void rt_check_expire(void)
+{
+	static unsigned int rover;
+	unsigned int i = rover, goal;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
+	unsigned long samples = 0;
+	unsigned long sum = 0, sum2 = 0;
+	unsigned long delta;
+	u64 mult;
+
+	delta = jiffies - expires_ljiffies;
+	expires_ljiffies = jiffies;
+	mult = ((u64)delta) << rt_hash_log;
+	if (ip_rt_gc_timeout > 1)
+		do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+	if (goal > rt_hash_mask)
+		goal = rt_hash_mask + 1;
+	for (; goal > 0; goal--) {
+		unsigned long tmo = ip_rt_gc_timeout;
+		unsigned long length;
+
+		i = (i + 1) & rt_hash_mask;
+		rthp = &rt_hash_table[i].chain;
+
+		if (need_resched())
+			cond_resched();
+
+		samples++;
+
+		if (rcu_dereference_raw(*rthp) == NULL)
+			continue;
+		length = 0;
+		spin_lock_bh(rt_hash_lock_addr(i));
+		while ((rth = rcu_dereference_protected(*rthp,
+					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+			prefetch(rth->dst.rt_next);
+			if (rt_is_expired(rth)) {
+				*rthp = rth->dst.rt_next;
+				rt_free(rth);
+				continue;
+			}
+			if (rth->dst.expires) {
+				/* Entry is expired even if it is in use */
+				if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+					tmo >>= 1;
+					rthp = &rth->dst.rt_next;
+					/*
+					 * We only count entries on
+					 * a chain with equal hash inputs once
+					 * so that entries for different QOS
+					 * levels, and other non-hash input
+					 * attributes don't unfairly skew
+					 * the length computation
+					 */
+					length += has_noalias(rt_hash_table[i].chain, rth);
+					continue;
+				}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+				goto nofree;
+
+			/* Cleanup aged off entries. */
+			*rthp = rth->dst.rt_next;
+			rt_free(rth);
+		}
+		spin_unlock_bh(rt_hash_lock_addr(i));
+		sum += length;
+		sum2 += length*length;
+	}
+	if (samples) {
+		unsigned long avg = sum / samples;
+		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+		rt_chain_length_max = max_t(unsigned long,
+					ip_rt_gc_elasticity,
+					(avg + 4*sd) >> FRACT_BITS);
+	}
+	rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+	rt_check_expire();
+	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
 /*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -3088,6 +3182,13 @@ static ctl_table ipv4_route_table[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 	{
+		.procname	= "gc_interval",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
 		.procname	= "redirect_load",
 		.data		= &ip_rt_redirect_load,
 		.maxlen		= sizeof(int),
@@ -3297,6 +3398,11 @@ int __init ip_rt_init(void)
 	devinet_init();
 	ip_fib_init();
 
+	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+	expires_ljiffies = jiffies;
+	schedule_delayed_work(&expires_work,
+		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
 	if (ip_rt_proc_init())
 		printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
-- 
cgit v1.1


From 732e81a7579eb0adb26aeadb209e919ee984d01e Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 22 Dec 2011 17:03:29 +1100
Subject: ipv4: using prefetch requires including prefetch.h

[ Upstream commit b9eda06f80b0db61a73bd87c6b0eb67d8aca55ad ]

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/route.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b01f569..4845bfe 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -91,6 +91,7 @@
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
-- 
cgit v1.1


From 49ffa26eca87d3518ed88d3e6feebf1b80837a15 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 9 Jan 2012 14:06:46 -0800
Subject: igmp: Avoid zero delay when receiving odd mixture of IGMP queries

commit a8c1f65c79cbbb2f7da782d4c9d15639a9b94b27 upstream.

Commit 5b7c84066733c5dfb0e4016d939757b38de189e4 ('ipv4: correct IGMP
behavior on v3 query during v2-compatibility mode') added yet another
case for query parsing, which can result in max_delay = 0.  Substitute
a value of 1, as in the usual v3 case.

Reported-by: Simon McVittie <smcv@debian.org>
References: http://bugs.debian.org/654876
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d577199..e0d42db 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -875,6 +875,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 		 * to be intended in a v3 query.
 		 */
 		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+		if (!max_delay)
+			max_delay = 1;	/* can't mod w/ 0 */
 	} else { /* v3 */
 		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
 			return;
-- 
cgit v1.1


From b9e11747e1227d7ad67c5b80be4b206e4059687e Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 11 Jan 2012 09:26:54 +0100
Subject: mac80211: fix rx->key NULL pointer dereference in promiscuous mode

commit 1140afa862842ac3e56678693050760edc4ecde9 upstream.

Since:

commit 816c04fe7ef01dd9649f5ccfe796474db8708be5
Author: Christian Lamparter <chunkeey@googlemail.com>
Date:   Sat Apr 30 15:24:30 2011 +0200

    mac80211: consolidate MIC failure report handling

is possible to that we dereference rx->key == NULL when driver set
RX_FLAG_MMIC_STRIPPED and not RX_FLAG_IV_STRIPPED and we are in
promiscuous mode. This happen with rt73usb and rt61pci at least.

Before the commit we always check rx->key against NULL, so I assume
fix should be done in mac80211 (also mic_fail path has similar check).

References:
https://bugzilla.redhat.com/show_bug.cgi?id=769766
http://rt2x00.serialmonkey.com/pipermail/users_rt2x00.serialmonkey.com/2012-January/004395.html

Reported-by: Stuart D Gathman <stuart@gathman.org>
Reported-by: Kai Wohlfahrt <kai.scorpio@gmail.com>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/mac80211/wpa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 8f6a302..aa1c40a 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -109,7 +109,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
 		if (status->flag & RX_FLAG_MMIC_ERROR)
 			goto mic_fail;
 
-		if (!(status->flag & RX_FLAG_IV_STRIPPED))
+		if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key)
 			goto update_iv;
 
 		return RX_CONTINUE;
-- 
cgit v1.1


From b09577ca6680033a4315e2f5cb3a95ebbb8dea79 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 22 Dec 2011 18:22:49 -0700
Subject: svcrpc: fix double-free on shutdown of nfsd after changing pool mode

commit 61c8504c428edcebf23b97775a129c5b393a302b upstream.

The pool_to and to_pool fields of the global svc_pool_map are freed on
shutdown, but are initialized in nfsd startup only in the
SVC_POOL_PERCPU and SVC_POOL_PERNODE cases.

They *are* initialized to zero on kernel startup.  So as long as you use
only SVC_POOL_GLOBAL (the default), this will never be a problem.

You're also OK if you only ever use SVC_POOL_PERCPU or SVC_POOL_PERNODE.

However, the following sequence events leads to a double-free:

	1. set SVC_POOL_PERCPU or SVC_POOL_PERNODE
	2. start nfsd: both fields are initialized.
	3. shutdown nfsd: both fields are freed.
	4. set SVC_POOL_GLOBAL
	5. start nfsd: the fields are left untouched.
	6. shutdown nfsd: now we try to free them again.

Step 4 is actually unnecessary, since (for some bizarre reason), nfsd
automatically resets the pool mode to SVC_POOL_GLOBAL on shutdown.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/sunrpc/svc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 2b90292..131da58 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -167,6 +167,7 @@ svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
 
 fail_free:
 	kfree(m->to_pool);
+	m->to_pool = NULL;
 fail:
 	return -ENOMEM;
 }
@@ -287,7 +288,9 @@ svc_pool_map_put(void)
 	if (!--m->count) {
 		m->mode = SVC_POOL_DEFAULT;
 		kfree(m->to_pool);
+		m->to_pool = NULL;
 		kfree(m->pool_to);
+		m->pool_to = NULL;
 		m->npools = 0;
 	}
 
-- 
cgit v1.1


From 7df22768c0af8769d805f6db21144d71d91fe13d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 29 Nov 2011 11:35:35 -0500
Subject: svcrpc: destroy server sockets all at once

commit 2fefb8a09e7ed251ae8996e0c69066e74c5aa560 upstream.

There's no reason I can see that we need to call sv_shutdown between
closing the two lists of sockets.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/sunrpc/svc.c      |  7 +------
 net/sunrpc/svc_xprt.c | 11 ++++++++++-
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 131da58..4d5cb99 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -476,16 +476,11 @@ svc_destroy(struct svc_serv *serv)
 
 	del_timer_sync(&serv->sv_temptimer);
 
-	svc_close_all(&serv->sv_tempsocks);
+	svc_close_all(serv);
 
 	if (serv->sv_shutdown)
 		serv->sv_shutdown(serv);
 
-	svc_close_all(&serv->sv_permsocks);
-
-	BUG_ON(!list_empty(&serv->sv_permsocks));
-	BUG_ON(!list_empty(&serv->sv_tempsocks));
-
 	cache_clean_deferred(serv);
 
 	if (svc_serv_is_pooled(serv))
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index bd31208..9cb2621 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -936,7 +936,7 @@ void svc_close_xprt(struct svc_xprt *xprt)
 }
 EXPORT_SYMBOL_GPL(svc_close_xprt);
 
-void svc_close_all(struct list_head *xprt_list)
+static void svc_close_list(struct list_head *xprt_list)
 {
 	struct svc_xprt *xprt;
 	struct svc_xprt *tmp;
@@ -954,6 +954,15 @@ void svc_close_all(struct list_head *xprt_list)
 	}
 }
 
+void svc_close_all(struct svc_serv *serv)
+{
+	svc_close_list(&serv->sv_tempsocks);
+	svc_close_list(&serv->sv_permsocks);
+	BUG_ON(!list_empty(&serv->sv_permsocks));
+	BUG_ON(!list_empty(&serv->sv_tempsocks));
+
+}
+
 /*
  * Handle defer and revisit of requests
  */
-- 
cgit v1.1


From a141a5eb3ab45131cb168e7a561d662722b43ec3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 29 Nov 2011 17:00:26 -0500
Subject: svcrpc: avoid memory-corruption on pool shutdown

commit b4f36f88b3ee7cf26bf0be84e6c7fc15f84dcb71 upstream.

Socket callbacks use svc_xprt_enqueue() to add an xprt to a
pool->sp_sockets list.  In normal operation a server thread will later
come along and take the xprt off that list.  On shutdown, after all the
threads have exited, we instead manually walk the sv_tempsocks and
sv_permsocks lists to find all the xprt's and delete them.

So the sp_sockets lists don't really matter any more.  As a result,
we've mostly just ignored them and hoped they would go away.

Which has gotten us into trouble; witness for example ebc63e531cc6
"svcrpc: fix list-corrupting race on nfsd shutdown", the result of Ben
Greear noticing that a still-running svc_xprt_enqueue() could re-add an
xprt to an sp_sockets list just before it was deleted.  The fix was to
remove it from the list at the end of svc_delete_xprt().  But that only
made corruption less likely--I can see nothing that prevents a
svc_xprt_enqueue() from adding another xprt to the list at the same
moment that we're removing this xprt from the list.  In fact, despite
the earlier xpo_detach(), I don't even see what guarantees that
svc_xprt_enqueue() couldn't still be running on this xprt.

So, instead, note that svc_xprt_enqueue() essentially does:
	lock sp_lock
		if XPT_BUSY unset
			add to sp_sockets
	unlock sp_lock

So, if we do:

	set XPT_BUSY on every xprt.
	Empty every sp_sockets list, under the sp_socks locks.

Then we're left knowing that the sp_sockets lists are all empty and will
stay that way, since any svc_xprt_enqueue() will check XPT_BUSY under
the sp_lock and see it set.

And *then* we can continue deleting the xprt's.

(Thanks to Jeff Layton for being correctly suspicious of this code....)

Cc: Ben Greear <greearb@candelatech.com>
Cc: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/sunrpc/svc.c      | 10 +++++++++-
 net/sunrpc/svc_xprt.c | 48 +++++++++++++++++++++++++++++-------------------
 2 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4d5cb99..ce5f111 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -475,7 +475,15 @@ svc_destroy(struct svc_serv *serv)
 		printk("svc_destroy: no threads for serv=%p!\n", serv);
 
 	del_timer_sync(&serv->sv_temptimer);
-
+	/*
+	 * The set of xprts (contained in the sv_tempsocks and
+	 * sv_permsocks lists) is now constant, since it is modified
+	 * only by accepting new sockets (done by service threads in
+	 * svc_recv) or aging old ones (done by sv_temptimer), or
+	 * configuration changes (excluded by whatever locking the
+	 * caller is using--nfsd_mutex in the case of nfsd).  So it's
+	 * safe to traverse those lists and shut everything down:
+	 */
 	svc_close_all(serv);
 
 	if (serv->sv_shutdown)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 9cb2621..9d7ed0b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -901,14 +901,7 @@ void svc_delete_xprt(struct svc_xprt *xprt)
 	spin_lock_bh(&serv->sv_lock);
 	if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
 		list_del_init(&xprt->xpt_list);
-	/*
-	 * The only time we're called while xpt_ready is still on a list
-	 * is while the list itself is about to be destroyed (in
-	 * svc_destroy).  BUT svc_xprt_enqueue could still be attempting
-	 * to add new entries to the sp_sockets list, so we can't leave
-	 * a freed xprt on it.
-	 */
-	list_del_init(&xprt->xpt_ready);
+	BUG_ON(!list_empty(&xprt->xpt_ready));
 	if (test_bit(XPT_TEMP, &xprt->xpt_flags))
 		serv->sv_tmpcnt--;
 	spin_unlock_bh(&serv->sv_lock);
@@ -939,28 +932,45 @@ EXPORT_SYMBOL_GPL(svc_close_xprt);
 static void svc_close_list(struct list_head *xprt_list)
 {
 	struct svc_xprt *xprt;
-	struct svc_xprt *tmp;
 
-	/*
-	 * The server is shutting down, and no more threads are running.
-	 * svc_xprt_enqueue() might still be running, but at worst it
-	 * will re-add the xprt to sp_sockets, which will soon get
-	 * freed.  So we don't bother with any more locking, and don't
-	 * leave the close to the (nonexistent) server threads:
-	 */
-	list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
+	list_for_each_entry(xprt, xprt_list, xpt_list) {
 		set_bit(XPT_CLOSE, &xprt->xpt_flags);
-		svc_delete_xprt(xprt);
+		set_bit(XPT_BUSY, &xprt->xpt_flags);
 	}
 }
 
 void svc_close_all(struct svc_serv *serv)
 {
+	struct svc_pool *pool;
+	struct svc_xprt *xprt;
+	struct svc_xprt *tmp;
+	int i;
+
 	svc_close_list(&serv->sv_tempsocks);
 	svc_close_list(&serv->sv_permsocks);
+
+	for (i = 0; i < serv->sv_nrpools; i++) {
+		pool = &serv->sv_pools[i];
+
+		spin_lock_bh(&pool->sp_lock);
+		while (!list_empty(&pool->sp_sockets)) {
+			xprt = list_first_entry(&pool->sp_sockets, struct svc_xprt, xpt_ready);
+			list_del_init(&xprt->xpt_ready);
+		}
+		spin_unlock_bh(&pool->sp_lock);
+	}
+	/*
+	 * At this point the sp_sockets lists will stay empty, since
+	 * svc_enqueue will not add new entries without taking the
+	 * sp_lock and checking XPT_BUSY.
+	 */
+	list_for_each_entry_safe(xprt, tmp, &serv->sv_tempsocks, xpt_list)
+		svc_delete_xprt(xprt);
+	list_for_each_entry_safe(xprt, tmp, &serv->sv_permsocks, xpt_list)
+		svc_delete_xprt(xprt);
+
 	BUG_ON(!list_empty(&serv->sv_permsocks));
 	BUG_ON(!list_empty(&serv->sv_tempsocks));
-
 }
 
 /*
-- 
cgit v1.1


From d253520a7b2c2223fb4f704f06d10f2c547bdeef Mon Sep 17 00:00:00 2001
From: Nick Bowler <nbowler@elliptictech.com>
Date: Tue, 8 Nov 2011 12:12:44 +0000
Subject: ah: Correctly pass error codes in ahash output callback.

commit 069294e813ed5f27f82613b027609bcda5f1b914 upstream.

The AH4/6 ahash output callbacks pass nexthdr to xfrm_output_resume
instead of the error code.  This appears to be a copy+paste error from
the input case, where nexthdr is expected.  This causes the driver to
continuously add AH headers to the datagram until either an allocation
fails and the packet is dropped or the ahash driver hits a synchronous
fallback and the resulting monstrosity is transmitted.

Correct this issue by simply passing the error code unadulterated.

Signed-off-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/ah4.c | 2 --
 net/ipv6/ah6.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index c1f4154..33ca186 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -136,8 +136,6 @@ static void ah_output_done(struct crypto_async_request *base, int err)
 		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
 	}
 
-	err = ah->nexthdr;
-
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_output_resume(skb, err);
 }
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 2195ae6..ede4d9d 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -324,8 +324,6 @@ static void ah6_output_done(struct crypto_async_request *base, int err)
 #endif
 	}
 
-	err = ah->nexthdr;
-
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_output_resume(skb, err);
 }
-- 
cgit v1.1


From c0ab420c6822529fa5aba05668e1e983b065460f Mon Sep 17 00:00:00 2001
From: Nick Bowler <nbowler@elliptictech.com>
Date: Tue, 8 Nov 2011 12:12:45 +0000
Subject: ah: Read nexthdr value before overwriting it in ahash input callback.

commit b7ea81a58adc123a4e980cb0eff9eb5c144b5dc7 upstream.

The AH4/6 ahash input callbacks read out the nexthdr field from the AH
header *after* they overwrite that header.  This is obviously not going
to end well.  Fix it up.

Signed-off-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 net/ipv4/ah4.c | 4 ++--
 net/ipv6/ah6.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 33ca186..c7056b2 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -262,12 +262,12 @@ static void ah_input_done(struct crypto_async_request *base, int err)
 	if (err)
 		goto out;
 
+	err = ah->nexthdr;
+
 	skb->network_header += ah_hlen;
 	memcpy(skb_network_header(skb), work_iph, ihl);
 	__skb_pull(skb, ah_hlen + ihl);
 	skb_set_transport_header(skb, -ihl);
-
-	err = ah->nexthdr;
 out:
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_input_resume(skb, err);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index ede4d9d..7a33aaa 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -464,12 +464,12 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
 	if (err)
 		goto out;
 
+	err = ah->nexthdr;
+
 	skb->network_header += ah_hlen;
 	memcpy(skb_network_header(skb), work_iph, hdr_len);
 	__skb_pull(skb, ah_hlen + hdr_len);
 	skb_set_transport_header(skb, -hdr_len);
-
-	err = ah->nexthdr;
 out:
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_input_resume(skb, err);
-- 
cgit v1.1


From ffee9a18f29a0645c2d117083e025f557c738018 Mon Sep 17 00:00:00 2001
From: Nick Bowler <nbowler@elliptictech.com>
Date: Thu, 10 Nov 2011 09:01:27 +0000
Subject: ah: Don't return NET_XMIT_DROP on input.

commit 4b90a603a1b21d63cf743cc833680cb195a729f6 upstream.

When the ahash driver returns -EBUSY, AH4/6 input functions return
NET_XMIT_DROP, presumably copied from the output code path.  But
returning transmit codes on input doesn't make a lot of sense.
Since NET_XMIT_DROP is a positive int, this gets interpreted as
the next header type (i.e., success).  As that can only end badly,
remove the check.

Signed-off-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ah4.c | 2 --
 net/ipv6/ah6.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index c7056b2..36d1440 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -369,8 +369,6 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 		if (err == -EINPROGRESS)
 			goto out;
 
-		if (err == -EBUSY)
-			err = NET_XMIT_DROP;
 		goto out_free;
 	}
 
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 7a33aaa..4c0f894 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -581,8 +581,6 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 		if (err == -EINPROGRESS)
 			goto out;
 
-		if (err == -EBUSY)
-			err = NET_XMIT_DROP;
 		goto out_free;
 	}
 
-- 
cgit v1.1


From 561331eae0a03d0c4cf60f3cf485aa3e8aa5ab48 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 26 Jan 2012 00:41:38 +0000
Subject: netns: fix net_alloc_generic()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 073862ba5d249c20bd5c49fc6d904ff0e1f6a672 ]

When a new net namespace is created, we should attach to it a "struct
net_generic" with enough slots (even empty), or we can hit the following
BUG_ON() :

[  200.752016] kernel BUG at include/net/netns/generic.h:40!
...
[  200.752016]  [<ffffffff825c3cea>] ? get_cfcnfg+0x3a/0x180
[  200.752016]  [<ffffffff821cf0b0>] ? lockdep_rtnl_is_held+0x10/0x20
[  200.752016]  [<ffffffff825c41be>] caif_device_notify+0x2e/0x530
[  200.752016]  [<ffffffff810d61b7>] notifier_call_chain+0x67/0x110
[  200.752016]  [<ffffffff810d67c1>] raw_notifier_call_chain+0x11/0x20
[  200.752016]  [<ffffffff821bae82>] call_netdevice_notifiers+0x32/0x60
[  200.752016]  [<ffffffff821c2b26>] register_netdevice+0x196/0x300
[  200.752016]  [<ffffffff821c2ca9>] register_netdev+0x19/0x30
[  200.752016]  [<ffffffff81c1c67a>] loopback_net_init+0x4a/0xa0
[  200.752016]  [<ffffffff821b5e62>] ops_init+0x42/0x180
[  200.752016]  [<ffffffff821b600b>] setup_net+0x6b/0x100
[  200.752016]  [<ffffffff821b6466>] copy_net_ns+0x86/0x110
[  200.752016]  [<ffffffff810d5789>] create_new_namespaces+0xd9/0x190

net_alloc_generic() should take into account the maximum index into the
ptr array, as a subsystem might use net_generic() anytime.

This also reduces number of reallocations in net_assign_generic()

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sjur Brændeland <sjur.brandeland@stericsson.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/net_namespace.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ea489db..0b0211d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -29,6 +29,20 @@ EXPORT_SYMBOL(init_net);
 
 #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
 
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
+{
+	struct net_generic *ng;
+	size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+	ng = kzalloc(generic_size, GFP_KERNEL);
+	if (ng)
+		ng->len = max_gen_ptrs;
+
+	return ng;
+}
+
 static int net_assign_generic(struct net *net, int id, void *data)
 {
 	struct net_generic *ng, *old_ng;
@@ -42,8 +56,7 @@ static int net_assign_generic(struct net *net, int id, void *data)
 	if (old_ng->len >= id)
 		goto assign;
 
-	ng = kzalloc(sizeof(struct net_generic) +
-			id * sizeof(void *), GFP_KERNEL);
+	ng = net_alloc_generic();
 	if (ng == NULL)
 		return -ENOMEM;
 
@@ -58,7 +71,6 @@ static int net_assign_generic(struct net *net, int id, void *data)
 	 * the old copy for kfree after a grace period.
 	 */
 
-	ng->len = id;
 	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
 
 	rcu_assign_pointer(net->gen, ng);
@@ -159,18 +171,6 @@ out_undo:
 	goto out;
 }
 
-static struct net_generic *net_alloc_generic(void)
-{
-	struct net_generic *ng;
-	size_t generic_size = sizeof(struct net_generic) +
-		INITIAL_NET_GEN_PTRS * sizeof(void *);
-
-	ng = kzalloc(generic_size, GFP_KERNEL);
-	if (ng)
-		ng->len = INITIAL_NET_GEN_PTRS;
-
-	return ng;
-}
 
 #ifdef CONFIG_NET_NS
 static struct kmem_cache *net_cachep;
@@ -481,6 +481,7 @@ again:
 			}
 			return error;
 		}
+		max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
 	}
 	error = __register_pernet_operations(list, ops);
 	if (error) {
-- 
cgit v1.1


From 62252cba2867cec7cc484ebb2d3ec705c41d9684 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jan 2012 14:04:53 +0000
Subject: net caif: Register properly as a pernet subsystem.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 8a8ee9aff6c3077dd9c2c7a77478e8ed362b96c6 ]

caif is a subsystem and as such it needs to register with
register_pernet_subsys instead of register_pernet_device.

Among other problems using register_pernet_device was resulting in
net_generic being called before the caif_net structure was allocated.
Which has been causing net_generic to fail with either BUG_ON's or by
return NULL pointers.

A more ugly problem that could be caused is packets in flight why the
subsystem is shutting down.

To remove confusion also remove the cruft cause by inappropriately
trying to fix this bug.

With the aid of the previous patch I have tested this patch and
confirmed that using register_pernet_subsys makes the failure go away as
it should.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/caif/caif_dev.c | 11 ++++-------
 net/caif/cfcnfg.c   |  1 -
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index dbdaa95..5ba4366 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -53,7 +53,6 @@ struct cfcnfg *get_cfcnfg(struct net *net)
 	struct caif_net *caifn;
 	BUG_ON(!net);
 	caifn = net_generic(net, caif_net_id);
-	BUG_ON(!caifn);
 	return caifn->cfg;
 }
 EXPORT_SYMBOL(get_cfcnfg);
@@ -63,7 +62,6 @@ static struct caif_device_entry_list *caif_device_list(struct net *net)
 	struct caif_net *caifn;
 	BUG_ON(!net);
 	caifn = net_generic(net, caif_net_id);
-	BUG_ON(!caifn);
 	return &caifn->caifdevs;
 }
 
@@ -92,7 +90,6 @@ static struct caif_device_entry *caif_device_alloc(struct net_device *dev)
 	struct caif_device_entry *caifd;
 
 	caifdevs = caif_device_list(dev_net(dev));
-	BUG_ON(!caifdevs);
 
 	caifd = kzalloc(sizeof(*caifd), GFP_ATOMIC);
 	if (!caifd)
@@ -108,7 +105,7 @@ static struct caif_device_entry *caif_get(struct net_device *dev)
 	struct caif_device_entry_list *caifdevs =
 	    caif_device_list(dev_net(dev));
 	struct caif_device_entry *caifd;
-	BUG_ON(!caifdevs);
+
 	list_for_each_entry_rcu(caifd, &caifdevs->list, list) {
 		if (caifd->netdev == dev)
 			return caifd;
@@ -349,7 +346,7 @@ static struct notifier_block caif_device_notifier = {
 static int caif_init_net(struct net *net)
 {
 	struct caif_net *caifn = net_generic(net, caif_net_id);
-	BUG_ON(!caifn);
+
 	INIT_LIST_HEAD(&caifn->caifdevs.list);
 	mutex_init(&caifn->caifdevs.lock);
 
@@ -414,7 +411,7 @@ static int __init caif_device_init(void)
 {
 	int result;
 
-	result = register_pernet_device(&caif_net_ops);
+	result = register_pernet_subsys(&caif_net_ops);
 
 	if (result)
 		return result;
@@ -427,7 +424,7 @@ static int __init caif_device_init(void)
 
 static void __exit caif_device_exit(void)
 {
-	unregister_pernet_device(&caif_net_ops);
+	unregister_pernet_subsys(&caif_net_ops);
 	unregister_netdevice_notifier(&caif_device_notifier);
 	dev_remove_pack(&caif_packet_type);
 }
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 52fe33b..bca32d7 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -313,7 +313,6 @@ int caif_connect_client(struct net *net, struct caif_connect_request *conn_req,
 	int err;
 	struct cfctrl_link_param param;
 	struct cfcnfg *cfg = get_cfcnfg(net);
-	caif_assert(cfg != NULL);
 
 	rcu_read_lock();
 	err = caif_connect_req_to_link_param(cfg, conn_req, &param);
-- 
cgit v1.1


From 1334533665277ccc5568c5104cd2358788a02e02 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 25 Jan 2012 02:39:05 +0000
Subject: l2tp: l2tp_ip - fix possible oops on packet receive

[ Upstream commit 68315801dbf3ab2001679fd2074c9dc5dcf87dfa ]

When a packet is received on an L2TP IP socket (L2TPv3 IP link
encapsulation), the l2tpip socket's backlog_rcv function calls
xfrm4_policy_check(). This is not necessary, since it was called
before the skb was added to the backlog. With CONFIG_NET_NS enabled,
xfrm4_policy_check() will oops if skb->dev is null, so this trivial
patch removes the call.

This bug has always been present, but only when CONFIG_NET_NS is
enabled does it cause problems. Most users are probably using UDP
encapsulation for L2TP, hence the problem has only recently
surfaced.

EIP: 0060:[<c12bb62b>] EFLAGS: 00210246 CPU: 0
EIP is at l2tp_ip_recvmsg+0xd4/0x2a7
EAX: 00000001 EBX: d77b5180 ECX: 00000000 EDX: 00200246
ESI: 00000000 EDI: d63cbd30 EBP: d63cbd18 ESP: d63cbcf4
 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Call Trace:
 [<c1218568>] sock_common_recvmsg+0x31/0x46
 [<c1215c92>] __sock_recvmsg_nosec+0x45/0x4d
 [<c12163a1>] __sock_recvmsg+0x31/0x3b
 [<c1216828>] sock_recvmsg+0x96/0xab
 [<c10b2693>] ? might_fault+0x47/0x81
 [<c10b2693>] ? might_fault+0x47/0x81
 [<c1167fd0>] ? _copy_from_user+0x31/0x115
 [<c121e8c8>] ? copy_from_user+0x8/0xa
 [<c121ebd6>] ? verify_iovec+0x3e/0x78
 [<c1216604>] __sys_recvmsg+0x10a/0x1aa
 [<c1216792>] ? sock_recvmsg+0x0/0xab
 [<c105a99b>] ? __lock_acquire+0xbdf/0xbee
 [<c12d5a99>] ? do_page_fault+0x193/0x375
 [<c10d1200>] ? fcheck_files+0x9b/0xca
 [<c10d1259>] ? fget_light+0x2a/0x9c
 [<c1216bbb>] sys_recvmsg+0x2b/0x43
 [<c1218145>] sys_socketcall+0x16d/0x1a5
 [<c11679f0>] ? trace_hardirqs_on_thunk+0xc/0x10
 [<c100305f>] sysenter_do_call+0x12/0x38
Code: c6 05 8c ea a8 c1 01 e8 0c d4 d9 ff 85 f6 74 07 3e ff 86 80 00 00 00 b9 17 b6 2b c1 ba 01 00 00 00 b8 78 ed 48 c1 e8 23 f6 d9 ff <ff> 76 0c 68 28 e3 30 c1 68 2d 44 41 c1 e8 89 57 01 00 83 c4 0c

Signed-off-by: James Chapman <jchapman@katalix.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/l2tp/l2tp_ip.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index b6466e7..858ca23 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -393,11 +393,6 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
-	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
-		goto drop;
-
-	nf_reset(skb);
-
 	/* Charge it to the socket, dropping if the queue is full. */
 	rc = sock_queue_rcv_skb(sk, skb);
 	if (rc < 0)
-- 
cgit v1.1


From f217c4711d71aa6811b6e71d219b9efafa5d55a6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 24 Jan 2012 17:03:44 -0500
Subject: rds: Make rds_sock_lock BH rather than IRQ safe.

[ Upstream commit efc3dbc37412c027e363736b4f4c74ee5e8ecffc ]

rds_sock_info() triggers locking warnings because we try to perform a
local_bh_enable() (via sock_i_ino()) while hardware interrupts are
disabled (via taking rds_sock_lock).

There is no reason for rds_sock_lock to be a hardware IRQ disabling
lock, none of these access paths run in hardware interrupt context.

Therefore making it a BH disabling lock is safe and sufficient to
fix this bug.

Reported-by: Kumar Sanghvi <kumaras@chelsio.com>
Reported-by: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/rds/af_rds.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index bb6ad81..424ff62 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -68,7 +68,6 @@ static int rds_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct rds_sock *rs;
-	unsigned long flags;
 
 	if (!sk)
 		goto out;
@@ -94,10 +93,10 @@ static int rds_release(struct socket *sock)
 	rds_rdma_drop_keys(rs);
 	rds_notify_queue_get(rs, NULL);
 
-	spin_lock_irqsave(&rds_sock_lock, flags);
+	spin_lock_bh(&rds_sock_lock);
 	list_del_init(&rs->rs_item);
 	rds_sock_count--;
-	spin_unlock_irqrestore(&rds_sock_lock, flags);
+	spin_unlock_bh(&rds_sock_lock);
 
 	rds_trans_put(rs->rs_transport);
 
@@ -409,7 +408,6 @@ static const struct proto_ops rds_proto_ops = {
 
 static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 {
-	unsigned long flags;
 	struct rds_sock *rs;
 
 	sock_init_data(sock, sk);
@@ -426,10 +424,10 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
 
-	spin_lock_irqsave(&rds_sock_lock, flags);
+	spin_lock_bh(&rds_sock_lock);
 	list_add_tail(&rs->rs_item, &rds_sock_list);
 	rds_sock_count++;
-	spin_unlock_irqrestore(&rds_sock_lock, flags);
+	spin_unlock_bh(&rds_sock_lock);
 
 	return 0;
 }
@@ -471,12 +469,11 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
 {
 	struct rds_sock *rs;
 	struct rds_incoming *inc;
-	unsigned long flags;
 	unsigned int total = 0;
 
 	len /= sizeof(struct rds_info_message);
 
-	spin_lock_irqsave(&rds_sock_lock, flags);
+	spin_lock_bh(&rds_sock_lock);
 
 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
 		read_lock(&rs->rs_recv_lock);
@@ -492,7 +489,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
 		read_unlock(&rs->rs_recv_lock);
 	}
 
-	spin_unlock_irqrestore(&rds_sock_lock, flags);
+	spin_unlock_bh(&rds_sock_lock);
 
 	lens->nr = total;
 	lens->each = sizeof(struct rds_info_message);
@@ -504,11 +501,10 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
 {
 	struct rds_info_socket sinfo;
 	struct rds_sock *rs;
-	unsigned long flags;
 
 	len /= sizeof(struct rds_info_socket);
 
-	spin_lock_irqsave(&rds_sock_lock, flags);
+	spin_lock_bh(&rds_sock_lock);
 
 	if (len < rds_sock_count)
 		goto out;
@@ -529,7 +525,7 @@ out:
 	lens->nr = rds_sock_count;
 	lens->each = sizeof(struct rds_info_socket);
 
-	spin_unlock_irqrestore(&rds_sock_lock, flags);
+	spin_unlock_bh(&rds_sock_lock);
 }
 
 static void rds_exit(void)
-- 
cgit v1.1


From 8b4bb350e120fe0b32a0b1b8d227e65af03e3993 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 28 Jan 2012 17:29:46 +0000
Subject: tcp: fix tcp_trim_head() to adjust segment count with skb MSS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 5b35e1e6e9ca651e6b291c96d1106043c9af314a ]

This commit fixes tcp_trim_head() to recalculate the number of
segments in the skb with the skb's existing MSS, so trimming the head
causes the skb segment count to be monotonically non-increasing - it
should stay the same or go down, but not increase.

Previously tcp_trim_head() used the current MSS of the connection. But
if there was a decrease in MSS between original transmission and ACK
(e.g. due to PMTUD), this could cause tcp_trim_head() to
counter-intuitively increase the segment count when trimming bytes off
the head of an skb. This violated assumptions in tcp_tso_acked() that
tcp_trim_head() only decreases the packet count, so that packets_acked
in tcp_tso_acked() could underflow, leading tcp_clean_rtx_queue() to
pass u32 pkts_acked values as large as 0xffffffff to
ca_ops->pkts_acked().

As an aside, if tcp_trim_head() had really wanted the skb to reflect
the current MSS, it should have called tcp_set_skb_tso_segs()
unconditionally, since a decrease in MSS would mean that a
single-packet skb should now be sliced into multiple segments.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_output.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 882e0b0..faf257b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1134,11 +1134,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	sk_mem_uncharge(sk, len);
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 
-	/* Any change of skb->len requires recalculation of tso
-	 * factor and mss.
-	 */
+	/* Any change of skb->len requires recalculation of tso factor. */
 	if (tcp_skb_pcount(skb) > 1)
-		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
+		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
 
 	return 0;
 }
-- 
cgit v1.1


From 81ecd154d0b07bd5dab6e4f09336cb068b70bcb9 Mon Sep 17 00:00:00 2001
From: shawnlu <shawn.lu@ericsson.com>
Date: Fri, 20 Jan 2012 12:22:04 +0000
Subject: tcp: md5: using remote adress for md5 lookup in rst packet

[ Upstream commit 8a622e71f58ec9f092fc99eacae0e6cf14f6e742 ]

md5 key is added in socket through remote address.
remote address should be used in finding md5 key when
sending out reset packet.

Signed-off-by: shawnlu <shawn.lu@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 net/ipv6/tcp_ipv6.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 69790aa..53b0125 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -630,7 +630,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	arg.iov[0].iov_len  = sizeof(rep.th);
 
 #ifdef CONFIG_TCP_MD5SIG
-	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
+	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
 	if (key) {
 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 				   (TCPOPT_NOP << 16) |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 296510a..51587a0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1096,7 +1096,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 
 #ifdef CONFIG_TCP_MD5SIG
 	if (sk)
-		key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr);
+		key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr);
 #endif
 
 	if (th->ack)
-- 
cgit v1.1


From 8a533666d1591cf4ea596c6bd710e2fe682cb56a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 9 Feb 2012 16:13:19 -0500
Subject: net: fix NULL dereferences in check_peer_redir()

[ Upstream commit d3aaeb38c40e5a6c08dd31a1b64da65c4352be36, along
  with dependent backports of commits:
     69cce1d1404968f78b177a0314f5822d5afdbbfb
     9de79c127cccecb11ae6a21ab1499e87aa222880
     218fa90f072e4aeff9003d57e390857f4f35513e
     580da35a31f91a594f3090b7a2c39b85cb051a12
     f7e57044eeb1841847c24aa06766c8290c202583
     e049f28883126c689cf95859480d9ee4ab23b7fa ]

Gergely Kalman reported crashes in check_peer_redir().

It appears commit f39925dbde778 (ipv4: Cache learned redirect
information in inetpeer.) added a race, leading to possible NULL ptr
dereference.

Since we can now change dst neighbour, we should make sure a reader can
safely use a neighbour.

Add RCU protection to dst neighbour, and make sure check_peer_redir()
can be called safely by different cpus in parallel.

As neighbours are already freed after one RCU grace period, this patch
should not add typical RCU penalty (cache cold effects)

Many thanks to Gergely for providing a pretty report pointing to the
bug.

Reported-by: Gergely Kalman <synapse@hippy.csoma.elte.hu>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/atm/clip.c            | 16 ++++++++-----
 net/bridge/br_netfilter.c |  6 +++--
 net/core/dst.c            | 15 ++++++++----
 net/core/neighbour.c      | 19 +++++++++------
 net/decnet/dn_neigh.c     |  8 +++----
 net/decnet/dn_route.c     | 18 ++++++++-------
 net/ipv4/arp.c            | 28 +++++++++++++---------
 net/ipv4/ip_gre.c         |  2 +-
 net/ipv4/ip_output.c      | 22 ++++++++++++++----
 net/ipv4/route.c          | 31 +++++++++++++++----------
 net/ipv6/addrconf.c       |  2 +-
 net/ipv6/ip6_fib.c        |  2 +-
 net/ipv6/ip6_output.c     | 40 ++++++++++++++++++++++++--------
 net/ipv6/ndisc.c          |  4 ++--
 net/ipv6/route.c          | 59 +++++++++++++++++++++++++++++------------------
 net/ipv6/sit.c            |  4 ++--
 net/sched/sch_teql.c      | 31 ++++++++++++++++---------
 net/xfrm/xfrm_policy.c    |  2 +-
 18 files changed, 200 insertions(+), 109 deletions(-)

(limited to 'net')

diff --git a/net/atm/clip.c b/net/atm/clip.c
index 1d4be60..5889074 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -364,33 +364,37 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
 				   struct net_device *dev)
 {
 	struct clip_priv *clip_priv = PRIV(dev);
+	struct dst_entry *dst = skb_dst(skb);
 	struct atmarp_entry *entry;
+	struct neighbour *n;
 	struct atm_vcc *vcc;
 	int old;
 	unsigned long flags;
 
 	pr_debug("(skb %p)\n", skb);
-	if (!skb_dst(skb)) {
+	if (!dst) {
 		pr_err("skb_dst(skb) == NULL\n");
 		dev_kfree_skb(skb);
 		dev->stats.tx_dropped++;
 		return NETDEV_TX_OK;
 	}
-	if (!skb_dst(skb)->neighbour) {
+	n = dst_get_neighbour(dst);
+	if (!n) {
 #if 0
-		skb_dst(skb)->neighbour = clip_find_neighbour(skb_dst(skb), 1);
-		if (!skb_dst(skb)->neighbour) {
+		n = clip_find_neighbour(skb_dst(skb), 1);
+		if (!n) {
 			dev_kfree_skb(skb);	/* lost that one */
 			dev->stats.tx_dropped++;
 			return 0;
 		}
+		dst_set_neighbour(dst, n);
 #endif
 		pr_err("NO NEIGHBOUR !\n");
 		dev_kfree_skb(skb);
 		dev->stats.tx_dropped++;
 		return NETDEV_TX_OK;
 	}
-	entry = NEIGH2ENTRY(skb_dst(skb)->neighbour);
+	entry = NEIGH2ENTRY(n);
 	if (!entry->vccs) {
 		if (time_after(jiffies, entry->expires)) {
 			/* should be resolved */
@@ -407,7 +411,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
 	}
 	pr_debug("neigh %p, vccs %p\n", entry, entry->vccs);
 	ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc;
-	pr_debug("using neighbour %p, vcc %p\n", skb_dst(skb)->neighbour, vcc);
+	pr_debug("using neighbour %p, vcc %p\n", n, vcc);
 	if (entry->vccs->encap) {
 		void *here;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 56149ec..3dc7f54 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -343,24 +343,26 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct neighbour *neigh;
 	struct dst_entry *dst;
 
 	skb->dev = bridge_parent(skb->dev);
 	if (!skb->dev)
 		goto free_skb;
 	dst = skb_dst(skb);
+	neigh = dst_get_neighbour(dst);
 	if (dst->hh) {
 		neigh_hh_bridge(dst->hh, skb);
 		skb->dev = nf_bridge->physindev;
 		return br_handle_frame_finish(skb);
-	} else if (dst->neighbour) {
+	} else if (neigh) {
 		/* the neighbour function below overwrites the complete
 		 * MAC header, so we save the Ethernet source address and
 		 * protocol number. */
 		skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), skb->nf_bridge->data, ETH_HLEN-ETH_ALEN);
 		/* tell br_dev_xmit to continue with forwarding */
 		nf_bridge->mask |= BRNF_BRIDGED_DNAT;
-		return dst->neighbour->output(skb);
+		return neigh->output(skb);
 	}
 free_skb:
 	kfree_skb(skb);
diff --git a/net/core/dst.c b/net/core/dst.c
index 6135f36..8246d47 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -171,7 +171,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 	dst_init_metrics(dst, dst_default_metrics, true);
 	dst->expires = 0UL;
 	dst->path = dst;
-	dst->neighbour = NULL;
+	RCU_INIT_POINTER(dst->_neighbour, NULL);
 	dst->hh = NULL;
 #ifdef CONFIG_XFRM
 	dst->xfrm = NULL;
@@ -231,7 +231,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 	smp_rmb();
 
 again:
-	neigh = dst->neighbour;
+	neigh = rcu_dereference_protected(dst->_neighbour, 1);
 	hh = dst->hh;
 	child = dst->child;
 
@@ -240,7 +240,7 @@ again:
 		hh_cache_put(hh);
 
 	if (neigh) {
-		dst->neighbour = NULL;
+		RCU_INIT_POINTER(dst->_neighbour, NULL);
 		neigh_release(neigh);
 	}
 
@@ -367,14 +367,19 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 	if (!unregister) {
 		dst->input = dst->output = dst_discard;
 	} else {
+		struct neighbour *neigh;
+
 		dst->dev = dev_net(dst->dev)->loopback_dev;
 		dev_hold(dst->dev);
 		dev_put(dev);
-		if (dst->neighbour && dst->neighbour->dev == dev) {
-			dst->neighbour->dev = dst->dev;
+		rcu_read_lock();
+		neigh = dst_get_neighbour(dst);
+		if (neigh && neigh->dev == dev) {
+			neigh->dev = dst->dev;
 			dev_hold(dst->dev);
 			dev_put(dev);
 		}
+		rcu_read_unlock();
 	}
 }
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 16db887..8c54aff 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1173,12 +1173,17 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 
 		while (neigh->nud_state & NUD_VALID &&
 		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
-			struct neighbour *n1 = neigh;
+			struct dst_entry *dst = skb_dst(skb);
+			struct neighbour *n2, *n1 = neigh;
 			write_unlock_bh(&neigh->lock);
+
+			rcu_read_lock();
 			/* On shaper/eql skb->dst->neighbour != neigh :( */
-			if (skb_dst(skb) && skb_dst(skb)->neighbour)
-				n1 = skb_dst(skb)->neighbour;
+			if (dst && (n2 = dst_get_neighbour(dst)) != NULL)
+				n1 = n2;
 			n1->output(skb);
+			rcu_read_unlock();
+
 			write_lock_bh(&neigh->lock);
 		}
 		skb_queue_purge(&neigh->arp_queue);
@@ -1300,10 +1305,10 @@ EXPORT_SYMBOL(neigh_compat_output);
 int neigh_resolve_output(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct neighbour *neigh;
+	struct neighbour *neigh = dst_get_neighbour(dst);
 	int rc = 0;
 
-	if (!dst || !(neigh = dst->neighbour))
+	if (!dst)
 		goto discard;
 
 	__skb_pull(skb, skb_network_offset(skb));
@@ -1333,7 +1338,7 @@ out:
 	return rc;
 discard:
 	NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
-		      dst, dst ? dst->neighbour : NULL);
+		      dst, neigh);
 out_kfree_skb:
 	rc = -EINVAL;
 	kfree_skb(skb);
@@ -1347,7 +1352,7 @@ int neigh_connected_output(struct sk_buff *skb)
 {
 	int err;
 	struct dst_entry *dst = skb_dst(skb);
-	struct neighbour *neigh = dst->neighbour;
+	struct neighbour *neigh = dst_get_neighbour(dst);
 	struct net_device *dev = neigh->dev;
 	unsigned int seq;
 
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 602dade..9810610 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -208,7 +208,7 @@ static int dn_neigh_output_packet(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct dn_route *rt = (struct dn_route *)dst;
-	struct neighbour *neigh = dst->neighbour;
+	struct neighbour *neigh = dst_get_neighbour(dst);
 	struct net_device *dev = neigh->dev;
 	char mac_addr[ETH_ALEN];
 
@@ -227,7 +227,7 @@ static int dn_neigh_output_packet(struct sk_buff *skb)
 static int dn_long_output(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct neighbour *neigh = dst->neighbour;
+	struct neighbour *neigh = dst_get_neighbour(dst);
 	struct net_device *dev = neigh->dev;
 	int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3;
 	unsigned char *data;
@@ -274,7 +274,7 @@ static int dn_long_output(struct sk_buff *skb)
 static int dn_short_output(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct neighbour *neigh = dst->neighbour;
+	struct neighbour *neigh = dst_get_neighbour(dst);
 	struct net_device *dev = neigh->dev;
 	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
 	struct dn_short_packet *sp;
@@ -318,7 +318,7 @@ static int dn_short_output(struct sk_buff *skb)
 static int dn_phase3_output(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct neighbour *neigh = dst->neighbour;
+	struct neighbour *neigh = dst_get_neighbour(dst);
 	struct net_device *dev = neigh->dev;
 	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
 	struct dn_short_packet *sp;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 74544bc..b91b603 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -241,9 +241,11 @@ static int dn_dst_gc(struct dst_ops *ops)
  */
 static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
+	struct neighbour *n = dst_get_neighbour(dst);
 	u32 min_mtu = 230;
-	struct dn_dev *dn = dst->neighbour ?
-			    rcu_dereference_raw(dst->neighbour->dev->dn_ptr) : NULL;
+	struct dn_dev *dn;
+
+	dn = n ? rcu_dereference_raw(n->dev->dn_ptr) : NULL;
 
 	if (dn && dn->use_long == 0)
 		min_mtu -= 6;
@@ -715,7 +717,7 @@ static int dn_output(struct sk_buff *skb)
 
 	int err = -EINVAL;
 
-	if ((neigh = dst->neighbour) == NULL)
+	if ((neigh = dst_get_neighbour(dst)) == NULL)
 		goto error;
 
 	skb->dev = dev;
@@ -750,7 +752,7 @@ static int dn_forward(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr);
 	struct dn_route *rt;
-	struct neighbour *neigh = dst->neighbour;
+	struct neighbour *neigh = dst_get_neighbour(dst);
 	int header_len;
 #ifdef CONFIG_NETFILTER
 	struct net_device *dev = skb->dev;
@@ -833,11 +835,11 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
 	}
 	rt->rt_type = res->type;
 
-	if (dev != NULL && rt->dst.neighbour == NULL) {
+	if (dev != NULL && dst_get_neighbour(&rt->dst) == NULL) {
 		n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
 		if (IS_ERR(n))
 			return PTR_ERR(n);
-		rt->dst.neighbour = n;
+		dst_set_neighbour(&rt->dst, n);
 	}
 
 	if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
@@ -1144,7 +1146,7 @@ make_route:
 	rt->rt_dst_map    = fld.daddr;
 	rt->rt_src_map    = fld.saddr;
 
-	rt->dst.neighbour = neigh;
+	dst_set_neighbour(&rt->dst, neigh);
 	neigh = NULL;
 
 	rt->dst.lastuse = jiffies;
@@ -1416,7 +1418,7 @@ make_route:
 	rt->fld.flowidn_iif  = in_dev->ifindex;
 	rt->fld.flowidn_mark = fld.flowidn_mark;
 
-	rt->dst.neighbour = neigh;
+	dst_set_neighbour(&rt->dst, neigh);
 	rt->dst.lastuse = jiffies;
 	rt->dst.output = dn_rt_bug;
 	switch(res.type) {
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1b74d3b..1d5675e 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -518,26 +518,32 @@ EXPORT_SYMBOL(arp_find);
 
 /* END OF OBSOLETE FUNCTIONS */
 
+struct neighbour *__arp_bind_neighbour(struct dst_entry *dst, __be32 nexthop)
+{
+	struct net_device *dev = dst->dev;
+
+	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+		nexthop = 0;
+	return __neigh_lookup_errno(
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+		dev->type == ARPHRD_ATM ?
+		clip_tbl_hook :
+#endif
+		&arp_tbl, &nexthop, dev);
+}
+
 int arp_bind_neighbour(struct dst_entry *dst)
 {
 	struct net_device *dev = dst->dev;
-	struct neighbour *n = dst->neighbour;
+	struct neighbour *n = dst_get_neighbour(dst);
 
 	if (dev == NULL)
 		return -EINVAL;
 	if (n == NULL) {
-		__be32 nexthop = ((struct rtable *)dst)->rt_gateway;
-		if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
-			nexthop = 0;
-		n = __neigh_lookup_errno(
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-					 dev->type == ARPHRD_ATM ?
-					 clip_tbl_hook :
-#endif
-					 &arp_tbl, &nexthop, dev);
+		n = __arp_bind_neighbour(dst, ((struct rtable *)dst)->rt_gateway);
 		if (IS_ERR(n))
 			return PTR_ERR(n);
-		dst->neighbour = n;
+		dst_set_neighbour(dst, n);
 	}
 	return 0;
 }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8871067..d7bb94c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -731,9 +731,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 		}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		else if (skb->protocol == htons(ETH_P_IPV6)) {
+			struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
 			const struct in6_addr *addr6;
 			int addr_type;
-			struct neighbour *neigh = skb_dst(skb)->neighbour;
 
 			if (neigh == NULL)
 				goto tx_error;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0c99db4..51a3eec 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -182,6 +182,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	struct rtable *rt = (struct rtable *)dst;
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
+	struct neighbour *neigh;
+	int res;
 
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -203,10 +205,22 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		skb = skb2;
 	}
 
-	if (dst->hh)
-		return neigh_hh_output(dst->hh, skb);
-	else if (dst->neighbour)
-		return dst->neighbour->output(skb);
+	rcu_read_lock();
+	if (dst->hh) {
+		int res = neigh_hh_output(dst->hh, skb);
+
+		rcu_read_unlock();
+		return res;
+	} else {
+		neigh = dst_get_neighbour(dst);
+		if (neigh) {
+			res = neigh->output(skb);
+
+			rcu_read_unlock();
+			return res;
+		}
+		rcu_read_unlock();
+	}
 
 	if (net_ratelimit())
 		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4845bfe..65ff2e5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -416,7 +416,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			   "HHUptod\tSpecDst");
 	else {
 		struct rtable *r = v;
-		int len;
+		struct neighbour *n;
+		int len, HHUptod;
+
+		rcu_read_lock();
+		n = dst_get_neighbour(&r->dst);
+		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
+		rcu_read_unlock();
 
 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
@@ -431,8 +437,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			      dst_metric(&r->dst, RTAX_RTTVAR)),
 			r->rt_key_tos,
 			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
-			r->dst.hh ? (r->dst.hh->hh_output ==
-				       dev_queue_xmit) : 0,
+			HHUptod,
 			r->rt_spec_dst, &len);
 
 		seq_printf(seq, "%*s\n", 127 - len, "");
@@ -1688,23 +1693,25 @@ static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 {
 	struct rtable *rt = (struct rtable *) dst;
 	__be32 orig_gw = rt->rt_gateway;
+	struct neighbour *n, *old_n;
 
 	dst_confirm(&rt->dst);
 
-	neigh_release(rt->dst.neighbour);
-	rt->dst.neighbour = NULL;
-
 	rt->rt_gateway = peer->redirect_learned.a4;
-	if (arp_bind_neighbour(&rt->dst) ||
-	    !(rt->dst.neighbour->nud_state & NUD_VALID)) {
-		if (rt->dst.neighbour)
-			neigh_event_send(rt->dst.neighbour, NULL);
+	n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway);
+	if (IS_ERR(n))
+		return PTR_ERR(n);
+	old_n = xchg(&rt->dst._neighbour, n);
+	if (old_n)
+		neigh_release(old_n);
+	if (!n || !(n->nud_state & NUD_VALID)) {
+		if (n)
+			neigh_event_send(n, NULL);
 		rt->rt_gateway = orig_gw;
 		return -EAGAIN;
 	} else {
 		rt->rt_flags |= RTCF_REDIRECTED;
-		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
-					rt->dst.neighbour);
+		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 	}
 	return 0;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 498b927..0f335c6 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -656,7 +656,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 	 * layer address of our nexhop router
 	 */
 
-	if (rt->rt6i_nexthop == NULL)
+	if (dst_get_neighbour_raw(&rt->dst) == NULL)
 		ifa->flags &= ~IFA_F_OPTIMISTIC;
 
 	ifa->idev = idev;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4076a0b..0f9b37a 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1455,7 +1455,7 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 			RT6_TRACE("aging clone %p\n", rt);
 			return -1;
 		} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
-			   (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) {
+			   (!(dst_get_neighbour_raw(&rt->dst)->flags & NTF_ROUTER))) {
 			RT6_TRACE("purging route %p via non-router but gateway\n",
 				  rt);
 			return -1;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e17596b..9cbf176 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -100,6 +100,8 @@ static int ip6_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
+	struct neighbour *neigh;
+	int res;
 
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->dev = dev;
@@ -134,10 +136,22 @@ static int ip6_finish_output2(struct sk_buff *skb)
 				skb->len);
 	}
 
-	if (dst->hh)
-		return neigh_hh_output(dst->hh, skb);
-	else if (dst->neighbour)
-		return dst->neighbour->output(skb);
+	rcu_read_lock();
+	if (dst->hh) {
+		res = neigh_hh_output(dst->hh, skb);
+
+		rcu_read_unlock();
+		return res;
+	} else {
+		neigh = dst_get_neighbour(dst);
+		if (neigh) {
+			res = neigh->output(skb);
+
+			rcu_read_unlock();
+			return res;
+		}
+		rcu_read_unlock();
+	}
 
 	IP6_INC_STATS_BH(dev_net(dst->dev),
 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
@@ -385,6 +399,7 @@ int ip6_forward(struct sk_buff *skb)
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct inet6_skb_parm *opt = IP6CB(skb);
 	struct net *net = dev_net(dst->dev);
+	struct neighbour *n;
 	u32 mtu;
 
 	if (net->ipv6.devconf_all->forwarding == 0)
@@ -459,11 +474,10 @@ int ip6_forward(struct sk_buff *skb)
 	   send redirects to source routed frames.
 	   We don't send redirects to frames decapsulated from IPsec.
 	 */
-	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
-	    !skb_sec_path(skb)) {
+	n = dst_get_neighbour(dst);
+	if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
 		struct in6_addr *target = NULL;
 		struct rt6_info *rt;
-		struct neighbour *n = dst->neighbour;
 
 		/*
 		 *	incoming and outgoing devices are the same
@@ -949,8 +963,11 @@ out:
 static int ip6_dst_lookup_tail(struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6)
 {
-	int err;
 	struct net *net = sock_net(sk);
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	struct neighbour *n;
+#endif
+	int err;
 
 	if (*dst == NULL)
 		*dst = ip6_route_output(net, sk, fl6);
@@ -976,11 +993,14 @@ static int ip6_dst_lookup_tail(struct sock *sk,
 	 * dst entry and replace it instead with the
 	 * dst entry of the nexthop router
 	 */
-	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
+	rcu_read_lock();
+	n = dst_get_neighbour(*dst);
+	if (n && !(n->nud_state & NUD_VALID)) {
 		struct inet6_ifaddr *ifp;
 		struct flowi6 fl_gw6;
 		int redirect;
 
+		rcu_read_unlock();
 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 				      (*dst)->dev, 1);
 
@@ -1000,6 +1020,8 @@ static int ip6_dst_lookup_tail(struct sock *sk,
 			if ((err = (*dst)->error))
 				goto out_err_release;
 		}
+	} else {
+		rcu_read_unlock();
 	}
 #endif
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7596f07..10a8d41 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1244,7 +1244,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt)
-		neigh = rt->rt6i_nexthop;
+		neigh = dst_get_neighbour(&rt->dst);
 
 	if (rt && lifetime == 0) {
 		neigh_clone(neigh);
@@ -1265,7 +1265,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 
-		neigh = rt->rt6i_nexthop;
+		neigh = dst_get_neighbour(&rt->dst);
 		if (neigh == NULL) {
 			ND_PRINTK0(KERN_ERR
 				   "ICMPv6 RA: %s() got default router without neighbour.\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0ef1f08..e70e902 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -356,7 +356,7 @@ out:
 #ifdef CONFIG_IPV6_ROUTER_PREF
 static void rt6_probe(struct rt6_info *rt)
 {
-	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
+	struct neighbour *neigh;
 	/*
 	 * Okay, this does not seem to be appropriate
 	 * for now, however, we need to check if it
@@ -365,8 +365,10 @@ static void rt6_probe(struct rt6_info *rt)
 	 * Router Reachability Probe MUST be rate-limited
 	 * to no more than one per minute.
 	 */
+	rcu_read_lock();
+	neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
 	if (!neigh || (neigh->nud_state & NUD_VALID))
-		return;
+		goto out;
 	read_lock_bh(&neigh->lock);
 	if (!(neigh->nud_state & NUD_VALID) &&
 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
@@ -379,8 +381,11 @@ static void rt6_probe(struct rt6_info *rt)
 		target = (struct in6_addr *)&neigh->primary_key;
 		addrconf_addr_solict_mult(target, &mcaddr);
 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
-	} else
+	} else {
 		read_unlock_bh(&neigh->lock);
+	}
+out:
+	rcu_read_unlock();
 }
 #else
 static inline void rt6_probe(struct rt6_info *rt)
@@ -404,8 +409,11 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 
 static inline int rt6_check_neigh(struct rt6_info *rt)
 {
-	struct neighbour *neigh = rt->rt6i_nexthop;
+	struct neighbour *neigh;
 	int m;
+
+	rcu_read_lock();
+	neigh = dst_get_neighbour(&rt->dst);
 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
 	    !(rt->rt6i_flags & RTF_GATEWAY))
 		m = 1;
@@ -422,6 +430,7 @@ static inline int rt6_check_neigh(struct rt6_info *rt)
 		read_unlock_bh(&neigh->lock);
 	} else
 		m = 0;
+	rcu_read_unlock();
 	return m;
 }
 
@@ -745,8 +754,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_add
 			dst_free(&rt->dst);
 			return NULL;
 		}
-		rt->rt6i_nexthop = neigh;
-
+		dst_set_neighbour(&rt->dst, neigh);
 	}
 
 	return rt;
@@ -760,7 +768,7 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_a
 		rt->rt6i_dst.plen = 128;
 		rt->rt6i_flags |= RTF_CACHE;
 		rt->dst.flags |= DST_HOST;
-		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
+		dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
 	}
 	return rt;
 }
@@ -794,7 +802,7 @@ restart:
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 
-	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
+	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
 		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 	else if (!(rt->dst.flags & DST_HOST))
 		nrt = rt6_alloc_clone(rt, &fl6->daddr);
@@ -1058,7 +1066,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	}
 
 	rt->rt6i_idev     = idev;
-	rt->rt6i_nexthop  = neigh;
+	dst_set_neighbour(&rt->dst, neigh);
 	atomic_set(&rt->dst.__refcnt, 1);
 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
 	rt->dst.output  = ip6_output;
@@ -1338,12 +1346,12 @@ int ip6_route_add(struct fib6_config *cfg)
 		rt->rt6i_prefsrc.plen = 0;
 
 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
-		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
-		if (IS_ERR(rt->rt6i_nexthop)) {
-			err = PTR_ERR(rt->rt6i_nexthop);
-			rt->rt6i_nexthop = NULL;
+		struct neighbour *neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
+		if (IS_ERR(neigh)) {
+			err = PTR_ERR(neigh);
 			goto out;
 		}
+		dst_set_neighbour(&rt->dst, neigh);
 	}
 
 	rt->rt6i_flags = cfg->fc_flags;
@@ -1574,7 +1582,7 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
 	dst_confirm(&rt->dst);
 
 	/* Duplicate redirect: silently ignore. */
-	if (neigh == rt->dst.neighbour)
+	if (neigh == dst_get_neighbour_raw(&rt->dst))
 		goto out;
 
 	nrt = ip6_rt_copy(rt);
@@ -1590,7 +1598,7 @@ void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
 	nrt->dst.flags |= DST_HOST;
 
 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
-	nrt->rt6i_nexthop = neigh_clone(neigh);
+	dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
 
 	if (ip6_ins_rt(nrt))
 		goto out;
@@ -1670,7 +1678,7 @@ again:
 	   1. It is connected route. Action: COW
 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
 	 */
-	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
+	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
 		nrt = rt6_alloc_cow(rt, daddr, saddr);
 	else
 		nrt = rt6_alloc_clone(rt, daddr);
@@ -2035,7 +2043,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 
 		return ERR_CAST(neigh);
 	}
-	rt->rt6i_nexthop = neigh;
+	dst_set_neighbour(&rt->dst, neigh);
 
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 	rt->rt6i_dst.plen = 128;
@@ -2312,6 +2320,7 @@ static int rt6_fill_node(struct net *net,
 	struct nlmsghdr *nlh;
 	long expires;
 	u32 table;
+	struct neighbour *n;
 
 	if (prefix) {	/* user wants prefix routes only */
 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
@@ -2400,8 +2409,11 @@ static int rt6_fill_node(struct net *net,
 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
 		goto nla_put_failure;
 
-	if (rt->dst.neighbour)
-		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
+	rcu_read_lock();
+	n = dst_get_neighbour(&rt->dst);
+	if (n)
+		NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
+	rcu_read_unlock();
 
 	if (rt->dst.dev)
 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
@@ -2585,6 +2597,7 @@ struct rt6_proc_arg
 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 {
 	struct seq_file *m = p_arg;
+	struct neighbour *n;
 
 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
 
@@ -2593,12 +2606,14 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 #else
 	seq_puts(m, "00000000000000000000000000000000 00 ");
 #endif
-
-	if (rt->rt6i_nexthop) {
-		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
+	rcu_read_lock();
+	n = dst_get_neighbour(&rt->dst);
+	if (n) {
+		seq_printf(m, "%pi6", n->primary_key);
 	} else {
 		seq_puts(m, "00000000000000000000000000000000");
 	}
+	rcu_read_unlock();
 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
 		   rt->dst.__use, rt->rt6i_flags,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 38490d5..f56acd0 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -679,7 +679,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		struct neighbour *neigh = NULL;
 
 		if (skb_dst(skb))
-			neigh = skb_dst(skb)->neighbour;
+			neigh = dst_get_neighbour(skb_dst(skb));
 
 		if (neigh == NULL) {
 			if (net_ratelimit())
@@ -704,7 +704,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		struct neighbour *neigh = NULL;
 
 		if (skb_dst(skb))
-			neigh = skb_dst(skb)->neighbour;
+			neigh = dst_get_neighbour(skb_dst(skb));
 
 		if (neigh == NULL) {
 			if (net_ratelimit())
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 45cd300..4f4c52c 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -225,11 +225,11 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
 
 
 static int
-__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
+__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
+	       struct net_device *dev, struct netdev_queue *txq,
+	       struct neighbour *mn)
 {
-	struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0);
-	struct teql_sched_data *q = qdisc_priv(dev_queue->qdisc);
-	struct neighbour *mn = skb_dst(skb)->neighbour;
+	struct teql_sched_data *q = qdisc_priv(txq->qdisc);
 	struct neighbour *n = q->ncache;
 
 	if (mn->tbl == NULL)
@@ -262,17 +262,26 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
 }
 
 static inline int teql_resolve(struct sk_buff *skb,
-			       struct sk_buff *skb_res, struct net_device *dev)
+			       struct sk_buff *skb_res,
+			       struct net_device *dev,
+			       struct netdev_queue *txq)
 {
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+	struct dst_entry *dst = skb_dst(skb);
+	struct neighbour *mn;
+	int res;
+
 	if (txq->qdisc == &noop_qdisc)
 		return -ENODEV;
 
-	if (dev->header_ops == NULL ||
-	    skb_dst(skb) == NULL ||
-	    skb_dst(skb)->neighbour == NULL)
+	if (!dev->header_ops || !dst)
 		return 0;
-	return __teql_resolve(skb, skb_res, dev);
+
+	rcu_read_lock();
+	mn = dst_get_neighbour(dst);
+	res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0;
+	rcu_read_unlock();
+
+	return res;
 }
 
 static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -307,7 +316,7 @@ restart:
 			continue;
 		}
 
-		switch (teql_resolve(skb, skb_res, slave)) {
+		switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
 		case 0:
 			if (__netif_tx_trylock(slave_txq)) {
 				unsigned int length = qdisc_pkt_len(skb);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5ce74a3..7803eb6 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1497,7 +1497,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		goto free_dst;
 
 	/* Copy neighbour for reachability confirmation */
-	dst0->neighbour = neigh_clone(dst->neighbour);
+	dst_set_neighbour(dst0, neigh_clone(dst_get_neighbour(dst)));
 
 	xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
 	xfrm_init_pmtu(dst_prev);
-- 
cgit v1.1


From 36935521cd67e3df9a1db71591cf224252d6082c Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Wed, 1 Feb 2012 18:48:09 +0200
Subject: mac80211: timeout a single frame in the rx reorder buffer

commit 07ae2dfcf4f7143ce191c6436da1c33f179af0d6 upstream.

The current code checks for stored_mpdu_num > 1, causing
the reorder_timer to be triggered indefinitely, but the
frame is never timed-out (until the next packet is received)

Signed-off-by: Eliad Peller <eliad@wizery.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 378bd67..4100065 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -610,7 +610,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw,
 	index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
 						tid_agg_rx->buf_size;
 	if (!tid_agg_rx->reorder_buf[index] &&
-	    tid_agg_rx->stored_mpdu_num > 1) {
+	    tid_agg_rx->stored_mpdu_num) {
 		/*
 		 * No buffers ready to be released, but check whether any
 		 * frames in the reorder buffer have timed out.
-- 
cgit v1.1


From 0a3e045705af3ea9d61560d4f6ffe2ce62f81992 Mon Sep 17 00:00:00 2001
From: Mohammed Shafi Shajakhan <mohammed@qca.qualcomm.com>
Date: Thu, 9 Feb 2012 19:59:43 +0530
Subject: mac80211: Fix a rwlock bad magic bug

commit b57e6b560fc2a2742910ac5ca0eb2c46e45aeac2 upstream.

read_lock(&tpt_trig->trig.leddev_list_lock) is accessed via the path
ieee80211_open (->) ieee80211_do_open (->) ieee80211_mod_tpt_led_trig
(->) ieee80211_start_tpt_led_trig (->) tpt_trig_timer before initializing
it.
the intilization of this read/write lock happens via the path
ieee80211_led_init (->) led_trigger_register, but we are doing
'ieee80211_led_init'  after 'ieeee80211_if_add' where we
register netdev_ops.
so we access leddev_list_lock before initializing it and causes the
following bug in chrome laptops with AR928X cards with the following
script

while true
do
sudo modprobe -v ath9k
sleep 3
sudo modprobe -r ath9k
sleep 3
done

	BUG: rwlock bad magic on CPU#1, wpa_supplicant/358, f5b9eccc
	Pid: 358, comm: wpa_supplicant Not tainted 3.0.13 #1
	Call Trace:

	[<8137b9df>] rwlock_bug+0x3d/0x47
	[<81179830>] do_raw_read_lock+0x19/0x29
	[<8137f063>] _raw_read_lock+0xd/0xf
	[<f9081957>] tpt_trig_timer+0xc3/0x145 [mac80211]
	[<f9081f3a>] ieee80211_mod_tpt_led_trig+0x152/0x174 [mac80211]
	[<f9076a3f>] ieee80211_do_open+0x11e/0x42e [mac80211]
	[<f9075390>] ? ieee80211_check_concurrent_iface+0x26/0x13c [mac80211]
	[<f9076d97>] ieee80211_open+0x48/0x4c [mac80211]
	[<812dbed8>] __dev_open+0x82/0xab
	[<812dc0c9>] __dev_change_flags+0x9c/0x113
	[<812dc1ae>] dev_change_flags+0x18/0x44
	[<8132144f>] devinet_ioctl+0x243/0x51a
	[<81321ba9>] inet_ioctl+0x93/0xac
	[<812cc951>] sock_ioctl+0x1c6/0x1ea
	[<812cc78b>] ? might_fault+0x20/0x20
	[<810b1ebb>] do_vfs_ioctl+0x46e/0x4a2
	[<810a6ebb>] ? fget_light+0x2f/0x70
	[<812ce549>] ? sys_recvmsg+0x3e/0x48
	[<810b1f35>] sys_ioctl+0x46/0x69
	[<8137fa77>] sysenter_do_call+0x12/0x2

Cc: Gary Morain <gmorain@google.com>
Cc: Paul Stewart <pstew@google.com>
Cc: Abhijit Pradhan <abhijit@qca.qualcomm.com>
Cc: Vasanthakumar Thiagarajan <vthiagar@qca.qualcomm.com>
Cc: Rajkumar Manoharan <rmanohar@qca.qualcomm.com>
Acked-by: Johannes Berg <johannes.berg@intel.com>
Tested-by: Mohammed Shafi Shajakhan <mohammed@qca.qualcomm.com>
Signed-off-by: Mohammed Shafi Shajakhan <mohammed@qca.qualcomm.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 866f269..1e36fb3 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -910,6 +910,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
 			    result);
 
+	ieee80211_led_init(local);
+
 	rtnl_lock();
 
 	result = ieee80211_init_rate_ctrl_alg(local,
@@ -931,8 +933,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	rtnl_unlock();
 
-	ieee80211_led_init(local);
-
 	local->network_latency_notifier.notifier_call =
 		ieee80211_max_network_latency;
 	result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY,
-- 
cgit v1.1


From c3e8445f6ec4ad66c5143d6df8528f7440429b91 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 6 Feb 2012 15:14:37 -0500
Subject: net: Make qdisc_skb_cb upper size bound explicit.

[ Upstream commit 16bda13d90c8d5da243e2cfa1677e62ecce26860 ]

Just like skb->cb[], so that qdisc_skb_cb can be encapsulated inside
of other data structures.

This is intended to be used by IPoIB so that it can remember
addressing information stored at hard_header_ops->create() time that
it can fetch when the packet gets to the transmit routine.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sched/sch_choke.c | 3 +--
 net/sched/sch_netem.c | 3 +--
 net/sched/sch_sfb.c   | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 06afbae..178ee83 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -225,8 +225,7 @@ struct choke_skb_cb {
 
 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
 {
-	BUILD_BUG_ON(sizeof(skb->cb) <
-		sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb));
+	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 69c35f6..87b9658 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -117,8 +117,7 @@ struct netem_skb_cb {
 
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
-	BUILD_BUG_ON(sizeof(skb->cb) <
-		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
+	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 0a833d0..47ee29f 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -93,8 +93,7 @@ struct sfb_skb_cb {
 
 static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
 {
-	BUILD_BUG_ON(sizeof(skb->cb) <
-		sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
+	qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
 	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
 }
 
-- 
cgit v1.1


From 32fa5d83232b026092505e9165e119e5806b930d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 8 Feb 2012 08:51:50 +0000
Subject: gro: more generic L2 header check

[ Upstream commit 5ca3b72c5da47d95b83857b768def6172fbc080a ]

Shlomo Pongratz reported GRO L2 header check was suited for Ethernet
only, and failed on IB/ipoib traffic.

He provided a patch faking a zeroed header to let GRO aggregates frames.

Roland Dreier, Herbert Xu, and others suggested we change GRO L2 header
check to be more generic, ie not assuming L2 header is 14 bytes, but
taking into account hard_header_len.

__napi_gro_receive() has special handling for the common case (Ethernet)
to avoid a memcmp() call and use an inline optimized function instead.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Shlomo Pongratz <shlomop@mellanox.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Tested-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/dev.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index f14f601..17fdbf8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3434,14 +3434,20 @@ static inline gro_result_t
 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff *p;
+	unsigned int maclen = skb->dev->hard_header_len;
 
 	for (p = napi->gro_list; p; p = p->next) {
 		unsigned long diffs;
 
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
-		diffs |= compare_ether_header(skb_mac_header(p),
-					      skb_gro_mac_header(skb));
+		if (maclen == ETH_HLEN)
+			diffs |= compare_ether_header(skb_mac_header(p),
+						      skb_gro_mac_header(skb));
+		else if (!diffs)
+			diffs = memcmp(skb_mac_header(p),
+				       skb_gro_mac_header(skb),
+				       maclen);
 		NAPI_GRO_CB(p)->same_flow = !diffs;
 		NAPI_GRO_CB(p)->flush = 0;
 	}
-- 
cgit v1.1


From 23b139ecf944b097a0493262cbc04886363bd8e6 Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Tue, 27 Sep 2011 15:16:08 -0400
Subject: ipv6-multicast: Fix memory leak in input path.

[ Upstream commit 2015de5fe2a47086a3260802275932bfd810884e ]

Have to free the skb before returning if we fail
the fib lookup.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6mr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 82a8099..450a1ff 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2051,8 +2051,10 @@ int ip6_mr_input(struct sk_buff *skb)
 	int err;
 
 	err = ip6mr_fib_lookup(net, &fl6, &mrt);
-	if (err < 0)
+	if (err < 0) {
+		kfree_skb(skb);
 		return err;
+	}
 
 	read_lock(&mrt_lock);
 	cache = ip6mr_cache_find(mrt,
-- 
cgit v1.1


From 24190a04c967d91e718bbe7a871e418edb9424aa Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Fri, 23 Sep 2011 13:11:01 +0000
Subject: ipv6-multicast: Fix memory leak in IPv6 multicast.

[ Upstream commit 67928c4041606f02725f3c95c4c0404e4532df1b ]

If reg_vif_xmit cannot find a routing entry, be sure to
free the skb before returning the error.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/ip6mr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 450a1ff..86e3cc1 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -696,8 +696,10 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
 	int err;
 
 	err = ip6mr_fib_lookup(net, &fl6, &mrt);
-	if (err < 0)
+	if (err < 0) {
+		kfree_skb(skb);
 		return err;
+	}
 
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
-- 
cgit v1.1


From c4f2403478002bdb0a46a62f87909aeda8058d8b Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Tue, 8 Nov 2011 21:39:28 +0000
Subject: ipv4: fix for ip_options_rcv_srr() daddr update.

[ Upstream commit b12f62efb8ec0b9523bdb6c2d412c07193086de9 ]

When opt->srr_is_hit is set skb_rtable(skb) has been updated for
'nexthop' and iph->daddr should always equals to skb_rtable->rt_dst
holds, We need update iph->daddr either.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_options.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ec93335..05d20cc 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -640,6 +640,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	}
 	if (srrptr <= srrspace) {
 		opt->srr_is_hit = 1;
+		iph->daddr = nexthop;
 		opt->is_changed = 1;
 	}
 	return 0;
-- 
cgit v1.1


From 5805d4729059f578c8283ccc44021342fbe8c93d Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Tue, 22 Nov 2011 23:33:10 +0000
Subject: ipv4: Save nexthop address of LSRR/SSRR option to IPCB.

[ Upstream commit ac8a48106be49c422575ddc7531b776f8eb49610 ]

We can not update iph->daddr in ip_options_rcv_srr(), It is too early.
When some exception ocurred later (eg. in ip_forward() when goto
sr_failed) we need the ip header be identical to the original one as
ICMP need it.

Add a field 'nexthop' in struct ip_options to save nexthop of LSRR
or SSRR option.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_forward.c | 2 +-
 net/ipv4/ip_options.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 3b34d1c..29a07b6 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb)
 
 	rt = skb_rtable(skb);
 
-	if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway)
+	if (opt->is_strictroute && opt->nexthop != rt->rt_gateway)
 		goto sr_failed;
 
 	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 05d20cc..1e60f76 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -568,12 +568,13 @@ void ip_forward_options(struct sk_buff *skb)
 		     ) {
 			if (srrptr + 3 > srrspace)
 				break;
-			if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0)
+			if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
 				break;
 		}
 		if (srrptr + 3 <= srrspace) {
 			opt->is_changed = 1;
 			ip_rt_get_source(&optptr[srrptr-1], skb, rt);
+			ip_hdr(skb)->daddr = opt->nexthop;
 			optptr[2] = srrptr+4;
 		} else if (net_ratelimit())
 			printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
@@ -640,7 +641,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	}
 	if (srrptr <= srrspace) {
 		opt->srr_is_hit = 1;
-		iph->daddr = nexthop;
+		opt->nexthop = nexthop;
 		opt->is_changed = 1;
 	}
 	return 0;
-- 
cgit v1.1


From ab2fd30a38e23f0b6f2e331889b31f4f6fb2378a Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Thu, 9 Feb 2012 21:15:25 +0000
Subject: ipv4: Fix wrong order of ip_rt_get_source() and update iph->daddr.

[ Upstream commit 5dc7883f2a7c25f8df40d7479687153558cd531b ]

This patch fix a bug which introduced by commit ac8a4810 (ipv4: Save
nexthop address of LSRR/SSRR option to IPCB.).In that patch, we saved
the nexthop of SRR in ip_option->nexthop and update iph->daddr until
we get to ip_forward_options(), but we need to update it before
ip_rt_get_source(), otherwise we may get a wrong src.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/ip_options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1e60f76..42dd1a9 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb)
 		}
 		if (srrptr + 3 <= srrspace) {
 			opt->is_changed = 1;
-			ip_rt_get_source(&optptr[srrptr-1], skb, rt);
 			ip_hdr(skb)->daddr = opt->nexthop;
+			ip_rt_get_source(&optptr[srrptr-1], skb, rt);
 			optptr[2] = srrptr+4;
 		} else if (net_ratelimit())
 			printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
-- 
cgit v1.1


From 1831cd9e1fb43c2e700e795827cb9531e679b0ef Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 10 Feb 2012 04:07:11 +0000
Subject: net: Don't proxy arp respond if iif == rt->dst.dev if private VLAN is
 disabled

[ Upstream commit 70620c46ac2b45c24b0f22002fdf5ddd1f7daf81 ]

Commit 653241 (net: RFC3069, private VLAN proxy arp support) changed
the behavior of arp proxy to send arp replies back out on the interface
the request came in even if the private VLAN feature is disabled.

Previously we checked rt->dst.dev != skb->dev for in scenarios, when
proxy arp is enabled on for the netdevice and also when individual proxy
neighbour entries have been added.

This patch adds the check back for the pneigh_lookup() scenario.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/arp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1d5675e..d8f852d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -906,7 +906,8 @@ static int arp_process(struct sk_buff *skb)
 			if (addr_type == RTN_UNICAST  &&
 			    (arp_fwd_proxy(in_dev, dev, rt) ||
 			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
-			     pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
+			     (rt->dst.dev != dev &&
+			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
 				if (n)
 					neigh_release(n);
-- 
cgit v1.1


From 9f8a28dca634c4aa13127ed2ea5de94c1e47dbb7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 14 Feb 2012 10:11:59 +0000
Subject: netpoll: netpoll_poll_dev() should access dev->flags

[ Upstream commit 58e05f357a039a94aa36475f8c110256f693a239 ]

commit 5a698af53f (bond: service netpoll arp queue on master device)
tested IFF_SLAVE flag against dev->priv_flags instead of dev->flags

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: WANG Cong <amwang@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/netpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 18d9cbd..05db410 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -193,7 +193,7 @@ void netpoll_poll_dev(struct net_device *dev)
 
 	poll_napi(dev);
 
-	if (dev->priv_flags & IFF_SLAVE) {
+	if (dev->flags & IFF_SLAVE) {
 		if (dev->npinfo) {
 			struct net_device *bond_dev = dev->master;
 			struct sk_buff *skb;
-- 
cgit v1.1


From 1609e23b0c313d58664bcc0677eadd9464c8c2cc Mon Sep 17 00:00:00 2001
From: Hagen Paul Pfeifer <hagen@jauu.net>
Date: Wed, 4 Jan 2012 17:35:26 +0000
Subject: net_sched: Bug in netem reordering

[ Upstream commit eb10192447370f19a215a8c2749332afa1199d46 ]

Not now, but it looks you are correct. q->qdisc is NULL until another
additional qdisc is attached (beside tfifo). See 50612537e9ab2969312.
The following patch should work.

From: Hagen Paul Pfeifer <hagen@jauu.net>

netem: catch NULL pointer by updating the real qdisc statistic

Reported-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sched/sch_netem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 87b9658..2f68459 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -381,8 +381,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->counter = 0;
 
 		__skb_queue_head(&q->qdisc->q, skb);
-		q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
-		q->qdisc->qstats.requeues++;
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		sch->qstats.requeues++;
 		ret = NET_XMIT_SUCCESS;
 	}
 
-- 
cgit v1.1


From 39b73fb4fedd23980b720d0272e5e26510cd6940 Mon Sep 17 00:00:00 2001
From: Shawn Lu <shawn.lu@ericsson.com>
Date: Sat, 4 Feb 2012 12:38:09 +0000
Subject: tcp_v4_send_reset: binding oif to iif in no sock case

[ Upstream commit e2446eaab5585555a38ea0df4e01ff313dbb4ac9 ]

Binding RST packet outgoing interface to incoming interface
for tcp v4 when there is no socket associate with it.
when sk is not NULL, using sk->sk_bound_dev_if instead.
(suggested by Eric Dumazet).

This has few benefits:
1. tcp_v6_send_reset already did that.
2. This helps tcp connect with SO_BINDTODEVICE set. When
connection is lost, we still able to sending out RST using
same interface.
3. we are sending reply, it is most likely to be succeed
if iif is used

Signed-off-by: Shawn Lu <shawn.lu@ericsson.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_ipv4.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 53b0125..04c6592 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -650,6 +650,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+	/* When socket is gone, all binding information is lost.
+	 * routing might fail in this case. using iif for oif to
+	 * make sure we can deliver it
+	 */
+	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 
 	net = dev_net(skb_dst(skb)->dev);
 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
-- 
cgit v1.1


From 382e8f84cb3db5295aa2f142a11e42eac6544ab4 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 12 Feb 2012 18:37:09 +0000
Subject: tcp: allow tcp_sacktag_one() to tag ranges not aligned with skbs

[ Upstream commit cc9a672ee522d4805495b98680f4a3db5d0a0af9 ]

This commit allows callers of tcp_sacktag_one() to pass in sequence
ranges that do not align with skb boundaries, as tcp_shifted_skb()
needs to do in an upcoming fix in this patch series.

In fact, now tcp_sacktag_one() does not need to depend on an input skb
at all, which makes its semantics and dependencies more clear.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c68040f..af41044 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1289,25 +1289,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 	return in_sack;
 }
 
-static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
-			  struct tcp_sacktag_state *state,
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+			  struct tcp_sacktag_state *state, u8 sacked,
+			  u32 start_seq, u32 end_seq,
 			  int dup_sack, int pcount)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u8 sacked = TCP_SKB_CB(skb)->sacked;
 	int fack_count = state->fack_count;
 
 	/* Account D-SACK for retransmitted packet. */
 	if (dup_sack && (sacked & TCPCB_RETRANS)) {
 		if (tp->undo_marker && tp->undo_retrans &&
-		    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+		    after(end_seq, tp->undo_marker))
 			tp->undo_retrans--;
 		if (sacked & TCPCB_SACKED_ACKED)
 			state->reord = min(fack_count, state->reord);
 	}
 
 	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
-	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+	if (!after(end_seq, tp->snd_una))
 		return sacked;
 
 	if (!(sacked & TCPCB_SACKED_ACKED)) {
@@ -1326,13 +1327,13 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 				/* New sack for not retransmitted frame,
 				 * which was in hole. It is reordering.
 				 */
-				if (before(TCP_SKB_CB(skb)->seq,
+				if (before(start_seq,
 					   tcp_highest_sack_seq(tp)))
 					state->reord = min(fack_count,
 							   state->reord);
 
 				/* SACK enhanced F-RTO (RFC4138; Appendix B) */
-				if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
+				if (!after(end_seq, tp->frto_highmark))
 					state->flag |= FLAG_ONLY_ORIG_SACKED;
 			}
 
@@ -1350,8 +1351,7 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
 		if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
-		    before(TCP_SKB_CB(skb)->seq,
-			   TCP_SKB_CB(tp->lost_skb_hint)->seq))
+		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
 			tp->lost_cnt_hint += pcount;
 
 		if (fack_count > tp->fackets_out)
@@ -1407,7 +1407,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	}
 
 	/* We discard results */
-	tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
+	tcp_sacktag_one(sk, state,
+			TCP_SKB_CB(skb)->sacked,
+			TCP_SKB_CB(skb)->seq,
+			TCP_SKB_CB(skb)->end_seq,
+			dup_sack, pcount);
 
 	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
 	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1646,10 +1650,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 			break;
 
 		if (in_sack) {
-			TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
-								  state,
-								  dup_sack,
-								  tcp_skb_pcount(skb));
+			TCP_SKB_CB(skb)->sacked =
+				tcp_sacktag_one(sk,
+						state,
+						TCP_SKB_CB(skb)->sacked,
+						TCP_SKB_CB(skb)->seq,
+						TCP_SKB_CB(skb)->end_seq,
+						dup_sack,
+						tcp_skb_pcount(skb));
 
 			if (!before(TCP_SKB_CB(skb)->seq,
 				    tcp_highest_sack_seq(tp)))
-- 
cgit v1.1


From dd31c1ce7ef7b363215081fde02f13bf3e50b5a1 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 12 Feb 2012 18:37:10 +0000
Subject: tcp: fix range tcp_shifted_skb() passes to tcp_sacktag_one()

[ Upstream commit daef52bab1fd26e24e8e9578f8fb33ba1d0cb412 ]

Fix the newly-SACKed range to be the range of newly-shifted bytes.

Previously - since 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 -
tcp_shifted_skb() incorrectly called tcp_sacktag_one() with the start
and end sequence numbers of the skb it passes in set to the range just
beyond the range that is newly-SACKed.

This commit also removes a special-case adjustment to lost_cnt_hint in
tcp_shifted_skb() since the pre-existing adjustment of lost_cnt_hint
in tcp_sacktag_one() now properly handles this things now that the
correct start sequence number is passed in.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index af41044..78689e5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1370,6 +1370,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
 	return sacked;
 }
 
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
 static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 			   struct tcp_sacktag_state *state,
 			   unsigned int pcount, int shifted, int mss,
@@ -1377,12 +1380,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
+	u32 start_seq = TCP_SKB_CB(skb)->seq;	/* start of newly-SACKed */
+	u32 end_seq = start_seq + shifted;	/* end of newly-SACKed */
 
 	BUG_ON(!pcount);
 
-	if (skb == tp->lost_skb_hint)
-		tp->lost_cnt_hint += pcount;
-
 	TCP_SKB_CB(prev)->end_seq += shifted;
 	TCP_SKB_CB(skb)->seq += shifted;
 
@@ -1406,12 +1408,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 		skb_shinfo(skb)->gso_type = 0;
 	}
 
-	/* We discard results */
-	tcp_sacktag_one(sk, state,
-			TCP_SKB_CB(skb)->sacked,
-			TCP_SKB_CB(skb)->seq,
-			TCP_SKB_CB(skb)->end_seq,
-			dup_sack, pcount);
+	/* Adjust counters and hints for the newly sacked sequence range but
+	 * discard the return value since prev is already marked.
+	 */
+	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+			start_seq, end_seq, dup_sack, pcount);
 
 	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
 	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
-- 
cgit v1.1


From 623f1904ef55789082259573bb6248df5fea3d92 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 13 Feb 2012 20:22:08 +0000
Subject: tcp: fix tcp_shifted_skb() adjustment of lost_cnt_hint for FACK

[ Upstream commit 0af2a0d0576205dda778d25c6c344fc6508fc81d ]

This commit ensures that lost_cnt_hint is correctly updated in
tcp_shifted_skb() for FACK TCP senders. The lost_cnt_hint adjustment
in tcp_sacktag_one() only applies to non-FACK senders, so FACK senders
need their own adjustment.

This applies the spirit of 1e5289e121372a3494402b1b131b41bfe1cf9b7f -
except now that the sequence range passed into tcp_sacktag_one() is
correct we need only have a special case adjustment for FACK.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 78689e5..ee08f11f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1385,6 +1385,10 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 
 	BUG_ON(!pcount);
 
+	/* Adjust hint for FACK. Non-FACK is handled in tcp_sacktag_one(). */
+	if (tcp_is_fack(tp) && (skb == tp->lost_skb_hint))
+		tp->lost_cnt_hint += pcount;
+
 	TCP_SKB_CB(prev)->end_seq += shifted;
 	TCP_SKB_CB(skb)->seq += shifted;
 
-- 
cgit v1.1


From bebee22bcbf0026f92141990972bd5863ef9b69c Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Mon, 24 Oct 2011 02:56:38 -0400
Subject: route: fix ICMP redirect validation

[ Upstream commit 7cc9150ebe8ec06cafea9f1c10d92ddacf88d8ae ]

The commit f39925dbde7788cfb96419c0f092b086aa325c0f
(ipv4: Cache learned redirect information in inetpeer.)
removed some ICMP packet validations which are required by
RFC 1122, section 3.2.2.2:
...
  A Redirect message SHOULD be silently discarded if the new
  gateway address it specifies is not on the same connected
  (sub-) net through which the Redirect arrived [INTRO:2,
  Appendix A], or if the source of the Redirect is not the
  current first-hop gateway for the specified destination (see
  Section 3.3.1).

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/route.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 65ff2e5..f881be2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1373,7 +1373,12 @@ static void rt_del(unsigned hash, struct rtable *rt)
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
 {
+	int s, i;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct rtable *rt;
+	__be32 skeys[2] = { saddr, 0 };
+	int    ikeys[2] = { dev->ifindex, 0 };
+	struct flowi4 fl4;
 	struct inet_peer *peer;
 	struct net *net;
 
@@ -1396,13 +1401,34 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			goto reject_redirect;
 	}
 
-	peer = inet_getpeer_v4(daddr, 1);
-	if (peer) {
-		peer->redirect_learned.a4 = new_gw;
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = daddr;
+	for (s = 0; s < 2; s++) {
+		for (i = 0; i < 2; i++) {
+			fl4.flowi4_oif = ikeys[i];
+			fl4.saddr = skeys[s];
+			rt = __ip_route_output_key(net, &fl4);
+			if (IS_ERR(rt))
+				continue;
 
-		inet_putpeer(peer);
+			if (rt->dst.error || rt->dst.dev != dev ||
+			    rt->rt_gateway != old_gw) {
+				ip_rt_put(rt);
+				continue;
+			}
 
-		atomic_inc(&__rt_peer_genid);
+			if (!rt->peer)
+				rt_bind_peer(rt, rt->rt_dst, 1);
+
+			peer = rt->peer;
+			if (peer) {
+				peer->redirect_learned.a4 = new_gw;
+				atomic_inc(&__rt_peer_genid);
+			}
+
+			ip_rt_put(rt);
+			return;
+		}
 	}
 	return;
 
-- 
cgit v1.1


From 42ab5316ddcaa0de23e88e8a3d363c767b9ab0b3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 18 Nov 2011 15:24:32 -0500
Subject: ipv4: fix redirect handling

[ Upstream commit 9cc20b268a5a14f5e57b8ad405a83513ab0d78dc ]

commit f39925dbde77 (ipv4: Cache learned redirect information in
inetpeer.) introduced a regression in ICMP redirect handling.

It assumed ipv4_dst_check() would be called because all possible routes
were attached to the inetpeer we modify in ip_rt_redirect(), but thats
not true.

commit 7cc9150ebe (route: fix ICMP redirect validation) tried to fix
this but solution was not complete. (It fixed only one route)

So we must lookup existing routes (including different TOS values) and
call check_peer_redir() on them.

Reported-by: Ivan Zahariev <famzah@icdsoft.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Flavio Leitner <fbl@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/route.c | 109 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 58 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f881be2..6b95f74 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1369,16 +1369,41 @@ static void rt_del(unsigned hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
+static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	__be32 orig_gw = rt->rt_gateway;
+	struct neighbour *n, *old_n;
+
+	dst_confirm(&rt->dst);
+
+	rt->rt_gateway = peer->redirect_learned.a4;
+	n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway);
+	if (IS_ERR(n))
+		return PTR_ERR(n);
+	old_n = xchg(&rt->dst._neighbour, n);
+	if (old_n)
+		neigh_release(old_n);
+	if (!n || !(n->nud_state & NUD_VALID)) {
+		if (n)
+			neigh_event_send(n, NULL);
+		rt->rt_gateway = orig_gw;
+		return -EAGAIN;
+	} else {
+		rt->rt_flags |= RTCF_REDIRECTED;
+		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+	}
+	return 0;
+}
+
 /* called in rcu_read_lock() section */
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
 {
 	int s, i;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	struct rtable *rt;
 	__be32 skeys[2] = { saddr, 0 };
 	int    ikeys[2] = { dev->ifindex, 0 };
-	struct flowi4 fl4;
 	struct inet_peer *peer;
 	struct net *net;
 
@@ -1401,33 +1426,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			goto reject_redirect;
 	}
 
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.daddr = daddr;
 	for (s = 0; s < 2; s++) {
 		for (i = 0; i < 2; i++) {
-			fl4.flowi4_oif = ikeys[i];
-			fl4.saddr = skeys[s];
-			rt = __ip_route_output_key(net, &fl4);
-			if (IS_ERR(rt))
-				continue;
-
-			if (rt->dst.error || rt->dst.dev != dev ||
-			    rt->rt_gateway != old_gw) {
-				ip_rt_put(rt);
-				continue;
-			}
+			unsigned int hash;
+			struct rtable __rcu **rthp;
+			struct rtable *rt;
+
+			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
+
+			rthp = &rt_hash_table[hash].chain;
+
+			while ((rt = rcu_dereference(*rthp)) != NULL) {
+				rthp = &rt->dst.rt_next;
+
+				if (rt->rt_key_dst != daddr ||
+				    rt->rt_key_src != skeys[s] ||
+				    rt->rt_oif != ikeys[i] ||
+				    rt_is_input_route(rt) ||
+				    rt_is_expired(rt) ||
+				    !net_eq(dev_net(rt->dst.dev), net) ||
+				    rt->dst.error ||
+				    rt->dst.dev != dev ||
+				    rt->rt_gateway != old_gw)
+					continue;
 
-			if (!rt->peer)
-				rt_bind_peer(rt, rt->rt_dst, 1);
+				if (!rt->peer)
+					rt_bind_peer(rt, rt->rt_dst, 1);
 
-			peer = rt->peer;
-			if (peer) {
-				peer->redirect_learned.a4 = new_gw;
-				atomic_inc(&__rt_peer_genid);
+				peer = rt->peer;
+				if (peer) {
+					if (peer->redirect_learned.a4 != new_gw) {
+						peer->redirect_learned.a4 = new_gw;
+						atomic_inc(&__rt_peer_genid);
+					}
+					check_peer_redir(&rt->dst, peer);
+				}
 			}
-
-			ip_rt_put(rt);
-			return;
 		}
 	}
 	return;
@@ -1715,33 +1749,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 	}
 }
 
-static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
-{
-	struct rtable *rt = (struct rtable *) dst;
-	__be32 orig_gw = rt->rt_gateway;
-	struct neighbour *n, *old_n;
-
-	dst_confirm(&rt->dst);
-
-	rt->rt_gateway = peer->redirect_learned.a4;
-	n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway);
-	if (IS_ERR(n))
-		return PTR_ERR(n);
-	old_n = xchg(&rt->dst._neighbour, n);
-	if (old_n)
-		neigh_release(old_n);
-	if (!n || !(n->nud_state & NUD_VALID)) {
-		if (n)
-			neigh_event_send(n, NULL);
-		rt->rt_gateway = orig_gw;
-		return -EAGAIN;
-	} else {
-		rt->rt_flags |= RTCF_REDIRECTED;
-		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
-	}
-	return 0;
-}
-
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct rtable *rt = (struct rtable *) dst;
-- 
cgit v1.1


From 426f45680cc71385a8929f11654c789f5019315c Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Fri, 27 Jan 2012 10:45:27 +0900
Subject: ipvs: fix matching of fwmark templates during scheduling

commit e0aac52e17a3db68fe2ceae281780a70fc69957f upstream.

	Commit f11017ec2d1859c661f4e2b12c4a8d250e1f47cf (2.6.37)
moved the fwmark variable in subcontext that is invalidated before
reaching the ip_vs_ct_in_get call. As vaddr is provided as pointer
in the param structure make sure the fwmark variable is in
same context. As the fwmark templates can not be matched,
more and more template connections are created and the
controlled connections can not go to single real server.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netfilter/ipvs/ip_vs_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 24c28d2..0787bed 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -233,6 +233,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	__be16 dport = 0;		/* destination port to forward */
 	unsigned int flags;
 	struct ip_vs_conn_param param;
+	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
 
@@ -268,7 +269,6 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	{
 		int protocol = iph.protocol;
 		const union nf_inet_addr *vaddr = &iph.daddr;
-		const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
 		__be16 vport = 0;
 
 		if (dst_port == svc->port) {
-- 
cgit v1.1


From 34a9660ba1a8b98adf852f4f1090bdf084ccf991 Mon Sep 17 00:00:00 2001
From: Mohammed Shafi Shajakhan <mohammed@qca.qualcomm.com>
Date: Mon, 20 Feb 2012 10:05:31 +0530
Subject: mac80211: zero initialize count field in ieee80211_tx_rate

commit 8617b093d0031837a7be9b32bc674580cfb5f6b5 upstream.

rate control algorithms concludes the rate as invalid
with rate[i].idx < -1 , while they do also check for rate[i].count is
non-zero. it would be safer to zero initialize the 'count' field.
recently we had a ath9k rate control crash where the ath9k rate control
in ath_tx_status assumed to check only for rate[i].count being non-zero
in one instance and ended up in using invalid rate index for
'connection monitoring NULL func frames' which eventually lead to the crash.
thanks to Pavel Roskin for fixing it and finding the root cause.
https://bugzilla.redhat.com/show_bug.cgi?id=768639

Cc: Pavel Roskin <proski@gnu.org>
Signed-off-by: Mohammed Shafi Shajakhan <mohammed@qca.qualcomm.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/rate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 3d5a2cb..816590b 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -314,7 +314,7 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
 	for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
 		info->control.rates[i].idx = -1;
 		info->control.rates[i].flags = 0;
-		info->control.rates[i].count = 1;
+		info->control.rates[i].count = 0;
 	}
 
 	if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
-- 
cgit v1.1


From 035e3f6e8d1353abcbefd5b87710f8ae8bf1b4f6 Mon Sep 17 00:00:00 2001
From: Michel Machado <michel@digirati.com.br>
Date: Tue, 21 Feb 2012 11:04:13 +0000
Subject: neighbour: Fixed race condition at tbl->nht

[ Upstream commit 84338a6c9dbb6ff3de4749864020f8f25d86fc81 ]

When the fixed race condition happens:

1. While function neigh_periodic_work scans the neighbor hash table
pointed by field tbl->nht, it unlocks and locks tbl->lock between
buckets in order to call cond_resched.

2. Assume that function neigh_periodic_work calls cond_resched, that is,
the lock tbl->lock is available, and function neigh_hash_grow runs.

3. Once function neigh_hash_grow finishes, and RCU calls
neigh_hash_free_rcu, the original struct neigh_hash_table that function
neigh_periodic_work was using doesn't exist anymore.

4. Once back at neigh_periodic_work, whenever the old struct
neigh_hash_table is accessed, things can go badly.

Signed-off-by: Michel Machado <michel@digirati.com.br>
CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/neighbour.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8c54aff..96bb0a3 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -823,6 +823,8 @@ next_elt:
 		write_unlock_bh(&tbl->lock);
 		cond_resched();
 		write_lock_bh(&tbl->lock);
+		nht = rcu_dereference_protected(tbl->nht,
+						lockdep_is_held(&tbl->lock));
 	}
 	/* Cycle through all hash buckets every base_reachable_time/2 ticks.
 	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
-- 
cgit v1.1


From e38b849e2fc481c5a6924f6872468104969e5d3c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 23 Feb 2012 10:55:02 +0000
Subject: ipsec: be careful of non existing mac headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 03606895cd98c0a628b17324fd7b5ff15db7e3cd ]

Niccolo Belli reported ipsec crashes in case we handle a frame without
mac header (atm in his case)

Before copying mac header, better make sure it is present.

Bugzilla reference:  https://bugzilla.kernel.org/show_bug.cgi?id=42809

Reported-by: Niccolò Belli <darkbasic@linuxsystems.it>
Tested-by: Niccolò Belli <darkbasic@linuxsystems.it>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/xfrm4_mode_beet.c   | 5 +----
 net/ipv4/xfrm4_mode_tunnel.c | 6 ++----
 net/ipv6/xfrm6_mode_beet.c   | 6 +-----
 net/ipv6/xfrm6_mode_tunnel.c | 6 ++----
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 6341818..e3db3f9 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	skb_push(skb, sizeof(*iph));
 	skb_reset_network_header(skb);
-
-	memmove(skb->data - skb->mac_len, skb_mac_header(skb),
-		skb->mac_len);
-	skb_set_mac_header(skb, -skb->mac_len);
+	skb_mac_header_rebuild(skb);
 
 	xfrm4_beet_make_header(skb);
 
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 534972e..ed4bf11 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -66,7 +66,6 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 
 static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-	const unsigned char *old_mac;
 	int err = -EINVAL;
 
 	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -84,10 +83,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 	if (!(x->props.flags & XFRM_STATE_NOECN))
 		ipip_ecn_decapsulate(skb);
 
-	old_mac = skb_mac_header(skb);
-	skb_set_mac_header(skb, -skb->mac_len);
-	memmove(skb_mac_header(skb), old_mac, skb->mac_len);
 	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
 	err = 0;
 
 out:
diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c
index 3437d7d..f37cba9 100644
--- a/net/ipv6/xfrm6_mode_beet.c
+++ b/net/ipv6/xfrm6_mode_beet.c
@@ -80,7 +80,6 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb)
 static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct ipv6hdr *ip6h;
-	const unsigned char *old_mac;
 	int size = sizeof(struct ipv6hdr);
 	int err;
 
@@ -90,10 +89,7 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	__skb_push(skb, size);
 	skb_reset_network_header(skb);
-
-	old_mac = skb_mac_header(skb);
-	skb_set_mac_header(skb, -skb->mac_len);
-	memmove(skb_mac_header(skb), old_mac, skb->mac_len);
+	skb_mac_header_rebuild(skb);
 
 	xfrm6_beet_make_header(skb);
 
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index 4d6edff..23ecd68 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -63,7 +63,6 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int err = -EINVAL;
-	const unsigned char *old_mac;
 
 	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6)
 		goto out;
@@ -80,10 +79,9 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 	if (!(x->props.flags & XFRM_STATE_NOECN))
 		ipip6_ecn_decapsulate(skb);
 
-	old_mac = skb_mac_header(skb);
-	skb_set_mac_header(skb, -skb->mac_len);
-	memmove(skb_mac_header(skb), old_mac, skb->mac_len);
 	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
 	err = 0;
 
 out:
-- 
cgit v1.1


From 85526d578a0c7b6723c1a429b39870ce3bfec11c Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 26 Feb 2012 10:06:19 +0000
Subject: tcp: fix false reordering signal in tcp_shifted_skb

[ Upstream commit 4c90d3b30334833450ccbb02f452d4972a3c3c3f ]

When tcp_shifted_skb() shifts bytes from the skb that is currently
pointed to by 'highest_sack' then the increment of
TCP_SKB_CB(skb)->seq implicitly advances tcp_highest_sack_seq(). This
implicit advancement, combined with the recent fix to pass the correct
SACKed range into tcp_sacktag_one(), caused tcp_sacktag_one() to think
that the newly SACKed range was before the tcp_highest_sack_seq(),
leading to a call to tcp_update_reordering() with a degree of
reordering matching the size of the newly SACKed range (typically just
1 packet, which is a NOP, but potentially larger).

This commit fixes this by simply calling tcp_sacktag_one() before the
TCP_SKB_CB(skb)->seq advancement that can advance our notion of the
highest SACKed sequence.

Correspondingly, we can simplify the code a little now that
tcp_shifted_skb() should update the lost_cnt_hint in all cases where
skb == tp->lost_skb_hint.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ee08f11f..8eb1302 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1385,8 +1385,16 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 
 	BUG_ON(!pcount);
 
-	/* Adjust hint for FACK. Non-FACK is handled in tcp_sacktag_one(). */
-	if (tcp_is_fack(tp) && (skb == tp->lost_skb_hint))
+	/* Adjust counters and hints for the newly sacked sequence
+	 * range but discard the return value since prev is already
+	 * marked. We must tag the range first because the seq
+	 * advancement below implicitly advances
+	 * tcp_highest_sack_seq() when skb is highest_sack.
+	 */
+	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+			start_seq, end_seq, dup_sack, pcount);
+
+	if (skb == tp->lost_skb_hint)
 		tp->lost_cnt_hint += pcount;
 
 	TCP_SKB_CB(prev)->end_seq += shifted;
@@ -1412,12 +1420,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 		skb_shinfo(skb)->gso_type = 0;
 	}
 
-	/* Adjust counters and hints for the newly sacked sequence range but
-	 * discard the return value since prev is already marked.
-	 */
-	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
-			start_seq, end_seq, dup_sack, pcount);
-
 	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
 	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
 
-- 
cgit v1.1


From 6046dc7d1b061d0f8e820757716de7df5079aa35 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 2 Mar 2012 21:36:51 +0000
Subject: tcp: don't fragment SACKed skbs in tcp_mark_head_lost()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit c0638c247f559e1a16ee79e54df14bca2cb679ea ]

In tcp_mark_head_lost() we should not attempt to fragment a SACKed skb
to mark the first portion as lost. This is for two primary reasons:

(1) tcp_shifted_skb() coalesces adjacent regions of SACKed skbs. When
doing this, it preserves the sum of their packet counts in order to
reflect the real-world dynamics on the wire. But given that skbs can
have remainders that do not align to MSS boundaries, this packet count
preservation means that for SACKed skbs there is not necessarily a
direct linear relationship between tcp_skb_pcount(skb) and
skb->len. Thus tcp_mark_head_lost()'s previous attempts to fragment
off and mark as lost a prefix of length (packets - oldcnt)*mss from
SACKed skbs were leading to occasional failures of the WARN_ON(len >
skb->len) in tcp_fragment() (which used to be a BUG_ON(); see the
recent "crash in tcp_fragment" thread on netdev).

(2) there is no real point in fragmenting off part of a SACKed skb and
calling tcp_skb_mark_lost() on it, since tcp_skb_mark_lost() is a NOP
for SACKed skbs.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8eb1302..ab9dcfb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2549,6 +2549,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 
 		if (cnt > packets) {
 			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
 			    (oldcnt >= packets))
 				break;
 
-- 
cgit v1.1


From b77a726051a42d382ec1d20c49b3a60bf7dd54d8 Mon Sep 17 00:00:00 2001
From: Ulrich Weber <ulrich.weber@sophos.com>
Date: Mon, 5 Mar 2012 04:52:44 +0000
Subject: bridge: check return value of ipv6_dev_get_saddr()

[ Upstream commit d1d81d4c3dd886d5fa25a2c4fa1e39cb89613712 ]

otherwise source IPv6 address of ICMPV6_MGM_QUERY packet
might be random junk if IPv6 is disabled on interface or
link-local address is not yet ready (DAD).

Signed-off-by: Ulrich Weber <ulrich.weber@sophos.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_multicast.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 995cbe0..c23a4b1 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -446,8 +446,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
 	ip6h->nexthdr = IPPROTO_HOPOPTS;
 	ip6h->hop_limit = 1;
 	ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
-	ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
-			   &ip6h->saddr);
+	if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
+			       &ip6h->saddr)) {
+		kfree_skb(skb);
+		return NULL;
+	}
 	ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
 
 	hopopt = (u8 *)(ip6h + 1);
-- 
cgit v1.1


From 619b6e476fdf85f17d80df72f647f2fb85535339 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 5 Mar 2012 19:35:04 +0000
Subject: tcp: fix tcp_shift_skb_data() to not shift SACKed data below snd_una

[ Upstream commit 4648dc97af9d496218a05353b0e442b3dfa6aaab ]

This commit fixes tcp_shift_skb_data() so that it does not shift
SACKed data below snd_una.

This fixes an issue whose symptoms exactly match reports showing
tp->sacked_out going negative since 3.3.0-rc4 (see "WARNING: at
net/ipv4/tcp_input.c:3418" thread on netdev).

Since 2008 (832d11c5cd076abc0aa1eaf7be96c81d1a59ce41)
tcp_shift_skb_data() had been shifting SACKed ranges that were below
snd_una. It checked that the *end* of the skb it was about to shift
from was above snd_una, but did not check that the end of the actual
shifted range was above snd_una; this commit adds that check.

Shifting SACKed ranges below snd_una is problematic because for such
ranges tcp_sacktag_one() short-circuits: it does not declare anything
as SACKed and does not increase sacked_out.

Before the fixes in commits cc9a672ee522d4805495b98680f4a3db5d0a0af9
and daef52bab1fd26e24e8e9578f8fb33ba1d0cb412, shifting SACKed ranges
below snd_una happened to work because tcp_shifted_skb() was always
(incorrectly) passing in to tcp_sacktag_one() an skb whose end_seq
tcp_shift_skb_data() had already guaranteed was beyond snd_una. Hence
tcp_sacktag_one() never short-circuited and always increased
tp->sacked_out in this case.

After those two fixes, my testing has verified that shifting SACKed
ranges below snd_una could cause tp->sacked_out to go negative with
the following sequence of events:

(1) tcp_shift_skb_data() sees an skb whose end_seq is beyond snd_una,
    then shifts a prefix of that skb that is below snd_una

(2) tcp_shifted_skb() increments the packet count of the
    already-SACKed prev sk_buff

(3) tcp_sacktag_one() sees the end of the new SACKed range is below
    snd_una, so it short-circuits and doesn't increase tp->sacked_out

(5) tcp_clean_rtx_queue() sees the SACKed skb has been ACKed,
    decrements tp->sacked_out by this "inflated" pcount that was
    missing a matching increase in tp->sacked_out, and hence
    tp->sacked_out underflows to a u32 like 0xFFFFFFFF, which casted
    to s32 is negative.

(6) this leads to the warnings seen in the recent "WARNING: at
    net/ipv4/tcp_input.c:3418" thread on the netdev list; e.g.:
    tcp_input.c:3418  WARN_ON((int)tp->sacked_out < 0);

More generally, I think this bug can be tickled in some cases where
two or more ACKs from the receiver are lost and then a DSACK arrives
that is immediately above an existing SACKed skb in the write queue.

This fix changes tcp_shift_skb_data() to abort this sequence at step
(1) in the scenario above by noticing that the bytes are below snd_una
and not shifting them.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ab9dcfb..72b1857 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1567,6 +1567,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+		goto fallback;
+
 	if (!skb_shift(prev, skb, len))
 		goto fallback;
 	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
-- 
cgit v1.1


From 94962718da9d6df11a3a30511016707e4e9451dc Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Mon, 5 Mar 2012 14:45:17 +0000
Subject: IPv6: Fix not join all-router mcast group when forwarding set.

[ Upstream commit d6ddef9e641d1229d4ec841dc75ae703171c3e92 ]

When forwarding was set and a new net device is register,
we need add this device to the all-router mcast group.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/addrconf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0f335c6..be29337 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -433,6 +433,10 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 	/* Join all-node multicast group */
 	ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
 
+	/* Join all-router multicast group if forwarding is set */
+	if (ndev->cnf.forwarding && dev && (dev->flags & IFF_MULTICAST))
+		ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+
 	return ndev;
 }
 
-- 
cgit v1.1


From 137a954db947096bd9378ff5a6a77336231f4a90 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 10 Mar 2012 09:20:21 +0000
Subject: tcp: fix syncookie regression

[ Upstream commit dfd25ffffc132c00070eed64200e8950da5d7e9d ]

commit ea4fc0d619 (ipv4: Don't use rt->rt_{src,dst} in ip_queue_xmit())
added a serious regression on synflood handling.

Simon Kirby discovered a successful connection was delayed by 20 seconds
before being responsive.

In my tests, I discovered that xmit frames were lost, and needed ~4
retransmits and a socket dst rebuild before being really sent.

In case of syncookie initiated connection, we use a different path to
initialize the socket dst, and inet->cork.fl.u.ip4 is left cleared.

As ip_queue_xmit() now depends on inet flow being setup, fix this by
copying the temp flowi4 we use in cookie_v4_check().

Reported-by: Simon Kirby <sim@netnation.com>
Bisected-by: Simon Kirby <sim@netnation.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/syncookies.c | 30 ++++++++++++++++--------------
 net/ipv4/tcp_ipv4.c   | 10 +++++++---
 2 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4382629..895f215 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -277,6 +277,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	struct rtable *rt;
 	__u8 rcv_wscale;
 	bool ecn_ok = false;
+	struct flowi4 fl4;
 
 	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
@@ -344,20 +345,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	 * hasn't changed since we received the original syn, but I see
 	 * no easy way to do this.
 	 */
-	{
-		struct flowi4 fl4;
-
-		flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
-				   RT_SCOPE_UNIVERSE, IPPROTO_TCP,
-				   inet_sk_flowi_flags(sk),
-				   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
-				   ireq->loc_addr, th->source, th->dest);
-		security_req_classify_flow(req, flowi4_to_flowi(&fl4));
-		rt = ip_route_output_key(sock_net(sk), &fl4);
-		if (IS_ERR(rt)) {
-			reqsk_free(req);
-			goto out;
-		}
+	flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
+			   RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+			   inet_sk_flowi_flags(sk),
+			   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
+			   ireq->loc_addr, th->source, th->dest);
+	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_key(sock_net(sk), &fl4);
+	if (IS_ERR(rt)) {
+		reqsk_free(req);
+		goto out;
 	}
 
 	/* Try to redo what tcp_v4_send_synack did. */
@@ -371,5 +368,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->rcv_wscale  = rcv_wscale;
 
 	ret = get_cookie_sock(sk, skb, req, &rt->dst);
+	/* ip_queue_xmit() depends on our flow being setup
+	 * Normal sockets get it right from inet_csk_route_child_sock()
+	 */
+	if (ret)
+		inet_sk(ret)->cork.fl.u.ip4 = fl4;
 out:	return ret;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 04c6592..53a5af6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1454,9 +1454,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 	newinet->inet_id = newtp->write_seq ^ jiffies;
 
-	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
-		goto put_and_exit;
-
+	if (!dst) {
+		dst = inet_csk_route_child_sock(sk, newsk, req);
+		if (!dst)
+			goto put_and_exit;
+	} else {
+		/* syncookie case : see end of cookie_v4_check() */
+	}
 	sk_setup_caps(newsk, dst);
 
 	tcp_mtup_init(newsk);
-- 
cgit v1.1


From 97490c46fea06ecf95f989789e72c54fba2e2584 Mon Sep 17 00:00:00 2001
From: "RongQing.Li" <roy.qing.li@gmail.com>
Date: Thu, 15 Mar 2012 22:54:14 +0000
Subject: ipv6: Don't dev_hold(dev) in ip6_mc_find_dev_rcu.

[ Upstream commit c577923756b7fe9071f28a76b66b83b306d1d001 ]

ip6_mc_find_dev_rcu() is called with rcu_read_lock(), so don't
need to dev_hold().
With dev_hold(), not corresponding dev_put(), will lead to leak.

[ bug introduced in 96b52e61be1 (ipv6: mcast: RCU conversions) ]

Signed-off-by: RongQing.Li <roy.qing.li@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/mcast.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index ee7839f..2257366 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -257,7 +257,6 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
 
 		if (rt) {
 			dev = rt->rt6i_dev;
-			dev_hold(dev);
 			dst_release(&rt->dst);
 		}
 	} else
-- 
cgit v1.1


From 77d77ab09b1b57cf3cc30d0dbdf8c55137146a8f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 19 Mar 2012 13:39:35 -0400
Subject: SUNRPC: We must not use list_for_each_entry_safe() in rpc_wake_up()

commit 540a0f7584169651f485e8ab67461fcb06934e38 upstream.

The problem is that for the case of priority queues, we
have to assume that __rpc_remove_wait_queue_priority will move new
elements from the tk_wait.links lists into the queue->tasks[] list.
We therefore cannot use list_for_each_entry_safe() on queue->tasks[],
since that will skip these new tasks that __rpc_remove_wait_queue_priority
is adding.

Without this fix, rpc_wake_up and rpc_wake_up_status will both fail
to wake up all functions on priority wait queues, which can result
in some nasty hangs.

Reported-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sunrpc/sched.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 4814e24..b6bb225 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -480,14 +480,18 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next);
  */
 void rpc_wake_up(struct rpc_wait_queue *queue)
 {
-	struct rpc_task *task, *next;
 	struct list_head *head;
 
 	spin_lock_bh(&queue->lock);
 	head = &queue->tasks[queue->maxpriority];
 	for (;;) {
-		list_for_each_entry_safe(task, next, head, u.tk_wait.list)
+		while (!list_empty(head)) {
+			struct rpc_task *task;
+			task = list_first_entry(head,
+					struct rpc_task,
+					u.tk_wait.list);
 			rpc_wake_up_task_queue_locked(queue, task);
+		}
 		if (head == &queue->tasks[0])
 			break;
 		head--;
@@ -505,13 +509,16 @@ EXPORT_SYMBOL_GPL(rpc_wake_up);
  */
 void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 {
-	struct rpc_task *task, *next;
 	struct list_head *head;
 
 	spin_lock_bh(&queue->lock);
 	head = &queue->tasks[queue->maxpriority];
 	for (;;) {
-		list_for_each_entry_safe(task, next, head, u.tk_wait.list) {
+		while (!list_empty(head)) {
+			struct rpc_task *task;
+			task = list_first_entry(head,
+					struct rpc_task,
+					u.tk_wait.list);
 			task->tk_status = status;
 			rpc_wake_up_task_queue_locked(queue, task);
 		}
-- 
cgit v1.1


From 6a26d49c67b852cfd9144f86f628f57c8ca00977 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 20 Mar 2012 03:57:54 +0000
Subject: Fix pppol2tp getsockname()

[ Upstream commit bbdb32cb5b73597386913d052165423b9d736145 ]

While testing L2TP functionality, I came across a bug in getsockname().  The
IP address returned within the pppol2tp_addr's addr memember was not being
set to the IP  address in use.  This bug is caused by using inet_sk() on the
wrong socket (the L2TP socket rather than the underlying UDP socket), and was
likely introduced during the addition of L2TPv3 support.

Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/l2tp/l2tp_ppp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 39a21d0..13f9868 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -908,7 +908,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
 		goto end_put_sess;
 	}
 
-	inet = inet_sk(sk);
+	inet = inet_sk(tunnel->sock);
 	if (tunnel->version == 2) {
 		struct sockaddr_pppol2tp sp;
 		len = sizeof(sp);
-- 
cgit v1.1


From e033155d0b495becfdc28f46ad20089dc8a13060 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Mar 2012 09:53:52 +0000
Subject: net: fix a potential rcu_read_lock() imbalance in rt6_fill_node()

[ Upstream commit 94f826b8076e2cb92242061e92f21b5baa3eccc2 ]

Commit f2c31e32b378 (net: fix NULL dereferences in check_peer_redir() )
added a regression in rt6_fill_node(), leading to rcu_read_lock()
imbalance.

Thats because NLA_PUT() can make a jump to nla_put_failure label.

Fix this by using nla_put()

Many thanks to Ben Greear for his help

Reported-by: Ben Greear <greearb@candelatech.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/route.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e70e902..8e600f8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2411,8 +2411,12 @@ static int rt6_fill_node(struct net *net,
 
 	rcu_read_lock();
 	n = dst_get_neighbour(&rt->dst);
-	if (n)
-		NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
+	if (n) {
+		if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
+			rcu_read_unlock();
+			goto nla_put_failure;
+		}
+	}
 	rcu_read_unlock();
 
 	if (rt->dst.dev)
-- 
cgit v1.1


From 99b8230daca979e0c0b988cfb80566791354a077 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 21 Mar 2012 06:58:03 +0000
Subject: net: fix napi_reuse_skb() skb reserve

[ Upstream commit 2a2a459eeeff48640dc557548ce576d666ab06ed ]

napi->skb is allocated in napi_get_frags() using
netdev_alloc_skb_ip_align(), with a reserve of NET_SKB_PAD +
NET_IP_ALIGN bytes.

However, when such skb is recycled in napi_reuse_skb(), it ends with a
reserve of NET_IP_ALIGN which is suboptimal.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 17fdbf8..f134f88 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3504,7 +3504,8 @@ EXPORT_SYMBOL(napi_gro_receive);
 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 {
 	__skb_pull(skb, skb_headlen(skb));
-	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
+	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 	skb->vlan_tci = 0;
 	skb->dev = napi->dev;
 	skb->skb_iif = 0;
-- 
cgit v1.1


From 276b5b3b43807632637cdd160eba2c5facb7e14a Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 19 Mar 2012 13:01:07 +0000
Subject: Remove printk from rds_sendmsg

[ Upstream commit a6506e1486181975d318344143aca722b2b91621 ]

no socket layer outputs a message for this error and neither should rds.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/rds/send.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index d58ae5f..c803341 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -932,7 +932,6 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
 	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
-		printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
-- 
cgit v1.1


From 096e4eafb36436b6ce6e8dc372bc5f6e11804e68 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 21 Mar 2012 23:36:13 +0000
Subject: xfrm: Access the replay notify functions via the registered callbacks

[ Upstream commit 1265fd616782ef03b98fd19f65c2b47fcd4ea11f ]

We call the wrong replay notify function when we use ESN replay
handling. This leads to the fact that we don't send notifications
if we use ESN. Fix this by calling the registered callbacks instead
of xfrm_replay_notify().

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/xfrm/xfrm_replay.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index b11ea69..3235023 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -166,7 +166,7 @@ static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq)
 	}
 
 	if (xfrm_aevent_is_on(xs_net(x)))
-		xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+		x->repl->notify(x, XFRM_REPLAY_UPDATE);
 }
 
 static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb)
@@ -293,7 +293,7 @@ static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq)
 	}
 
 	if (xfrm_aevent_is_on(xs_net(x)))
-		xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+		x->repl->notify(x, XFRM_REPLAY_UPDATE);
 }
 
 static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event)
@@ -502,7 +502,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
 	}
 
 	if (xfrm_aevent_is_on(xs_net(x)))
-		xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+		x->repl->notify(x, XFRM_REPLAY_UPDATE);
 }
 
 static struct xfrm_replay xfrm_replay_legacy = {
-- 
cgit v1.1


From 8bb8ebe7b77228209006ea945104e37294608b93 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 18 Jan 2012 12:56:02 +0300
Subject: nfsd: don't allow zero length strings in cache_parse()

commit 6d8d17499810479eabd10731179c04b2ca22152f upstream.

There is no point in passing a zero length string here and quite a
few of that cache_parse() implementations will Oops if count is
zero.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sunrpc/cache.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 72ad836..4530a91 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -828,6 +828,8 @@ static ssize_t cache_do_downcall(char *kaddr, const char __user *buf,
 {
 	ssize_t ret;
 
+	if (count == 0)
+		return -EINVAL;
 	if (copy_from_user(kaddr, buf, count))
 		return -EFAULT;
 	kaddr[count] = '\0';
-- 
cgit v1.1


From cea90bebaab0e962cf9a954ef41cc1893da7b3bf Mon Sep 17 00:00:00 2001
From: "danborkmann@iogearbox.net" <danborkmann@iogearbox.net>
Date: Tue, 27 Mar 2012 22:47:43 +0000
Subject: rose_dev: fix memcpy-bug in rose_set_mac_address

[ Upstream commit 81213b5e8ae68e204aa7a3f83c4f9100405dbff9 ]

If both addresses equal, nothing needs to be done. If the device is down,
then we simply copy the new address to dev->dev_addr. If the device is up,
then we add another loopback device with the new address, and if that does
not fail, we remove the loopback device with the old address. And only
then, we update the dev->dev_addr.

Signed-off-by: Daniel Borkmann <daniel.borkmann@tik.ee.ethz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/rose/rose_dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c
index 178ff4f..2679507 100644
--- a/net/rose/rose_dev.c
+++ b/net/rose/rose_dev.c
@@ -96,11 +96,11 @@ static int rose_set_mac_address(struct net_device *dev, void *addr)
 	struct sockaddr *sa = addr;
 	int err;
 
-	if (!memcpy(dev->dev_addr, sa->sa_data, dev->addr_len))
+	if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len))
 		return 0;
 
 	if (dev->flags & IFF_UP) {
-		err = rose_add_loopback_node((rose_address *)dev->dev_addr);
+		err = rose_add_loopback_node((rose_address *)sa->sa_data);
 		if (err)
 			return err;
 
-- 
cgit v1.1


From eb221774b352966c562b5c92a28d01ddc1bc4393 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Mon, 19 Mar 2012 16:00:26 +0100
Subject: mac80211: fix possible tid_rx->reorder_timer use after free

commit d72308bff5c2fa207949a5925b020bce74495e33 upstream.

Is possible that we will arm the tid_rx->reorder_timer after
del_timer_sync() in ___ieee80211_stop_rx_ba_session(). We need to stop
timer after RCU grace period finish, so move it to
ieee80211_free_tid_rx(). Timer will not be armed again, as
rcu_dereference(sta->ampdu_mlme.tid_rx[tid]) will return NULL.

Debug object detected problem with the following warning:
ODEBUG: free active (active state 0) object type: timer_list hint: sta_rx_agg_reorder_timer_expired+0x0/0xf0 [mac80211]

Bug report (with all warning messages):
https://bugzilla.redhat.com/show_bug.cgi?id=804007

Reported-by: "jan p. springer" <jsd@igroup.org>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/agg-rx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 9c0d76c..1a41b14 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -48,6 +48,8 @@ static void ieee80211_free_tid_rx(struct rcu_head *h)
 		container_of(h, struct tid_ampdu_rx, rcu_head);
 	int i;
 
+	del_timer_sync(&tid_rx->reorder_timer);
+
 	for (i = 0; i < tid_rx->buf_size; i++)
 		dev_kfree_skb(tid_rx->reorder_buf[i]);
 	kfree(tid_rx->reorder_buf);
@@ -87,7 +89,6 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 				     tid, 0, reason);
 
 	del_timer_sync(&tid_rx->session_timer);
-	del_timer_sync(&tid_rx->reorder_timer);
 
 	call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
 }
-- 
cgit v1.1


From 073c4aec557b87d0c5b6f8b6b1910b356088eaf3 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Mon, 2 Apr 2012 13:44:56 +0200
Subject: Bluetooth: Fix l2cap conn failures for ssp devices

commit 18daf1644e634bae951a6e3d4d19d89170209762 upstream

Commit 330605423c fixed l2cap conn establishment for non-ssp remote
devices by not setting HCI_CONN_ENCRYPT_PEND every time conn security
is tested (which was always returning failure on any subsequent
security checks).

However, this broke l2cap conn establishment for ssp remote devices
when an ACL link was already established at SDP-level security. This
fix ensures that encryption must be pending whenever authentication
is also pending.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
---
 net/bluetooth/hci_conn.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index bcd158f..4bb16b8 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -548,6 +548,10 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
 
 	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
 		struct hci_cp_auth_requested cp;
+
+		/* encrypt must be pending if auth is also pending */
+		set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend);
+
 		cp.handle = cpu_to_le16(conn->handle);
 		hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
 							sizeof(cp), &cp);
-- 
cgit v1.1


From c1a658c9440a201f46a367a4da6e0bd99c9beea4 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 15 Mar 2012 14:48:41 +0100
Subject: Bluetooth: hci_core: fix NULL-pointer dereference at unregister

commit 94324962066231a938564bebad0f941cd2d06bb2 upstream.

Make sure hci_dev_open returns immediately if hci_dev_unregister has
been called.

This fixes a race between hci_dev_open and hci_dev_unregister which can
lead to a NULL-pointer dereference.

Bug is 100% reproducible using hciattach and a disconnected serial port:

0. # hciattach -n /dev/ttyO1 any noflow

1. hci_dev_open called from hci_power_on grabs req lock
2. hci_init_req executes but device fails to initialise (times out
   eventually)
3. hci_dev_open is called from hci_sock_ioctl and sleeps on req lock
4. hci_uart_tty_close calls hci_dev_unregister and sleeps on req lock in
   hci_dev_do_close
5. hci_dev_open (1) releases req lock
6. hci_dev_do_close grabs req lock and returns as device is not up
7. hci_dev_unregister sleeps in destroy_workqueue
8. hci_dev_open (3) grabs req lock, calls hci_init_req and eventually sleeps
9. hci_dev_unregister finishes, while hci_dev_open is still running...

[   79.627136] INFO: trying to register non-static key.
[   79.632354] the code is fine but needs lockdep annotation.
[   79.638122] turning off the locking correctness validator.
[   79.643920] [<c00188bc>] (unwind_backtrace+0x0/0xf8) from [<c00729c4>] (__lock_acquire+0x1590/0x1ab0)
[   79.653594] [<c00729c4>] (__lock_acquire+0x1590/0x1ab0) from [<c00733f8>] (lock_acquire+0x9c/0x128)
[   79.663085] [<c00733f8>] (lock_acquire+0x9c/0x128) from [<c0040a88>] (run_timer_softirq+0x150/0x3ac)
[   79.672668] [<c0040a88>] (run_timer_softirq+0x150/0x3ac) from [<c003a3b8>] (__do_softirq+0xd4/0x22c)
[   79.682281] [<c003a3b8>] (__do_softirq+0xd4/0x22c) from [<c003a924>] (irq_exit+0x8c/0x94)
[   79.690856] [<c003a924>] (irq_exit+0x8c/0x94) from [<c0013a50>] (handle_IRQ+0x34/0x84)
[   79.699157] [<c0013a50>] (handle_IRQ+0x34/0x84) from [<c0008530>] (omap3_intc_handle_irq+0x48/0x4c)
[   79.708648] [<c0008530>] (omap3_intc_handle_irq+0x48/0x4c) from [<c037499c>] (__irq_usr+0x3c/0x60)
[   79.718048] Exception stack(0xcf281fb0 to 0xcf281ff8)
[   79.723358] 1fa0:                                     0001e6a0 be8dab00 0001e698 00036698
[   79.731933] 1fc0: 0002df98 0002df38 0000001f 00000000 b6f234d0 00000000 00000004 00000000
[   79.740509] 1fe0: 0001e6f8 be8d6aa0 be8dac50 0000aab8 80000010 ffffffff
[   79.747497] Unable to handle kernel NULL pointer dereference at virtual address 00000000
[   79.756011] pgd = cf3b4000
[   79.758850] [00000000] *pgd=8f0c7831, *pte=00000000, *ppte=00000000
[   79.765502] Internal error: Oops: 80000007 [#1]
[   79.770294] Modules linked in:
[   79.773529] CPU: 0    Tainted: G        W     (3.3.0-rc6-00002-gb5d5c87 #421)
[   79.781066] PC is at 0x0
[   79.783721] LR is at run_timer_softirq+0x16c/0x3ac
[   79.788787] pc : [<00000000>]    lr : [<c0040aa4>]    psr: 60000113
[   79.788787] sp : cf281ee0  ip : 00000000  fp : cf280000
[   79.800903] r10: 00000004  r9 : 00000100  r8 : b6f234d0
[   79.806427] r7 : c0519c28  r6 : cf093488  r5 : c0561a00  r4 : 00000000
[   79.813323] r3 : 00000000  r2 : c054eee0  r1 : 00000001  r0 : 00000000
[   79.820190] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
[   79.827728] Control: 10c5387d  Table: 8f3b4019  DAC: 00000015
[   79.833801] Process gpsd (pid: 1265, stack limit = 0xcf2802e8)
[   79.839965] Stack: (0xcf281ee0 to 0xcf282000)
[   79.844573] 1ee0: 00000002 00000000 c0040a24 00000000 00000002 cf281f08 00200200 00000000
[   79.853210] 1f00: 00000000 cf281f18 cf281f08 00000000 00000000 00000000 cf281f18 cf281f18
[   79.861816] 1f20: 00000000 00000001 c056184c 00000000 00000001 b6f234d0 c0561848 00000004
[   79.870452] 1f40: cf280000 c003a3b8 c051e79c 00000001 00000000 00000100 3fa9e7b8 0000000a
[   79.879089] 1f60: 00000025 cf280000 00000025 00000000 00000000 b6f234d0 00000000 00000004
[   79.887756] 1f80: 00000000 c003a924 c053ad38 c0013a50 fa200000 cf281fb0 ffffffff c0008530
[   79.896362] 1fa0: 0001e6a0 0000aab8 80000010 c037499c 0001e6a0 be8dab00 0001e698 00036698
[   79.904998] 1fc0: 0002df98 0002df38 0000001f 00000000 b6f234d0 00000000 00000004 00000000
[   79.913665] 1fe0: 0001e6f8 be8d6aa0 be8dac50 0000aab8 80000010 ffffffff 00fbf700 04ffff00
[   79.922302] [<c0040aa4>] (run_timer_softirq+0x16c/0x3ac) from [<c003a3b8>] (__do_softirq+0xd4/0x22c)
[   79.931945] [<c003a3b8>] (__do_softirq+0xd4/0x22c) from [<c003a924>] (irq_exit+0x8c/0x94)
[   79.940582] [<c003a924>] (irq_exit+0x8c/0x94) from [<c0013a50>] (handle_IRQ+0x34/0x84)
[   79.948913] [<c0013a50>] (handle_IRQ+0x34/0x84) from [<c0008530>] (omap3_intc_handle_irq+0x48/0x4c)
[   79.958404] [<c0008530>] (omap3_intc_handle_irq+0x48/0x4c) from [<c037499c>] (__irq_usr+0x3c/0x60)
[   79.967773] Exception stack(0xcf281fb0 to 0xcf281ff8)
[   79.973083] 1fa0:                                     0001e6a0 be8dab00 0001e698 00036698
[   79.981658] 1fc0: 0002df98 0002df38 0000001f 00000000 b6f234d0 00000000 00000004 00000000
[   79.990234] 1fe0: 0001e6f8 be8d6aa0 be8dac50 0000aab8 80000010 ffffffff
[   79.997161] Code: bad PC value
[   80.000396] ---[ end trace 6f6739840475f9ee ]---
[   80.005279] Kernel panic - not syncing: Fatal exception in interrupt

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bluetooth/hci_core.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 815269b..cb9cb48 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -509,6 +509,11 @@ int hci_dev_open(__u16 dev)
 
 	hci_req_lock(hdev);
 
+	if (test_bit(HCI_UNREGISTER, &hdev->flags)) {
+		ret = -ENODEV;
+		goto done;
+	}
+
 	if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) {
 		ret = -ERFKILL;
 		goto done;
@@ -1317,6 +1322,8 @@ int hci_unregister_dev(struct hci_dev *hdev)
 
 	BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
 
+	set_bit(HCI_UNREGISTER, &hdev->flags);
+
 	write_lock_bh(&hci_dev_list_lock);
 	list_del(&hdev->list);
 	write_unlock_bh(&hci_dev_list_lock);
-- 
cgit v1.1


From 027e5d441e6a46ab6cfd5f0e27c5eebd533b8131 Mon Sep 17 00:00:00 2001
From: Lukasz Kucharczyk <lukasz.kucharczyk@tieto.com>
Date: Wed, 11 Apr 2012 14:55:10 +0200
Subject: cfg80211: fix interface combinations check.

commit e55a4046dab28c440c96890bdddcf02dc8981f2d upstream.

Signed-off-by: Lukasz Kucharczyk <lukasz.kucharczyk@tieto.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/wireless/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 4d7b83f..9c22330 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -990,7 +990,7 @@ int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev,
 			if (rdev->wiphy.software_iftypes & BIT(iftype))
 				continue;
 			for (j = 0; j < c->n_limits; j++) {
-				if (!(limits[j].types & iftype))
+				if (!(limits[j].types & BIT(iftype)))
 					continue;
 				if (limits[j].max < num[iftype])
 					goto cont;
-- 
cgit v1.1


From 8d2228dd95c656e5fc9af2e8776f9c95e269806f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 24 Apr 2012 22:12:06 -0400
Subject: tcp: allow splice() to build full TSO packets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ This combines upstream commit
  2f53384424251c06038ae612e56231b96ab610ee and the follow-on bug fix
  commit 35f9c09fe9c72eb8ca2b8e89a593e1c151f28fc2 ]

vmsplice()/splice(pipe, socket) call do_tcp_sendpages() one page at a
time, adding at most 4096 bytes to an skb. (assuming PAGE_SIZE=4096)

The call to tcp_push() at the end of do_tcp_sendpages() forces an
immediate xmit when pipe is not already filled, and tso_fragment() try
to split these skb to MSS multiples.

4096 bytes are usually split in a skb with 2 MSS, and a remaining
sub-mss skb (assuming MTU=1500)

This makes slow start suboptimal because many small frames are sent to
qdisc/driver layers instead of big ones (constrained by cwnd and packets
in flight of course)

In fact, applications using sendmsg() (adding an additional memory copy)
instead of vmsplice()/splice()/sendfile() are a bit faster because of
this anomaly, especially if serving small files in environments with
large initial [c]wnd.

Call tcp_push() only if MSG_MORE is not set in the flags parameter.

This bit is automatically provided by splice() internals but for the
last page, or on all pages if user specified SPLICE_F_MORE splice()
flag.

In some workloads, this can reduce number of sent logical packets by an
order of magnitude, making zero-copy TCP actually faster than
one-copy :)

Reported-by: Tom Herbert <therbert@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: H.K. Jerry Chu <hkchu@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp.c | 2 +-
 net/socket.c   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 46febca..80b988f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -860,7 +860,7 @@ wait_for_memory:
 	}
 
 out:
-	if (copied)
+	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
 		tcp_push(sk, flags, mss_now, tp->nonagle);
 	return copied;
 
diff --git a/net/socket.c b/net/socket.c
index 1ad42d3..cf41afc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -791,9 +791,9 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 
 	sock = file->private_data;
 
-	flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
-	if (more)
-		flags |= MSG_MORE;
+	flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
+	flags |= more;
 
 	return kernel_sendpage(sock, page, offset, size, flags);
 }
-- 
cgit v1.1


From 3109ea06da8538ee3ded3489b26065a3be32f360 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@infradead.org>
Date: Tue, 3 Apr 2012 22:17:53 +0000
Subject: sctp: Allow struct sctp_event_subscribe to grow without breaking
 binaries

[ Upstream commit acdd5985364f8dc511a0762fab2e683f29d9d692 ]

getsockopt(..., SCTP_EVENTS, ...) performs a length check and returns
an error if the user provides less bytes than the size of struct
sctp_event_subscribe.

Struct sctp_event_subscribe needs to be extended by an u8 for every
new event or notification type that is added.

This obviously makes getsockopt fail for binaries that are compiled
against an older versions of <net/sctp/user.h> which do not contain
all event types.

This patch changes getsockopt behaviour to no longer return an error
if not enough bytes are being provided by the user. Instead, it
returns as much of sctp_event_subscribe as fits into the provided buffer.

This leads to the new behavior that users see what they have been aware
of at compile time.

The setsockopt(..., SCTP_EVENTS, ...) API is already behaving like this.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sctp/socket.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index fa9b5c7..4434853 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4009,9 +4009,10 @@ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len,
 static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval,
 				  int __user *optlen)
 {
-	if (len < sizeof(struct sctp_event_subscribe))
+	if (len <= 0)
 		return -EINVAL;
-	len = sizeof(struct sctp_event_subscribe);
+	if (len > sizeof(struct sctp_event_subscribe))
+		len = sizeof(struct sctp_event_subscribe);
 	if (put_user(len, optlen))
 		return -EFAULT;
 	if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len))
-- 
cgit v1.1


From 4baf6fcf14c1c8e228dbf73bc3b5393c80ce6065 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 4 Apr 2012 01:01:20 +0000
Subject: bridge: Do not send queries on multicast group leaves

[ Upstream commit 996304bbea3d2a094b7ba54c3bd65d3fffeac57b ]

As it stands the bridge IGMP snooping system will respond to
group leave messages with queries for remaining membership.
This is both unnecessary and undesirable.  First of all any
multicast routers present should be doing this rather than us.
What's more the queries that we send may end up upsetting other
multicast snooping swithces in the system that are buggy.

In fact, we can simply remove the code that send these queries
because the existing membership expiry mechanism doesn't rely
on them anyway.

So this patch simply removes all code associated with group
queries in response to group leave messages.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/bridge/br_multicast.c | 81 -----------------------------------------------
 net/bridge/br_private.h   |  4 ---
 2 files changed, 85 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index c23a4b1..e78269d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -241,7 +241,6 @@ static void br_multicast_group_expired(unsigned long data)
 	hlist_del_rcu(&mp->hlist[mdb->ver]);
 	mdb->size--;
 
-	del_timer(&mp->query_timer);
 	call_rcu_bh(&mp->rcu, br_multicast_free_group);
 
 out:
@@ -271,7 +270,6 @@ static void br_multicast_del_pg(struct net_bridge *br,
 		rcu_assign_pointer(*pp, p->next);
 		hlist_del_init(&p->mglist);
 		del_timer(&p->timer);
-		del_timer(&p->query_timer);
 		call_rcu_bh(&p->rcu, br_multicast_free_pg);
 
 		if (!mp->ports && !mp->mglist &&
@@ -507,74 +505,6 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
 	return NULL;
 }
 
-static void br_multicast_send_group_query(struct net_bridge_mdb_entry *mp)
-{
-	struct net_bridge *br = mp->br;
-	struct sk_buff *skb;
-
-	skb = br_multicast_alloc_query(br, &mp->addr);
-	if (!skb)
-		goto timer;
-
-	netif_rx(skb);
-
-timer:
-	if (++mp->queries_sent < br->multicast_last_member_count)
-		mod_timer(&mp->query_timer,
-			  jiffies + br->multicast_last_member_interval);
-}
-
-static void br_multicast_group_query_expired(unsigned long data)
-{
-	struct net_bridge_mdb_entry *mp = (void *)data;
-	struct net_bridge *br = mp->br;
-
-	spin_lock(&br->multicast_lock);
-	if (!netif_running(br->dev) || !mp->mglist ||
-	    mp->queries_sent >= br->multicast_last_member_count)
-		goto out;
-
-	br_multicast_send_group_query(mp);
-
-out:
-	spin_unlock(&br->multicast_lock);
-}
-
-static void br_multicast_send_port_group_query(struct net_bridge_port_group *pg)
-{
-	struct net_bridge_port *port = pg->port;
-	struct net_bridge *br = port->br;
-	struct sk_buff *skb;
-
-	skb = br_multicast_alloc_query(br, &pg->addr);
-	if (!skb)
-		goto timer;
-
-	br_deliver(port, skb);
-
-timer:
-	if (++pg->queries_sent < br->multicast_last_member_count)
-		mod_timer(&pg->query_timer,
-			  jiffies + br->multicast_last_member_interval);
-}
-
-static void br_multicast_port_group_query_expired(unsigned long data)
-{
-	struct net_bridge_port_group *pg = (void *)data;
-	struct net_bridge_port *port = pg->port;
-	struct net_bridge *br = port->br;
-
-	spin_lock(&br->multicast_lock);
-	if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) ||
-	    pg->queries_sent >= br->multicast_last_member_count)
-		goto out;
-
-	br_multicast_send_port_group_query(pg);
-
-out:
-	spin_unlock(&br->multicast_lock);
-}
-
 static struct net_bridge_mdb_entry *br_multicast_get_group(
 	struct net_bridge *br, struct net_bridge_port *port,
 	struct br_ip *group, int hash)
@@ -690,8 +620,6 @@ rehash:
 	mp->addr = *group;
 	setup_timer(&mp->timer, br_multicast_group_expired,
 		    (unsigned long)mp);
-	setup_timer(&mp->query_timer, br_multicast_group_query_expired,
-		    (unsigned long)mp);
 
 	hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]);
 	mdb->size++;
@@ -746,8 +674,6 @@ static int br_multicast_add_group(struct net_bridge *br,
 	hlist_add_head(&p->mglist, &port->mglist);
 	setup_timer(&p->timer, br_multicast_port_group_expired,
 		    (unsigned long)p);
-	setup_timer(&p->query_timer, br_multicast_port_group_query_expired,
-		    (unsigned long)p);
 
 	rcu_assign_pointer(*pp, p);
 
@@ -1291,9 +1217,6 @@ static void br_multicast_leave_group(struct net_bridge *br,
 		     time_after(mp->timer.expires, time) :
 		     try_to_del_timer_sync(&mp->timer) >= 0)) {
 			mod_timer(&mp->timer, time);
-
-			mp->queries_sent = 0;
-			mod_timer(&mp->query_timer, now);
 		}
 
 		goto out;
@@ -1310,9 +1233,6 @@ static void br_multicast_leave_group(struct net_bridge *br,
 		     time_after(p->timer.expires, time) :
 		     try_to_del_timer_sync(&p->timer) >= 0)) {
 			mod_timer(&p->timer, time);
-
-			p->queries_sent = 0;
-			mod_timer(&p->query_timer, now);
 		}
 
 		break;
@@ -1678,7 +1598,6 @@ void br_multicast_stop(struct net_bridge *br)
 		hlist_for_each_entry_safe(mp, p, n, &mdb->mhash[i],
 					  hlist[ver]) {
 			del_timer(&mp->timer);
-			del_timer(&mp->query_timer);
 			call_rcu_bh(&mp->rcu, br_multicast_free_group);
 		}
 	}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 857a021..1ca1b1c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -77,9 +77,7 @@ struct net_bridge_port_group {
 	struct hlist_node		mglist;
 	struct rcu_head			rcu;
 	struct timer_list		timer;
-	struct timer_list		query_timer;
 	struct br_ip			addr;
-	u32				queries_sent;
 };
 
 struct net_bridge_mdb_entry
@@ -89,10 +87,8 @@ struct net_bridge_mdb_entry
 	struct net_bridge_port_group __rcu *ports;
 	struct rcu_head			rcu;
 	struct timer_list		timer;
-	struct timer_list		query_timer;
 	struct br_ip			addr;
 	bool				mglist;
-	u32				queries_sent;
 };
 
 struct net_bridge_mdb_htable
-- 
cgit v1.1


From 93deb00abf9ccfcd66060760df11061cc0b9f030 Mon Sep 17 00:00:00 2001
From: "RongQing.Li" <roy.qing.li@gmail.com>
Date: Wed, 4 Apr 2012 16:47:04 +0000
Subject: ipv6: fix array index in ip6_mc_add_src()

[ Upstream commit 78d50217baf36093ab320f95bae0d6452daec85c ]

Convert array index from the loop bound to the loop index.

And remove the void type conversion to ip6_mc_del1_src() return
code, seem it is unnecessary, since ip6_mc_del1_src() does not
use __must_check similar attribute, no compiler will report the
warning when it is removed.

v2: enrich the commit header

Signed-off-by: RongQing.Li <roy.qing.li@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/mcast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 2257366..f2d74ea 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2054,7 +2054,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
 		if (!delta)
 			pmc->mca_sfcount[sfmode]--;
 		for (j=0; j<i; j++)
-			(void) ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+			ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
 	} else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
 		struct ip6_sf_list *psf;
 
-- 
cgit v1.1


From 9acd6c30514c0801e94e1bb3fa4734e9e47ab560 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Thu, 5 Apr 2012 12:07:45 +0000
Subject: phonet: Check input from user before allocating
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit bcf1b70ac6eb0ed8286c66e6bf37cb747cbaa04c ]

A phonet packet is limited to USHRT_MAX bytes, this is never checked during
tx which means that the user can specify any size he wishes, and the kernel
will attempt to allocate that size.

In the good case, it'll lead to the following warning, but it may also cause
the kernel to kick in the OOM and kill a random task on the server.

[ 8921.744094] WARNING: at mm/page_alloc.c:2255 __alloc_pages_slowpath+0x65/0x730()
[ 8921.749770] Pid: 5081, comm: trinity Tainted: G        W    3.4.0-rc1-next-20120402-sasha #46
[ 8921.756672] Call Trace:
[ 8921.758185]  [<ffffffff810b2ba7>] warn_slowpath_common+0x87/0xb0
[ 8921.762868]  [<ffffffff810b2be5>] warn_slowpath_null+0x15/0x20
[ 8921.765399]  [<ffffffff8117eae5>] __alloc_pages_slowpath+0x65/0x730
[ 8921.769226]  [<ffffffff81179c8a>] ? zone_watermark_ok+0x1a/0x20
[ 8921.771686]  [<ffffffff8117d045>] ? get_page_from_freelist+0x625/0x660
[ 8921.773919]  [<ffffffff8117f3a8>] __alloc_pages_nodemask+0x1f8/0x240
[ 8921.776248]  [<ffffffff811c03e0>] kmalloc_large_node+0x70/0xc0
[ 8921.778294]  [<ffffffff811c4bd4>] __kmalloc_node_track_caller+0x34/0x1c0
[ 8921.780847]  [<ffffffff821b0e3c>] ? sock_alloc_send_pskb+0xbc/0x260
[ 8921.783179]  [<ffffffff821b3c65>] __alloc_skb+0x75/0x170
[ 8921.784971]  [<ffffffff821b0e3c>] sock_alloc_send_pskb+0xbc/0x260
[ 8921.787111]  [<ffffffff821b002e>] ? release_sock+0x7e/0x90
[ 8921.788973]  [<ffffffff821b0ff0>] sock_alloc_send_skb+0x10/0x20
[ 8921.791052]  [<ffffffff824cfc20>] pep_sendmsg+0x60/0x380
[ 8921.792931]  [<ffffffff824cb4a6>] ? pn_socket_bind+0x156/0x180
[ 8921.794917]  [<ffffffff824cb50f>] ? pn_socket_autobind+0x3f/0x90
[ 8921.797053]  [<ffffffff824cb63f>] pn_socket_sendmsg+0x4f/0x70
[ 8921.798992]  [<ffffffff821ab8e7>] sock_aio_write+0x187/0x1b0
[ 8921.801395]  [<ffffffff810e325e>] ? sub_preempt_count+0xae/0xf0
[ 8921.803501]  [<ffffffff8111842c>] ? __lock_acquire+0x42c/0x4b0
[ 8921.805505]  [<ffffffff821ab760>] ? __sock_recv_ts_and_drops+0x140/0x140
[ 8921.807860]  [<ffffffff811e07cc>] do_sync_readv_writev+0xbc/0x110
[ 8921.809986]  [<ffffffff811958e7>] ? might_fault+0x97/0xa0
[ 8921.811998]  [<ffffffff817bd99e>] ? security_file_permission+0x1e/0x90
[ 8921.814595]  [<ffffffff811e17e2>] do_readv_writev+0xe2/0x1e0
[ 8921.816702]  [<ffffffff810b8dac>] ? do_setitimer+0x1ac/0x200
[ 8921.818819]  [<ffffffff810e2ec1>] ? get_parent_ip+0x11/0x50
[ 8921.820863]  [<ffffffff810e325e>] ? sub_preempt_count+0xae/0xf0
[ 8921.823318]  [<ffffffff811e1926>] vfs_writev+0x46/0x60
[ 8921.825219]  [<ffffffff811e1a3f>] sys_writev+0x4f/0xb0
[ 8921.827127]  [<ffffffff82658039>] system_call_fastpath+0x16/0x1b
[ 8921.829384] ---[ end trace dffe390f30db9eb7 ]---

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Acked-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/phonet/pep.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index f17fd84..d29a7fb 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1045,6 +1045,9 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
 	int flags = msg->msg_flags;
 	int err, done;
 
+	if (len > USHRT_MAX)
+		return -EMSGSIZE;
+
 	if ((msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|
 				MSG_CMSG_COMPAT)) ||
 			!(msg->msg_flags & MSG_EOR))
-- 
cgit v1.1


From 19a8321ccebc1db80a75d32d0235f2beb646d8f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 5 Apr 2012 22:17:46 +0000
Subject: netlink: fix races after skb queueing

[ Upstream commit 4a7e7c2ad540e54c75489a70137bf0ec15d3a127 ]

As soon as an skb is queued into socket receive_queue, another thread
can consume it, so we are not allowed to reference skb anymore, or risk
use after free.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/netlink/af_netlink.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6ef64ad..24bc620 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -830,12 +830,19 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 	return 0;
 }
 
-int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
+static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 {
 	int len = skb->len;
 
 	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk, len);
+	return len;
+}
+
+int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
+{
+	int len = __netlink_sendskb(sk, skb);
+
 	sock_put(sk);
 	return len;
 }
@@ -960,8 +967,7 @@ static inline int netlink_broadcast_deliver(struct sock *sk,
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 	    !test_bit(0, &nlk->state)) {
 		skb_set_owner_r(skb, sk);
-		skb_queue_tail(&sk->sk_receive_queue, skb);
-		sk->sk_data_ready(sk, skb->len);
+		__netlink_sendskb(sk, skb);
 		return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf;
 	}
 	return -1;
@@ -1682,10 +1688,8 @@ static int netlink_dump(struct sock *sk)
 
 		if (sk_filter(sk, skb))
 			kfree_skb(skb);
-		else {
-			skb_queue_tail(&sk->sk_receive_queue, skb);
-			sk->sk_data_ready(sk, skb->len);
-		}
+		else
+			__netlink_sendskb(sk, skb);
 		return 0;
 	}
 
@@ -1697,10 +1701,8 @@ static int netlink_dump(struct sock *sk)
 
 	if (sk_filter(sk, skb))
 		kfree_skb(skb);
-	else {
-		skb_queue_tail(&sk->sk_receive_queue, skb);
-		sk->sk_data_ready(sk, skb->len);
-	}
+	else
+		__netlink_sendskb(sk, skb);
 
 	if (cb->done)
 		cb->done(cb);
-- 
cgit v1.1


From 6d7946bd33e29b2659504ffb3b98aa9fdb2229d8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 6 Apr 2012 10:49:10 +0200
Subject: net: fix a race in sock_queue_err_skb()

[ Upstream commit 110c43304db6f06490961529536c362d9ac5732f ]

As soon as an skb is queued into socket error queue, another thread
can consume it, so we are not allowed to reference skb anymore, or risk
use after free.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/skbuff.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 46cbd28..4821df8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2985,6 +2985,8 @@ static void sock_rmem_free(struct sk_buff *skb)
  */
 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
 {
+	int len = skb->len;
+
 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 	    (unsigned)sk->sk_rcvbuf)
 		return -ENOMEM;
@@ -2999,7 +3001,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
 
 	skb_queue_tail(&sk->sk_error_queue, skb);
 	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_data_ready(sk, skb->len);
+		sk->sk_data_ready(sk, len);
 	return 0;
 }
 EXPORT_SYMBOL(sock_queue_err_skb);
-- 
cgit v1.1


From 4a1abcbd24d856fe49029c3dc6f7fe8dc66ddaac Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 10 Apr 2012 07:59:20 +0000
Subject: tcp: fix tcp_rcv_rtt_update() use of an unscaled RTT sample

[ Upstream commit 18a223e0b9ec8979320ba364b47c9772391d6d05 ]

Fix a code path in tcp_rcv_rtt_update() that was comparing scaled and
unscaled RTT samples.

The intent in the code was to only use the 'm' measurement if it was a
new minimum.  However, since 'm' had not yet been shifted left 3 bits
but 'new_sample' had, this comparison would nearly always succeed,
leading us to erroneously set our receive-side RTT estimate to the 'm'
sample when that sample could be nearly 8x too high to use.

The overall effect is to often cause the receive-side RTT estimate to
be significantly too large (up to 40% too large for brief periods in
my tests).

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 72b1857..104b02e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -460,8 +460,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
 		if (!win_dep) {
 			m -= (new_sample >> 3);
 			new_sample += m;
-		} else if (m < new_sample)
-			new_sample = m << 3;
+		} else {
+			m <<= 3;
+			if (m < new_sample)
+				new_sample = m;
+		}
 	} else {
 		/* No previous measure. */
 		new_sample = m << 3;
-- 
cgit v1.1


From 11e8e6af6ec119f354354f69cc9aade7c6a0bb42 Mon Sep 17 00:00:00 2001
From: David Ward <david.ward@ll.mit.edu>
Date: Sun, 15 Apr 2012 12:31:45 +0000
Subject: net_sched: gred: Fix oops in gred_dump() in WRED mode

[ Upstream commit 244b65dbfede788f2fa3fe2463c44d0809e97c6b ]

A parameter set exists for WRED mode, called wred_set, to hold the same
values for qavg and qidlestart across all VQs. The WRED mode values had
been previously held in the VQ for the default DP. After these values
were moved to wred_set, the VQ for the default DP was no longer created
automatically (so that it could be omitted on purpose, to have packets
in the default DP enqueued directly to the device without using RED).

However, gred_dump() was overlooked during that change; in WRED mode it
still reads qavg/qidlestart from the VQ for the default DP, which might
not even exist. As a result, this command sequence will cause an oops:

tc qdisc add dev $DEV handle $HANDLE parent $PARENT gred setup \
    DPs 3 default 2 grio
tc qdisc change dev $DEV handle $HANDLE gred DP 0 prio 8 $RED_OPTIONS
tc qdisc change dev $DEV handle $HANDLE gred DP 1 prio 8 $RED_OPTIONS

This fixes gred_dump() in WRED mode to use the values held in wred_set.

Signed-off-by: David Ward <david.ward@ll.mit.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/sched/sch_gred.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 6cd8ddf..e1afe0c 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -544,11 +544,8 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 		opt.packets	= q->packetsin;
 		opt.bytesin	= q->bytesin;
 
-		if (gred_wred_mode(table)) {
-			q->parms.qidlestart =
-				table->tab[table->def]->parms.qidlestart;
-			q->parms.qavg = table->tab[table->def]->parms.qavg;
-		}
+		if (gred_wred_mode(table))
+			gred_load_wred_set(table, q);
 
 		opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
 
-- 
cgit v1.1


From 6a0e69cea2ec1499224067989470d3c929dbd67e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2012 23:28:07 +0000
Subject: tcp: fix tcp_grow_window() for large incoming frames

[ Upstream commit 4d846f02392a710f9604892ac3329e628e60a230 ]

tcp_grow_window() has to grow rcv_ssthresh up to window_clamp, allowing
sender to increase its window.

tcp_grow_window() still assumes a tcp frame is under MSS, but its no
longer true with LRO/GRO.

This patch fixes one of the performance issue we noticed with GRO on.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 104b02e..c3a9f03 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -328,6 +328,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
 			incr = __tcp_grow_window(sk, skb);
 
 		if (incr) {
+			incr = max_t(int, incr, 2 * skb->len);
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
 					       tp->window_clamp);
 			inet_csk(sk)->icsk_ack.quick |= 1;
-- 
cgit v1.1


From 0958c122f47f4ef2a1ae552fed56a8bf8502c32b Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 16 Apr 2012 04:43:15 +0000
Subject: netns: do not leak net_generic data on failed init

[ Upstream commit b922934d017f1cc831b017913ed7d1a56c558b43 ]

ops_init should free the net_generic data on
init failure and __register_pernet_operations should not
call ops_free when NET_NS is not enabled.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/core/net_namespace.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0b0211d..2772ed1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -82,21 +82,29 @@ assign:
 
 static int ops_init(const struct pernet_operations *ops, struct net *net)
 {
-	int err;
+	int err = -ENOMEM;
+	void *data = NULL;
+
 	if (ops->id && ops->size) {
-		void *data = kzalloc(ops->size, GFP_KERNEL);
+		data = kzalloc(ops->size, GFP_KERNEL);
 		if (!data)
-			return -ENOMEM;
+			goto out;
 
 		err = net_assign_generic(net, *ops->id, data);
-		if (err) {
-			kfree(data);
-			return err;
-		}
+		if (err)
+			goto cleanup;
 	}
+	err = 0;
 	if (ops->init)
-		return ops->init(net);
-	return 0;
+		err = ops->init(net);
+	if (!err)
+		return 0;
+
+cleanup:
+	kfree(data);
+
+out:
+	return err;
 }
 
 static void ops_free(const struct pernet_operations *ops, struct net *net)
@@ -446,12 +454,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
 static int __register_pernet_operations(struct list_head *list,
 					struct pernet_operations *ops)
 {
-	int err = 0;
-	err = ops_init(ops, &init_net);
-	if (err)
-		ops_free(ops, &init_net);
-	return err;
-	
+	return ops_init(ops, &init_net);
 }
 
 static void __unregister_pernet_operations(struct pernet_operations *ops)
-- 
cgit v1.1


From 28b78eb401b7e60c0984eacf3837cf09a2307253 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 18 Apr 2012 16:11:23 +0000
Subject: net ax25: Reorder ax25_exit to remove races.

[ Upstream commit 3adadc08cc1e2cbcc15a640d639297ef5fcb17f5 ]

While reviewing the sysctl code in ax25 I spotted races in ax25_exit
where it is possible to receive notifications and packets after already
freeing up some of the data structures needed to process those
notifications and updates.

Call unregister_netdevice_notifier early so that the rest of the cleanup
code does not need to deal with network devices.  This takes advantage
of my recent enhancement to unregister_netdevice_notifier to send
unregister notifications of all network devices that are current
registered.

Move the unregistration for packet types, socket types and protocol
types before we cleanup any of the ax25 data structures to remove the
possibilities of other races.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ax25/af_ax25.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index e7c69f4..b04a6ef 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -2006,16 +2006,17 @@ static void __exit ax25_exit(void)
 	proc_net_remove(&init_net, "ax25_route");
 	proc_net_remove(&init_net, "ax25");
 	proc_net_remove(&init_net, "ax25_calls");
-	ax25_rt_free();
-	ax25_uid_free();
-	ax25_dev_free();
 
-	ax25_unregister_sysctl();
 	unregister_netdevice_notifier(&ax25_dev_notifier);
+	ax25_unregister_sysctl();
 
 	dev_remove_pack(&ax25_packet_type);
 
 	sock_unregister(PF_AX25);
 	proto_unregister(&ax25_proto);
+
+	ax25_rt_free();
+	ax25_uid_free();
+	ax25_dev_free();
 }
 module_exit(ax25_exit);
-- 
cgit v1.1


From ad24d0be9d68b8907bcdd170649214db4ce6a55c Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 22 Apr 2012 09:45:47 +0000
Subject: tcp: fix TCP_MAXSEG for established IPv6 passive sockets

[ Upstream commit d135c522f1234f62e81be29cebdf59e9955139ad ]

Commit f5fff5d forgot to fix TCP_MAXSEG behavior IPv6 sockets, so IPv6
TCP server sockets that used TCP_MAXSEG would find that the advmss of
child sockets would be incorrect. This commit mirrors the advmss logic
from tcp_v4_syn_recv_sock in tcp_v6_syn_recv_sock. Eventually this
logic should probably be shared between IPv4 and IPv6, but this at
least fixes this issue.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv6/tcp_ipv6.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 51587a0..848f963 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1514,6 +1514,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	tcp_mtup_init(newsk);
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric_advmss(dst);
+	if (tcp_sk(sk)->rx_opt.user_mss &&
+	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
 	tcp_initialize_rcv_mss(newsk);
 
 	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
-- 
cgit v1.1


From 20eae41274bb811063f95a2dde0b3dda88a3d5a0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 2 Apr 2012 10:51:55 +0200
Subject: nl80211: ensure interface is up in various APIs

commit 2b5f8b0b44e17e625cfba1e7b88db44f4dcc0441 upstream.
[backported by Ben Greear]

The nl80211 handling code should ensure as much as
it can that the interface is in a valid state, it
can certainly ensure the interface is running.

Not doing so can cause calls through mac80211 into
the driver that result in warnings and unspecified
behaviour in the driver.

Reported-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/wireless/nl80211.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0c2b808..f310a0d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1181,6 +1181,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			goto bad_res;
 		}
 
+		if (!netif_running(netdev)) {
+			result = -ENETDOWN;
+			goto bad_res;
+		}
+
 		nla_for_each_nested(nl_txq_params,
 				    info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
 				    rem_txq_params) {
@@ -5432,7 +5437,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_get_key,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5464,7 +5469,7 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 		.doit = nl80211_addset_beacon,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5472,7 +5477,7 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 		.doit = nl80211_addset_beacon,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5496,7 +5501,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_set_station,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5512,7 +5517,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_del_station,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5545,7 +5550,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_del_mpath,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5553,7 +5558,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_set_bss,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5579,7 +5584,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_get_mesh_config,
 		.policy = nl80211_policy,
 		/* can be retrieved by unprivileged users */
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5711,7 +5716,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_setdel_pmksa,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5719,7 +5724,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_setdel_pmksa,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5727,7 +5732,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_flush_pmksa,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
@@ -5815,7 +5820,7 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_set_wds_peer,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
-- 
cgit v1.1


From 9bd46fe16654ee5a10dc269ebe3fc44903424707 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sun, 29 Apr 2012 15:44:16 +0200
Subject: mac80211: fix AP mode EAP tx for VLAN stations

commit 66f2c99af3d6f2d0aa1120884cf1c60613ef61c0 upstream.

EAP frames for stations in an AP VLAN are sent on the main AP interface
to avoid race conditions wrt. moving stations.
For that to work properly, sta_info_get_bss must be used instead of
sta_info_get when sending EAP packets.
Previously this was only done for cooked monitor injected packets, so
this patch adds a check for tx->skb->protocol to the same place.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/mac80211/tx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 3104c84..da878c1 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1222,7 +1222,8 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 		tx->sta = rcu_dereference(sdata->u.vlan.sta);
 		if (!tx->sta && sdata->dev->ieee80211_ptr->use_4addr)
 			return TX_DROP;
-	} else if (info->flags & IEEE80211_TX_CTL_INJECTED) {
+	} else if (info->flags & IEEE80211_TX_CTL_INJECTED ||
+		   tx->sdata->control_port_protocol == tx->skb->protocol) {
 		tx->sta = sta_info_get_bss(sdata, hdr->addr1);
 	}
 	if (!tx->sta)
-- 
cgit v1.1