From df9f86faf3ee610527ed02031fe7dd3c8b752e44 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 1 Nov 2010 15:49:23 -0700 Subject: ceph: fix small seq message skipping If the client gets out of sync with the server message sequence number, we normally skip low seq messages (ones we already received). The skip code was also incrementing the expected seq, such that all subsequent messages also appeared old and got skipped, and an eventual timeout on the osd connection. This resulted in some lagging requests and console messages like [233480.882885] ceph: skipping osd22 10.138.138.13:6804 seq 2016, expected 2017 [233480.882919] ceph: skipping osd22 10.138.138.13:6804 seq 2017, expected 2018 [233480.882963] ceph: skipping osd22 10.138.138.13:6804 seq 2018, expected 2019 [233480.883488] ceph: skipping osd22 10.138.138.13:6804 seq 2019, expected 2020 [233485.219558] ceph: skipping osd22 10.138.138.13:6804 seq 2020, expected 2021 [233485.906595] ceph: skipping osd22 10.138.138.13:6804 seq 2021, expected 2022 [233490.379536] ceph: skipping osd22 10.138.138.13:6804 seq 2022, expected 2023 [233495.523260] ceph: skipping osd22 10.138.138.13:6804 seq 2023, expected 2024 [233495.923194] ceph: skipping osd22 10.138.138.13:6804 seq 2024, expected 2025 [233500.534614] ceph: tid 6023602 timed out on osd22, will reset osd Reported-by: Theodore Ts'o Signed-off-by: Sage Weil --- net/ceph/messenger.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0e8157e..d379abf 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1532,14 +1532,13 @@ static int read_partial_message(struct ceph_connection *con) /* verify seq# */ seq = le64_to_cpu(con->in_hdr.seq); if ((s64)seq - (s64)con->in_seq < 1) { - pr_info("skipping %s%lld %s seq %lld, expected %lld\n", + pr_info("skipping %s%lld %s seq %lld expected %lld\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; return 0; } else if ((s64)seq - (s64)con->in_seq > 1) { pr_err("read_partial_message bad seq %lld expected %lld\n", -- cgit v1.1 From e98b6fed84d0f0155d7b398e0dfeac74c792f2d0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:24:53 -0800 Subject: ceph: fix comment, remove extraneous args The offset/length arguments aren't used. Signed-off-by: Sage Weil --- net/ceph/pagevec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 54caf06..ac34fee 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -13,8 +13,7 @@ * build a vector of user pages */ struct page **ceph_get_direct_page_vector(const char __user *data, - int num_pages, - loff_t off, size_t len) + int num_pages) { struct page **pages; int rc; -- cgit v1.1 From b7495fc2ff941db6a118a93ab8d61149e3f4cef8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:43:12 -0800 Subject: ceph: make page alignment explicit in osd interface We used to infer alignment of IOs within a page based on the file offset, which assumed they matched. This broke with direct IO that was not aligned to pages (e.g., 512-byte aligned IO). We were also trusting the alignment specified in the OSD reply, which could have been adjusted by the server. Explicitly specify the page alignment when setting up OSD IO requests. Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7939199..6c09623 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -71,6 +71,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, op->extent.length = objlen; } req->r_num_pages = calc_pages_for(off, *plen); + req->r_page_alignment = off & ~PAGE_MASK; if (op->op == CEPH_OSD_OP_WRITE) op->payload_len = *plen; @@ -419,7 +420,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - bool use_mempool, int num_reply) + bool use_mempool, int num_reply, + int page_align) { struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; @@ -447,6 +449,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, calc_layout(osdc, vino, layout, off, plen, req, ops); req->r_file_layout = *layout; /* keep a copy */ + /* in case it differs from natural alignment that calc_layout + filled in for us */ + req->r_page_alignment = page_align; + ceph_osdc_build_request(req, off, plen, ops, snapc, mtime, @@ -1489,7 +1495,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 *plen, u32 truncate_seq, u64 truncate_size, - struct page **pages, int num_pages) + struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; int rc = 0; @@ -1499,15 +1505,15 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, req = ceph_osdc_new_request(osdc, layout, vino, off, plen, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, - false, 1); + false, 1, page_align); if (!req) return -ENOMEM; /* it may be a short read due to an object boundary */ req->r_pages = pages; - dout("readpages final extent is %llu~%llu (%d pages)\n", - off, *plen, req->r_num_pages); + dout("readpages final extent is %llu~%llu (%d pages align %d)\n", + off, *plen, req->r_num_pages, page_align); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1533,6 +1539,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, { struct ceph_osd_request *req; int rc = 0; + int page_align = off & ~PAGE_MASK; BUG_ON(vino.snap != CEPH_NOSNAP); req = ceph_osdc_new_request(osdc, layout, vino, off, &len, @@ -1541,7 +1548,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, CEPH_OSD_FLAG_WRITE, snapc, do_sync, truncate_seq, truncate_size, mtime, - nofail, 1); + nofail, 1, page_align); if (!req) return -ENOMEM; @@ -1638,8 +1645,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - unsigned data_off = le16_to_cpu(hdr->data_off); - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + int want = calc_pages_for(req->r_page_alignment, data_len); if (unlikely(req->r_num_pages < want)) { pr_warning("tid %lld reply %d > expected %d pages\n", -- cgit v1.1 From c5c6b19d4b8f5431fca05f28ae9e141045022149 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 9 Nov 2010 12:40:00 -0800 Subject: ceph: explicitly specify page alignment in network messages The alignment used for reading data into or out of pages used to be taken from the data_off field in the message header. This only worked as long as the page alignment matched the object offset, breaking direct io to non-page aligned offsets. Instead, explicitly specify the page alignment next to the page vector in the ceph_msg struct, and use that instead of the message header (which probably shouldn't be trusted). The alloc_msg callback is responsible for filling in this field properly when it sets up the page vector. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 10 +++++----- net/ceph/osd_client.c | 3 +++ 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d379abf..1c7a2ec 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -540,8 +540,7 @@ static void prepare_write_message(struct ceph_connection *con) /* initialize page iterator */ con->out_msg_pos.page = 0; if (m->pages) - con->out_msg_pos.page_pos = - le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; + con->out_msg_pos.page_pos = m->page_alignment; else con->out_msg_pos.page_pos = 0; con->out_msg_pos.data_pos = 0; @@ -1491,7 +1490,7 @@ static int read_partial_message(struct ceph_connection *con) struct ceph_msg *m = con->in_msg; int ret; int to, left; - unsigned front_len, middle_len, data_len, data_off; + unsigned front_len, middle_len, data_len; int datacrc = con->msgr->nocrc; int skip; u64 seq; @@ -1527,7 +1526,6 @@ static int read_partial_message(struct ceph_connection *con) data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) return -EIO; - data_off = le16_to_cpu(con->in_hdr.data_off); /* verify seq# */ seq = le64_to_cpu(con->in_hdr.seq); @@ -1575,7 +1573,7 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg_pos.page = 0; if (m->pages) - con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; + con->in_msg_pos.page_pos = m->page_alignment; else con->in_msg_pos.page_pos = 0; con->in_msg_pos.data_pos = 0; @@ -2300,6 +2298,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) /* data */ m->nr_pages = 0; + m->page_alignment = 0; m->pages = NULL; m->pagelist = NULL; m->bio = NULL; @@ -2369,6 +2368,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, type, front_len); return NULL; } + msg->page_alignment = le16_to_cpu(hdr->data_off); } memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6c09623..3e20a12 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -391,6 +391,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, req->r_request->hdr.data_len = cpu_to_le32(data_len); } + req->r_request->page_alignment = req->r_page_alignment; + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); msg_size = p - msg->front.iov_base; msg->front.iov_len = msg_size; @@ -1657,6 +1659,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m->pages = req->r_pages; m->nr_pages = req->r_num_pages; + m->page_alignment = req->r_page_alignment; #ifdef CONFIG_BLOCK m->bio = req->r_bio; #endif -- cgit v1.1 From 94f58df8e545657f0b2d16eca1ac7a4ec39ed6be Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 7 Nov 2010 22:11:34 +0100 Subject: SUNRPC: Simplify rpc_alloc_iostats by removing pointless local variable Hi, We can simplify net/sunrpc/stats.c::rpc_alloc_iostats() a bit by getting rid of the unneeded local variable 'new'. Please CC me on replies. Signed-off-by: Jesper Juhl Signed-off-by: Trond Myklebust --- net/sunrpc/stats.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index f71a731..80df89d 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -115,9 +115,7 @@ EXPORT_SYMBOL_GPL(svc_seq_show); */ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { - struct rpc_iostats *new; - new = kcalloc(clnt->cl_maxproc, sizeof(struct rpc_iostats), GFP_KERNEL); - return new; + return kcalloc(clnt->cl_maxproc, sizeof(struct rpc_iostats), GFP_KERNEL); } EXPORT_SYMBOL_GPL(rpc_alloc_iostats); -- cgit v1.1 From 4c62ab9c538bc09c38093fa079e6902ea4d42b98 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 16 Nov 2010 09:50:47 -0800 Subject: irda: irttp: allow zero byte packets Sending zero byte packets is not neccessarily an error (AF_INET accepts it, too), so just apply a shortcut. This was discovered because of a non-working software with WINE. See http://bugs.winehq.org/show_bug.cgi?id=19397#c86 http://thread.gmane.org/gmane.linux.irda.general/1643 for very detailed debugging information and a testcase. Kudos to Wolfgang for those! Reported-by: Wolfgang Schwotzer Signed-off-by: Wolfram Sang Tested-by: Mike Evans Signed-off-by: David S. Miller --- net/irda/irttp.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/irda/irttp.c b/net/irda/irttp.c index 285761e..6cfaeaf 100644 --- a/net/irda/irttp.c +++ b/net/irda/irttp.c @@ -550,16 +550,23 @@ EXPORT_SYMBOL(irttp_close_tsap); */ int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) { + int ret = -1; + IRDA_ASSERT(self != NULL, return -1;); IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); IRDA_ASSERT(skb != NULL, return -1;); IRDA_DEBUG(4, "%s()\n", __func__); + /* Take shortcut on zero byte packets */ + if (skb->len == 0) { + ret = 0; + goto err; + } + /* Check that nothing bad happens */ - if ((skb->len == 0) || (!self->connected)) { - IRDA_DEBUG(1, "%s(), No data, or not connected\n", - __func__); + if (!self->connected) { + IRDA_DEBUG(1, "%s(), Not connected\n", __func__); goto err; } @@ -576,7 +583,7 @@ int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) err: dev_kfree_skb(skb); - return -1; + return ret; } EXPORT_SYMBOL(irttp_udata_request); @@ -599,9 +606,15 @@ int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb) IRDA_DEBUG(2, "%s() : queue len = %d\n", __func__, skb_queue_len(&self->tx_queue)); + /* Take shortcut on zero byte packets */ + if (skb->len == 0) { + ret = 0; + goto err; + } + /* Check that nothing bad happens */ - if ((skb->len == 0) || (!self->connected)) { - IRDA_WARNING("%s: No data, or not connected\n", __func__); + if (!self->connected) { + IRDA_WARNING("%s: Not connected\n", __func__); ret = -ENOTCONN; goto err; } -- cgit v1.1 From 7d98ffd8c2d1da6cec5d84eba42c4aa836a93f85 Mon Sep 17 00:00:00 2001 From: Ulrich Weber Date: Fri, 5 Nov 2010 01:39:12 +0000 Subject: xfrm: update flowi saddr in icmp_send if unset otherwise xfrm_lookup will fail to find correct policy Signed-off-by: Ulrich Weber Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 96bc7f9..e5d1a44 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -569,6 +569,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* No need to clone since we're just using its address. */ rt2 = rt; + if (!fl.nl_u.ip4_u.saddr) + fl.nl_u.ip4_u.saddr = rt->rt_src; + err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); switch (err) { case 0: -- cgit v1.1 From 9236d838c920e90708570d9bbd7bb82d30a38130 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 12 Nov 2010 16:31:23 -0800 Subject: cfg80211: fix extension channel checks to initiate communication When operating in a mode that initiates communication and using HT40 we should fail if we cannot use both primary and secondary channels to initiate communication. Our current ht40 allowmap only covers STA mode of operation, for beaconing modes we need a check on the fly as the mode of operation is dynamic and there other flags other than disable which we should read to check if we can initiate communication. Do not allow for initiating communication if our secondary HT40 channel has is either disabled, has a passive scan flag, a no-ibss flag or is a radar channel. Userspace now has similar checks but this is also needed in-kernel. Reported-by: Jouni Malinen Cc: stable@kernel.org Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/wireless/chan.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index d0c92dd..c8d190d 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -44,6 +44,36 @@ rdev_freq_to_chan(struct cfg80211_registered_device *rdev, return chan; } +static bool can_beacon_sec_chan(struct wiphy *wiphy, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type) +{ + struct ieee80211_channel *sec_chan; + int diff; + + switch (channel_type) { + case NL80211_CHAN_HT40PLUS: + diff = 20; + case NL80211_CHAN_HT40MINUS: + diff = -20; + default: + return false; + } + + sec_chan = ieee80211_get_channel(wiphy, chan->center_freq + diff); + if (!sec_chan) + return false; + + /* we'll need a DFS capability later */ + if (sec_chan->flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_PASSIVE_SCAN | + IEEE80211_CHAN_NO_IBSS | + IEEE80211_CHAN_RADAR)) + return false; + + return true; +} + int cfg80211_set_freq(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int freq, enum nl80211_channel_type channel_type) @@ -68,6 +98,28 @@ int cfg80211_set_freq(struct cfg80211_registered_device *rdev, if (!chan) return -EINVAL; + /* Both channels should be able to initiate communication */ + if (wdev && (wdev->iftype == NL80211_IFTYPE_ADHOC || + wdev->iftype == NL80211_IFTYPE_AP || + wdev->iftype == NL80211_IFTYPE_AP_VLAN || + wdev->iftype == NL80211_IFTYPE_MESH_POINT || + wdev->iftype == NL80211_IFTYPE_P2P_GO)) { + switch (channel_type) { + case NL80211_CHAN_HT40PLUS: + case NL80211_CHAN_HT40MINUS: + if (!can_beacon_sec_chan(&rdev->wiphy, chan, + channel_type)) { + printk(KERN_DEBUG + "cfg80211: Secondary channel not " + "allowed to initiate communication\n"); + return -EINVAL; + } + break; + default: + break; + } + } + result = rdev->ops->set_channel(&rdev->wiphy, wdev ? wdev->netdev : NULL, chan, channel_type); -- cgit v1.1 From 451a3c24b0135bce54542009b5fde43846c7cf67 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Nov 2010 16:26:55 +0100 Subject: BKL: remove extraneous #include The big kernel lock has been removed from all these files at some point, leaving only the #include. Remove this too as a cleanup. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- net/irda/af_irda.c | 1 - net/irda/irnet/irnet_ppp.c | 1 - net/sunrpc/svc_xprt.c | 1 - 3 files changed, 3 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 7f097989..a6de305 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include #include diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 7fa8637..7c567b8 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -15,7 +15,6 @@ #include #include -#include #include "irnet_ppp.h" /* Private header */ /* Please put other headers in irnet.h - Thanks */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c82fe73..ea2ff78 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -5,7 +5,6 @@ */ #include -#include #include #include #include -- cgit v1.1 From 218854af84038d828a32f061858b1902ed2beec6 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Wed, 17 Nov 2010 06:37:16 +0000 Subject: rds: Integer overflow in RDS cmsg handling In rds_cmsg_rdma_args(), the user-provided args->nr_local value is restricted to less than UINT_MAX. This seems to need a tighter upper bound, since the calculation of total iov_size can overflow, resulting in a small sock_kmalloc() allocation. This would probably just result in walking off the heap and crashing when calling rds_rdma_pages() with a high count value. If it somehow doesn't crash here, then memory corruption could occur soon after. Signed-off-by: Dan Rosenberg Signed-off-by: David S. Miller --- net/rds/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 8920f2a..4e37c1c 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -567,7 +567,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out; } - if (args->nr_local > (u64)UINT_MAX) { + if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out; } -- cgit v1.1 From 09a02fdb919876c01e8f05960750a418b3f7fa48 Mon Sep 17 00:00:00 2001 From: Mark Mentovai Date: Wed, 17 Nov 2010 16:34:37 -0500 Subject: cfg80211: fix can_beacon_sec_chan, reenable HT40 This follows wireless-testing 9236d838c920e90708570d9bbd7bb82d30a38130 ("cfg80211: fix extension channel checks to initiate communication") and fixes accidental case fall-through. Without this fix, HT40 is entirely blocked. Signed-off-by: Mark Mentovai Cc: stable@kernel.org Acked-by: Luis R. Rodriguez --- net/wireless/chan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index c8d190d..17cd0c0 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -54,8 +54,10 @@ static bool can_beacon_sec_chan(struct wiphy *wiphy, switch (channel_type) { case NL80211_CHAN_HT40PLUS: diff = 20; + break; case NL80211_CHAN_HT40MINUS: diff = -20; + break; default: return false; } -- cgit v1.1 From 7d8e76bf9ac3604897f0ce12e8bf09b68c2a2c89 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Nov 2010 19:42:53 +0000 Subject: net: zero kobject in rx_queue_release netif_set_real_num_rx_queues() can decrement and increment the number of rx queues. For example ixgbe does this as features and offloads are toggled. Presumably this could also happen across down/up on most devices if the available resources changed (cpu offlined). The kobject needs to be zero'd in this case so that the state is not preserved across kobject_put()/kobject_init_and_add(). This resolves the following error report. ixgbe 0000:03:00.0: eth2: NIC Link is Up 10 Gbps, Flow Control: RX/TX kobject (ffff880324b83210): tried to init an initialized object, something is seriously wrong. Pid: 1972, comm: lldpad Not tainted 2.6.37-rc18021qaz+ #169 Call Trace: [] kobject_init+0x3a/0x83 [] kobject_init_and_add+0x23/0x57 [] ? mark_lock+0x21/0x267 [] net_rx_queue_update_kobjects+0x63/0xc6 [] netif_set_real_num_rx_queues+0x5f/0x78 [] ixgbe_set_num_queues+0x1c6/0x1ca [ixgbe] [] ixgbe_init_interrupt_scheme+0x1e/0x79c [ixgbe] [] ixgbe_dcbnl_set_state+0x167/0x189 [ixgbe] Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a5ff5a8..7f902ca 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -712,15 +712,21 @@ static void rx_queue_release(struct kobject *kobj) map = rcu_dereference_raw(queue->rps_map); - if (map) + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); call_rcu(&map->rcu, rps_map_release); + } flow_table = rcu_dereference_raw(queue->rps_flow_table); - if (flow_table) + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } if (atomic_dec_and_test(&first->count)) kfree(first); + else + memset(kobj, 0, sizeof(*kobj)); } static struct kobj_type rx_queue_ktype = { -- cgit v1.1 From 93908d192686d8285dd6441ff855df92a40103d2 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 17 Nov 2010 01:44:24 +0000 Subject: ipv6: Expose IFLA_PROTINFO timer values in msecs instead of jiffies IFLA_PROTINFO exposes timer related per device settings in jiffies. Change it to expose these values in msecs like the sysctl interface does. I did not find any users of IFLA_PROTINFO which rely on any of these values and even if there are, they are likely already broken because there is no way for them to reliably convert such a value to another time format. Signed-off-by: Thomas Graf Cc: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b41ce0f..52b7df7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3798,8 +3798,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_AUTOCONF] = cnf->autoconf; array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; - array[DEVCONF_RTR_SOLICIT_INTERVAL] = cnf->rtr_solicit_interval; - array[DEVCONF_RTR_SOLICIT_DELAY] = cnf->rtr_solicit_delay; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_DELAY] = + jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; #ifdef CONFIG_IPV6_PRIVACY array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; @@ -3813,7 +3815,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; - array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval; + array[DEVCONF_RTR_PROBE_INTERVAL] = + jiffies_to_msecs(cnf->rtr_probe_interval); #ifdef CONFIG_IPV6_ROUTE_INFO array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; #endif -- cgit v1.1 From 18a31e1e282f9ed563b131526a88162ccbe04ee3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 17 Nov 2010 04:12:02 +0000 Subject: ipv6: Expose reachable and retrans timer values as msecs Expose reachable and retrans timer values in msecs instead of jiffies. Both timer values are already exposed as msecs in the neighbour table netlink interface. The creation timestamp format with increased precision is kept but cleaned up. Signed-off-by: Thomas Graf Cc: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 52b7df7..2fc35b3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -98,7 +98,11 @@ #endif #define INFINITY_LIFE_TIME 0xFFFFFFFF -#define TIME_DELTA(a, b) ((unsigned long)((long)(a) - (long)(b))) + +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ/50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) @@ -3444,10 +3448,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, { struct ifa_cacheinfo ci; - ci.cstamp = (u32)(TIME_DELTA(cstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(cstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.tstamp = (u32)(TIME_DELTA(tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; @@ -3932,10 +3934,9 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags); ci.max_reasm_len = IPV6_MAXPLEN; - ci.tstamp = (__u32)(TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.reachable_time = idev->nd_parms->reachable_time; - ci.retrans_time = idev->nd_parms->retrans_time; + ci.tstamp = cstamp_delta(idev->tstamp); + ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); + ci.retrans_time = jiffies_to_msecs(idev->nd_parms->retrans_time); NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); -- cgit v1.1 From 925e277f5221defdc53cbef1ac3ed1803fa32357 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 16 Nov 2010 09:40:02 +0000 Subject: net: irda: irttp: sync error paths of data- and udata-requests irttp_data_request() returns meaningful errorcodes, while irttp_udata_request() just returns -1 in similar situations. Sync the two and the loglevels of the accompanying output. Signed-off-by: Wolfram Sang Cc: Samuel Ortiz Cc: David Miller Signed-off-by: David S. Miller --- net/irda/irttp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/irda/irttp.c b/net/irda/irttp.c index 6cfaeaf..f6054f9 100644 --- a/net/irda/irttp.c +++ b/net/irda/irttp.c @@ -550,7 +550,7 @@ EXPORT_SYMBOL(irttp_close_tsap); */ int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) { - int ret = -1; + int ret; IRDA_ASSERT(self != NULL, return -1;); IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); @@ -566,13 +566,14 @@ int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) /* Check that nothing bad happens */ if (!self->connected) { - IRDA_DEBUG(1, "%s(), Not connected\n", __func__); + IRDA_WARNING("%s(), Not connected\n", __func__); + ret = -ENOTCONN; goto err; } if (skb->len > self->max_seg_size) { - IRDA_DEBUG(1, "%s(), UData is too large for IrLAP!\n", - __func__); + IRDA_ERROR("%s(), UData is too large for IrLAP!\n", __func__); + ret = -EMSGSIZE; goto err; } -- cgit v1.1 From dba4490d22a496f9b7c21919cf3effbed5851213 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 18 Nov 2010 08:20:57 +0000 Subject: netfilter: fix IP_VS dependencies When NF_CONNTRACK is enabled, IP_VS uses conntrack symbols. Therefore IP_VS can't be linked statically when conntrack is built modular. Reported-by: Justin P. Mattock Tested-by: Justin P. Mattock Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/ipvs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index a22dac2..70bd1d0 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -4,6 +4,7 @@ menuconfig IP_VS tristate "IP virtual server support" depends on NET && INET && NETFILTER + depends on (NF_CONNTRACK || NF_CONNTRACK=n) ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This -- cgit v1.1 From 0302b8622ce696af1cda22fcf207d3793350e896 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 18 Nov 2010 13:02:37 +0000 Subject: net: fix kernel-doc for sk_filter_rcu_release Fix kernel-doc warning for sk_filter_rcu_release(): Warning(net/core/filter.c:586): missing initial short description on line: * sk_filter_rcu_release: Release a socket filter by rcu_head Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 23e9b2a..c1ee800 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -589,7 +589,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen) EXPORT_SYMBOL(sk_chk_filter); /** - * sk_filter_rcu_release: Release a socket filter by rcu_head + * sk_filter_rcu_release - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_rcu_release(struct rcu_head *rcu) -- cgit v1.1 From 7a1c8e5ab120a5f352e78bbc1fa5bb64e6f23639 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 20 Nov 2010 07:46:35 +0000 Subject: net: allow GFP_HIGHMEM in __vmalloc() We forgot to use __GFP_HIGHMEM in several __vmalloc() calls. In ceph, add the missing flag. In fib_trie.c, xfrm_hash.c and request_sock.c, using vzalloc() is cleaner and allows using HIGHMEM pages as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ceph/buffer.c | 2 +- net/core/request_sock.c | 4 +--- net/ipv4/fib_trie.c | 2 +- net/xfrm/xfrm_hash.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 53d8abf..bf3e6a1 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -19,7 +19,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (b->vec.iov_base) { b->is_vmalloc = false; } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); + b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 7552495..fceeb37 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -45,9 +45,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) - lopt = __vmalloc(lopt_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 200eb53..0f28034 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -365,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size); } static void __tnode_vfree(struct work_struct *arg) diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index a2023ec..1e98bc0 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -19,7 +19,7 @@ struct hlist_head *xfrm_hash_alloc(unsigned int sz) if (sz <= PAGE_SIZE) n = kzalloc(sz, GFP_KERNEL); else if (hashdist) - n = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + n = vzalloc(sz); else n = (struct hlist_head *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, -- cgit v1.1 From 88b2a9a3d98a19496d64aadda7158c0ad51cbe7d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 15 Nov 2010 20:29:21 +0000 Subject: ipv6: fix missing in6_ifa_put in addrconf Fix ref count bug introduced by commit 2de795707294972f6c34bae9de713e502c431296 Author: Lorenzo Colitti Date: Wed Oct 27 18:16:49 2010 +0000 ipv6: addrconf: don't remove address state on ifdown if the address is being kept Fix logic so that addrconf_ifdown() decrements the inet6_ifaddr refcnt correctly with in6_ifa_put(). Reported-by: Stephen Hemminger Signed-off-by: John Fastabend Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2fc35b3..23cc8e1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2758,13 +2758,13 @@ static int addrconf_ifdown(struct net_device *dev, int how) ifa->state = INET6_IFADDR_STATE_DEAD; spin_unlock_bh(&ifa->state_lock); - if (state == INET6_IFADDR_STATE_DEAD) { - in6_ifa_put(ifa); - } else { + if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa); } + + in6_ifa_put(ifa); write_lock_bh(&idev->lock); } } -- cgit v1.1 From 5fc43978a79e8021c189660ab63249fd29c5fb32 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 20 Nov 2010 11:13:31 -0500 Subject: SUNRPC: Fix an infinite loop in call_refresh/call_refreshresult If the rpcauth_refreshcred() call returns an error other than EACCES, ENOMEM or ETIMEDOUT, we currently end up looping forever between call_refresh and call_refreshresult. The correct thing to do here is to exit on all errors except EAGAIN and ETIMEDOUT, for which case we retry 3 times, then return EACCES. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9dab957..92ce94f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -989,20 +989,26 @@ call_refreshresult(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - task->tk_action = call_allocate; - if (status >= 0 && rpcauth_uptodatecred(task)) - return; + task->tk_action = call_refresh; switch (status) { - case -EACCES: - rpc_exit(task, -EACCES); - return; - case -ENOMEM: - rpc_exit(task, -ENOMEM); + case 0: + if (rpcauth_uptodatecred(task)) + task->tk_action = call_allocate; return; case -ETIMEDOUT: rpc_delay(task, 3*HZ); + case -EAGAIN: + status = -EACCES; + if (!task->tk_cred_retry) + break; + task->tk_cred_retry--; + dprintk("RPC: %5u %s: retry refresh creds\n", + task->tk_pid, __func__); + return; } - task->tk_action = call_refresh; + dprintk("RPC: %5u %s: refresh creds failed with error %d\n", + task->tk_pid, __func__, status); + rpc_exit(task, status); } /* -- cgit v1.1 From c89ad7372232b69fd37edf90d6f5d2a8d6381214 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Mon, 1 Nov 2010 19:08:50 +0000 Subject: Bluetooth: Fix not returning proper error in SCO Return 0 in that situation could lead to errors in the caller. Signed-off-by: Gustavo F. Padovan --- net/bluetooth/sco.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d0927d1..66b9e5c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -882,7 +882,7 @@ static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type) int lm = 0; if (type != SCO_LINK && type != ESCO_LINK) - return 0; + return -EINVAL; BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); @@ -908,7 +908,7 @@ static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status); if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) - return 0; + return -EINVAL; if (!status) { struct sco_conn *conn; @@ -927,7 +927,7 @@ static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) BT_DBG("hcon %p reason %d", hcon, reason); if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) - return 0; + return -EINVAL; sco_conn_del(hcon, bt_err(reason)); -- cgit v1.1 From 9915672d41273f5b77f1b3c29b391ffb7732b84b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Nov 2010 09:15:27 -0800 Subject: af_unix: limit unix_tot_inflight Vegard Nossum found a unix socket OOM was possible, posting an exploit program. My analysis is we can eat all LOWMEM memory before unix_gc() being called from unix_release_sock(). Moreover, the thread blocked in unix_gc() can consume huge amount of time to perform cleanup because of huge working set. One way to handle this is to have a sensible limit on unix_tot_inflight, tested from wait_for_unix_gc() and to force a call to unix_gc() if this limit is hit. This solves the OOM and also reduce overall latencies, and should not slowdown normal workloads. Reported-by: Vegard Nossum Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/garbage.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c8df6fd..40df93d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -259,9 +259,16 @@ static void inc_inflight_move_tail(struct unix_sock *u) } static bool gc_in_progress = false; +#define UNIX_INFLIGHT_TRIGGER_GC 16000 void wait_for_unix_gc(void) { + /* + * If number of inflight sockets is insane, + * force a garbage collect right now. + */ + if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) + unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } -- cgit v1.1 From c39508d6f118308355468314ff414644115a07f3 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 24 Nov 2010 11:47:22 -0800 Subject: tcp: Make TCP_MAXSEG minimum more correct. Use TCP_MIN_MSS instead of constant 64. Reported-by: Min Zhang Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0814199..f15c36a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < 64 || val > MAX_TCP_WINDOW) { + if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } -- cgit v1.1 From fa0e846494792e722d817b9d3d625a4ef4896c96 Mon Sep 17 00:00:00 2001 From: Phil Blundell Date: Wed, 24 Nov 2010 11:49:19 -0800 Subject: econet: disallow NULL remote addr for sendmsg(), fixes CVE-2010-3849 Later parts of econet_sendmsg() rely on saddr != NULL, so return early with EINVAL if NULL was passed otherwise an oops may occur. Signed-off-by: Phil Blundell Signed-off-by: David S. Miller --- net/econet/af_econet.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index f8c1ae4..e366f1b 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -297,23 +297,14 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, mutex_lock(&econet_mutex); - if (saddr == NULL) { - struct econet_sock *eo = ec_sk(sk); - - addr.station = eo->station; - addr.net = eo->net; - port = eo->port; - cb = eo->cb; - } else { - if (msg->msg_namelen < sizeof(struct sockaddr_ec)) { - mutex_unlock(&econet_mutex); - return -EINVAL; - } - addr.station = saddr->addr.station; - addr.net = saddr->addr.net; - port = saddr->port; - cb = saddr->cb; - } + if (saddr == NULL || msg->msg_namelen < sizeof(struct sockaddr_ec)) { + mutex_unlock(&econet_mutex); + return -EINVAL; + } + addr.station = saddr->addr.station; + addr.net = saddr->addr.net; + port = saddr->port; + cb = saddr->cb; /* Look for a device with the right network number. */ dev = net2dev_map[addr.net]; @@ -351,7 +342,6 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, eb = (struct ec_cb *)&skb->cb; - /* BUG: saddr may be NULL */ eb->cookie = saddr->cookie; eb->sec = *saddr; eb->sent = ec_tx_done; -- cgit v1.1 From 16c41745c7b92a243d0874f534c1655196c64b74 Mon Sep 17 00:00:00 2001 From: Phil Blundell Date: Wed, 24 Nov 2010 11:49:53 -0800 Subject: econet: fix CVE-2010-3850 Add missing check for capable(CAP_NET_ADMIN) in SIOCSIFADDR operation. Signed-off-by: Phil Blundell Signed-off-by: David S. Miller --- net/econet/af_econet.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index e366f1b..d41ba8e 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -661,6 +661,9 @@ static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) err = 0; switch (cmd) { case SIOCSIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + edev = dev->ec_ptr; if (edev == NULL) { /* Magic up a new one. */ -- cgit v1.1 From a27e13d370415add3487949c60810e36069a23a6 Mon Sep 17 00:00:00 2001 From: Phil Blundell Date: Wed, 24 Nov 2010 11:51:47 -0800 Subject: econet: fix CVE-2010-3848 Don't declare variable sized array of iovecs on the stack since this could cause stack overflow if msg->msgiovlen is large. Instead, coalesce the user-supplied data into a new buffer and use a single iovec for it. Signed-off-by: Phil Blundell Signed-off-by: David S. Miller --- net/econet/af_econet.c | 62 +++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index d41ba8e..13992e1 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -276,12 +277,12 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, #endif #ifdef CONFIG_ECONET_AUNUDP struct msghdr udpmsg; - struct iovec iov[msg->msg_iovlen+1]; + struct iovec iov[2]; struct aunhdr ah; struct sockaddr_in udpdest; __kernel_size_t size; - int i; mm_segment_t oldfs; + char *userbuf; #endif /* @@ -319,17 +320,17 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, } } - if (len + 15 > dev->mtu) { - mutex_unlock(&econet_mutex); - return -EMSGSIZE; - } - if (dev->type == ARPHRD_ECONET) { /* Real hardware Econet. We're not worthy etc. */ #ifdef CONFIG_ECONET_NATIVE unsigned short proto = 0; int res; + if (len + 15 > dev->mtu) { + mutex_unlock(&econet_mutex); + return -EMSGSIZE; + } + dev_hold(dev); skb = sock_alloc_send_skb(sk, len+LL_ALLOCATED_SPACE(dev), @@ -405,6 +406,11 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, return -ENETDOWN; /* No socket - can't send */ } + if (len > 32768) { + err = -E2BIG; + goto error; + } + /* Make up a UDP datagram and hand it off to some higher intellect. */ memset(&udpdest, 0, sizeof(udpdest)); @@ -436,36 +442,26 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, /* tack our header on the front of the iovec */ size = sizeof(struct aunhdr); - /* - * XXX: that is b0rken. We can't mix userland and kernel pointers - * in iovec, since on a lot of platforms copy_from_user() will - * *not* work with the kernel and userland ones at the same time, - * regardless of what we do with set_fs(). And we are talking about - * econet-over-ethernet here, so "it's only ARM anyway" doesn't - * apply. Any suggestions on fixing that code? -- AV - */ iov[0].iov_base = (void *)&ah; iov[0].iov_len = size; - for (i = 0; i < msg->msg_iovlen; i++) { - void __user *base = msg->msg_iov[i].iov_base; - size_t iov_len = msg->msg_iov[i].iov_len; - /* Check it now since we switch to KERNEL_DS later. */ - if (!access_ok(VERIFY_READ, base, iov_len)) { - mutex_unlock(&econet_mutex); - return -EFAULT; - } - iov[i+1].iov_base = base; - iov[i+1].iov_len = iov_len; - size += iov_len; + + userbuf = vmalloc(len); + if (userbuf == NULL) { + err = -ENOMEM; + goto error; } + iov[1].iov_base = userbuf; + iov[1].iov_len = len; + err = memcpy_fromiovec(userbuf, msg->msg_iov, len); + if (err) + goto error_free_buf; + /* Get a skbuff (no data, just holds our cb information) */ if ((skb = sock_alloc_send_skb(sk, 0, msg->msg_flags & MSG_DONTWAIT, - &err)) == NULL) { - mutex_unlock(&econet_mutex); - return err; - } + &err)) == NULL) + goto error_free_buf; eb = (struct ec_cb *)&skb->cb; @@ -481,7 +477,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, udpmsg.msg_name = (void *)&udpdest; udpmsg.msg_namelen = sizeof(udpdest); udpmsg.msg_iov = &iov[0]; - udpmsg.msg_iovlen = msg->msg_iovlen + 1; + udpmsg.msg_iovlen = 2; udpmsg.msg_control = NULL; udpmsg.msg_controllen = 0; udpmsg.msg_flags=0; @@ -489,9 +485,13 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, oldfs = get_fs(); set_fs(KERNEL_DS); /* More privs :-) */ err = sock_sendmsg(udpsock, &udpmsg, size); set_fs(oldfs); + +error_free_buf: + vfree(userbuf); #else err = -EPROTOTYPE; #endif + error: mutex_unlock(&econet_mutex); return err; -- cgit v1.1 From 4cb6a614ba0e58cae8abdadbf73bcb4d37a3f599 Mon Sep 17 00:00:00 2001 From: Tracey Dent Date: Sun, 21 Nov 2010 15:23:50 +0000 Subject: Net: ceph: Makefile: Remove unnessary code Remove the if and else conditional because the code is in mainline and there is no need in it being there. Signed-off-by: Tracey Dent Signed-off-by: David S. Miller --- net/ceph/Makefile | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'net') diff --git a/net/ceph/Makefile b/net/ceph/Makefile index aab1cab..5f19415 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -1,9 +1,6 @@ # # Makefile for CEPH filesystem. # - -ifneq ($(KERNELRELEASE),) - obj-$(CONFIG_CEPH_LIB) += libceph.o libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ @@ -16,22 +13,3 @@ libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ pagevec.o -else -#Otherwise we were called directly from the command -# line; invoke the kernel build system. - -KERNELDIR ?= /lib/modules/$(shell uname -r)/build -PWD := $(shell pwd) - -default: all - -all: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install - -clean: - $(MAKE) -C $(KERNELDIR) M=$(PWD) clean - -endif -- cgit v1.1 From 8475ef9fd16cadbfc692f78e608d1941a340beb2 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 22 Nov 2010 03:26:12 +0000 Subject: netns: Don't leak others' openreq-s in proc The /proc/net/tcp leaks openreq sockets from other namespaces. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69ccbc1..e13da6d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2043,7 +2043,9 @@ get_req: } get_sk: sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { cur = sk; goto out; } -- cgit v1.1 From 0147fc058d11bd4009b126d09974d2c8f48fef15 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 22 Nov 2010 12:54:21 +0000 Subject: tcp: restrict net.ipv4.tcp_adv_win_scale (#20312) tcp_win_from_space() does the following: if (sysctl_tcp_adv_win_scale <= 0) return space >> (-sysctl_tcp_adv_win_scale); else return space - (space >> sysctl_tcp_adv_win_scale); "space" is int. As per C99 6.5.7 (3) shifting int for 32 or more bits is undefined behaviour. Indeed, if sysctl_tcp_adv_win_scale is exactly 32, space >> 32 equals space and function returns 0. Which means we busyloop in tcp_fixup_rcvbuf(). Restrict net.ipv4.tcp_adv_win_scale to [-31, 31]. Fix https://bugzilla.kernel.org/show_bug.cgi?id=20312 Steps to reproduce: echo 32 >/proc/sys/net/ipv4/tcp_adv_win_scale wget www.kernel.org [softlockup] Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e91911d..1b4ec21 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,8 @@ static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; +static int tcp_adv_win_scale_min = -31; +static int tcp_adv_win_scale_max = 31; /* Update system visible IP port range */ static void set_local_port_range(int range[2]) @@ -426,7 +428,9 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, }, { .procname = "tcp_tw_reuse", -- cgit v1.1 From 0ac78870220b6e0ac74dd9292bcfa7b18718babd Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 23 Nov 2010 02:36:56 +0000 Subject: dccp: fix error in updating the GAR This fixes a bug in updating the Greatest Acknowledgment number Received (GAR): the current implementation does not track the greatest received value - lower values in the range AWL..AWH (RFC 4340, 7.5.1) erase higher ones. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/input.c b/net/dccp/input.c index 2659853..e424a09 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -239,7 +239,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) dccp_update_gsr(sk, seqno); if (dh->dccph_type != DCCP_PKT_SYNC && - (ackno != DCCP_PKT_WITHOUT_ACK_SEQ)) + ackno != DCCP_PKT_WITHOUT_ACK_SEQ && + after48(ackno, dp->dccps_gar)) dp->dccps_gar = ackno; } else { unsigned long now = jiffies; -- cgit v1.1 From 3c6f27bf33052ea6ba9d82369fb460726fb779c0 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Tue, 23 Nov 2010 11:02:13 +0000 Subject: DECnet: don't leak uninitialized stack byte A single uninitialized padding byte is leaked to userspace. Signed-off-by: Dan Rosenberg CC: stable Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index a76b78d..6f97268 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1556,6 +1556,8 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us if (r_len > sizeof(struct linkinfo_dn)) r_len = sizeof(struct linkinfo_dn); + memset(&link, 0, sizeof(link)); + switch(sock->state) { case SS_CONNECTING: link.idn_linkstate = LL_CONNECTING; -- cgit v1.1 From b4ff3c90e6066bacc8a92111752fe9e4f4c45cca Mon Sep 17 00:00:00 2001 From: Nagendra Tomar Date: Fri, 26 Nov 2010 14:26:27 +0000 Subject: inet: Fix __inet_inherit_port() to correctly increment bsockets and num_owners inet sockets corresponding to passive connections are added to the bind hash using ___inet_inherit_port(). These sockets are later removed from the bind hash using __inet_put_port(). These two functions are not exactly symmetrical. __inet_put_port() decrements hashinfo->bsockets and tb->num_owners, whereas ___inet_inherit_port() does not increment them. This results in both of these going to -ve values. This patch fixes this by calling inet_bind_hash() from ___inet_inherit_port(), which does the right thing. 'bsockets' and 'num_owners' were introduced by commit a9d8f9110d7e953c (inet: Allowing more than 64k connections and heavily optimize bind(0)) Signed-off-by: Nagendra Singh Tomar Acked-by: Eric Dumazet Acked-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1b344f3..3c0369a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -133,8 +133,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) } } } - sk_add_bind_node(child, &tb->owners); - inet_csk(child)->icsk_bind_hash = tb; + inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; -- cgit v1.1 From 25888e30319f8896fc656fc68643e6a078263060 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Nov 2010 04:11:39 +0000 Subject: af_unix: limit recursion level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its easy to eat all kernel memory and trigger NMI watchdog, using an exploit program that queues unix sockets on top of others. lkml ref : http://lkml.org/lkml/2010/11/25/8 This mechanism is used in applications, one choice we have is to have a recursion limit. Other limits might be needed as well (if we queue other types of files), since the passfd mechanism is currently limited by socket receive queue sizes only. Add a recursion_level to unix socket, allowing up to 4 levels. Each time we send an unix socket through sendfd mechanism, we copy its recursion level (plus one) to receiver. This recursion level is cleared when socket receive queue is emptied. Reported-by: Марк Коренберг Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++++++----- net/unix/garbage.c | 2 +- 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3c95304..2268e67 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1343,9 +1343,25 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +#define MAX_RECURSION_LEVEL 4 + static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + unsigned char max_level = 0; + int unix_sock_count = 0; + + for (i = scm->fp->count - 1; i >= 0; i--) { + struct sock *sk = unix_get_socket(scm->fp->fp[i]); + + if (sk) { + unix_sock_count++; + max_level = max(max_level, + unix_sk(sk)->recursion_level); + } + } + if (unlikely(max_level > MAX_RECURSION_LEVEL)) + return -ETOOMANYREFS; /* * Need to duplicate file references for the sake of garbage @@ -1356,9 +1372,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count-1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - return 0; + if (unix_sock_count) { + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); + } + return max_level; } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1393,6 +1411,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sk_buff *skb; long timeo; struct scm_cookie tmp_scm; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1431,8 +1450,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; err = unix_scm_to_skb(siocb->scm, skb, true); - if (err) + if (err < 0) goto out_free; + max_level = err + 1; unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1514,6 +1534,8 @@ restart: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); @@ -1544,6 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, int sent = 0; struct scm_cookie tmp_scm; bool fds_sent = false; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1607,10 +1630,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); - if (err) { + if (err < 0) { kfree_skb(skb); goto out_err; } + max_level = err + 1; fds_sent = true; err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); @@ -1626,6 +1650,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err_free; skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, size); sent += size; @@ -1845,6 +1871,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, unix_state_lock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb == NULL) { + unix_sk(sk)->recursion_level = 0; if (copied >= target) goto unlock; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 40df93d..f89f83b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -96,7 +96,7 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); unsigned int unix_tot_inflight; -static struct sock *unix_get_socket(struct file *filp) +struct sock *unix_get_socket(struct file *filp) { struct sock *u_sock = NULL; struct inode *inode = filp->f_path.dentry->d_inode; -- cgit v1.1 From 7dff3125534c1d035a910052335a3a39fbb31aa7 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Fri, 26 Nov 2010 20:41:55 +0200 Subject: mac80211: Fix frame injection using non-AP vif In order for frame injection to work properly for some use cases (e.g., finding the station entry and keys for encryption), mac80211 needs to find the correct sdata entry. This works when the main vif is in AP mode, but commit a2c1e3dad516618cb0fbfb1a62c36d0b0744573a broke this particular use case for station main vif. While this type of injection is quite unusual operation, it has some uses and we should fix it. Do this by changing the monitor vif sdata selection to allow station vif to be selected instead of limiting it to just AP vifs. We still need to skip some iftypes to avoid selecting unsuitable vif for injection. Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- net/mac80211/tx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 96c5943..df6aac5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1587,7 +1587,12 @@ static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, list) { if (!ieee80211_sdata_running(tmp_sdata)) continue; - if (tmp_sdata->vif.type != NL80211_IFTYPE_AP) + if (tmp_sdata->vif.type == + NL80211_IFTYPE_MONITOR || + tmp_sdata->vif.type == + NL80211_IFTYPE_AP_VLAN || + tmp_sdata->vif.type == + NL80211_IFTYPE_WDS) continue; if (compare_ether_addr(tmp_sdata->vif.addr, hdr->addr2) == 0) { -- cgit v1.1 From 2c31333a8fde7e26936a9f5371d02ff12c490993 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Mon, 29 Nov 2010 20:53:23 +0100 Subject: mac80211: ignore non-bcast mcast deauth/disassoc franes This patch fixes an curious issue due to insufficient rx frame filtering. Saqeb Akhter reported frequent disconnects while streaming videos over samba: > [ 1166.512087] wlan1: deauthenticated from 30:46:9a:10:49:f7 (Reason: 7) > [ 1526.059997] wlan1: deauthenticated from 30:46:9a:10:49:f7 (Reason: 7) > [ 2125.324356] wlan1: deauthenticated from 30:46:9a:10:49:f7 (Reason: 7) > [...] The reason is that the device generates frames with slightly bogus SA/TA addresses. e.g.: [ 2314.402316] Ignore 9f:1f:31:f8:64:ff [ 2314.402321] Ignore 9f:1f:31:f8:64:ff [ 2352.453804] Ignore 0d:1f:31:f8:64:ff [ 2352.453808] Ignore 0d:1f:31:f8:64:ff ^^ the group-address flag is set! (the correct SA/TA would be: 00:1f:31:f8:64:ff) Since the AP does not know from where the frames come, it generates a DEAUTH response for the (invalid) mcast address. This mcast deauth frame then passes through all filters and tricks the stack into thinking that the AP brutally kicked us! This patch fixes the problem by simply ignoring non-broadcast, group-addressed deauth/disassoc frames. Cc: Jouni Malinen Cc: Johannes Berg Reported-by: Saqeb Akhter Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/rx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 902b03e..3c87293 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2247,6 +2247,10 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) break; case cpu_to_le16(IEEE80211_STYPE_DEAUTH): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): + if (is_multicast_ether_addr(mgmt->da) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_MONITOR; + /* process only for station */ if (sdata->vif.type != NL80211_IFTYPE_STATION) return RX_DROP_MONITOR; -- cgit v1.1 From 8e26d5ad2f9c038609d42eebc676cd1107709eef Mon Sep 17 00:00:00 2001 From: Senthil Balasubramanian Date: Tue, 30 Nov 2010 20:15:38 +0530 Subject: mac80211: Fix STA disconnect due to MIC failure Th commit titled "mac80211: clean up rx handling wrt. found_sta" removed found_sta variable which caused a MIC failure event to be reported twice for a single failure to supplicant resulted in STA disconnect. This should fix WPA specific countermeasures WiFi test case (5.2.17) issues with mac80211 based drivers which report MIC failure events in rx status. Cc: Stable (2.6.37) Signed-off-by: Senthil Balasubramanian Signed-off-by: John W. Linville --- net/mac80211/rx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3c87293..54fb4a0e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2745,6 +2745,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) return; + goto out; } } @@ -2784,6 +2785,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, return; } + out: dev_kfree_skb(skb); } -- cgit v1.1 From 381601e5bbae78d7c18d946fe874a63957edea13 Mon Sep 17 00:00:00 2001 From: Anders Franzen Date: Wed, 24 Nov 2010 05:47:18 +0000 Subject: Make the ip6_tunnel reflect the true mtu. The ip6_tunnel always assumes it consumes 40 bytes (ip6 hdr) of the mtu of the underlaying device. So for a normal ethernet bearer, the mtu of the ip6_tunnel is 1460. However, when creating a tunnel the encap limit option is enabled by default, and it consumes 8 bytes more, so the true mtu shall be 1452. I dont really know if this breaks some statement in some RFC, so this is a request for comments. Signed-off-by: Anders Franzen Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 2a59610..70e891a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1175,6 +1175,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) sizeof (struct ipv6hdr); dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1363,12 +1365,17 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { static void ip6_tnl_dev_setup(struct net_device *dev) { + struct ip6_tnl *t; + dev->netdev_ops = &ip6_tnl_netdev_ops; dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->features |= NETIF_F_NETNS_LOCAL; -- cgit v1.1 From 6dcdd1b3694a4fa2b85167a9c860c7613a7553c7 Mon Sep 17 00:00:00 2001 From: David McCullough Date: Mon, 29 Nov 2010 19:32:34 +0000 Subject: net/ipv6/sit.c: return unhandled skb to tunnel4_rcv I found a problem using an IPv6 over IPv4 tunnel. When CONFIG_IPV6_SIT was enabled, the packets would be rejected as net/ipv6/sit.c was catching all IPPROTO_IPV6 packets and returning an ICMP port unreachable error. I think this patch fixes the problem cleanly. I believe the code in net/ipv4/tunnel4.c:tunnel4_rcv takes care of it properly if none of the handlers claim the skb. Signed-off-by: David McCullough Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/sit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d6bfaec..8c4d00c 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -606,8 +606,9 @@ static int ipip6_rcv(struct sk_buff *skb) return 0; } - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + /* no tunnel matched, let upstream know, ipsec may handle it */ rcu_read_unlock(); + return 1; out: kfree_skb(skb); return 0; -- cgit v1.1 From 0bae35e14b68f5e7075bc96e5ea608b42bdf8f59 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 2 Dec 2010 14:31:14 -0800 Subject: leds: fix up dependencies It's not useful to build LED triggers when there's no LEDs that can be triggered by them. Therefore, fix up the dependencies so that this cannot happen, and fix a few users that select triggers to depend on LEDS_CLASS as well (there is also one user that also selects LEDS_CLASS, which is OK). Signed-off-by: Johannes Berg Reported-by: Randy Dunlap Acked-by: Randy Dunlap Tested-by: Ingo Molnar Cc: Arnd Hannemann Cc: Michal Hocko Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/mac80211/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 4d6f865..8e8ea9c 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -92,7 +92,7 @@ config MAC80211_MESH config MAC80211_LEDS bool "Enable LED triggers" depends on MAC80211 - select NEW_LEDS + depends on LEDS_CLASS select LEDS_TRIGGERS ---help--- This option enables a few LED triggers for different -- cgit v1.1 From 46bcf14f44d8f31ecfdc8b6708ec15a3b33316d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2010 09:29:43 -0800 Subject: filter: fix sk_filter rcu handling Pavel Emelyanov tried to fix a race between sk_filter_(de|at)tach and sk_clone() in commit 47e958eac280c263397 Problem is we can have several clones sharing a common sk_filter, and these clones might want to sk_filter_attach() their own filters at the same time, and can overwrite old_filter->rcu, corrupting RCU queues. We can not use filter->rcu without being sure no other thread could do the same thing. Switch code to a more conventional ref-counting technique : Do the atomic decrement immediately and queue one rcu call back when last reference is released. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index c1ee800..ae21a0d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -589,23 +589,16 @@ int sk_chk_filter(struct sock_filter *filter, int flen) EXPORT_SYMBOL(sk_chk_filter); /** - * sk_filter_rcu_release - Release a socket filter by rcu_head + * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -static void sk_filter_rcu_release(struct rcu_head *rcu) +void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - sk_filter_release(fp); -} - -static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp) -{ - unsigned int size = sk_filter_len(fp); - - atomic_sub(size, &sk->sk_omem_alloc); - call_rcu_bh(&fp->rcu, sk_filter_rcu_release); + kfree(fp); } +EXPORT_SYMBOL(sk_filter_release_rcu); /** * sk_attach_filter - attach a socket filter @@ -649,7 +642,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) - sk_filter_delayed_uncharge(sk, old_fp); + sk_filter_uncharge(sk, old_fp); return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -663,7 +656,7 @@ int sk_detach_filter(struct sock *sk) sock_owned_by_user(sk)); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); - sk_filter_delayed_uncharge(sk, filter); + sk_filter_uncharge(sk, filter); ret = 0; } return ret; -- cgit v1.1 From ed2849d3ecfa339435818eeff28f6c3424300cec Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Nov 2010 16:55:19 +1100 Subject: sunrpc: prevent use-after-free on clearing XPT_BUSY When an xprt is created, it has a refcount of 1, and XPT_BUSY is set. The refcount is *not* owned by the thread that created the xprt (as is clear from the fact that creators never put the reference). Rather, it is owned by the absence of XPT_DEAD. Once XPT_DEAD is set, (And XPT_BUSY is clear) that initial reference is dropped and the xprt can be freed. So when a creator clears XPT_BUSY it is dropping its only reference and so must not touch the xprt again. However svc_recv, after calling ->xpo_accept (and so getting an XPT_BUSY reference on a new xprt), calls svc_xprt_recieved. This clears XPT_BUSY and then svc_xprt_enqueue - this last without owning a reference. This is dangerous and has been seen to leave svc_xprt_enqueue working with an xprt containing garbage. So we need to hold an extra counted reference over that call to svc_xprt_received. For safety, any time we clear XPT_BUSY and then use the xprt again, we first get a reference, and the put it again afterwards. Note that svc_close_all does not need this extra protection as there are no threads running, and the final free can only be called asynchronously from such a thread. Signed-off-by: NeilBrown Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ea2ff78..3f2c555 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -212,6 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, spin_lock(&svc_xprt_class_lock); list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { struct svc_xprt *newxprt; + unsigned short newport; if (strcmp(xprt_name, xcl->xcl_name)) continue; @@ -230,8 +231,9 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, spin_lock_bh(&serv->sv_lock); list_add(&newxprt->xpt_list, &serv->sv_permsocks); spin_unlock_bh(&serv->sv_lock); + newport = svc_xprt_local_port(newxprt); clear_bit(XPT_BUSY, &newxprt->xpt_flags); - return svc_xprt_local_port(newxprt); + return newport; } err: spin_unlock(&svc_xprt_class_lock); @@ -425,8 +427,13 @@ void svc_xprt_received(struct svc_xprt *xprt) { BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); xprt->xpt_pool = NULL; + /* As soon as we clear busy, the xprt could be closed and + * 'put', so we need a reference to call svc_xprt_enqueue with: + */ + svc_xprt_get(xprt); clear_bit(XPT_BUSY, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } EXPORT_SYMBOL_GPL(svc_xprt_received); -- cgit v1.1 From b1afde60f2b9ee8444fba4e012dc99a3b28d224d Mon Sep 17 00:00:00 2001 From: Nandita Dukkipati Date: Fri, 3 Dec 2010 13:33:44 +0000 Subject: tcp: Bug fix in initialization of receive window. The bug has to do with boundary checks on the initial receive window. If the initial receive window falls between init_cwnd and the receive window specified by the user, the initial window is incorrectly brought down to init_cwnd. The correct behavior is to allow it to remain unchanged. Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 05b1ecf..3c59ab4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -231,11 +231,10 @@ void tcp_select_initial_window(int __space, __u32 mss, /* when initializing use the value from init_rcv_wnd * rather than the default from above */ - if (init_rcv_wnd && - (*rcv_wnd > init_rcv_wnd * mss)) - *rcv_wnd = init_rcv_wnd * mss; - else if (*rcv_wnd > init_cwnd * mss) - *rcv_wnd = init_cwnd * mss; + if (init_rcv_wnd) + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); + else + *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); } /* Set the clamp no higher than max representable value */ -- cgit v1.1 From 35d9b0c906ad92d32a0b8db5daa6fabfcc2f068d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Dec 2010 02:03:26 +0000 Subject: llc: fix a device refcount imbalance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le dimanche 05 décembre 2010 à 12:23 +0100, Eric Dumazet a écrit : > Le dimanche 05 décembre 2010 à 09:19 +0100, Eric Dumazet a écrit : > > > Hmm.. > > > > If somebody can explain why RTNL is held in arp_ioctl() (and therefore > > in arp_req_delete()), we might first remove RTNL use in arp_ioctl() so > > that your patch can be applied. > > > > Right now it is not good, because RTNL wont be necessarly held when you > > are going to call arp_invalidate() ? > > While doing this analysis, I found a refcount bug in llc, I'll send a > patch for net-2.6 Oh well, of course I must first fix the bug in net-2.6, and wait David pull the fix in net-next-2.6 before sending this rcu conversion. Note: this patch should be sent to stable teams (2.6.34 and up) [PATCH net-2.6] llc: fix a device refcount imbalance commit abf9d537fea225 (llc: add support for SO_BINDTODEVICE) added one refcount imbalance in llc_ui_bind(), because dev_getbyhwaddr() doesnt take a reference on device, while dev_get_by_index() does. Fix this using RCU locking. And since an RCU conversion will be done for 2.6.38 for dev_getbyhwaddr(), put the rcu_read_lock/unlock exactly at their final place. Signed-off-by: Eric Dumazet Cc: stable@kernel.org Cc: Octavian Purdila Signed-off-by: David S. Miller --- net/llc/af_llc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5826129..e35dbe5 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -317,8 +317,9 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) goto out; rc = -ENODEV; rtnl_lock(); + rcu_read_lock(); if (sk->sk_bound_dev_if) { - llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (llc->dev) { if (!addr->sllc_arphrd) addr->sllc_arphrd = llc->dev->type; @@ -329,13 +330,13 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) !llc_mac_match(addr->sllc_mac, llc->dev->dev_addr)) { rc = -EINVAL; - dev_put(llc->dev); llc->dev = NULL; } } } else llc->dev = dev_getbyhwaddr(&init_net, addr->sllc_arphrd, addr->sllc_mac); + rcu_read_unlock(); rtnl_unlock(); if (!llc->dev) goto out; -- cgit v1.1 From 0c62fc6dd02c8d793c75ae76a9b6881fc36388ad Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Wed, 8 Dec 2010 10:13:55 -0800 Subject: econet: Do the correct cleanup after an unprivileged SIOCSIFADDR. We need to drop the mutex and do a dev_put, so set an error code and break like the other paths, instead of returning directly. Signed-off-by: Nelson Elhage Signed-off-by: David S. Miller --- net/econet/af_econet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 13992e1..f180371 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -661,8 +661,10 @@ static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) err = 0; switch (cmd) { case SIOCSIFADDR: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } edev = dev->ec_ptr; if (edev == NULL) { -- cgit v1.1 From e8d34a884e4ff118920bb57664def8a73b1b784f Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Mon, 6 Dec 2010 02:39:12 +0000 Subject: l2tp: Fix modalias of l2tp_ip Using the SOCK_DGRAM enum results in "net-pf-2-proto-SOCK_DGRAM-type-115", so use the numeric value like it is done in net/dccp. Signed-off-by: Michal Marek Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 0bf6a59..522e219 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -674,4 +674,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman "); MODULE_DESCRIPTION("L2TP over IP"); MODULE_VERSION("1.0"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, SOCK_DGRAM, IPPROTO_L2TP); + +/* Use the value of SOCK_DGRAM (2) directory, because __stringify does't like + * enums + */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP); -- cgit v1.1 From 171995e5d82dcc92bea37a7d2a2ecc21068a0f19 Mon Sep 17 00:00:00 2001 From: Apollon Oikonomopoulos Date: Tue, 7 Dec 2010 09:43:30 +0000 Subject: x25: decrement netdev reference counts on unload x25 does not decrement the network device reference counts on module unload. Thus unregistering any pre-existing interface after unloading the x25 module hangs and results in unregister_netdevice: waiting for tap0 to become free. Usage count = 1 This patch decrements the reference counts of all interfaces in x25_link_free, the way it is already done in x25_link_device_down for NETDEV_DOWN events. Signed-off-by: Apollon Oikonomopoulos Signed-off-by: David S. Miller --- net/x25/x25_link.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 73e7b95..b25c646 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -394,6 +394,7 @@ void __exit x25_link_free(void) list_for_each_safe(entry, tmp, &x25_neigh_list) { nb = list_entry(entry, struct x25_neigh, node); __x25_remove_neigh(nb); + dev_put(nb->dev); } write_unlock_bh(&x25_neigh_list_lock); } -- cgit v1.1 From 67631510a318d5a930055fe927607f483716e100 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 8 Dec 2010 12:16:33 -0800 Subject: tcp: Replace time wait bucket msg by counter Rather than printing the message to the log, use a mib counter to keep track of the count of occurences of time wait bucket overflow. Reduces spam in logs. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/proc.c | 1 + net/ipv4/tcp_minisocks.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 1b48eb1..b14ec7d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), + SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 43cf901..a66735f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -347,7 +347,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * socket up. We've got bigger problems than * non-graceful socket closings. */ - LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); -- cgit v1.1 From 7e2447075690860e2cea96b119fc9cadbaa7e83c Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Thu, 2 Dec 2010 18:44:09 +0100 Subject: mac80211: Fix BUG in pskb_expand_head when transmitting shared skbs mac80211 doesn't handle shared skbs correctly at the moment. As a result a possible resize can trigger a BUG in pskb_expand_head. [ 676.030000] Kernel bug detected[#1]: [ 676.030000] Cpu 0 [ 676.030000] $ 0 : 00000000 00000000 819662ff 00000002 [ 676.030000] $ 4 : 81966200 00000020 00000000 00000020 [ 676.030000] $ 8 : 819662e0 800043c0 00000002 00020000 [ 676.030000] $12 : 3b9aca00 00000000 00000000 00470000 [ 676.030000] $16 : 80ea2000 00000000 00000000 00000000 [ 676.030000] $20 : 818aa200 80ea2018 80ea2000 00000008 [ 676.030000] $24 : 00000002 800ace5c [ 676.030000] $28 : 8199a000 8199bd20 81938f88 80f180d4 [ 676.030000] Hi : 0000026e [ 676.030000] Lo : 0000757e [ 676.030000] epc : 801245e4 pskb_expand_head+0x44/0x1d8 [ 676.030000] Not tainted [ 676.030000] ra : 80f180d4 ieee80211_skb_resize+0xb0/0x114 [mac80211] [ 676.030000] Status: 1000a403 KERNEL EXL IE [ 676.030000] Cause : 10800024 [ 676.030000] PrId : 0001964c (MIPS 24Kc) [ 676.030000] Modules linked in: mac80211_hwsim rt2800lib rt2x00soc rt2x00pci rt2x00lib mac80211 crc_itu_t crc_ccitt cfg80211 compat arc4 aes_generic deflate ecb cbc [last unloaded: rt2800pci] [ 676.030000] Process kpktgend_0 (pid: 97, threadinfo=8199a000, task=81879f48, tls=00000000) [ 676.030000] Stack : ffffffff 00000000 00000000 00000014 00000004 80ea2000 00000000 00000000 [ 676.030000] 818aa200 80f180d4 ffffffff 0000000a 81879f78 81879f48 81879f48 00000018 [ 676.030000] 81966246 80ea2000 818432e0 80f1a420 80203050 81814d98 00000001 81879f48 [ 676.030000] 81879f48 00000018 81966246 818432e0 0000001a 8199bdd4 0000001c 80f1b72c [ 676.030000] 80203020 8001292c 80ef4aa2 7f10b55d 801ab5b8 81879f48 00000188 80005c90 [ 676.030000] ... [ 676.030000] Call Trace: [ 676.030000] [<801245e4>] pskb_expand_head+0x44/0x1d8 [ 676.030000] [<80f180d4>] ieee80211_skb_resize+0xb0/0x114 [mac80211] [ 676.030000] [<80f1a420>] ieee80211_xmit+0x150/0x22c [mac80211] [ 676.030000] [<80f1b72c>] ieee80211_subif_start_xmit+0x6f4/0x73c [mac80211] [ 676.030000] [<8014361c>] pktgen_thread_worker+0xfac/0x16f8 [ 676.030000] [<8002ebe8>] kthread+0x7c/0x88 [ 676.030000] [<80008e0c>] kernel_thread_helper+0x10/0x18 [ 676.030000] [ 676.030000] [ 676.030000] Code: 24020001 10620005 2502001f <0200000d> 0804917a 00000000 2502001f 00441023 00531021 Fix this by making a local copy of shared skbs prior to mangeling them. To avoid copying the skb unnecessarily move the skb_copy call below the checks that don't need write access to the skb. Also, move the assignment of nh_pos and h_pos below the skb_copy to point to the correct skb. It would be possible to avoid another resize of the copied skb by using skb_copy_expand instead of skb_copy but that would make the patch more complex. Also, shared skbs are a corner case right now, so the resize shouldn't matter much. Cc: Johannes Berg Signed-off-by: Helmut Schaa Cc: stable@kernel.org Signed-off-by: John W. Linville --- net/mac80211/tx.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index df6aac5..7a637b8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1737,15 +1737,13 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, int nh_pos, h_pos; struct sta_info *sta = NULL; u32 sta_flags = 0; + struct sk_buff *tmp_skb; if (unlikely(skb->len < ETH_HLEN)) { ret = NETDEV_TX_OK; goto fail; } - nh_pos = skb_network_header(skb) - skb->data; - h_pos = skb_transport_header(skb) - skb->data; - /* convert Ethernet header to proper 802.11 header (based on * operation mode) */ ethertype = (skb->data[12] << 8) | skb->data[13]; @@ -1918,6 +1916,20 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, goto fail; } + /* + * If the skb is shared we need to obtain our own copy. + */ + if (skb_shared(skb)) { + tmp_skb = skb; + skb = skb_copy(skb, GFP_ATOMIC); + kfree_skb(tmp_skb); + + if (!skb) { + ret = NETDEV_TX_OK; + goto fail; + } + } + hdr.frame_control = fc; hdr.duration_id = 0; hdr.seq_ctrl = 0; @@ -1936,6 +1948,9 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, encaps_len = 0; } + nh_pos = skb_network_header(skb) - skb->data; + h_pos = skb_transport_header(skb) - skb->data; + skb_pull(skb, skip_header_bytes); nh_pos -= skip_header_bytes; h_pos -= skip_header_bytes; -- cgit v1.1 From ad9f4f50fe9288bbe65b7dfd76d8820afac6a24c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Dec 2010 12:03:55 +0000 Subject: tcp: avoid a possible divide by zero sysctl_tcp_tso_win_divisor might be set to zero while one cpu runs in tcp_tso_should_defer(). Make sure we dont allow a divide by zero by reading sysctl_tcp_tso_win_divisor exactly once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3c59ab4..0d4a3ce 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1512,6 +1512,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; + int win_divisor; if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) goto send_now; @@ -1543,13 +1544,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - if (sysctl_tcp_tso_win_divisor) { + win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ - chunk /= sysctl_tcp_tso_win_divisor; + chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { -- cgit v1.1 From f19872575ff7819a3723154657a497d9bca66b33 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Dec 2010 12:20:47 +0000 Subject: tcp: protect sysctl_tcp_cookie_size reads Make sure sysctl_tcp_cookie_size is read once in tcp_cookie_size_check(), or we might return an illegal value to caller if sysctl_tcp_cookie_size is changed by another cpu. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: William Allen Simpson Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0d4a3ce..61c2463 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,27 +385,30 @@ struct tcp_out_options { */ static u8 tcp_cookie_size_check(u8 desired) { - if (desired > 0) { + int cookie_size; + + if (desired > 0) /* previously specified */ return desired; - } - if (sysctl_tcp_cookie_size <= 0) { + + cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); + if (cookie_size <= 0) /* no default specified */ return 0; - } - if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { + + if (cookie_size <= TCP_COOKIE_MIN) /* value too small, specify minimum */ return TCP_COOKIE_MIN; - } - if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { + + if (cookie_size >= TCP_COOKIE_MAX) /* value too large, specify maximum */ return TCP_COOKIE_MAX; - } - if (0x1 & sysctl_tcp_cookie_size) { + + if (cookie_size & 1) /* 8-bit multiple, illegal, fix it */ - return (u8)(sysctl_tcp_cookie_size + 0x1); - } - return (u8)sysctl_tcp_cookie_size; + cookie_size++; + + return (u8)cookie_size; } /* Write previously computed TCP options to the packet. -- cgit v1.1 From 4e085e76cbe558b79b54cbab772f61185879bc64 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 8 Dec 2010 18:42:23 -0800 Subject: econet: Fix crash in aun_incoming(). Unconditional use of skb->dev won't work here, try to fetch the econet device via skb_dst()->dev instead. Suggested by Eric Dumazet. Reported-by: Nelson Elhage Tested-by: Nelson Elhage Signed-off-by: David S. Miller --- net/econet/af_econet.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index f180371..15dcc1a 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -851,9 +851,13 @@ static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len) { struct iphdr *ip = ip_hdr(skb); unsigned char stn = ntohl(ip->saddr) & 0xff; + struct dst_entry *dst = skb_dst(skb); + struct ec_device *edev = NULL; struct sock *sk = NULL; struct sk_buff *newskb; - struct ec_device *edev = skb->dev->ec_ptr; + + if (dst) + edev = dst->dev->ec_ptr; if (! edev) goto bad; -- cgit v1.1 From 78347c8c6b2ddf20535bc1b18d749a3bbdea2a60 Mon Sep 17 00:00:00 2001 From: Thomas Egerer Date: Mon, 6 Dec 2010 23:28:56 +0000 Subject: xfrm: Fix xfrm_state_migrate leak xfrm_state_migrate calls kfree instead of xfrm_state_put to free a failed state. According to git commit 553f9118 this can cause memory leaks. Signed-off-by: Thomas Egerer Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index eb96ce5..220ebc0 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1268,7 +1268,7 @@ struct xfrm_state * xfrm_state_migrate(struct xfrm_state *x, return xc; error: - kfree(xc); + xfrm_state_put(xc); return NULL; } EXPORT_SYMBOL(xfrm_state_migrate); -- cgit v1.1 From c1249c0aae4c93a753c70496ab2e9a51430a6f02 Mon Sep 17 00:00:00 2001 From: Martin Lucina Date: Fri, 10 Dec 2010 00:04:05 +0000 Subject: net: Document the kernel_recvmsg() function Signed-off-by: Martin Lucina Signed-off-by: David S. Miller --- net/socket.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 3ca2fd9..088fb3f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -732,6 +732,21 @@ static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, return ret; } +/** + * kernel_recvmsg - Receive a message from a socket (kernel space) + * @sock: The socket to receive the message from + * @msg: Received message + * @vec: Input s/g array for message data + * @num: Size of input s/g array + * @size: Number of bytes to read + * @flags: Message flags (MSG_DONTWAIT, etc...) + * + * On return the msg structure contains the scatter/gather array passed in the + * vec argument. The array is modified so that it consists of the unfilled + * portion of the original array. + * + * The returned value is the total number of bytes received, or an error. + */ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { -- cgit v1.1 From 5f75a1042feca37c0a436ba42a4b1f7f75c35778 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 7 Dec 2010 23:38:31 +0000 Subject: ipv6: fix nl group when advertising a new link New idev are advertised with NL group RTNLGRP_IPV6_IFADDR, but should use RTNLGRP_IPV6_IFINFO. Bug was introduced by commit 8d7a76c9. Signed-off-by: Wang Xuefu Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 23cc8e1..93b7a93 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4021,11 +4021,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); return; errout: if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); } static inline size_t inet6_prefix_nlmsg_size(void) -- cgit v1.1 From 40a010395cd66053f07bffeb3da5e44683bac30e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 7 Dec 2010 17:11:09 +0000 Subject: SCTP: Fix SCTP_SET_PEER_PRIMARY_ADDR to accpet v4mapped address SCTP_SET_PEER_PRIMARY_ADDR does not accpet v4mapped address, using v4mapped address in SCTP_SET_PEER_PRIMARY_ADDR socket option will get -EADDRNOTAVAIL error if v4map is enabled. This patch try to fix it by mapping v4mapped address to v4 address if allowed. Signed-off-by: Wei Yongjun Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6bd5543..0b9ee34 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2932,6 +2932,7 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva struct sctp_association *asoc = NULL; struct sctp_setpeerprim prim; struct sctp_chunk *chunk; + struct sctp_af *af; int err; sp = sctp_sk(sk); @@ -2959,6 +2960,13 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva if (!sctp_state(asoc, ESTABLISHED)) return -ENOTCONN; + af = sctp_get_af_specific(prim.sspp_addr.ss_family); + if (!af) + return -EINVAL; + + if (!af->addr_valid((union sctp_addr *)&prim.sspp_addr, sp, NULL)) + return -EADDRNOTAVAIL; + if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr)) return -EADDRNOTAVAIL; -- cgit v1.1 From d9ca676bcb26e1fdff9265a3e70f697cd381c889 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 8 Dec 2010 19:40:47 +0000 Subject: atm: correct sysfs 'device' link creation and parent relationships The ATM subsystem was incorrectly creating the 'device' link for ATM nodes in sysfs. This led to incorrect device/parent relationships exposed by sysfs and udev. Instead of rolling the 'device' link by hand in the generic ATM code, pass each ATM driver's bus device down to the sysfs code and let sysfs do this stuff correctly. Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- net/atm/atm_sysfs.c | 3 ++- net/atm/resources.c | 7 ++++--- net/atm/resources.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 799c631..f7fa67c 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -143,12 +143,13 @@ static struct class atm_class = { .dev_uevent = atm_uevent, }; -int atm_register_sysfs(struct atm_dev *adev) +int atm_register_sysfs(struct atm_dev *adev, struct device *parent) { struct device *cdev = &adev->class_dev; int i, j, err; cdev->class = &atm_class; + cdev->parent = parent; dev_set_drvdata(cdev, adev); dev_set_name(cdev, "%s%d", adev->type, adev->number); diff --git a/net/atm/resources.c b/net/atm/resources.c index d29e582..23f45ce 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -74,8 +74,9 @@ struct atm_dev *atm_dev_lookup(int number) } EXPORT_SYMBOL(atm_dev_lookup); -struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, - int number, unsigned long *flags) +struct atm_dev *atm_dev_register(const char *type, struct device *parent, + const struct atmdev_ops *ops, int number, + unsigned long *flags) { struct atm_dev *dev, *inuse; @@ -115,7 +116,7 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, goto out_fail; } - if (atm_register_sysfs(dev) < 0) { + if (atm_register_sysfs(dev, parent) < 0) { pr_err("atm_register_sysfs failed for dev %s\n", type); atm_proc_dev_deregister(dev); goto out_fail; diff --git a/net/atm/resources.h b/net/atm/resources.h index 126fb18..521431e 100644 --- a/net/atm/resources.h +++ b/net/atm/resources.h @@ -42,6 +42,6 @@ static inline void atm_proc_dev_deregister(struct atm_dev *dev) #endif /* CONFIG_PROC_FS */ -int atm_register_sysfs(struct atm_dev *adev); +int atm_register_sysfs(struct atm_dev *adev, struct device *parent); void atm_unregister_sysfs(struct atm_dev *adev); #endif -- cgit v1.1 From a19faf0250e09b16cac169354126404bc8aa342b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Dec 2010 18:50:32 +0000 Subject: net: fix skb_defer_rx_timestamp() After commit c1f19b51d1d8 (net: support time stamping in phy devices.), kernel might crash if CONFIG_NETWORK_PHY_TIMESTAMPING=y and skb_defer_rx_timestamp() handles a packet without an ethernet header. Fixes kernel bugzilla #24102 Reference: https://bugzilla.kernel.org/show_bug.cgi?id=24102 Reported-and-tested-by: Andrew Watts Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/timestamping.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 0ae6c22..c19bb4e 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -96,11 +96,13 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) struct phy_device *phydev; unsigned int type; - skb_push(skb, ETH_HLEN); + if (skb_headroom(skb) < ETH_HLEN) + return false; + __skb_push(skb, ETH_HLEN); type = classify(skb); - skb_pull(skb, ETH_HLEN); + __skb_pull(skb, ETH_HLEN); switch (type) { case PTP_CLASS_V1_IPV4: -- cgit v1.1