From c80a5cdfc5ca6533cb893154f546370da1fdb8f0 Mon Sep 17 00:00:00 2001 From: Doug Leith Date: Mon, 25 May 2009 22:44:59 -0700 Subject: tcp: tcp_vegas ssthresh bugfix This patch fixes ssthresh accounting issues in tcp_vegas when cwnd decreases Signed-off-by: Doug Leith Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index a453aac..c6743ee 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,6 +158,11 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) } EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); +static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) +{ + return min(tp->snd_ssthresh, tp->snd_cwnd-1); +} + static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); @@ -221,11 +226,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) */ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; - if (diff > gamma && tp->snd_ssthresh > 2 ) { + if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ - tp->snd_ssthresh = 2; /* Set cwnd to match the actual rate * exactly: @@ -235,6 +239,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * utilization. */ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); + tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ @@ -250,6 +255,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * we slow down. */ tp->snd_cwnd--; + tp->snd_ssthresh + = tcp_vegas_ssthresh(tp); } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. -- cgit v1.1 From 915219441d566f1da0caa0e262be49b666159e17 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 May 2009 21:35:47 -0700 Subject: tcp: Use SKB queue and list helpers instead of doing it by-hand. Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 17 +++----- net/ipv4/tcp_input.c | 118 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 90 insertions(+), 45 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0fb8b44..17b89c5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -439,12 +439,14 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { + struct sk_buff *skb; + answ = tp->rcv_nxt - tp->copied_seq; /* Subtract 1, if FIN is in queue. */ - if (answ && !skb_queue_empty(&sk->sk_receive_queue)) - answ -= - tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; + skb = skb_peek_tail(&sk->sk_receive_queue); + if (answ && skb) + answ -= tcp_hdr(skb)->fin; } else answ = tp->urg_seq - tp->copied_seq; release_sock(sk); @@ -1382,11 +1384,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Next get a buffer. */ - skb = skb_peek(&sk->sk_receive_queue); - do { - if (!skb) - break; - + skb_queue_walk(&sk->sk_receive_queue, skb) { /* Now that we have two receive queues this * shouldn't happen. */ @@ -1403,8 +1401,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (tcp_hdr(skb)->fin) goto found_fin_ok; WARN_ON(!(flags & MSG_PEEK)); - skb = skb->next; - } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + } /* Well, if we have backlog, try to process it now yet. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eeb8a92..ba34a23 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4426,7 +4426,7 @@ drop: } __skb_queue_head(&tp->out_of_order_queue, skb); } else { - struct sk_buff *skb1 = tp->out_of_order_queue.prev; + struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); u32 seq = TCP_SKB_CB(skb)->seq; u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -4443,15 +4443,18 @@ drop: } /* Find place to insert this segment. */ - do { + while (1) { if (!after(TCP_SKB_CB(skb1)->seq, seq)) break; - } while ((skb1 = skb1->prev) != - (struct sk_buff *)&tp->out_of_order_queue); + if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { + skb1 = NULL; + break; + } + skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); + } /* Do skb overlap to previous one? */ - if (skb1 != (struct sk_buff *)&tp->out_of_order_queue && - before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ __kfree_skb(skb); @@ -4463,24 +4466,41 @@ drop: tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { - skb1 = skb1->prev; + if (skb_queue_is_first(&tp->out_of_order_queue, + skb1)) + skb1 = NULL; + else + skb1 = skb_queue_prev( + &tp->out_of_order_queue, + skb1); } } - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + if (!skb1) + __skb_queue_head(&tp->out_of_order_queue, skb); + else + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); /* And clean segments covered by new one as whole. */ - while ((skb1 = skb->next) != - (struct sk_buff *)&tp->out_of_order_queue && - after(end_seq, TCP_SKB_CB(skb1)->seq)) { - if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + if (skb1 && !skb_queue_is_last(&tp->out_of_order_queue, skb1)) { + struct sk_buff *n; + + skb1 = skb_queue_next(&tp->out_of_order_queue, skb1); + skb_queue_walk_from_safe(&tp->out_of_order_queue, + skb1, n) { + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, + TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(sk, + TCP_SKB_CB(skb1)->seq, + end_seq); + break; + } + __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, - end_seq); - break; + TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); } - __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, - TCP_SKB_CB(skb1)->end_seq); - __kfree_skb(skb1); } add_sack: @@ -4492,7 +4512,10 @@ add_sack: static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff *next = skb->next; + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(list, skb)) + next = skb_queue_next(list, skb); __skb_unlink(skb, list); __kfree_skb(skb); @@ -4503,6 +4526,9 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. + * + * If tail is NULL, this means until the end of the list. + * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ @@ -4511,15 +4537,23 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb; + struct sk_buff *skb, *n; + bool end_of_skbs; /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ - for (skb = head; skb != tail;) { + skb = head; +restart: + end_of_skbs = true; + skb_queue_walk_from_safe(list, skb, n) { + if (skb == tail) + break; /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list); - continue; + if (!skb) + break; + goto restart; } /* The first skb to collapse is: @@ -4529,16 +4563,24 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, */ if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && (tcp_win_from_space(skb->truesize) > skb->len || - before(TCP_SKB_CB(skb)->seq, start) || - (skb->next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq))) + before(TCP_SKB_CB(skb)->seq, start))) { + end_of_skbs = false; break; + } + + if (!skb_queue_is_last(list, skb)) { + struct sk_buff *next = skb_queue_next(list, skb); + if (next != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { + end_of_skbs = false; + break; + } + } /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; - skb = skb->next; } - if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; while (before(start, end)) { @@ -4583,7 +4625,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list); - if (skb == tail || + if (!skb || + skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; @@ -4610,17 +4653,21 @@ static void tcp_collapse_ofo_queue(struct sock *sk) head = skb; for (;;) { - skb = skb->next; + struct sk_buff *next = NULL; + + if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) + next = skb_queue_next(&tp->out_of_order_queue, skb); + skb = next; /* Segment is terminated when we see gap or when * we are at the end of all the queue. */ - if (skb == (struct sk_buff *)&tp->out_of_order_queue || + if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { tcp_collapse(sk, &tp->out_of_order_queue, head, skb, start, end); head = skb; - if (skb == (struct sk_buff *)&tp->out_of_order_queue) + if (!skb) break; /* Start new segment */ start = TCP_SKB_CB(skb)->seq; @@ -4681,10 +4728,11 @@ static int tcp_prune_queue(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); - tcp_collapse(sk, &sk->sk_receive_queue, - sk->sk_receive_queue.next, - (struct sk_buff *)&sk->sk_receive_queue, - tp->copied_seq, tp->rcv_nxt); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, + skb_peek(&sk->sk_receive_queue), + NULL, + tp->copied_seq, tp->rcv_nxt); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) -- cgit v1.1 From 28e72216d7e7af7050f171a87c1eecba93d01ea6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 May 2009 10:44:30 +0000 Subject: net: unset IFF_XMIT_DST_RELEASE in ipip_tunnel_setup() ipip_tunnel_xmit() might need skb->dst, so tell dev_hard_start_xmit() to no release it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 9054139..bb2f1b1 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -713,6 +713,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } static void ipip_tunnel_init(struct net_device *dev) -- cgit v1.1 From 108bfa895cacd1a7c1767e85be105b213e5dce50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 May 2009 22:35:10 +0000 Subject: net: unset IFF_XMIT_DST_RELEASE in ipgre_tunnel_setup() ipgre_tunnel_xmit() might need skb->dst, so tell dev_hard_start_xmit() to no release it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e62510d..77436e2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1238,6 +1238,7 @@ static void ipgre_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } static int ipgre_tunnel_init(struct net_device *dev) -- cgit v1.1 From 2df9001edc382c331f338f45d259feeaa740c418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 29 May 2009 15:02:29 -0700 Subject: tcp: fix loop in ofo handling code and reduce its complexity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Somewhat luckily, I was looking into these parts with very fine comb because I've made somewhat similar changes on the same area (conflicts that arose weren't that lucky though). The loop was very much overengineered recently in commit 915219441d566 (tcp: Use SKB queue and list helpers instead of doing it by-hand), while it basically just wants to know if there are skbs after 'skb'. Also it got broken because skb1 = skb->next got translated into skb1 = skb1->next (though abstracted) improperly. Note that 'skb1' is pointing to previous sk_buff than skb or NULL if at head. Two things went wrong: - We'll kfree 'skb' on the first iteration instead of the skbuff following 'skb' (it would require required SACK reneging to recover I think). - The list head case where 'skb1' is NULL is checked too early and the loop won't execute whereas it previously did. Conclusion, mostly revert the recent changes which makes the cset very messy looking but using proper accessor in the previous-like version. The effective changes against the original can be viewed with: git-diff 915219441d566f1da0caa0e262be49b666159e17^ \ net/ipv4/tcp_input.c | sed -n -e '57,70 p' Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ba34a23..2bdb0da 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4481,26 +4481,20 @@ drop: __skb_queue_after(&tp->out_of_order_queue, skb1, skb); /* And clean segments covered by new one as whole. */ - if (skb1 && !skb_queue_is_last(&tp->out_of_order_queue, skb1)) { - struct sk_buff *n; + while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { + skb1 = skb_queue_next(&tp->out_of_order_queue, skb); - skb1 = skb_queue_next(&tp->out_of_order_queue, skb1); - skb_queue_walk_from_safe(&tp->out_of_order_queue, - skb1, n) { - if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) - break; - if (before(end_seq, - TCP_SKB_CB(skb1)->end_seq)) { - tcp_dsack_extend(sk, - TCP_SKB_CB(skb1)->seq, - end_seq); - break; - } - __skb_unlink(skb1, &tp->out_of_order_queue); + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) + break; + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, - TCP_SKB_CB(skb1)->end_seq); - __kfree_skb(skb1); + end_seq); + break; } + __skb_unlink(skb1, &tp->out_of_order_queue); + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); } add_sack: -- cgit v1.1 From 4d52cfbef6266092d535237ba5a4b981458ab171 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jun 2009 00:42:16 -0700 Subject: net: ipv4/ip_sockglue.c cleanups Pure cleanups Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 73 +++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 43c0585..21b0187 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -157,38 +157,39 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) /* Ordered by supposed usage frequency */ if (flags & 1) ip_cmsg_recv_pktinfo(msg, skb); - if ((flags>>=1) == 0) + if ((flags >>= 1) == 0) return; if (flags & 1) ip_cmsg_recv_ttl(msg, skb); - if ((flags>>=1) == 0) + if ((flags >>= 1) == 0) return; if (flags & 1) ip_cmsg_recv_tos(msg, skb); - if ((flags>>=1) == 0) + if ((flags >>= 1) == 0) return; if (flags & 1) ip_cmsg_recv_opts(msg, skb); - if ((flags>>=1) == 0) + if ((flags >>= 1) == 0) return; if (flags & 1) ip_cmsg_recv_retopts(msg, skb); - if ((flags>>=1) == 0) + if ((flags >>= 1) == 0) return; if (flags & 1) ip_cmsg_recv_security(msg, skb); - if ((flags>>=1) == 0) + if ((flags >>= 1) == 0) return; if (flags & 1) ip_cmsg_recv_dstaddr(msg, skb); } +EXPORT_SYMBOL(ip_cmsg_recv); int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) { @@ -203,7 +204,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); - err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); + err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), + err < 40 ? err : 40); if (err) return err; break; @@ -238,7 +240,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) struct ip_ra_chain *ip_ra_chain; DEFINE_RWLOCK(ip_ra_lock); -int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) +int ip_ra_control(struct sock *sk, unsigned char on, + void (*destructor)(struct sock *)) { struct ip_ra_chain *ra, *new_ra, **rap; @@ -248,7 +251,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; write_lock_bh(&ip_ra_lock); - for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { write_unlock_bh(&ip_ra_lock); @@ -416,7 +419,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) /* Reset and regenerate socket error */ spin_lock_bh(&sk->sk_error_queue.lock); sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + skb2 = skb_peek(&sk->sk_error_queue); + if (skb2 != NULL) { sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; spin_unlock_bh(&sk->sk_error_queue.lock); sk->sk_error_report(sk); @@ -431,8 +435,8 @@ out: /* - * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on - * an IP socket. + * Socket option code for IP. This is the end of the line after any + * TCP,UDP etc options on an IP socket. */ static int do_ip_setsockopt(struct sock *sk, int level, @@ -474,7 +478,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, switch (optname) { case IP_OPTIONS: { - struct ip_options * opt = NULL; + struct ip_options *opt = NULL; if (optlen > 40 || optlen < 0) goto e_inval; err = ip_options_get_from_user(sock_net(sk), &opt, @@ -556,9 +560,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, } break; case IP_TTL: - if (optlen<1) + if (optlen < 1) goto e_inval; - if (val != -1 && (val < 1 || val>255)) + if (val != -1 && (val < 0 || val > 255)) goto e_inval; inet->uc_ttl = val; break; @@ -570,7 +574,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->hdrincl = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val<0 || val>3) + if (val < 0 || val > 3) goto e_inval; inet->pmtudisc = val; break; @@ -582,7 +586,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; - if (optlen<1) + if (optlen < 1) goto e_inval; if (val == -1) val = 1; @@ -591,7 +595,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->mc_ttl = val; break; case IP_MULTICAST_LOOP: - if (optlen<1) + if (optlen < 1) goto e_inval; inet->mc_loop = !!val; break; @@ -613,7 +617,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, } else { memset(&mreq, 0, sizeof(mreq)); if (optlen >= sizeof(struct in_addr) && - copy_from_user(&mreq.imr_address, optval, sizeof(struct in_addr))) + copy_from_user(&mreq.imr_address, optval, + sizeof(struct in_addr))) break; } @@ -677,7 +682,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, } case IP_MSFILTER: { - extern int sysctl_igmp_max_msf; struct ip_msfilter *msf; if (optlen < IP_MSFILTER_SIZE(0)) @@ -831,7 +835,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, } case MCAST_MSFILTER: { - extern int sysctl_igmp_max_msf; struct sockaddr_in *psin; struct ip_msfilter *msf = NULL; struct group_filter *gsf = NULL; @@ -849,9 +852,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } err = -EFAULT; - if (copy_from_user(gsf, optval, optlen)) { + if (copy_from_user(gsf, optval, optlen)) goto mc_msf_out; - } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffff || gsf->gf_numsrc > sysctl_igmp_max_msf) { @@ -879,7 +882,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, msf->imsf_fmode = gsf->gf_fmode; msf->imsf_numsrc = gsf->gf_numsrc; err = -EADDRNOTAVAIL; - for (i=0; igf_numsrc; ++i) { + for (i = 0; i < gsf->gf_numsrc; ++i) { psin = (struct sockaddr_in *)&gsf->gf_slist[i]; if (psin->sin_family != AF_INET) @@ -890,7 +893,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, gsf = NULL; err = ip_mc_msfilter(sk, msf, ifindex); - mc_msf_out: +mc_msf_out: kfree(msf); kfree(gsf); break; @@ -900,7 +903,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; case IP_FREEBIND: - if (optlen<1) + if (optlen < 1) goto e_inval; inet->freebind = !!val; break; @@ -957,6 +960,7 @@ int ip_setsockopt(struct sock *sk, int level, #endif return err; } +EXPORT_SYMBOL(ip_setsockopt); #ifdef CONFIG_COMPAT int compat_ip_setsockopt(struct sock *sk, int level, int optname, @@ -986,13 +990,12 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(compat_ip_setsockopt); #endif /* - * Get the options. Note for future reference. The GET of IP options gets the - * _received_ ones. The set sets the _sent_ ones. + * Get the options. Note for future reference. The GET of IP options gets + * the _received_ ones. The set sets the _sent_ ones. */ static int do_ip_getsockopt(struct sock *sk, int level, int optname, @@ -1143,7 +1146,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; } err = ip_mc_gsfget(sk, &gsf, - (struct group_filter __user *)optval, optlen); + (struct group_filter __user *)optval, + optlen); release_sock(sk); return err; } @@ -1187,7 +1191,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, } release_sock(sk); - if (len < sizeof(int) && len > 0 && val>=0 && val<=255) { + if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; if (put_user(len, optlen)) @@ -1230,6 +1234,7 @@ int ip_getsockopt(struct sock *sk, int level, #endif return err; } +EXPORT_SYMBOL(ip_getsockopt); #ifdef CONFIG_COMPAT int compat_ip_getsockopt(struct sock *sk, int level, int optname, @@ -1262,11 +1267,5 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(compat_ip_getsockopt); #endif - -EXPORT_SYMBOL(ip_cmsg_recv); - -EXPORT_SYMBOL(ip_getsockopt); -EXPORT_SYMBOL(ip_setsockopt); -- cgit v1.1 From f771bef98004d9d141b085d987a77d06669d4f4f Mon Sep 17 00:00:00 2001 From: Nivedita Singhvi Date: Thu, 28 May 2009 07:00:46 +0000 Subject: ipv4: New multicast-all socket option After some discussion offline with Christoph Lameter and David Stevens regarding multicast behaviour in Linux, I'm submitting a slightly modified patch from the one Christoph submitted earlier. This patch provides a new socket option IP_MULTICAST_ALL. In this case, default behaviour is _unchanged_ from the current Linux standard. The socket option is set by default to provide original behaviour. Sockets wishing to receive data only from multicast groups they join explicitly will need to clear this socket option. Signed-off-by: Nivedita Singhvi Signed-off-by: Christoph Lameter Acked-by: David Stevens Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 1 + net/ipv4/igmp.c | 2 +- net/ipv4/ip_sockglue.c | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5abee4c..d873621 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -375,6 +375,7 @@ lookup_protocol: inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; + inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9eb6219..e6058a5 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2196,7 +2196,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) break; } if (!pmc) - return 1; + return inet->mc_all; psl = pmc->sflist; if (!psl) return pmc->sfmode == MCAST_EXCLUDE; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 21b0187..cb49936 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -453,6 +453,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<= sizeof(int)) { @@ -898,6 +899,13 @@ mc_msf_out: kfree(gsf); break; } + case IP_MULTICAST_ALL: + if (optlen < 1) + goto e_inval; + if (val != 0 && val != 1) + goto e_inval; + inet->mc_all = val; + break; case IP_ROUTER_ALERT: err = ip_ra_control(sk, val ? 1 : 0, NULL); break; @@ -1151,6 +1159,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, release_sock(sk); return err; } + case IP_MULTICAST_ALL: + val = inet->mc_all; + break; case IP_PKTOPTIONS: { struct msghdr msg; -- cgit v1.1 From 511c3f92ad5b6d9f8f6464be1b4f85f0422be91a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jun 2009 05:14:27 +0000 Subject: net: skb->rtable accessor Define skb_rtable(const struct sk_buff *skb) accessor to get rtable from skb Delete skb->rtable field Setting rtable is not allowed, just set dst instead as rtable is an alias. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/arp.c | 4 ++-- net/ipv4/icmp.c | 10 +++++----- net/ipv4/igmp.c | 2 +- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_input.c | 2 +- net/ipv4/ip_options.c | 16 ++++++++-------- net/ipv4/ip_output.c | 10 +++++----- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv4/ipmr.c | 6 +++--- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +- net/ipv4/netfilter/nf_nat_helper.c | 4 ++-- net/ipv4/route.c | 37 ++++++++++++++++++++++--------------- net/ipv4/tcp_ipv4.c | 4 ++-- 15 files changed, 57 insertions(+), 50 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f11931c..816494f 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -474,7 +474,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) return 1; } - paddr = skb->rtable->rt_gateway; + paddr = skb_rtable(skb)->rt_gateway; if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) return 0; @@ -817,7 +817,7 @@ static int arp_process(struct sk_buff *skb) if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input(skb, tip, sip, 0, dev) == 0) { - rt = skb->rtable; + rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 3f50807..94f75ef 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -356,7 +356,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct ipcm_cookie ipc; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->u.dst.dev); struct sock *sk; struct inet_sock *inet; @@ -416,7 +416,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct iphdr *iph; int room; struct icmp_bxm icmp_param; - struct rtable *rt = skb_in->rtable; + struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; __be32 saddr; u8 tos; @@ -596,7 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) RT_TOS(tos), rt2->u.dst.dev); dst_release(&rt2->u.dst); - rt2 = skb_in->rtable; + rt2 = skb_rtable(skb_in); skb_in->dst = odst; } @@ -926,7 +926,7 @@ static void icmp_address(struct sk_buff *skb) static void icmp_address_reply(struct sk_buff *skb) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct net_device *dev = skb->dev; struct in_device *in_dev; struct in_ifaddr *ifa; @@ -970,7 +970,7 @@ static void icmp_discard(struct sk_buff *skb) int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->u.dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e6058a5..afabd27 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -948,7 +948,7 @@ int igmp_rcv(struct sk_buff *skb) case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ - if (skb->rtable->fl.iif == 0) + if (skb_rtable(skb)->fl.iif == 0) break; /* don't rely on MC router hearing unicast reports */ if (skb->pkt_type == PACKET_MULTICAST || diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index df3fe50..0761cd9 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -81,7 +81,7 @@ int ip_forward(struct sk_buff *skb) if (!xfrm4_route_forward(skb)) goto drop; - rt = skb->rtable; + rt = skb_rtable(skb); if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 77436e2..85ddad4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -602,7 +602,7 @@ static int ipgre_rcv(struct sk_buff *skb) #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { /* Looped back packet, drop it! */ - if (skb->rtable->fl.iif == 0) + if (skb_rtable(skb)->fl.iif == 0) goto drop; stats->multicast++; skb->pkt_type = PACKET_BROADCAST; @@ -704,7 +704,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (skb->protocol == htons(ETH_P_IP)) { - rt = skb->rtable; + rt = skb_rtable(skb); if ((dst = rt->rt_gateway) == 0) goto tx_error_icmp; } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 40f6206..cea784b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -357,7 +357,7 @@ static int ip_rcv_finish(struct sk_buff *skb) if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; - rt = skb->rtable; + rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, skb->len); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 2c88da6..7e1074f 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -102,7 +102,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) sptr = skb_network_header(skb); dptr = dopt->__data; - daddr = skb->rtable->rt_spec_dst; + daddr = skb_rtable(skb)->rt_spec_dst; if (sopt->rr) { optlen = sptr[sopt->rr+1]; @@ -257,7 +257,7 @@ int ip_options_compile(struct net *net, struct rtable *rt = NULL; if (skb != NULL) { - rt = skb->rtable; + rt = skb_rtable(skb); optptr = (unsigned char *)&(ip_hdr(skb)[1]); } else optptr = opt->__data; @@ -550,7 +550,7 @@ void ip_forward_options(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); unsigned char * optptr; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { @@ -598,7 +598,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) __be32 nexthop; struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = skb_network_header(skb) + opt->srr; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct rtable *rt2; int err; @@ -623,13 +623,13 @@ int ip_options_rcv_srr(struct sk_buff *skb) } memcpy(&nexthop, &optptr[srrptr-1], 4); - rt = skb->rtable; - skb->rtable = NULL; + rt = skb_rtable(skb); + skb->dst = NULL; err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); - rt2 = skb->rtable; + rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { ip_rt_put(rt2); - skb->rtable = rt; + skb->dst = &rt->u.dst; return -EINVAL; } ip_rt_put(rt); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ea19c37..8d845eb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -140,7 +140,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options *opt) { struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct iphdr *iph; /* Build the IP header. */ @@ -238,7 +238,7 @@ static int ip_finish_output(struct sk_buff *skb) int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->u.dst.dev; /* @@ -319,7 +319,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ - rt = skb->rtable; + rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; @@ -440,7 +440,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) unsigned int mtu, hlen, left, len, ll_rs, pad; int offset; __be16 not_last_frag; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); int err = 0; dev = rt->u.dst.dev; @@ -1362,7 +1362,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } replyopts; struct ipcm_cookie ipc; __be32 daddr; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); if (ip_options_echo(&replyopts.opt, skb)) return; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cb49936..fc7993e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -57,7 +57,7 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct in_pktinfo info; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); info.ipi_addr.s_addr = ip_hdr(skb)->daddr; if (rt) { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index bb2f1b1..0c6e7bf 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -416,7 +416,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (!dst) { /* NBMA tunnel */ - if ((rt = skb->rtable) == NULL) { + if ((rt = skb_rtable(skb)) == NULL) { stats->tx_fifo_errors++; goto tx_error; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 13e9dd3..69dd058 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1354,7 +1354,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local if (net->ipv4.vif_table[vif].dev != skb->dev) { int true_vifi; - if (skb->rtable->fl.iif == 0) { + if (skb_rtable(skb)->fl.iif == 0) { /* It is our own packet, looped back. Very complicated situation... @@ -1430,7 +1430,7 @@ int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; struct net *net = dev_net(skb->dev); - int local = skb->rtable->rt_flags&RTCF_LOCAL; + int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; /* Packet is looped back after forward, it should not be forwarded second time, but still can be delivered locally. @@ -1646,7 +1646,7 @@ int ipmr_get_route(struct net *net, { int err; struct mfc_cache *cache; - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); read_lock(&mrt_lock); cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index f389f60..c0992c7 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -72,7 +72,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) return NF_ACCEPT; mr = par->targinfo; - rt = skb->rtable; + rt = skb_rtable(skb); newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); if (!newsrc) { printk("MASQUERADE: %s ate my IP address\n", par->out->name); diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index cf7a42b..155c008 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -140,7 +140,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct iphdr *iph; struct tcphdr *tcph; int oldlen, datalen; @@ -218,7 +218,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct iphdr *iph; struct udphdr *udph; int datalen, oldlen; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 28205e5..f20060a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1064,7 +1064,8 @@ work_done: out: return 0; } -static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +static int rt_intern_hash(unsigned hash, struct rtable *rt, + struct rtable **rp, struct sk_buff *skb) { struct rtable *rth, **rthp; unsigned long now; @@ -1114,7 +1115,10 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); - *rp = rth; + if (rp) + *rp = rth; + else + skb->dst = &rth->u.dst; return 0; } @@ -1210,7 +1214,10 @@ restart: rcu_assign_pointer(rt_hash_table[hash].chain, rt); spin_unlock_bh(rt_hash_lock_addr(hash)); - *rp = rt; + if (rp) + *rp = rt; + else + skb->dst = &rt->u.dst; return 0; } @@ -1407,7 +1414,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, &netevent); rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt)) + if (!rt_intern_hash(hash, rt, &rt, NULL)) ip_rt_put(rt); goto do_next; } @@ -1473,7 +1480,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct in_device *in_dev = in_dev_get(rt->u.dst.dev); if (!in_dev) @@ -1521,7 +1528,7 @@ out: static int ip_error(struct sk_buff *skb) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); unsigned long now; int code; @@ -1698,7 +1705,7 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - rt = skb->rtable; + rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->u.dst, 0); } @@ -1858,7 +1865,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, &skb->rtable); + return rt_intern_hash(hash, rth, NULL, skb); e_nobufs: in_dev_put(in_dev); @@ -2019,7 +2026,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, rt_genid(dev_net(rth->u.dst.dev))); - return rt_intern_hash(hash, rth, &skb->rtable); + return rt_intern_hash(hash, rth, NULL, skb); } /* @@ -2175,7 +2182,7 @@ local_input: } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, &skb->rtable); + err = rt_intern_hash(hash, rth, NULL, skb); goto done; no_route: @@ -2244,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb->rtable = rth; + skb->dst = &rth->u.dst; return 0; } RT_CACHE_STAT_INC(in_hlist_search); @@ -2420,7 +2427,7 @@ static int ip_mkroute_output(struct rtable **rp, if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp); + err = rt_intern_hash(hash, rth, rp, NULL); } return err; @@ -2763,7 +2770,7 @@ static int rt_fill_info(struct net *net, struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; long expires; @@ -2907,7 +2914,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); - rt = skb->rtable; + rt = skb_rtable(skb); if (err == 0 && rt->u.dst.error) err = -rt->u.dst.error; } else { @@ -2927,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err) goto errout_free; - skb->rtable = rt; + skb->dst = &rt->u.dst; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fc79e34..319c885 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -546,7 +546,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (th->rst) return; - if (skb->rtable->rt_type != RTN_LOCAL) + if (skb_rtable(skb)->rt_type != RTN_LOCAL) return; /* Swap the send and the receive. */ @@ -1185,7 +1185,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) #endif /* Never answer to SYNs send to broadcast or multicast */ - if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; /* TW buckets are converted to open requests without -- cgit v1.1 From adf30907d63893e4208dfe3f5c88ae12bc2f25d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jun 2009 05:19:30 +0000 Subject: net: skb->dst accessors Define three accessors to get/set dst attached to a skb struct dst_entry *skb_dst(const struct sk_buff *skb) void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) void skb_dst_drop(struct sk_buff *skb) This one should replace occurrences of : dst_release(skb->dst) skb->dst = NULL; Delete skb->dst field Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- net/ipv4/icmp.c | 10 +++++----- net/ipv4/igmp.c | 4 ++-- net/ipv4/ip_forward.c | 4 ++-- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_gre.c | 23 +++++++++++------------ net/ipv4/ip_input.c | 6 +++--- net/ipv4/ip_options.c | 6 +++--- net/ipv4/ip_output.c | 20 ++++++++++---------- net/ipv4/ipip.c | 13 ++++++------- net/ipv4/ipmr.c | 13 ++++++------- net/ipv4/netfilter.c | 28 ++++++++++++++++------------ net/ipv4/netfilter/ipt_REJECT.c | 7 +++---- net/ipv4/netfilter/nf_nat_standalone.c | 7 +++---- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 14 +++++++------- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_output.c | 2 +- net/ipv4/udp.c | 4 ++-- net/ipv4/xfrm4_input.c | 2 +- net/ipv4/xfrm4_mode_tunnel.c | 4 ++-- net/ipv4/xfrm4_output.c | 6 +++--- 22 files changed, 91 insertions(+), 92 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 816494f..8a3881e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -468,7 +468,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) __be32 paddr; struct neighbour *n; - if (!skb->dst) { + if (!skb_dst(skb)) { printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); kfree_skb(skb); return 1; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 94f75ef..97c410e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -591,13 +591,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) goto relookup_failed; /* Ugh! */ - odst = skb_in->dst; + odst = skb_dst(skb_in); err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, RT_TOS(tos), rt2->u.dst.dev); dst_release(&rt2->u.dst); rt2 = skb_rtable(skb_in); - skb_in->dst = odst; + skb_dst_set(skb_in, odst); } if (err) @@ -659,7 +659,7 @@ static void icmp_unreach(struct sk_buff *skb) u32 info = 0; struct net *net; - net = dev_net(skb->dst->dev); + net = dev_net(skb_dst(skb)->dev); /* * Incomplete header ? @@ -822,7 +822,7 @@ static void icmp_echo(struct sk_buff *skb) { struct net *net; - net = dev_net(skb->dst->dev); + net = dev_net(skb_dst(skb)->dev); if (!net->ipv4.sysctl_icmp_echo_ignore_all) { struct icmp_bxm icmp_param; @@ -873,7 +873,7 @@ static void icmp_timestamp(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(dev_net(skb->dst->dev), ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); goto out; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index afabd27..01b4284 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -311,7 +311,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->u.dst); skb->dev = dev; skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -659,7 +659,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, return -1; } - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->u.dst); skb_reserve(skb, LL_RESERVED_SPACE(dev)); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0761cd9..a2991bc 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -42,7 +42,7 @@ static int ip_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -123,7 +123,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7985346..1f1b824 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -573,7 +573,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) struct ipq *qp; struct net *net; - net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev); + net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Start by cleaning up the memory. */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 85ddad4..44e2a3d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -643,8 +643,7 @@ static int ipgre_rcv(struct sk_buff *skb) stats->rx_packets++; stats->rx_bytes += len; skb->dev = tunnel->dev; - dst_release(skb->dst); - skb->dst = NULL; + skb_dst_drop(skb); nf_reset(skb); skb_reset_network_header(skb); @@ -698,7 +697,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if ((dst = tiph->daddr) == 0) { /* NBMA tunnel */ - if (skb->dst == NULL) { + if (skb_dst(skb) == NULL) { stats->tx_fifo_errors++; goto tx_error; } @@ -712,7 +711,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) else if (skb->protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; int addr_type; - struct neighbour *neigh = skb->dst->neighbour; + struct neighbour *neigh = skb_dst(skb)->neighbour; if (neigh == NULL) goto tx_error; @@ -766,10 +765,10 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (df) mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; else - mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); if (skb->protocol == htons(ETH_P_IP)) { df |= (old_iph->frag_off&htons(IP_DF)); @@ -783,14 +782,14 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } #ifdef CONFIG_IPV6 else if (skb->protocol == htons(ETH_P_IPV6)) { - struct rt6_info *rt6 = (struct rt6_info *)skb->dst; + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); - if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) { + if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { if ((tunnel->parms.iph.daddr && !ipv4_is_multicast(tunnel->parms.iph.daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; - skb->dst->metrics[RTAX_MTU-1] = mtu; + skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; } } @@ -837,8 +836,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - dst_release(skb->dst); - skb->dst = &rt->u.dst; + skb_dst_drop(skb); + skb_dst_set(skb, &rt->u.dst); /* * Push down and install the IPIP header. diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index cea784b..490ce20 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -329,7 +329,7 @@ static int ip_rcv_finish(struct sk_buff *skb) * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (skb->dst == NULL) { + if (skb_dst(skb) == NULL) { int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { @@ -344,9 +344,9 @@ static int ip_rcv_finish(struct sk_buff *skb) } #ifdef CONFIG_NET_CLS_ROUTE - if (unlikely(skb->dst->tclassid)) { + if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); - u32 idx = skb->dst->tclassid; + u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 7e1074f..94bf105 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -143,7 +143,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) __be32 addr; memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(dev_net(skb->dst->dev), addr) != RTN_LOCAL) { + if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { dopt->ts_needtime = 1; soffset += 8; } @@ -624,12 +624,12 @@ int ip_options_rcv_srr(struct sk_buff *skb) memcpy(&nexthop, &optptr[srrptr-1], 4); rt = skb_rtable(skb); - skb->dst = NULL; + skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { ip_rt_put(rt2); - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->u.dst); return -EINVAL; } ip_rt_put(rt); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d845eb..3d6167f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -95,7 +95,7 @@ int __ip_local_out(struct sk_buff *skb) iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, + return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, dst_output); } @@ -118,7 +118,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) __skb_pull(newskb, skb_network_offset(newskb)); newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!newskb->dst); + WARN_ON(!skb_dst(newskb)); netif_rx(newskb); return 0; } @@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); @@ -217,14 +217,14 @@ static inline int ip_skb_dst_mtu(struct sk_buff *skb) struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? - skb->dst->dev->mtu : dst_mtu(skb->dst); + skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); } static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ - if (skb->dst->xfrm != NULL) { + if (skb_dst(skb)->xfrm != NULL) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } @@ -296,7 +296,7 @@ int ip_mc_output(struct sk_buff *skb) int ip_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; + struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); @@ -355,7 +355,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) } sk_setup_caps(sk, &rt->u.dst); } - skb->dst = dst_clone(&rt->u.dst); + skb_dst_set(skb, dst_clone(&rt->u.dst)); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -401,8 +401,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - dst_release(to->dst); - to->dst = dst_clone(from->dst); + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); to->dev = from->dev; to->mark = from->mark; @@ -1294,7 +1294,7 @@ int ip_push_pending_frames(struct sock *sk) * on dst refcount */ inet->cork.dst = NULL; - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->u.dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 0c6e7bf..93e2b78 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -370,8 +370,7 @@ static int ipip_rcv(struct sk_buff *skb) tunnel->dev->stats.rx_packets++; tunnel->dev->stats.rx_bytes += skb->len; skb->dev = tunnel->dev; - dst_release(skb->dst); - skb->dst = NULL; + skb_dst_drop(skb); nf_reset(skb); ipip_ecn_decapsulate(iph, skb); netif_rx(skb); @@ -447,15 +446,15 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tiph->frag_off) mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); else - mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; if (mtu < 68) { stats->collisions++; ip_rt_put(rt); goto tx_error; } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); df |= (old_iph->frag_off&htons(IP_DF)); @@ -502,8 +501,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - dst_release(skb->dst); - skb->dst = &rt->u.dst; + skb_dst_drop(skb); + skb_dst_set(skb, &rt->u.dst); /* * Push down and install the IPIP header. diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 69dd058..ffd9861 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -651,7 +651,7 @@ static int ipmr_cache_report(struct net *net, ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; - skb->dst = dst_clone(pkt->dst); + skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* * Add our header @@ -1201,7 +1201,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(iph, skb->dst, NULL); + ip_select_ident(iph, skb_dst(skb), NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1212,7 +1212,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -1290,8 +1290,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) vif->pkt_out++; vif->bytes_out += skb->len; - dst_release(skb->dst); - skb->dst = &rt->u.dst; + skb_dst_drop(skb); + skb_dst_set(skb, &rt->u.dst); ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. @@ -1543,8 +1543,7 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) skb->protocol = htons(ETH_P_IP); skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; - dst_release(skb->dst); - skb->dst = NULL; + skb_dst_drop(skb); reg_dev->stats.rx_bytes += skb->len; reg_dev->stats.rx_packets++; nf_reset(skb); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index fdf6811..1725dc0 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -12,7 +12,7 @@ /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) { - struct net *net = dev_net(skb->dst->dev); + struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi fl = {}; @@ -41,8 +41,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) return -1; /* Drop old route. */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; + skb_dst_drop(skb); + skb_dst_set(skb, &rt->u.dst); } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ @@ -50,7 +50,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) if (ip_route_output_key(net, &rt, &fl) != 0) return -1; - odst = skb->dst; + odst = skb_dst(skb); if (ip_route_input(skb, iph->daddr, iph->saddr, RT_TOS(iph->tos), rt->u.dst.dev) != 0) { dst_release(&rt->u.dst); @@ -60,18 +60,22 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) dst_release(odst); } - if (skb->dst->error) + if (skb_dst(skb)->error) return -1; #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - xfrm_decode_session(skb, &fl, AF_INET) == 0) - if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0)) + xfrm_decode_session(skb, &fl, AF_INET) == 0) { + struct dst_entry *dst = skb_dst(skb); + skb_dst_set(skb, NULL); + if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) return -1; + skb_dst_set(skb, dst); + } #endif /* Change in oif may mean change in hh_len. */ - hh_len = skb->dst->dev->hard_header_len; + hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) return -1; @@ -92,7 +96,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb) if (xfrm_decode_session(skb, &fl, AF_INET) < 0) return -1; - dst = skb->dst; + dst = skb_dst(skb); if (dst->xfrm) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); @@ -100,11 +104,11 @@ int ip_xfrm_me_harder(struct sk_buff *skb) if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) return -1; - dst_release(skb->dst); - skb->dst = dst; + skb_dst_drop(skb); + skb_dst_set(skb, dst); /* Change in oif may mean change in hh_len. */ - hh_len = skb->dst->dev->hard_header_len; + hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) return -1; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 0b4b6e0..c93ae44 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -108,17 +108,16 @@ static void send_reset(struct sk_buff *oldskb, int hook) addr_type = RTN_LOCAL; /* ip_route_me_harder expects skb->dst to be set */ - dst_hold(oldskb->dst); - nskb->dst = oldskb->dst; + skb_dst_set(nskb, dst_clone(skb_dst(oldskb))); if (ip_route_me_harder(nskb, addr_type)) goto free_nskb; - niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); + niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); nskb->ip_summed = CHECKSUM_NONE; /* "Never happens" */ - if (nskb->len > dst_mtu(nskb->dst)) + if (nskb->len > dst_mtu(skb_dst(nskb))) goto free_nskb; nf_ct_attach(nskb, oldskb); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index b7dd695..5567bd0 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -167,10 +167,9 @@ nf_nat_in(unsigned int hooknum, ret = nf_nat_fn(hooknum, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && - daddr != ip_hdr(skb)->daddr) { - dst_release(skb->dst); - skb->dst = NULL; - } + daddr != ip_hdr(skb)->daddr) + skb_dst_drop(skb); + return ret; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index f774651..3dc9171 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -343,7 +343,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb->dst = dst_clone(&rt->u.dst); + skb_dst_set(skb, dst_clone(&rt->u.dst)); skb_reset_network_header(skb); iph = ip_hdr(skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f20060a..a849bb1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1118,7 +1118,7 @@ restart: if (rp) *rp = rth; else - skb->dst = &rth->u.dst; + skb_dst_set(skb, &rth->u.dst); return 0; } @@ -1217,7 +1217,7 @@ restart: if (rp) *rp = rt; else - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->u.dst); return 0; } @@ -2251,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb->dst = &rth->u.dst; + skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search); @@ -2934,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err) goto errout_free; - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->u.dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -2975,15 +2975,15 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (rt_is_expired(rt)) continue; - skb->dst = dst_clone(&rt->u.dst); + skb_dst_set(skb, dst_clone(&rt->u.dst)); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { - dst_release(xchg(&skb->dst, NULL)); + skb_dst_drop(skb); rcu_read_unlock_bh(); goto done; } - dst_release(xchg(&skb->dst, NULL)); + skb_dst_drop(skb); } rcu_read_unlock_bh(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 319c885..5a1ca26 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -590,7 +590,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; - net = dev_net(skb->dst->dev); + net = dev_net(skb_dst(skb)->dev); ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); @@ -617,7 +617,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb->dst->dev); + struct net *net = dev_net(skb_dst(skb)->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 79c39dc..416fc4c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2202,7 +2202,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->dst = dst_clone(dst); + skb_dst_set(skb, dst_clone(dst)); mss = dst_metric(dst, RTAX_ADVMSS); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7a1d1ce..8f4158d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -328,7 +328,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, if (unlikely(sk = skb_steal_sock(skb))) return sk; else - return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport, + return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), udptable); } @@ -1237,7 +1237,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct sock *sk; struct udphdr *uh; unsigned short ulen; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 4ec2162..f9f922a 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -23,7 +23,7 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) { - if (skb->dst == NULL) { + if (skb_dst(skb) == NULL) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 7135279..3444f3b 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -28,7 +28,7 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb) */ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct iphdr *top_iph; int flags; @@ -41,7 +41,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ihl = 5; top_iph->version = 4; - top_iph->protocol = xfrm_af2proto(skb->dst->ops->family); + top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); /* DS disclosed */ top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 8c3180a..c908bd9 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -29,7 +29,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; - dst = skb->dst; + dst = skb_dst(skb); mtu = dst_mtu(dst); if (skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -72,7 +72,7 @@ EXPORT_SYMBOL(xfrm4_prepare_output); static int xfrm4_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER - if (!skb->dst->xfrm) { + if (!skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } @@ -87,6 +87,6 @@ static int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, - NULL, skb->dst->dev, xfrm4_output_finish, + NULL, skb_dst(skb)->dev, xfrm4_output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -- cgit v1.1 From 2307f866f542f3397d24f78d0efd74f4ab214a96 Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Wed, 3 Jun 2009 21:43:26 -0700 Subject: ipv4: remove ip_mc_drop_socket() declaration from af_inet.c. ip_mc_drop_socket() method is declared in linux/igmp.h, which is included anyhow in af_inet.c. So there is no need for this declaration. This patch removes it from af_inet.c. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d873621..566ea6c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -116,7 +116,6 @@ #include #endif -extern void ip_mc_drop_socket(struct sock *sk); /* The inetsw table contains everything that inet_create needs to * build a new socket. -- cgit v1.1 From d7fcf1a5cae2c970e9afe7192fe0c13d931247e0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 00:19:37 -0700 Subject: ipv4: Use frag list abstraction interfaces. Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 4 ++-- net/ipv4/ip_output.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1f1b824..575f9bd 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -507,7 +507,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_shinfo(head)->frag_list) { + if (skb_has_frags(head)) { struct sk_buff *clone; int i, plen = 0; @@ -516,7 +516,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone->next = head->next; head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_shinfo(head)->frag_list = NULL; + skb_frag_list_init(head); for (i=0; inr_frags; i++) plen += skb_shinfo(head)->frags[i].size; clone->len = clone->data_len = head->data_len - plen; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3d6167f..9248d28 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -474,7 +474,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_shinfo(skb)->frag_list) { + if (skb_has_frags(skb)) { struct sk_buff *frag; int first_len = skb_pagelen(skb); int truesizes = 0; @@ -485,7 +485,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb_cloned(skb)) goto slow_path; - for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || @@ -510,7 +510,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) err = 0; offset = 0; frag = skb_shinfo(skb)->frag_list; - skb_shinfo(skb)->frag_list = NULL; + skb_frag_list_init(skb); skb->data_len = first_len - skb_headlen(skb); skb->truesize -= truesizes; skb->len = first_len; -- cgit v1.1 From 343a99724e4431d8618bea2eb7552e12c965425a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 00:23:58 -0700 Subject: netfilter: Use frag list abstraction interfaces. Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_proto_sctp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c index 65e470b..6983e41 100644 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -33,6 +33,7 @@ sctp_manip_pkt(struct sk_buff *skb, enum nf_nat_manip_type maniptype) { const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + struct sk_buff *frag; sctp_sctphdr_t *hdr; unsigned int hdroff = iphdroff + iph->ihl*4; __be32 oldip, newip; @@ -57,8 +58,8 @@ sctp_manip_pkt(struct sk_buff *skb, } crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff); - for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) - crc32 = sctp_update_cksum((u8 *)skb->data, skb_headlen(skb), + skb_walk_frags(skb, frag); + crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), crc32); crc32 = sctp_end_cksum(crc32); hdr->checksum = crc32; -- cgit v1.1 From 0808dc80939b08ec215f472e17a5d8f6b148037e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 18:05:28 -0700 Subject: netfilter: Fix extra semi-colon in skb_walk_frags() changes. Noticed by Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_proto_sctp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c index 6983e41..3fc598e 100644 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -58,7 +58,7 @@ sctp_manip_pkt(struct sk_buff *skb, } crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff); - skb_walk_frags(skb, frag); + skb_walk_frags(skb, frag) crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), crc32); crc32 = sctp_end_cksum(crc32); -- cgit v1.1 From 2b85a34e911bf483c27cfdd124aeb1605145dc80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2009 02:55:43 -0700 Subject: net: No more expensive sock_hold()/sock_put() on each tx One of the problem with sock memory accounting is it uses a pair of sock_hold()/sock_put() for each transmitted packet. This slows down bidirectional flows because the receive path also needs to take a refcount on socket and might use a different cpu than transmit path or transmit completion path. So these two atomic operations also trigger cache line bounces. We can see this in tx or tx/rx workloads (media gateways for example), where sock_wfree() can be in top five functions in profiles. We use this sock_hold()/sock_put() so that sock freeing is delayed until all tx packets are completed. As we also update sk_wmem_alloc, we could offset sk_wmem_alloc by one unit at init time, until sk_free() is called. Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc) to decrement initial offset and atomicaly check if any packets are in flight. skb_set_owner_w() doesnt call sock_hold() anymore sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc reached 0 to perform the final freeing. Drawback is that a skb->truesize error could lead to unfreeable sockets, or even worse, prematurely calling __sk_free() on a live socket. Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt contention point. 5 % speedup on a UDP transmit workload (depends on number of flows), lowering TX completion cpu usage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9248d28..2470262 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -498,7 +498,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) BUG_ON(frag->sk); if (skb->sk) { - sock_hold(skb->sk); frag->sk = skb->sk; frag->destructor = sock_wfree; truesizes += frag->truesize; -- cgit v1.1