diff options
Diffstat (limited to 'net/ipv4')
32 files changed, 1006 insertions, 697 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7cd7760..e848e6c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -215,9 +215,15 @@ config NET_IPIP be inserted in and removed from the running kernel whenever you want). Most people won't need this and can say N. +config NET_IPGRE_DEMUX + tristate "IP: GRE demultiplexer" + help + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. + config NET_IPGRE tristate "IP: GRE tunnels over IP" - depends on IPV6 || IPV6=n + depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87c..4978d22 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6a1100c..f581f77 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret); /* * inet_ehash_secret must be set exactly once - * Instead of using a dedicated spinlock, we (ab)use inetsw_lock */ void build_ehash_secret(void) { u32 rnd; + do { get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - spin_lock_bh(&inetsw_lock); - if (!inet_ehash_secret) - inet_ehash_secret = rnd; - spin_unlock_bh(&inetsw_lock); + + cmpxchg(&inet_ehash_secret, 0, rnd); } EXPORT_SYMBOL(build_ehash_secret); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 96c1955..d9031ad 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -55,7 +55,7 @@ * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. - * Alan Cox : Took the AP1000 nasty FDDI hack and + * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... @@ -120,7 +120,7 @@ EXPORT_SYMBOL(clip_tbl_hook); #endif #include <asm/system.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/netfilter_arp.h> @@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = { .queue_xmit = dev_queue_xmit, }; -const struct neigh_ops arp_broken_ops = { +static const struct neigh_ops arp_broken_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = { .hh_output = dev_queue_xmit, .queue_xmit = dev_queue_xmit, }; -EXPORT_SYMBOL(arp_broken_ops); struct neigh_table arp_tbl = { - .family = AF_INET, - .entry_size = sizeof(struct neighbour) + 4, - .key_len = 4, - .hash = arp_hash, - .constructor = arp_constructor, - .proxy_redo = parp_redo, - .id = "arp_cache", - .parms = { - .tbl = &arp_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, - .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len = 3, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, - .locktime = 1 * HZ, + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, + .hash = arp_hash, + .constructor = arp_constructor, + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { + .tbl = &arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, }, - .gc_interval = 30 * HZ, - .gc_thresh1 = 128, - .gc_thresh2 = 512, - .gc_thresh3 = 1024, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); @@ -233,7 +232,7 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32*)neigh->primary_key; + __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; @@ -296,16 +295,19 @@ static int arp_constructor(struct neighbour *neigh) neigh->ops = &arp_broken_ops; neigh->output = neigh->ops->output; return 0; +#else + break; #endif - ;} + } #endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); - } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); - } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + } else if (neigh->type == RTN_BROADCAST || + (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } @@ -315,7 +317,7 @@ static int arp_constructor(struct neighbour *neigh) else neigh->ops = &arp_generic_ops; - if (neigh->nud_state&NUD_VALID) + if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; @@ -334,7 +336,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 saddr = 0; u8 *dst_ha = NULL; struct net_device *dev = neigh->dev; - __be32 target = *(__be32*)neigh->primary_key; + __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; @@ -347,7 +349,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(dev_net(dev), + ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ @@ -369,16 +372,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); - if ((probes -= neigh->parms->ucast_probes) < 0) { - if (!(neigh->nud_state&NUD_VALID)) - printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + probes -= neigh->parms->ucast_probes; + if (probes < 0) { + if (!(neigh->nud_state & NUD_VALID)) + printk(KERN_DEBUG + "trying to ucast probe in NUD_INVALID\n"); dst_ha = neigh->ha; read_lock_bh(&neigh->lock); - } else if ((probes -= neigh->parms->app_probes) < 0) { + } else { + probes -= neigh->parms->app_probes; + if (probes < 0) { #ifdef CONFIG_ARPD - neigh_app_ns(neigh); + neigh_app_ns(neigh); #endif - return; + return; + } } arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, @@ -451,7 +459,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) * is allowed to use this function, it is scheduled to be removed. --ANK */ -static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) +static int arp_set_predefined(int addr_hint, unsigned char *haddr, + __be32 paddr, struct net_device *dev) { switch (addr_hint) { case RTN_LOCAL: @@ -483,7 +492,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = skb_rtable(skb)->rt_gateway; - if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, + paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); @@ -515,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst) return -EINVAL; if (n == NULL) { __be32 nexthop = ((struct rtable *)dst)->rt_gateway; - if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) nexthop = 0; n = __neigh_lookup_errno( #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) - dev->type == ARPHRD_ATM ? clip_tbl_hook : + dev->type == ARPHRD_ATM ? + clip_tbl_hook : #endif - &arp_tbl, &nexthop, dev); + &arp_tbl, &nexthop, dev); if (IS_ERR(n)) return PTR_ERR(n); dst->neighbour = n; @@ -543,8 +554,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, if (!IN_DEV_PROXY_ARP(in_dev)) return 0; - - if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) + imi = IN_DEV_MEDIUM_ID(in_dev); + if (imi == 0) return 1; if (imi == -1) return 0; @@ -555,7 +566,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); - return (omi != imi && omi != -1); + return omi != imi && omi != -1; } /* @@ -685,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp->ar_pln = 4; arp->ar_op = htons(type); - arp_ptr=(unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; @@ -735,9 +746,8 @@ void arp_send(int type, int ptype, __be32 dest_ip, skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); - if (skb == NULL) { + if (skb == NULL) return; - } arp_xmit(skb); } @@ -815,7 +825,7 @@ static int arp_process(struct sk_buff *skb) /* * Extract fields */ - arp_ptr= (unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); @@ -869,16 +879,17 @@ static int arp_process(struct sk_buff *skb) addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { - int dont_send = 0; + int dont_send; - if (!dont_send) - dont_send |= arp_ignore(in_dev,sip,tip); + dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) - dont_send |= arp_filter(sip,tip,dev); + dont_send |= arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); neigh_release(n); } } @@ -887,8 +898,7 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) - { + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -896,9 +906,12 @@ static int arp_process(struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); } else { - pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); + pneigh_enqueue(&arp_tbl, + in_dev->arp_parms, skb); return 0; } goto out; @@ -939,7 +952,8 @@ static int arp_process(struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; - neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); + neigh_update(n, sha, state, + override ? NEIGH_UPDATE_F_OVERRIDE : 0); neigh_release(n); } @@ -975,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, arp->ar_pln != 4) goto freeskb; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) goto out_of_mem; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); @@ -1019,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, - r->arp_ha.sa_data); + r->arp_ha.sa_data); if (!dev) return -ENODEV; } @@ -1033,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, } static int arp_req_set(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { __be32 ip; struct neighbour *neigh; @@ -1046,10 +1061,11 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; dev = rt->dst.dev; ip_rt_put(rt); @@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r, unsigned state = NUD_STALE; if (r->arp_flags & ATF_PERM) state = NUD_PERMANENT; - err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, - NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); } @@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r, static unsigned arp_state_to_flags(struct neighbour *neigh) { - unsigned flags = 0; if (neigh->nud_state&NUD_PERMANENT) - flags = ATF_PERM|ATF_COM; + return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) - flags = ATF_COM; - return flags; + return ATF_COM; + else + return 0; } /* @@ -1142,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, } static int arp_req_delete(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { int err; __be32 ip; @@ -1153,10 +1169,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; dev = rt->dst.dev; ip_rt_put(rt); @@ -1166,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, err = -ENXIO; neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { - if (neigh->nud_state&~NUD_NOARP) + if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN); @@ -1186,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) struct net_device *dev = NULL; switch (cmd) { - case SIOCDARP: - case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - case SIOCGARP: - err = copy_from_user(&r, arg, sizeof(struct arpreq)); - if (err) - return -EFAULT; - break; - default: - return -EINVAL; + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && - (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) + (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; if (!(r.arp_flags & ATF_NETMASK)) ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = @@ -1211,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) + dev = __dev_get_by_name(net, r.arp_dev); + if (dev == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1243,7 +1261,8 @@ out: return err; } -static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +static int arp_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) { struct net_device *dev = ptr; @@ -1311,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf) for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; - if (c != ' ') *s++ = c; + if (c != ' ') + *s++ = c; } *s++ = '-'; - - if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + n = (a->ax25_call[6] >> 1) & 0x0F; + if (n > 9) { *s++ = '1'; n -= 10; } @@ -1325,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf) *s++ = '\0'; if (*buf == '\0' || *buf == '-') - return "*"; + return "*"; return buf; - } #endif /* CONFIG_AX25 */ @@ -1408,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos) /* ------------------------------------------------------------------------ */ static const struct seq_operations arp_seq_ops = { - .start = arp_seq_start, - .next = neigh_seq_next, - .stop = neigh_seq_stop, - .show = arp_seq_show, + .start = arp_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = arp_seq_show, }; static int arp_seq_open(struct inode *inode, struct file *file) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 721a8a3..174be6c 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -73,6 +73,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); - return(0); + return 0; } EXPORT_SYMBOL(ip4_datagram_connect); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index da14c49..c2ff48f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev) inet_free_ifa(ifa); } - dev->ip_ptr = NULL; + rcu_assign_pointer(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); @@ -1059,7 +1059,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_REGISTER: printk(KERN_DEBUG "inetdev_event: bug\n"); - dev->ip_ptr = NULL; + rcu_assign_pointer(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7d02a9f..4a69a95 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -147,35 +147,40 @@ static void fib_flush(struct net *net) rt_cache_flush(net, -1); } -/* - * Find the first device with a given source address. +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU */ - -struct net_device * ip_dev_find(struct net *net, __be32 addr) +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; - struct fib_result res; + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = addr + } + }, + .flags = FLOWI_FLAG_MATCH_ANY_IIF + }; + struct fib_result res = { 0 }; struct net_device *dev = NULL; - struct fib_table *local_table; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || fib_table_lookup(local_table, &fl, &res)) + if (fib_lookup(net, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) goto out; dev = FIB_RES_DEV(res); - if (dev) + if (dev && devref) dev_hold(dev); out: fib_res_put(&res); return dev; } -EXPORT_SYMBOL(ip_dev_find); +EXPORT_SYMBOL(__ip_dev_find); /* * Find address type as if only "dev" was present in the system. If diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4a8e370..a96e5ec 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -186,9 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node) { struct tnode *ret = node_parent(node); - return rcu_dereference_check(ret, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + return rcu_dereference_rtnl(ret); } /* Same as rcu_assign_pointer @@ -211,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { struct node *ret = tnode_get_child(tn, i); - return rcu_dereference_check(ret, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + return rcu_dereference_rtnl(ret); } static inline int tnode_child_length(const struct tnode *tn) @@ -459,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<<bits; } - pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), - (unsigned long) (sizeof(struct node) << bits)); + pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), + sizeof(struct node) << bits); return tn; } @@ -609,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!node_parent((struct node*) tn)) { + if (!node_parent((struct node *)tn)) { inflate_threshold_use = inflate_threshold_root; halve_threshold_use = halve_threshold_root; - } - else { + } else { inflate_threshold_use = inflate_threshold; halve_threshold_use = halve_threshold; } @@ -639,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) check_tnode(tn); /* Return if at least one inflate is run */ - if( max_work != MAX_WORK) + if (max_work != MAX_WORK) return (struct node *) tn; /* @@ -966,9 +961,7 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = rcu_dereference_check(t->trie, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + n = rcu_dereference_rtnl(t->trie); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; @@ -1748,16 +1741,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) /* Node empty, walk back up to parent */ c = (struct node *) p; - } while ( (p = node_parent_rcu(c)) != NULL); + } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ } static struct leaf *trie_firstleaf(struct trie *t) { - struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); if (!n) return NULL; @@ -2043,14 +2034,14 @@ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct tnode *tnode; - unsigned index; - unsigned depth; + unsigned int index; + unsigned int depth; }; static struct node *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; - unsigned cindex = iter->index; + unsigned int cindex = iter->index; struct tnode *p; /* A single entry routing table */ @@ -2159,7 +2150,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { - unsigned i, max, pointers, bytes, avdepth; + unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; @@ -2356,7 +2347,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v) static void seq_indent(struct seq_file *seq, int n) { - while (n-- > 0) seq_puts(seq, " "); + while (n-- > 0) + seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) @@ -2388,7 +2380,7 @@ static const char *const rtn_type_names[__RTN_MAX] = { [RTN_XRESOLVE] = "XRESOLVE", }; -static inline const char *rtn_type(char *buf, size_t len, unsigned t) +static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; @@ -2544,13 +2536,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) { - static unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, [8] = RTF_REJECT, - }; - unsigned flags = type2flags[type]; + unsigned int flags = 0; + if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) + flags = RTF_REJECT; if (fi && fi->fib_nh->nh_gw) flags |= RTF_GATEWAY; if (mask == htonl(0xFFFFFFFF)) @@ -2562,7 +2553,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed - * and needs to be same as fib_hash output to avoid breaking + * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) @@ -2587,7 +2578,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; - unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); + unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); int len; if (fa->fa_type == RTN_BROADCAST diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c new file mode 100644 index 0000000..caea688 --- /dev/null +++ b/net/ipv4/gre.c @@ -0,0 +1,151 @@ +/* + * GRE over IPv4 demultiplexer driver + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <linux/spinlock.h> +#include <net/protocol.h> +#include <net/gre.h> + + +static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static DEFINE_SPINLOCK(gre_proto_lock); + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version]) + goto err_out_unlock; + + rcu_assign_pointer(gre_proto[version], proto); + spin_unlock(&gre_proto_lock); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version] != proto) + goto err_out_unlock; + rcu_assign_pointer(gre_proto[version], NULL); + spin_unlock(&gre_proto_lock); + synchronize_rcu(); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +static int gre_rcv(struct sk_buff *skb) +{ + const struct gre_protocol *proto; + u8 ver; + int ret; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->handler) + goto drop_unlock; + ret = proto->handler(skb); + rcu_read_unlock(); + return ret; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + const struct gre_protocol *proto; + u8 ver; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->err_handler) + goto drop_unlock; + proto->err_handler(skb, info); + rcu_read_unlock(); + return; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); +} + +static const struct net_protocol net_gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .netns_ok = 1, +}; + +static int __init gre_init(void) +{ + pr_info("GRE over IPv4 demultiplexor driver"); + + if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { + pr_err("gre: can't add protocol\n"); + return -EAGAIN; + } + + return 0; +} + +static void __exit gre_exit(void) +{ + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); + diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a0d847c..96bc7f9 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) @@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; { struct flowi fl = { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e5fa2dd..ba80426 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len, bc += op->no; } } - return (len == 0); + return len == 0; } static int valid_cc(const void *bc, int len, int cc) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b7c4165..1684408 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a) struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); - return (qp->id == arg->iph->id && + return qp->id == arg->iph->id && qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user); + qp->user == arg->user; } /* Memory Tracking Functions. */ @@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 35c93e8..fbe2c47 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -44,6 +44,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/gre.h> #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) #include <net/ipv6.h> @@ -63,13 +64,13 @@ We cannot track such dead loops during route installation, it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), - and silently drop packet when it expires. It is the best + and silently drop packet when it expires. It is a good solution, but it supposes maintaing new variable in ALL skb, even if no tunneling is used. - Current solution: HARD_TX_LOCK lock breaks dead loops. - - + Current solution: xmit_recursion breaks dead loops. This is a percpu + counter, since when we enter the first ndo_xmit(), cpu migration is + forbidden. We force an exit if this counter reaches RECURSION_LIMIT 2. Networking dead loops would not kill routers, but would really kill network. IP hop limit plays role of "t->recursion" in this case, @@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); static int ipgre_net_id __read_mostly; struct ipgre_net { - struct ip_tunnel *tunnels[4][HASH_SIZE]; + struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; struct net_device *fb_tunnel_dev; }; @@ -158,13 +159,40 @@ struct ipgre_net { #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipgre_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipgre_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + /* Given src, dst and key, find appropriate for input tunnel. */ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, @@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, { struct net *net = dev_net(dev); int link = dev->ifindex; - unsigned h0 = HASH(remote); - unsigned h1 = HASH(key); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(key); struct ip_tunnel *t, *cand = NULL; struct ipgre_net *ign = net_generic(net, ipgre_net_id); int dev_type = (gre_proto == htons(ETH_P_TEB)) ? @@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, return NULL; } -static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, +static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; - unsigned h = HASH(key); + unsigned int h = HASH(key); int prio = 0; if (local) @@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, return &ign->tunnels[prio][h]; } -static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, +static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, struct ip_tunnel *t) { return __ipgre_bucket(ign, &t->parms); @@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipgre_bucket(ign, t); + struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); - spin_lock_bh(&ipgre_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipgre_lock); } static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipgre_lock); - *tp = t->next; - spin_unlock_bh(&ipgre_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipgre_bucket(ign, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, __be32 local = parms->iph.saddr; __be32 key = parms->i_key; int link = parms->link; - struct ip_tunnel *t, **tp; + struct ip_tunnel *t; + struct ip_tunnel __rcu **tp; struct ipgre_net *ign = net_generic(net, ipgre_net_id); - for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) + for (tp = __ipgre_bucket(ign, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && key == t->parms.i_key && @@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, return t; } -static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, +static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { struct ip_tunnel *t, *nt; @@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb) if ((tunnel = ipgre_tunnel_lookup(skb->dev, iph->saddr, iph->daddr, key, gre_proto))) { - struct net_device_stats *stats = &tunnel->dev->stats; + struct pcpu_tstats *tstats; secpath_reset(skb); @@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb) /* Looped back packet, drop it! */ if (skb_rtable(skb)->fl.iif == 0) goto drop; - stats->multicast++; + tunnel->dev->stats.multicast++; skb->pkt_type = PACKET_BROADCAST; } #endif if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - stats->rx_crc_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } if (tunnel->parms.i_flags&GRE_SEQ) { if (!(flags&GRE_SEQ) || (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - stats->rx_fifo_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } tunnel->i_seqno = seqno + 1; @@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb) /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { - stats->rx_length_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } @@ -640,14 +670,20 @@ static int ipgre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); - netif_rx(skb); + if (netif_rx(skb) == NET_RX_DROP) + tunnel->dev->stats.rx_dropped++; + rcu_read_unlock(); - return(0); + return 0; } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -655,20 +691,19 @@ drop: rcu_read_unlock(); drop_nolock: kfree_skb(skb); - return(0); + return 0; } static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; __be16 df; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int gre_hlen; @@ -690,7 +725,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev /* NBMA tunnel */ if (skb_dst(skb) == NULL) { - stats->tx_fifo_errors++; + dev->stats.tx_fifo_errors++; goto tx_error; } @@ -736,14 +771,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_GRE + } +; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error; } } @@ -751,7 +792,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } @@ -814,7 +855,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -881,15 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -909,13 +950,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_GRE + }; struct rtable *rt; + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->dst.dev; ip_rt_put(rt); @@ -1012,7 +1059,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } } else { - unsigned nflags = 0; + unsigned int nflags = 0; t = netdev_priv(dev); @@ -1125,7 +1172,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - const void *daddr, const void *saddr, unsigned len) + const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); @@ -1167,13 +1214,19 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi fl = { .oif = t->parms.link, - .nl_u = { .ip4_u = - { .daddr = t->parms.iph.daddr, - .saddr = t->parms.iph.saddr, - .tos = RT_TOS(t->parms.iph.tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = t->parms.link, + .nl_u = { + .ip4_u = { + .daddr = t->parms.iph.daddr, + .saddr = t->parms.iph.saddr, + .tos = RT_TOS(t->parms.iph.tos) + } + }, + .proto = IPPROTO_GRE + }; struct rtable *rt; + if (ip_route_output_key(dev_net(dev), &rt, &fl)) return -EADDRNOTAVAIL; dev = rt->dst.dev; @@ -1213,12 +1266,19 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_start_xmit = ipgre_tunnel_xmit, .ndo_do_ioctl = ipgre_tunnel_ioctl, .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, }; +static void ipgre_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipgre_dev_free; dev->type = ARPHRD_IPGRE; dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; @@ -1256,6 +1316,10 @@ static int ipgre_tunnel_init(struct net_device *dev) } else dev->header_ops = &ipgre_header_ops; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } @@ -1274,14 +1338,13 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - ign->tunnels_wc[0] = tunnel; + rcu_assign_pointer(ign->tunnels_wc[0], tunnel); } -static const struct net_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .netns_ok = 1, +static const struct gre_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, }; static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) @@ -1291,11 +1354,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = ign->tunnels[prio][h]; + struct ip_tunnel *t; + + t = rtnl_dereference(ign->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } @@ -1441,6 +1506,10 @@ static int ipgre_tap_init(struct net_device *dev) ipgre_tunnel_bind_dev(dev); + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } @@ -1451,6 +1520,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, }; static void ipgre_tap_setup(struct net_device *dev) @@ -1459,7 +1529,7 @@ static void ipgre_tap_setup(struct net_device *dev) ether_setup(dev); dev->netdev_ops = &ipgre_tap_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipgre_dev_free; dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; @@ -1487,6 +1557,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla if (!tb[IFLA_MTU]) dev->mtu = mtu; + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + err = register_netdevice(dev); if (err) goto out; @@ -1522,7 +1596,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], t = nt; if (dev->type != ARPHRD_ETHER) { - unsigned nflags = 0; + unsigned int nflags = 0; if (ipv4_is_multicast(p.iph.daddr)) nflags = IFF_BROADCAST; @@ -1663,7 +1737,7 @@ static int __init ipgre_init(void) if (err < 0) return err; - err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { printk(KERN_INFO "ipgre init: can't add protocol\n"); goto add_proto_failed; @@ -1683,7 +1757,7 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_net_ops); goto out; @@ -1693,7 +1767,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) + if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); unregister_pernet_device(&ipgre_net_ops); } diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ba9836c..1906fa3 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -466,7 +466,7 @@ error: } return -EINVAL; } - +EXPORT_SYMBOL(ip_options_compile); /* * Undo all the changes done by ip_options_compile(). @@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb) } return 0; } +EXPORT_SYMBOL(ip_options_rcv_srr); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7649d77..439d2a3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -487,7 +487,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { struct sk_buff *frag, *frag2; int first_len = skb_pagelen(skb); @@ -844,10 +844,9 @@ int ip_append_data(struct sock *sk, inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->dst.header_len) != 0) { - length += exthdrlen; - transhdrlen += exthdrlen; - } + exthdrlen = rt->dst.header_len; + length += exthdrlen; + transhdrlen += exthdrlen; } else { rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) @@ -934,16 +933,19 @@ alloc_new_skb: !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else - alloclen = datalen + fragheaderlen; + alloclen = fraglen; /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ - if (datalen == length + fraggap) + if (datalen == length + fraggap) { alloclen += rt->dst.trailer_len; - + /* make sure mtu is not reached */ + if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) + datalen -= ALIGN(rt->dst.trailer_len, 8); + } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, @@ -960,7 +962,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->shtx.flags = 0; + ipc->tx_flags = 0; } if (skb == NULL) goto error; @@ -971,7 +973,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - *skb_tx(skb) = ipc->shtx; + skb_shinfo(skb)->tx_flags = ipc->tx_flags; /* * Find where to start putting bytes. @@ -1391,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ec03673..6ad46c2 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -122,31 +122,59 @@ static int ipip_net_id __read_mostly; struct ipip_net { - struct ip_tunnel *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel *tunnels_r[HASH_SIZE]; - struct ip_tunnel *tunnels_l[HASH_SIZE]; - struct ip_tunnel *tunnels_wc[1]; - struct ip_tunnel **tunnels[4]; + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; -static void ipip_tunnel_init(struct net_device *dev); +static int ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); +static void ipip_dev_free(struct net_device *dev); /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipip_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipip_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); @@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, +static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - unsigned h = 0; + unsigned int h = 0; int prio = 0; if (remote) { @@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, return &ipn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, +static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, struct ip_tunnel *t) { return __ipip_bucket(ipn, &t->parms); @@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipip_lock); - *tp = t->next; - spin_unlock_bh(&ipip_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip_bucket(ipn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip_bucket(ipn, t); + struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - spin_lock_bh(&ipip_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipip_lock); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, @@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip_bucket(ipn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else - sprintf(name, "tunl%%d"); + strcpy(name, "tunl%d"); dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); if (dev == NULL) @@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, nt = netdev_priv(dev); nt->parms = *parms; - ipip_tunnel_init(dev); + if (ipip_tunnel_init(dev) < 0) + goto failed_free; if (register_netdevice(dev) < 0) goto failed_free; @@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, return nt; failed_free: - free_netdev(dev); + ipip_dev_free(dev); return NULL; } +/* called with RTNL */ static void ipip_tunnel_uninit(struct net_device *dev) { struct net *net = dev_net(dev); struct ipip_net *ipn = net_generic(net, ipip_net_id); - if (dev == ipn->fb_tunnel_dev) { - spin_lock_bh(&ipip_lock); - ipn->tunnels_wc[0] = NULL; - spin_unlock_bh(&ipip_lock); - } else + if (dev == ipn->fb_tunnel_dev) + rcu_assign_pointer(ipn->tunnels_wc[0], NULL); + else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); } @@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); rcu_read_lock(); - if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), - iph->saddr, iph->daddr)) != NULL) { + tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); kfree_skb(skb); @@ -374,10 +406,17 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); ipip_ecn_decapsulate(iph, skb); - netif_rx(skb); + + if (netif_rx(skb) == NET_RX_DROP) + tunnel->dev->stats.rx_dropped++; + rcu_read_unlock(); return 0; } @@ -394,13 +433,12 @@ static int ipip_rcv(struct sk_buff *skb) static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ @@ -410,13 +448,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->protocol != htons(ETH_P_IP)) goto tx_error; - if (tos&1) + if (tos & 1) tos = old_iph->tos; if (!dst) { /* NBMA tunnel */ if ((rt = skb_rtable(skb)) == NULL) { - stats->tx_fifo_errors++; + dev->stats.tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) @@ -424,14 +462,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } } @@ -439,7 +483,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } @@ -449,7 +493,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - stats->collisions++; + dev->stats.collisions++; ip_rt_put(rt); goto tx_error; } @@ -485,7 +529,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -522,14 +566,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ttl = old_iph->ttl; nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -544,13 +588,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; struct rtable *rt; + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->dst.dev; ip_rt_put(rt); @@ -696,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ipip_tunnel_change_mtu, - + .ndo_get_stats = ipip_get_stats, }; +static void ipip_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipip_dev_free; dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); @@ -711,10 +767,11 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } -static void ipip_tunnel_init(struct net_device *dev) +static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); @@ -725,9 +782,15 @@ static void ipip_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip_tunnel_bind_dev(dev); + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; } -static void __net_init ipip_fb_tunnel_init(struct net_device *dev) +static int __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -740,11 +803,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + dev_hold(dev); - ipn->tunnels_wc[0] = tunnel; + rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + return 0; } -static struct xfrm_tunnel ipip_handler = { +static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, @@ -760,11 +828,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = ipn->tunnels[prio][h]; + struct ip_tunnel *t; + t = rtnl_dereference(ipn->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } @@ -789,7 +858,9 @@ static int __net_init ipip_init_net(struct net *net) } dev_net_set(ipn->fb_tunnel_dev, net); - ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + if (err) + goto err_reg_dev; if ((err = register_netdev(ipn->fb_tunnel_dev))) goto err_reg_dev; @@ -797,7 +868,7 @@ static int __net_init ipip_init_net(struct net *net) return 0; err_reg_dev: - free_netdev(ipn->fb_tunnel_dev); + ipip_dev_free(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ return err; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 179fcab..86dd569 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -75,7 +75,7 @@ struct mr_table { struct net *net; #endif u32 id; - struct sock *mroute_sk; + struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct list_head mfc_cache_array[MFC_LINES]; @@ -98,7 +98,7 @@ struct ipmr_result { }; /* Big lock, protecting vif table, mrt cache and mroute socket state. - Note that the changes are semaphored via rtnl_lock. + * Note that the changes are semaphored via rtnl_lock. */ static DEFINE_RWLOCK(mrt_lock); @@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock); static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved - entries is changed only in process context and protected - with weak lock mrt_lock. Queue of unresolved entries is protected - with strong spinlock mfc_unres_lock. - - In this case data path is free of exclusive locks at all. + * entries is changed only in process context and protected + * with weak lock mrt_lock. Queue of unresolved entries is protected + * with strong spinlock mfc_unres_lock. + * + * In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; @@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) set_fs(KERNEL_DS); err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); set_fs(oldfs); - } else + } else { err = -EOPNOTSUPP; - + } dev = NULL; if (err == 0 && @@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) dev->iflink = 0; rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { rcu_read_unlock(); goto failure; } @@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, mrt->mroute_reg_vif_num = -1; #endif - if (vifi+1 == mrt->maxvif) { + if (vifi + 1 == mrt->maxvif) { int tmp; - for (tmp=vifi-1; tmp>=0; tmp--) { + + for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } @@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, dev_set_allmulti(dev, -1); - if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; ip_rt_multicast_event(in_dev); } - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); dev_put(dev); return 0; } -static inline void ipmr_cache_free(struct mfc_cache *c) +static void ipmr_cache_free_rcu(struct rcu_head *head) { + struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); + kmem_cache_free(mrt_cachep, c); } +static inline void ipmr_cache_free(struct mfc_cache *c) +{ + call_rcu(&c->rcu, ipmr_cache_free_rcu); +} + /* Destroy an unresolved cache entry, killing queued skbs - and reporting error to netlink readers. + * and reporting error to netlink readers. */ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) @@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) memset(&e->msg, 0, sizeof(e->msg)); rtnl_unicast(skb, net, NETLINK_CB(skb).pid); - } else + } else { kfree_skb(skb); + } } ipmr_cache_free(c); @@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); - if (dev && dev->ip_ptr == NULL) { + if (dev && __in_dev_get_rtnl(dev) == NULL) { dev_put(dev); return -EADDRNOTAVAIL; } - } else + } else { dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); - + } if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); @@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt, return -EINVAL; } - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) { dev_put(dev); return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; ip_rt_multicast_event(in_dev); - /* - * Fill in the VIF structures - */ + /* Fill in the VIF structures */ + v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->pkt_in = 0; v->pkt_out = 0; v->link = dev->ifindex; - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) v->link = dev->iflink; /* And finish update writing critical data */ write_lock_bh(&mrt_lock); v->dev = dev; #ifdef CONFIG_IP_PIMSM - if (v->flags&VIFF_REGISTER) + if (v->flags & VIFF_REGISTER) mrt->mroute_reg_vif_num = vifi; #endif if (vifi+1 > mrt->maxvif) @@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, return 0; } +/* called with rcu_read_lock() */ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) @@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, int line = MFC_HASH(mcastgrp, origin); struct mfc_cache *c; - list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) return c; } @@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); - if (c == NULL) - return NULL; - c->mfc_un.res.minvif = MAXVIFS; + + if (c) + c->mfc_un.res.minvif = MAXVIFS; return c; } static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); - if (c == NULL) - return NULL; - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10*HZ; + + if (c) { + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10*HZ; + } return c; } @@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct sk_buff *skb; struct nlmsgerr *e; - /* - * Play the pending entries through our router - */ + /* Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { - nlh->nlmsg_len = (skb_tail_pointer(skb) - - (u8 *)nlh); + nlh->nlmsg_len = skb_tail_pointer(skb) - + (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); @@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).pid); - } else + } else { ip_mr_forward(net, mrt, skb, c, 0); + } } } @@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt, const int ihl = ip_hdrlen(pkt); struct igmphdr *igmp; struct igmpmsg *msg; + struct sock *mroute_sk; int ret; #ifdef CONFIG_IP_PIMSM @@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt, #ifdef CONFIG_IP_PIMSM if (assert == IGMPMSG_WHOLEPKT) { /* Ugly, but we have no choice with this interface. - Duplicate old header, fix ihl, length etc. - And all this only to mangle msg->im_msgtype and - to set msg->im_mbz to "mbz" :-) + * Duplicate old header, fix ihl, length etc. + * And all this only to mangle msg->im_msgtype and + * to set msg->im_mbz to "mbz" :-) */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt, #endif { - /* - * Copy the IP header - */ + /* Copy the IP header */ skb->network_header = skb->tail; skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); - ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ + ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; skb_dst_set(skb, dst_clone(skb_dst(pkt))); - /* - * Add our header - */ + /* Add our header */ - igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); igmp->type = msg->im_msgtype = assert; - igmp->code = 0; - ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ + igmp->code = 0; + ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } - if (mrt->mroute_sk == NULL) { + rcu_read_lock(); + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk == NULL) { + rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } - /* - * Deliver to mrouted - */ - ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); + /* Deliver to mrouted */ + + ret = sock_queue_rcv_skb(mroute_sk, skb); + rcu_read_unlock(); if (ret < 0) { if (net_ratelimit()) printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); @@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) } if (!found) { - /* - * Create a new entry if allowable - */ + /* Create a new entry if allowable */ if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || (c = ipmr_cache_alloc_unres()) == NULL) { @@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) return -ENOBUFS; } - /* - * Fill in the new cache entry - */ + /* Fill in the new cache entry */ + c->mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; - /* - * Reflect first query at mrouted. - */ + /* Reflect first query at mrouted. */ + err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry @@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); } - /* - * See if we can append the packet - */ - if (c->mfc_un.unres.unresolved.qlen>3) { + /* See if we can append the packet */ + + if (c->mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { @@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); + list_del_rcu(&c->list); ipmr_cache_free(c); return 0; @@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; - write_lock_bh(&mrt_lock); - list_add(&c->list, &mrt->mfc_cache_array[line]); - write_unlock_bh(&mrt_lock); + list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); /* * Check to see if we resolved a queued list. If so we @@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt) LIST_HEAD(list); struct mfc_cache *c, *next; - /* - * Shut down all active vif entries - */ + /* Shut down all active vif entries */ + for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif_table[i].flags&VIFF_STATIC)) + if (!(mrt->vif_table[i].flags & VIFF_STATIC)) vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); - /* - * Wipe the cache - */ + /* Wipe the cache */ + for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (c->mfc_flags&MFC_STATIC) + if (c->mfc_flags & MFC_STATIC) continue; - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); - + list_del_rcu(&c->list); ipmr_cache_free(c); } } @@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt) } } +/* called from ip_ra_control(), before an RCU grace period, + * we dont need to call synchronize_rcu() here + */ static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); @@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk) rtnl_lock(); ipmr_for_each_table(mrt, net) { - if (sk == mrt->mroute_sk) { + if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; - - write_lock_bh(&mrt_lock); - mrt->mroute_sk = NULL; - write_unlock_bh(&mrt_lock); - + rcu_assign_pointer(mrt->mroute_sk, NULL); mroute_clean_tables(mrt); } } @@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENOENT; if (optname != MRT_INIT) { - if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) + if (sk != rcu_dereference_raw(mrt->mroute_sk) && + !capable(CAP_NET_ADMIN)) return -EACCES; } @@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENOPROTOOPT; rtnl_lock(); - if (mrt->mroute_sk) { + if (rtnl_dereference(mrt->mroute_sk)) { rtnl_unlock(); return -EADDRINUSE; } ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { - write_lock_bh(&mrt_lock); - mrt->mroute_sk = sk; - write_unlock_bh(&mrt_lock); - + rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; } rtnl_unlock(); return ret; case MRT_DONE: - if (sk != mrt->mroute_sk) + if (sk != rcu_dereference_raw(mrt->mroute_sk)) return -EACCES; return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: @@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENFILE; rtnl_lock(); if (optname == MRT_ADD_VIF) { - ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); + ret = vif_add(net, mrt, &vif, + sk == rtnl_dereference(mrt->mroute_sk)); } else { ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } @@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi if (optname == MRT_DEL_MFC) ret = ipmr_mfc_delete(mrt, &mfc); else - ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); + ret = ipmr_mfc_add(net, mrt, &mfc, + sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; /* @@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi case MRT_ASSERT: { int v; - if (get_user(v,(int __user *)optval)) + if (get_user(v, (int __user *)optval)) return -EFAULT; mrt->mroute_do_assert = (v) ? 1 : 0; return 0; @@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi { int v; - if (get_user(v,(int __user *)optval)) + if (get_user(v, (int __user *)optval)) return -EFAULT; v = (v) ? 1 : 0; @@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -EINVAL; if (get_user(v, (u32 __user *)optval)) return -EFAULT; - if (sk == mrt->mroute_sk) - return -EBUSY; rtnl_lock(); ret = 0; - if (!ipmr_new_table(net, v)) - ret = -ENOMEM; - raw_sk(sk)->ipmr_table = v; + if (sk == rtnl_dereference(mrt->mroute_sk)) { + ret = -EBUSY; + } else { + if (!ipmr_new_table(net, v)) + ret = -ENOMEM; + raw_sk(sk)->ipmr_table = v; + } rtnl_unlock(); return ret; } @@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int if (optname != MRT_VERSION && #ifdef CONFIG_IP_PIMSM - optname!=MRT_PIM && + optname != MRT_PIM && #endif - optname!=MRT_ASSERT) + optname != MRT_ASSERT) return -ENOPROTOOPT; if (get_user(olr, optlen)) @@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; - read_lock(&mrt_lock); + rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); + rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; @@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = { }; /* - * Encapsulate a packet by attaching a valid IPIP header to it. + * Encapsulate a packet by attaching a valid IPIP header to it. * This avoids tunnel drivers and other mess and gives us the speed so * important for multicast video. */ @@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) skb_reset_network_header(skb); iph = ip_hdr(skb); - iph->version = 4; + iph->version = 4; iph->tos = old_iph->tos; iph->ttl = old_iph->ttl; iph->frag_off = 0; @@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) static inline int ipmr_forward_finish(struct sk_buff *skb) { - struct ip_options * opt = &(IPCB(skb)->opt); + struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); @@ -1535,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, } #endif - if (vif->flags&VIFF_TUNNEL) { - struct flowi fl = { .oif = vif->link, - .nl_u = { .ip4_u = - { .daddr = vif->remote, - .saddr = vif->local, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + if (vif->flags & VIFF_TUNNEL) { + struct flowi fl = { + .oif = vif->link, + .nl_u = { + .ip4_u = { + .daddr = vif->remote, + .saddr = vif->local, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(net, &rt, &fl)) goto out_free; encap = sizeof(struct iphdr); } else { - struct flowi fl = { .oif = vif->link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = vif->link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(net, &rt, &fl)) goto out_free; } @@ -1559,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not - allow to send ICMP, so that packets will disappear - to blackhole. + * allow to send ICMP, so that packets will disappear + * to blackhole. */ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); @@ -1583,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. - * What do we do with netfilter? -- RR */ + * What do we do with netfilter? -- RR + */ if (vif->flags & VIFF_TUNNEL) { ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ @@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (skb_rtable(skb)->fl.iif == 0) { /* It is our own packet, looped back. - Very complicated situation... - - The best workaround until routing daemons will be - fixed is not to redistribute packet, if it was - send through wrong interface. It means, that - multicast applications WILL NOT work for - (S,G), which have default multicast route pointing - to wrong oif. In any case, it is not a good - idea to use multicasting applications on router. + * Very complicated situation... + * + * The best workaround until routing daemons will be + * fixed is not to redistribute packet, if it was + * send through wrong interface. It means, that + * multicast applications WILL NOT work for + * (S,G), which have default multicast route pointing + * to wrong oif. In any case, it is not a good + * idea to use multicasting applications on router. */ goto dont_forward; } @@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, - so that we cannot check that packet arrived on an oif. - It is bad, but otherwise we would need to move pretty - large chunk of pimd to kernel. Ough... --ANK + * so that we cannot check that packet arrived on an oif. + * It is bad, but otherwise we would need to move pretty + * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && @@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, /* * Forward the frame */ - for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = cache->mfc_un.res.maxvif - 1; + ct >= cache->mfc_un.res.minvif; ct--) { if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) ipmr_queue_xmit(net, mrt, skb2, cache, psend); @@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) ipmr_queue_xmit(net, mrt, skb2, cache, psend); } else { @@ -1713,6 +1728,7 @@ dont_forward: /* * Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); */ int ip_mr_input(struct sk_buff *skb) @@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb) int err; /* Packet is looped back after forward, it should not be - forwarded second time, but still can be delivered locally. + * forwarded second time, but still can be delivered locally. */ - if (IPCB(skb)->flags&IPSKB_FORWARDED) + if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); @@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb) } if (!local) { - if (IPCB(skb)->opt.router_alert) { - if (ip_call_ra_chain(skb)) - return 0; - } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ - /* IGMPv1 (and broken IGMPv2 implementations sort of - Cisco IOS <= 11.2(8)) do not put router alert - option to IGMP packets destined to routable - groups. It is very bad, because it means - that we can forward NO IGMP messages. - */ - read_lock(&mrt_lock); - if (mrt->mroute_sk) { - nf_reset(skb); - raw_rcv(mrt->mroute_sk, skb); - read_unlock(&mrt_lock); - return 0; - } - read_unlock(&mrt_lock); + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { + /* IGMPv1 (and broken IGMPv2 implementations sort of + * Cisco IOS <= 11.2(8)) do not put router alert + * option to IGMP packets destined to routable + * groups. It is very bad, because it means + * that we can forward NO IGMP messages. + */ + struct sock *mroute_sk; + + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk) { + nf_reset(skb); + raw_rcv(mroute_sk, skb); + return 0; + } } } - read_lock(&mrt_lock); + /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* @@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb) if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ip_local_deliver(skb); - if (skb2 == NULL) { - read_unlock(&mrt_lock); + if (skb2 == NULL) return -ENOBUFS; - } skb = skb2; } + read_lock(&mrt_lock); vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) { int err2 = ipmr_cache_unresolved(mrt, vif, skb); @@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb) return -ENODEV; } + read_lock(&mrt_lock); ip_mr_forward(net, mrt, skb, cache, local); - read_unlock(&mrt_lock); if (local) @@ -1805,6 +1820,7 @@ dont_forward: } #ifdef CONFIG_IP_PIMSM +/* called with rcu_read_lock() */ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, unsigned int pimlen) { @@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* - Check that: - a. packet is really destinted to a multicast group - b. packet is not a NULL-REGISTER - c. packet is not truncated + * Check that: + * a. packet is really sent to a multicast group + * b. packet is not a NULL-REGISTER + * c. packet is not truncated */ if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || @@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, read_lock(&mrt_lock); if (mrt->mroute_reg_vif_num >= 0) reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; - if (reg_dev) - dev_hold(reg_dev); read_unlock(&mrt_lock); if (reg_dev == NULL) return 1; skb->mac_header = skb->network_header; - skb_pull(skb, (u8*)encap - skb->data); + skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); - skb->ip_summed = 0; + skb->ip_summed = CHECKSUM_NONE; skb->pkt_type = PACKET_HOST; skb_tunnel_rx(skb, reg_dev); netif_rx(skb); - dev_put(reg_dev); - return 0; + return NET_RX_SUCCESS; } #endif @@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, * Handle IGMP messages of PIMv1 */ -int pim_rcv_v1(struct sk_buff * skb) +int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; struct net *net = dev_net(skb->dev); @@ -1881,7 +1894,7 @@ drop: #endif #ifdef CONFIG_IP_PIMSM_V2 -static int pim_rcv(struct sk_buff * skb) +static int pim_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct net *net = dev_net(skb->dev); @@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); - if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || - (pim->flags&PIM_NULL_REGISTER) || + if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || + (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; @@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net, if (mrt == NULL) return -ENOENT; - read_lock(&mrt_lock); + rcu_read_lock(); cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); if (cache == NULL) { struct sk_buff *skb2; struct iphdr *iph; struct net_device *dev; - int vif; + int vif = -1; if (nowait) { - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EAGAIN; } dev = skb->dev; - if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { + read_lock(&mrt_lock); + if (dev) + vif = ipmr_find_vif(mrt, dev); + if (vif < 0) { read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENODEV; } skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENOMEM; } @@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net, iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } - if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + read_lock(&mrt_lock); + if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; err = __ipmr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } @@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[1]; s_e = cb->args[2]; - read_lock(&mrt_lock); + rcu_read_lock(); ipmr_for_each_table(mrt, net) { if (t < s_t) goto next_table; if (t > s_t) s_h = 0; for (h = s_h; h < MFC_LINES; h++) { - list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { if (e < s_e) goto next_entry; if (ipmr_fill_mroute(mrt, skb, @@ -2075,7 +2096,7 @@ next_table: t++; } done: - read_unlock(&mrt_lock); + rcu_read_unlock(); cb->args[2] = e; cb->args[1] = h; @@ -2086,7 +2107,8 @@ done: #ifdef CONFIG_PROC_FS /* - * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif + * The /proc interfaces to multicast routing : + * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ struct ipmr_vif_iter { struct seq_net_private p; @@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct mr_table *mrt = it->mrt; struct mfc_cache *mfc; - read_lock(&mrt_lock); + rcu_read_lock(); for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { it->cache = &mrt->mfc_cache_array[it->ct]; - list_for_each_entry(mfc, it->cache, list) + list_for_each_entry_rcu(mfc, it->cache, list) if (pos-- == 0) return mfc; } - read_unlock(&mrt_lock); + rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); it->cache = &mrt->mfc_unres_queue; @@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* exhausted cache_array, show unresolved */ - read_unlock(&mrt_lock); + rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; it->ct = 0; @@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!list_empty(it->cache)) return list_first_entry(it->cache, struct mfc_cache, list); - end_of_list: +end_of_list: spin_unlock_bh(&mfc_unres_lock); it->cache = NULL; @@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); else if (it->cache == &mrt->mfc_cache_array[it->ct]) - read_unlock(&mrt_lock); + rcu_read_unlock(); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) mfc->mfc_un.res.bytes, mfc->mfc_un.res.wrong_if); for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++ ) { + n < mfc->mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, @@ -2421,7 +2443,7 @@ int __init ip_mr_init(void) mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); if (!mrt_cachep) return -ENOMEM; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e8f4f9a..8b642f1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, for (i = 0; i < len; i++) ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; - return (ret != 0); + return ret != 0; } /* diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 3a43cf3..1e26a48 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/checksum.h> +#include <net/ip.h> #define CLUSTERIP_VERSION "0.8" @@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; - u_int16_t sport, dport; - const u_int16_t *ports; - - switch (iph->protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ICMP: - ports = (const void *)iph+iph->ihl*4; - sport = ports[0]; - dport = ports[1]; - break; - default: + u_int16_t sport = 0, dport = 0; + int poff; + + poff = proto_ports_offset(iph->protocol); + if (poff >= 0) { + const u_int16_t *ports; + u16 _ports[2]; + + ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); + if (ports) { + sport = ports[0]; + dport = ports[1]; + } + } else { if (net_ratelimit()) pr_info("unknown protocol %u\n", iph->protocol); - sport = dport = 0; } switch (config->hash_mode) { diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index f2d2973..65699c2 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,8 +28,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; -static DEFINE_SPINLOCK(inet_proto_lock); +const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly; /* * Add a protocol handler to the hash tables @@ -37,20 +36,9 @@ static DEFINE_SPINLOCK(inet_proto_lock); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; + int hash = protocol & (MAX_INET_PROTOS - 1); - hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash]) { - ret = -1; - } else { - inet_protos[hash] = prot; - ret = 0; - } - spin_unlock_bh(&inet_proto_lock); - - return ret; + return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -60,18 +48,9 @@ EXPORT_SYMBOL(inet_add_protocol); int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; - - hash = protocol & (MAX_INET_PROTOS - 1); + int ret, hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash] == prot) { - inet_protos[hash] = NULL; - ret = 0; - } else { - ret = -1; - } - spin_unlock_bh(&inet_proto_lock); + ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 009a7b2..1f85ef2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac6559c..04e0df8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1107,6 +1107,7 @@ restart: * on the route gc list. */ + rt->dst.flags |= DST_NOCACHE; if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->dst); if (err) { @@ -1268,18 +1269,11 @@ skip_hashing: void rt_bind_peer(struct rtable *rt, int create) { - static DEFINE_SPINLOCK(rt_peer_lock); struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; - } - spin_unlock_bh(&rt_peer_lock); - if (peer) + if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); } @@ -2365,9 +2359,8 @@ static int __mkroute_output(struct rtable **result, struct rtable *rth; struct in_device *in_dev; u32 tos = RT_FL_TOS(oldflp); - int err = 0; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) return -EINVAL; if (fl->fl4_dst == htonl(0xFFFFFFFF)) @@ -2380,11 +2373,12 @@ static int __mkroute_output(struct rtable **result, if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - /* get work reference to inet device */ - in_dev = in_dev_get(dev_out); - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) { + rcu_read_unlock(); return -EINVAL; - + } if (res->type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; if (res->fi) { @@ -2392,13 +2386,13 @@ static int __mkroute_output(struct rtable **result, res->fi = NULL; } } else if (res->type == RTN_MULTICAST) { - flags |= RTCF_MULTICAST|RTCF_LOCAL; + flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use - default one, but do not gateway in this case. - Yes, it is hack. + * default one, but do not gateway in this case. + * Yes, it is hack. */ if (res->fi && res->prefixlen < 4) { fib_info_put(res->fi); @@ -2409,9 +2403,12 @@ static int __mkroute_output(struct rtable **result, rth = dst_alloc(&ipv4_dst_ops); if (!rth) { - err = -ENOBUFS; - goto cleanup; + rcu_read_unlock(); + return -ENOBUFS; } + in_dev_hold(in_dev); + rcu_read_unlock(); + rth->idev = in_dev; atomic_set(&rth->dst.__refcnt, 1); rth->dst.flags= DST_HOST; @@ -2432,7 +2429,6 @@ static int __mkroute_output(struct rtable **result, cache entry */ rth->dst.dev = dev_out; dev_hold(dev_out); - rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; @@ -2467,13 +2463,8 @@ static int __mkroute_output(struct rtable **result, rt_set_nexthop(rth, res, 0); rth->rt_flags = flags; - *result = rth; - cleanup: - /* release work reference to inet device */ - in_dev_put(in_dev); - - return err; + return 0; } static int ip_mkroute_output(struct rtable **rp, @@ -2497,6 +2488,7 @@ static int ip_mkroute_output(struct rtable **rp, /* * Major route resolver routine. + * called with rcu_read_lock(); */ static int ip_route_output_slow(struct net *net, struct rtable **rp, @@ -2515,7 +2507,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, .iif = net->loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; - unsigned flags = 0; + unsigned int flags = 0; struct net_device *dev_out = NULL; int free_res = 0; int err; @@ -2545,7 +2537,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, (ipv4_is_multicast(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); + dev_out = __ip_dev_find(net, oldflp->fl4_src, false); if (dev_out == NULL) goto out; @@ -2570,26 +2562,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) + if (!__ip_dev_find(net, oldflp->fl4_src, false)) goto out; - dev_put(dev_out); - dev_out = NULL; } } if (oldflp->oif) { - dev_out = dev_get_by_index(net, oldflp->oif); + dev_out = dev_get_by_index_rcu(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ - if (__in_dev_get_rtnl(dev_out) == NULL) { - dev_put(dev_out); + if (rcu_dereference(dev_out->ip_ptr) == NULL) goto out; /* Wrong error code */ - } if (ipv4_is_local_multicast(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) { @@ -2612,10 +2599,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, fl.fl4_dst = fl.fl4_src; if (!fl.fl4_dst) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); - if (dev_out) - dev_put(dev_out); dev_out = net->loopback_dev; - dev_hold(dev_out); fl.oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; @@ -2649,8 +2633,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, res.type = RTN_UNICAST; goto make_route; } - if (dev_out) - dev_put(dev_out); err = -ENETUNREACH; goto out; } @@ -2659,10 +2641,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (res.type == RTN_LOCAL) { if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; - if (dev_out) - dev_put(dev_out); dev_out = net->loopback_dev; - dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) fib_info_put(res.fi); @@ -2682,28 +2661,23 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); - if (dev_out) - dev_put(dev_out); dev_out = FIB_RES_DEV(res); - dev_hold(dev_out); fl.oif = dev_out->ifindex; make_route: err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); - if (free_res) fib_res_put(&res); - if (dev_out) - dev_put(dev_out); out: return err; } int __ip_route_output_key(struct net *net, struct rtable **rp, const struct flowi *flp) { - unsigned hash; + unsigned int hash; + int res; struct rtable *rth; if (!rt_caching(net)) @@ -2734,7 +2708,10 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rcu_read_unlock_bh(); slow_output: - return ip_route_output_slow(net, rp, flp); + rcu_read_lock(); + res = ip_route_output_slow(net, rp, flp); + rcu_read_unlock(); + return res; } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2798,7 +2775,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_release(&(*rp)->dst); *rp = rt; - return (rt ? 0 : -ENOMEM); + return rt ? 0 : -ENOMEM; } int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f115ea6..1664a05 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2392,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = tp->af_specific->md5_parse(sk, optval, optlen); break; #endif - + case TCP_USER_TIMEOUT: + /* Cap the max timeout in ms TCP will retry/retrans + * before giving up and aborting (ETIMEDOUT) a connection. + */ + icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; default: err = -ENOPROTOOPT; break; @@ -2611,6 +2616,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: val = tp->thin_dupack; break; + + case TCP_USER_TIMEOUT: + val = jiffies_to_msecs(icsk->icsk_user_timeout); + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b55f60f..f6fdd72 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -void tcp_enter_quickack_mode(struct sock *sk) +static void tcp_enter_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk); @@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC3390. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than use a multiplier in the relevant range. - */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { - if (tp->mss_cache > 1460) - cwnd = 2; - else - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - } + if (!cwnd) + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -2314,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp) static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); + return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; } static inline int tcp_head_timedout(struct sock *sk) @@ -3412,8 +3399,8 @@ static void tcp_ack_probe(struct sock *sk) static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) { - return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || - inet_csk(sk)->icsk_ca_state != TCP_CA_Open); + return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) @@ -3430,9 +3417,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { - return (after(ack, tp->snd_una) || + return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || - (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); } /* Update our send window. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0207662..a0232f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2571,7 +2571,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) return tcp_gro_receive(head, skb); } -EXPORT_SYMBOL(tcp4_gro_receive); int tcp4_gro_complete(struct sk_buff *skb) { @@ -2584,7 +2583,6 @@ int tcp4_gro_complete(struct sk_buff *skb) return tcp_gro_complete(skb); } -EXPORT_SYMBOL(tcp4_gro_complete); struct proto tcp_prot = { .name = "TCP", diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f25b56c..43cf901 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) return 1; if (after(end_seq, s_win) && before(seq, e_win)) return 1; - return (seq == e_win && seq == end_seq); + return seq == e_win && seq == end_seq; } /* diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index de3bd84..05b1ecf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to value enough for senders, - * following RFC2414. Senders, not following this RFC, - * will be satisfied with 2. - */ + /* Set initial window to value enough for senders, following RFC5681. */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = 4; - if (mss > 1460 * 3) - init_cwnd = 2; - else if (mss > 1460) - init_cwnd = 3; + int init_cwnd = rfc3390_bytes_to_packets(mss); + /* when initializing use the value from init_rcv_wnd * rather than the default from above */ @@ -1376,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned mss_now, int nonagle) { - return (skb->len < mss_now && + return skb->len < mss_now && ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return non-zero if the Nagle test allows this packet to be @@ -1449,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); - return (skb && + return skb && tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH))); + tp->nonagle : TCP_NAGLE_PUSH)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet @@ -2429,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -2555,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk) tcp_initialize_rcv_mss(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space(sk); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 74c54b3..f3c8c6c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -140,10 +140,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, + unsigned int timeout, bool syn_set) { - unsigned int timeout, linear_backoff_thresh; - unsigned int start_ts; + unsigned int linear_backoff_thresh, start_ts; unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; if (!inet_csk(sk)->icsk_retransmits) @@ -154,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk, else start_ts = tcp_sk(sk)->retrans_stamp; - linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); - - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * rto_base; - else - timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + if (likely(timeout == 0)) { + linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } return (tcp_time_stamp - start_ts) >= timeout; } @@ -178,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = 1; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -191,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until, 0); + !retransmits_timed_out(sk, retry_until, 0, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (retransmits_timed_out(sk, retry_until, syn_set)) { + if (retransmits_timed_out(sk, retry_until, + syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -440,7 +442,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk); out:; @@ -560,7 +562,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { - if (icsk->icsk_probes_out >= keepalive_probes(tp)) { + /* If the TCP_USER_TIMEOUT option is enabled, use that + * to determine when to timeout instead. + */ + if ((icsk->icsk_user_timeout != 0 && + elapsed >= icsk->icsk_user_timeout && + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 20151d6..a534dda 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk) */ static inline u32 westwood_do_filter(u32 a, u32 b) { - return (((7 * a) + b) >> 3); + return ((7 * a) + b) >> 3; } static void westwood_filter(struct westwood *w, u32 delta) diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 59186ca..9a17bd2 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,8 +14,8 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm_tunnel *tunnel4_handlers; -static struct xfrm_tunnel *tunnel64_handlers; +static struct xfrm_tunnel *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel **fam_handlers(unsigned short family) @@ -39,7 +39,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; @@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) } EXPORT_SYMBOL(xfrm4_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; @@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; @@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; @@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) break; } @@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) break; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fb23c2e..b3f7e8c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EOPNOTSUPP; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (up->pending) { /* @@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - err = sock_tx_timestamp(msg, sk, &ipc.shtx); + err = sock_tx_timestamp(sk, &ipc.tx_flags); if (err) return err; if (msg->msg_controllen) { diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 41f5982..8280645 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) return -ENOENT; } -static struct xfrm_tunnel xfrm_tunnel_handler = { +static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct xfrm_tunnel xfrm64_tunnel_handler = { +static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, |