diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 9 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 6 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 5 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 8 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 20 | ||||
-rw-r--r-- | net/ipv4/inetpeer.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 27 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_options.c | 5 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 6 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 2 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
-rw-r--r-- | net/ipv4/ping.c | 4 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 4 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 78 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 17 | ||||
-rw-r--r-- | net/ipv4/udp.c | 3 | ||||
-rw-r--r-- | net/ipv4/xfrm4_mode_tunnel.c | 2 |
22 files changed, 133 insertions, 97 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0747294..ba78b76 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -239,8 +239,12 @@ EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); +u32 ipv6_hash_secret __read_mostly; +EXPORT_SYMBOL(ipv6_hash_secret); + /* - * inet_ehash_secret must be set exactly once + * inet_ehash_secret must be set exactly once, and to a non nul value + * ipv6_hash_secret must be set exactly once. */ void build_ehash_secret(void) { @@ -250,7 +254,8 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - cmpxchg(&inet_ehash_secret, 0, rnd); + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) + get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); } EXPORT_SYMBOL(build_ehash_secret); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 530787b..238fc3b 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -137,8 +137,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) /* skb is pure payload to encrypt */ - err = -ENOMEM; - esp = x->data; aead = esp->aead; alen = crypto_aead_authsize(aead); @@ -174,8 +172,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); - if (!tmp) + if (!tmp) { + err = -ENOMEM; goto error; + } seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0d884eb..e11f4a5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -72,7 +72,6 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/prefetch.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -1772,10 +1771,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) if (!c) continue; - if (IS_LEAF(c)) { - prefetch(rcu_dereference_rtnl(p->child[idx])); + if (IS_LEAF(c)) return (struct leaf *) c; - } /* Rescan start scanning in new node */ p = (struct tnode *) c; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e0d42db..508bae1 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -342,7 +342,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(pip, &rt->dst, NULL); + ip_select_ident(skb, &rt->dst, NULL); ((u8*)&pip[1])[0] = IPOPT_RA; ((u8*)&pip[1])[1] = 4; ((u8*)&pip[1])[2] = 0; @@ -683,7 +683,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, &rt->dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; ((u8*)&iph[1])[1] = 4; ((u8*)&iph[1])[2] = 0; @@ -705,7 +705,7 @@ static void igmp_gq_timer_expire(unsigned long data) in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); - __in_dev_put(in_dev); + in_dev_put(in_dev); } static void igmp_ifc_timer_expire(unsigned long data) @@ -717,7 +717,7 @@ static void igmp_ifc_timer_expire(unsigned long data) in_dev->mr_ifc_count--; igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); } - __in_dev_put(in_dev); + in_dev_put(in_dev); } static void igmp_ifc_event(struct in_device *in_dev) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ff2a51..210b710 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -21,6 +21,7 @@ #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <net/sock.h> #include <net/inet_frag.h> static void inet_frag_secret_rebuild(unsigned long dummy) @@ -271,6 +272,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, { struct inet_frag_queue *q; struct hlist_node *n; + int depth = 0; hlist_for_each_entry(q, n, &f->hash[hash], list) { if (q->net == nf && f->match(q, key)) { @@ -278,9 +280,25 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, read_unlock(&f->lock); return q; } + depth++; } read_unlock(&f->lock); - return inet_frag_create(nf, f, key); + if (depth <= INETFRAGS_MAXDEPTH) + return inet_frag_create(nf, f, key); + else + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); + +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix) +{ + static const char msg[] = "inet_frag_find: Fragment hash bucket" + " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) + ". Dropping fragment.\n"; + + if (PTR_ERR(q) == -ENOBUFS) + LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); +} +EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 6877645..996df36 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -31,8 +31,8 @@ * At the moment of writing this notes identifier of IP packets is generated * to be unpredictable using this code only for packets subjected * (actually or potentially) to defragmentation. I.e. DF packets less than - * PMTU in size uses a constant ID and do not use this code (see - * ip_select_ident() in include/net/ip.h). + * PMTU in size when local fragmentation is disabled use a constant ID and do + * not use this code (see ip_select_ident() in include/net/ip.h). * * Route cache entries hold references to our nodes. * New cache entries get references via lookup by destination IP address in diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 0ad6035..daf19ad 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -20,6 +20,8 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#define pr_fmt(fmt) "IPv4: " fmt + #include <linux/compiler.h> #include <linux/module.h> #include <linux/types.h> @@ -249,8 +251,7 @@ static void ip_expire(unsigned long arg) if (!head->dev) goto out_rcu_unlock; - /* skb dst is stale, drop it, and perform route lookup again */ - skb_dst_drop(head); + /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); @@ -292,14 +293,12 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ @@ -517,8 +516,16 @@ found: qp->q.last_in |= INET_FRAG_FIRST_IN; if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - qp->q.meat == qp->q.len) - return ip_frag_reasm(qp, prev, dev); + qp->q.meat == qp->q.len) { + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + err = ip_frag_reasm(qp, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } + + skb_dst_drop(skb); write_lock(&ip4_frags.lock); list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d7bb94c..91ddbd4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -716,6 +716,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tiph = &tunnel->parms.iph; } + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); if ((dst = tiph->daddr) == 0) { /* NBMA tunnel */ @@ -853,7 +854,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev skb_reset_transport_header(skb); skb_push(skb, gre_hlen); skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 42dd1a9..40eb4fc 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -358,7 +358,6 @@ int ip_options_compile(struct net *net, } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: - opt->ts = optptr - iph; if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; @@ -369,7 +368,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; if (rt) { memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); timeptr = &optptr[optptr[2]+3]; @@ -383,7 +381,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); @@ -416,12 +413,12 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 3; goto error; } - opt->ts = optptr - iph; if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } + opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6daaf60..69b32cb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -161,7 +161,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(iph, &rt->dst, sk); + ip_select_ident(skb, &rt->dst, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -411,7 +411,7 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->dst, sk, + ip_select_ident_more(skb, &rt->dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); skb->priority = sk->sk_priority; @@ -1354,7 +1354,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ihl = 5; iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->dst, sk); + ip_select_ident(skb, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 6f06f7f..6d1165f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -448,6 +448,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tos & 1) tos = old_iph->tos; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); if (!dst) { /* NBMA tunnel */ if ((rt = skb_rtable(skb)) == NULL) { @@ -531,7 +532,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->transport_header = skb->network_header; skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index dc89714..033ea7e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1575,7 +1575,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(iph, skb_dst(skb), NULL); + ip_select_ident(skb, skb_dst(skb), NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index fba9f908..06ca6fe 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -458,8 +458,8 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) int family; struct icmphdr *icmph; struct inet_sock *inet_sock; - int type; - int code; + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4b32679..14b0486 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -379,7 +379,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(iph, &rt->dst, NULL); + ip_select_ident(skb, &rt->dst, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3f6a67d..62d9020 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -345,8 +345,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), - RT_SCOPE_UNIVERSE, IPPROTO_TCP, + flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, ireq->loc_addr, th->source, th->dest, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 46b5950..3a22b1c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -32,6 +32,8 @@ static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int ip_ttl_min = 1; static int ip_ttl_max = 255; +static int tcp_syn_retries_min = 1; +static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; @@ -231,7 +233,9 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_syn_retries, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_syn_retries_min, + .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0e8fe7b..96bb1dc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3058,7 +3058,11 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, for (i = 0; i < shi->nr_frags; ++i) { const struct skb_frag_struct *f = &shi->frags[i]; - sg_set_page(&sg, f->page, f->size, f->page_offset); + unsigned int offset = f->page_offset; + struct page *page = f->page + (offset >> PAGE_SHIFT); + + sg_set_page(&sg, page, f->size, + offset_in_page(offset)); if (crypto_hash_update(desc, &sg, f->size)) return 1; } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index f376b05..b78eac2 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -204,8 +204,8 @@ static u32 cubic_root(u64 a) */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { - u64 offs; - u32 delta, t, bic_target, max_cnt; + u32 delta, bic_target, max_cnt; + u64 offs, t; ca->ack_cnt++; /* count the number of ACKs */ @@ -248,9 +248,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) * if the cwnd < 1 million packets !!! */ + t = (s32)(tcp_time_stamp - ca->epoch_start); + t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - - ca->epoch_start) << BICTCP_HZ) / HZ; + t <<= BICTCP_HZ; + do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; @@ -412,7 +414,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) return; /* Discard delay samples right after fast recovery */ - if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; delay = (rtt_us << 3) / USEC_PER_MSEC; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7f4e391..b1172b4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -115,6 +115,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -2244,11 +2245,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - if (!how) { - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - tp->undo_marker = tp->snd_una; - } else { + tp->undo_marker = tp->snd_una; + if (how) { tp->sacked_out = 0; tp->fackets_out = 0; } @@ -3659,6 +3657,27 @@ static void tcp_send_challenge_ack(struct sock *sk) } } +static void tcp_store_ts_recent(struct tcp_sock *tp) +{ + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); +} + +static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if (tcp_paws_check(&tp->rx_opt, 0)) + tcp_store_ts_recent(tp); + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { @@ -3705,6 +3724,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + if (flag & FLAG_UPDATE_TS_RECENT) + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -3991,27 +4016,6 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th) EXPORT_SYMBOL(tcp_parse_md5sig_option); #endif -static inline void tcp_store_ts_recent(struct tcp_sock *tp) -{ - tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); -} - -static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) -{ - if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Not only, also it occurs for expired timestamps. - */ - - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); - } -} - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) @@ -5419,6 +5423,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_checksum_complete_user(sk, skb)) goto csum_error; + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5430,9 +5437,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ @@ -5480,14 +5484,10 @@ slow_path: return 0; step5: - if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + if (th->ack && + tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) goto discard; - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ @@ -5851,7 +5851,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* step 5: check the ACK field */ if (th->ack) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; switch (sk->sk_state) { case TCP_SYN_RECV: @@ -5964,11 +5965,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } else goto discard; - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - /* step 6: check the URG bit */ tcp_urg(sk, skb, th); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 893da5e..4966d3e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -833,11 +833,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) { + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - skb->ooo_okay = 1; - } else - skb->ooo_okay = 0; + + /* if no packet is in qdisc/device queue, then allow XPS to select + * another queue. + */ + skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -1579,8 +1581,11 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; } - /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies << 1); + /* Ok, it looks like it is advisable to defer. + * Do not rearm the timer if already set to not break TCP ACK clocking. + */ + if (!tp->tso_deferred) + tp->tso_deferred = 1 | (jiffies << 1); return 1; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 68277fd..966c3e1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -765,7 +765,7 @@ send: /* * Push out all pending data as one UDP datagram. Socket is locked. */ -static int udp_push_pending_frames(struct sock *sk) +int udp_push_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -784,6 +784,7 @@ out: up->pending = 0; return err; } +EXPORT_SYMBOL(udp_push_pending_frames); int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index ed4bf11..938553e 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -54,7 +54,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - ip_select_ident(top_iph, dst->child, NULL); + ip_select_ident(skb, dst->child, NULL); top_iph->ttl = ip4_dst_hoplimit(dst->child); |