tcp: add new TCP_TW_ACK_OOW state and allow ECN bits in TOS

ECN bits in TOS are always cleared when sending in ACKs in TW. Clearing
them is problematic for TCP flows that used Accurate ECN because ECN bits
decide which service queue the packet is placed into (L4S vs Classic).
Effectively, TW ACKs are always downgraded from L4S to Classic queue
which might impact, e.g., delay the ACK will experience on the path
compared with the other packets of the flow.

Change the TW ACK sending code to differentiate:
- In tcp_v4_send_reset(), commit ba9e04a7dd ("ip: fix tos reflection
  in ack and reset packets") cleans ECN bits for TW reset and this is
  not affected.
- In tcp_v4_timewait_ack(), ECN bits for all TW ACKs are cleaned. But now
  only ECN bits of ACKs for oow data or paws_reject are cleaned, and ECN
  bits of other ACKs will not be cleaned.
- In tcp_v4_reqsk_send_ack(), commit 66b13d99d9 ("ipv4: tcp: fix TOS
  value in ACK messages sent from TIME_WAIT") did not clean ECN bits of
  ACKs for oow data or paws_reject. But now the ECN bits rae cleaned for
  these ACKs.

Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Ilpo Järvinen 2025-03-05 23:38:51 +01:00 committed by David S. Miller
parent d722762c4e
commit 4618e195f9
5 changed files with 44 additions and 17 deletions

View File

@ -419,7 +419,8 @@ enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
TCP_TW_RST = 1,
TCP_TW_ACK = 2,
TCP_TW_SYN = 3
TCP_TW_SYN = 3,
TCP_TW_ACK_OOW = 4
};

View File

@ -75,7 +75,6 @@
#include <net/checksum.h>
#include <net/gso.h>
#include <net/inetpeer.h>
#include <net/inet_ecn.h>
#include <net/lwtunnel.h>
#include <net/inet_dscp.h>
#include <linux/bpf-cgroup.h>
@ -1640,7 +1639,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
if (IS_ERR(rt))
return;
inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
inet_sk(sk)->tos = arg->tos;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;

View File

@ -66,6 +66,7 @@
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
@ -887,7 +888,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
/* ECN bits of TW reset are cleared */
arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
@ -1033,11 +1035,21 @@ static void tcp_v4_send_ack(const struct sock *sk,
local_bh_enable();
}
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
enum tcp_tw_status tw_status)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
struct tcp_key key = {};
u8 tos = tw->tw_tos;
/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
* while not cleaning ECN bits of other TW ACKs to avoid these ACKs
* being placed in a different service queues (Classic rather than L4S)
*/
if (tw_status == TCP_TW_ACK_OOW)
tos &= ~INET_ECN_MASK;
#ifdef CONFIG_TCP_AO
struct tcp_ao_info *ao_info;
@ -1081,7 +1093,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
READ_ONCE(tcptw->tw_ts_recent),
tw->tw_bound_dev_if, &key,
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
tw->tw_tos,
tos,
tw->tw_txhash);
inet_twsk_put(tw);
@ -1151,6 +1163,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
key.type = TCP_KEY_MD5;
}
/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
@ -1158,7 +1171,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
req->ts_recent,
0, &key,
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos,
ip_hdr(skb)->tos & ~INET_ECN_MASK,
READ_ONCE(tcp_rsk(req)->txhash));
if (tcp_key_is_ao(&key))
kfree(key.traffic_key);
@ -2175,6 +2188,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net_rcu(skb->dev);
enum skb_drop_reason drop_reason;
enum tcp_tw_status tw_status;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
@ -2402,7 +2416,9 @@ do_time_wait:
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn);
switch (tw_status) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(net,
net->ipv4.tcp_death_row.hashinfo,
@ -2423,7 +2439,8 @@ do_time_wait:
/* to ACK */
fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
case TCP_TW_ACK_OOW:
tcp_v4_timewait_ack(sk, skb, tw_status);
break;
case TCP_TW_RST:
tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);

View File

@ -44,7 +44,7 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller.
*/
return TCP_TW_ACK;
return TCP_TW_ACK_OOW;
}
/* We are rate-limiting, so just release the tw sock and drop skb. */

View File

@ -999,7 +999,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
tclass & ~INET_ECN_MASK, priority);
tclass, priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@ -1135,7 +1135,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
trace_tcp_send_reset(sk, skb, reason);
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
ipv6_get_dsfield(ipv6h), label, priority, txhash,
ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK,
label, priority, txhash,
&key);
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
@ -1155,11 +1156,16 @@ static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
tclass, label, priority, txhash, key);
}
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb,
enum tcp_tw_status tw_status)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
u8 tclass = tw->tw_tclass;
struct tcp_key key = {};
if (tw_status == TCP_TW_ACK_OOW)
tclass &= ~INET_ECN_MASK;
#ifdef CONFIG_TCP_AO
struct tcp_ao_info *ao_info;
@ -1203,7 +1209,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_tw_tsval(tcptw),
READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if,
&key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel),
&key, tclass, cpu_to_be32(tw->tw_flowlabel),
tw->tw_priority, tw->tw_txhash);
#ifdef CONFIG_TCP_AO
@ -1280,7 +1286,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
tcp_rsk_tsval(tcp_rsk(req)),
req->ts_recent, sk->sk_bound_dev_if,
&key, ipv6_get_dsfield(ipv6_hdr(skb)), 0,
&key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK,
0,
READ_ONCE(sk->sk_priority),
READ_ONCE(tcp_rsk(req)->txhash));
if (tcp_key_is_ao(&key))
@ -1742,6 +1749,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
struct net *net = dev_net_rcu(skb->dev);
enum skb_drop_reason drop_reason;
enum tcp_tw_status tw_status;
int sdif = inet6_sdif(skb);
int dif = inet6_iif(skb);
const struct tcphdr *th;
@ -1962,7 +1970,8 @@ do_time_wait:
goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn);
switch (tw_status) {
case TCP_TW_SYN:
{
struct sock *sk2;
@ -1987,7 +1996,8 @@ do_time_wait:
/* to ACK */
fallthrough;
case TCP_TW_ACK:
tcp_v6_timewait_ack(sk, skb);
case TCP_TW_ACK_OOW:
tcp_v6_timewait_ack(sk, skb, tw_status);
break;
case TCP_TW_RST:
tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);