From 94d077c331730510d5611b438640a292097341f0 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 23 May 2025 17:11:17 +0200 Subject: [PATCH 1/8] xfrm: state: initialize state_ptrs earlier in xfrm_state_find In case of preemption, xfrm_state_look_at will find a different pcpu_id and look up states for that other CPU. If we matched a state for CPU2 in the state_cache while the lookup started on CPU1, we will jump to "found", but the "best" state that we got will be ignored and we will enter the "acquire" block. This block uses state_ptrs, which isn't initialized at this point. Let's initialize state_ptrs just after taking rcu_read_lock. This will also prevent a possible misuse in the future, if someone adjusts this function. Reported-by: syzbot+7ed9d47e15e88581dc5b@syzkaller.appspotmail.com Fixes: e952837f3ddb ("xfrm: state: fix out-of-bounds read during lookup") Signed-off-by: Sabrina Dubroca Reviewed-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 203b585c2ae2..2e2e95d2a06f 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1389,6 +1389,8 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); + xfrm_hash_ptrs_get(net, &state_ptrs); + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && @@ -1429,8 +1431,6 @@ cached: else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); - xfrm_hash_ptrs_get(net, &state_ptrs); - h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask); hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD From 7eb11c0ab70777b9e5145a5ba1c0a2312c3980b2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 23 May 2025 17:11:18 +0200 Subject: [PATCH 2/8] xfrm: state: use a consistent pcpu_id in xfrm_state_find If we get preempted during xfrm_state_find, we could run xfrm_state_look_at using a different pcpu_id than the one xfrm_state_find saw. This could lead to ignoring states that should have matched, and triggering acquires on a CPU that already has a pcpu state. xfrm_state_find starts on CPU1 pcpu_id = 1 lookup starts xfrm_state_look_at pcpu_id = 2 finds a state found: best->pcpu_num != pcpu_id (2 != 1) if (!x && !error && !acquire_in_progress) { ... xfrm_state_alloc xfrm_init_tempstate ... This can be avoided by passing the original pcpu_id down to all xfrm_state_look_at() calls. Also switch to raw_smp_processor_id, disabling preempting just to re-enable it immediately doesn't really make sense. Fixes: 1ddf9916ac09 ("xfrm: Add support for per cpu xfrm state handling.") Signed-off-by: Sabrina Dubroca Reviewed-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 2e2e95d2a06f..7e34fc94f668 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1307,14 +1307,8 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision) static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, const struct flowi *fl, unsigned short family, struct xfrm_state **best, int *acq_in_progress, - int *error) + int *error, unsigned int pcpu_id) { - /* We need the cpu id just as a lookup key, - * we don't require it to be stable. - */ - unsigned int pcpu_id = get_cpu(); - put_cpu(); - /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. @@ -1381,8 +1375,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, /* We need the cpu id just as a lookup key, * we don't require it to be stable. */ - pcpu_id = get_cpu(); - put_cpu(); + pcpu_id = raw_smp_processor_id(); to_put = NULL; @@ -1402,7 +1395,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, encap_family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } if (best) @@ -1419,7 +1412,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } cached: @@ -1460,7 +1453,7 @@ cached: tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } if (best || acquire_in_progress) goto found; @@ -1495,7 +1488,7 @@ cached: tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } found: From c0f21029f123d1b15f8eddc8e3976bf0c8781c43 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 8 Jun 2025 10:42:53 +0300 Subject: [PATCH 3/8] xfrm: always initialize offload path Offload path is used for GRO with SW IPsec, and not just for HW offload. So initialize it anyway. Fixes: 585b64f5a620 ("xfrm: delay initialization of offload path till its actually requested") Reported-by: Sabrina Dubroca Closes: https://lore.kernel.org/all/aEGW_5HfPqU1rFjl@krikkit Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/xfrm/xfrm_device.c | 1 - net/xfrm/xfrm_state.c | 6 ++---- net/xfrm/xfrm_user.c | 1 + 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a21e276dbe44..e45a275fca26 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -474,7 +474,7 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); -void xfrm_set_type_offload(struct xfrm_state *x); +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load); static inline void xfrm_unset_type_offload(struct xfrm_state *x) { if (!x->type_offload) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 81fd486b5e56..d2819baea414 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -305,7 +305,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - xfrm_set_type_offload(x); if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); dev_put(dev); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7e34fc94f668..c7e6472c623d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -424,11 +424,10 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, } EXPORT_SYMBOL(xfrm_unregister_type_offload); -void xfrm_set_type_offload(struct xfrm_state *x) +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; - bool try_load = true; retry: afinfo = xfrm_state_get_afinfo(x->props.family); @@ -607,6 +606,7 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); + xfrm_unset_type_offload(x); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -780,8 +780,6 @@ void xfrm_dev_state_free(struct xfrm_state *x) struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); - xfrm_unset_type_offload(x); - if (dev && dev->xfrmdev_ops) { spin_lock_bh(&xfrm_state_dev_gc_lock); if (!hlist_unhashed(&x->dev_gclist)) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 59f258daf830..1db18f470f42 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -977,6 +977,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); + xfrm_set_type_offload(x, attrs[XFRMA_OFFLOAD_DEV]); /* configure the hardware if offload is requested */ if (attrs[XFRMA_OFFLOAD_DEV]) { err = xfrm_dev_state_add(net, x, From 3ac9e29211fa2df5539ba0d742c8fe9fe95fdc79 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Tue, 24 Jun 2025 14:47:20 +0200 Subject: [PATCH 4/8] xfrm: Set transport header to fix UDP GRO handling The referenced commit replaced a call to __xfrm4|6_udp_encap_rcv() with a custom check for non-ESP markers. But what the called function also did was setting the transport header to the ESP header. The function that follows, esp4|6_gro_receive(), relies on that being set when it calls xfrm_parse_spi(). We have to set the full offset as the skb's head was not moved yet so adding just the UDP header length won't work. Fixes: e3fd05777685 ("xfrm: Fix UDP GRO handling for some corner cases") Signed-off-by: Tobias Brunner Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 3 +++ net/ipv6/xfrm6_input.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 0d31a8c108d4..f28cfd88eaf5 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -202,6 +202,9 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 841c81abaaf4..9005fc156a20 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -202,6 +202,9 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); From 2ca58d87ebae20906cf808ef813d747db0177a18 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 24 Jun 2025 15:11:15 +0200 Subject: [PATCH 5/8] xfrm: ipcomp: adjust transport header after decompressing The skb transport header pointer needs to be adjusted by network header pointer plus the size of the ipcomp header. This shows up when running traffic over ipcomp using transport mode. After being reinjected, packets are dropped because the header isn't adjusted properly and some checks can be triggered. E.g the skb is mistakenly considered as IP fragmented packet and later dropped. kworker/30:1-mm 443 [030] 102.055250: skb:kfree_skb:skbaddr=0xffff8f104aa3ce00 rx_sk=( ffffffff8419f1f4 sk_skb_reason_drop+0x94 ([kernel.kallsyms]) ffffffff8419f1f4 sk_skb_reason_drop+0x94 ([kernel.kallsyms]) ffffffff84281420 ip_defrag+0x4b0 ([kernel.kallsyms]) ffffffff8428006e ip_local_deliver+0x4e ([kernel.kallsyms]) ffffffff8432afb1 xfrm_trans_reinject+0xe1 ([kernel.kallsyms]) ffffffff83758230 process_one_work+0x190 ([kernel.kallsyms]) ffffffff83758f37 worker_thread+0x2d7 ([kernel.kallsyms]) ffffffff83761cc9 kthread+0xf9 ([kernel.kallsyms]) ffffffff836c3437 ret_from_fork+0x197 ([kernel.kallsyms]) ffffffff836718da ret_from_fork_asm+0x1a ([kernel.kallsyms]) Fixes: eb2953d26971 ("xfrm: ipcomp: Use crypto_acomp interface") Link: https://bugzilla.suse.com/1244532 Signed-off-by: Fernando Fernandez Mancera Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_ipcomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 907c3ccb440d..a38545413b80 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -97,7 +97,7 @@ static int ipcomp_input_done2(struct sk_buff *skb, int err) struct ip_comp_hdr *ipch = ip_comp_hdr(skb); const int plen = skb->len; - skb_reset_transport_header(skb); + skb->transport_header = skb->network_header + sizeof(*ipch); return ipcomp_post_acomp(skb, err, 0) ?: skb->len < (plen + sizeof(ip_comp_hdr)) ? -EINVAL : From a90b2a1aaacbcf0f91d7e4868ad6c51c5dee814b Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 3 Jul 2025 10:02:58 -0700 Subject: [PATCH 6/8] xfrm: interface: fix use-after-free after changing collect_md xfrm interface collect_md property on xfrm interfaces can only be set on device creation, thus xfrmi_changelink() should fail when called on such interfaces. The check to enforce this was done only in the case where the xi was returned from xfrmi_locate() which doesn't look for the collect_md interface, and thus the validation was never reached. Calling changelink would thus errornously place the special interface xi in the xfrmi_net->xfrmi hash, but since it also exists in the xfrmi_net->collect_md_xfrmi pointer it would lead to a double free when the net namespace was taken down [1]. Change the check to use the xi from netdev_priv which is available earlier in the function to prevent changes in xfrm collect_md interfaces. [1] resulting oops: [ 8.516540] kernel BUG at net/core/dev.c:12029! [ 8.516552] Oops: invalid opcode: 0000 [#1] SMP NOPTI [ 8.516559] CPU: 0 UID: 0 PID: 12 Comm: kworker/u80:0 Not tainted 6.15.0-virtme #5 PREEMPT(voluntary) [ 8.516565] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 8.516569] Workqueue: netns cleanup_net [ 8.516579] RIP: 0010:unregister_netdevice_many_notify+0x101/0xab0 [ 8.516590] Code: 90 0f 0b 90 48 8b b0 78 01 00 00 48 8b 90 80 01 00 00 48 89 56 08 48 89 32 4c 89 80 78 01 00 00 48 89 b8 80 01 00 00 eb ac 90 <0f> 0b 48 8b 45 00 4c 8d a0 88 fe ff ff 48 39 c5 74 5c 41 80 bc 24 [ 8.516593] RSP: 0018:ffffa93b8006bd30 EFLAGS: 00010206 [ 8.516598] RAX: ffff98fe4226e000 RBX: ffffa93b8006bd58 RCX: ffffa93b8006bc60 [ 8.516601] RDX: 0000000000000004 RSI: 0000000000000000 RDI: dead000000000122 [ 8.516603] RBP: ffffa93b8006bdd8 R08: dead000000000100 R09: ffff98fe4133c100 [ 8.516605] R10: 0000000000000000 R11: 00000000000003d2 R12: ffffa93b8006be00 [ 8.516608] R13: ffffffff96c1a510 R14: ffffffff96c1a510 R15: ffffa93b8006be00 [ 8.516615] FS: 0000000000000000(0000) GS:ffff98fee73b7000(0000) knlGS:0000000000000000 [ 8.516619] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8.516622] CR2: 00007fcd2abd0700 CR3: 000000003aa40000 CR4: 0000000000752ef0 [ 8.516625] PKRU: 55555554 [ 8.516627] Call Trace: [ 8.516632] [ 8.516635] ? rtnl_is_locked+0x15/0x20 [ 8.516641] ? unregister_netdevice_queue+0x29/0xf0 [ 8.516650] ops_undo_list+0x1f2/0x220 [ 8.516659] cleanup_net+0x1ad/0x2e0 [ 8.516664] process_one_work+0x160/0x380 [ 8.516673] worker_thread+0x2aa/0x3c0 [ 8.516679] ? __pfx_worker_thread+0x10/0x10 [ 8.516686] kthread+0xfb/0x200 [ 8.516690] ? __pfx_kthread+0x10/0x10 [ 8.516693] ? __pfx_kthread+0x10/0x10 [ 8.516697] ret_from_fork+0x82/0xf0 [ 8.516705] ? __pfx_kthread+0x10/0x10 [ 8.516709] ret_from_fork_asm+0x1a/0x30 [ 8.516718] Fixes: abc340b38ba2 ("xfrm: interface: support collect metadata mode") Reported-by: Lonial Con Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface_core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index cb1e12740c87..330a05286a56 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -875,7 +875,7 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], return -EINVAL; } - if (p.collect_md) { + if (p.collect_md || xi->p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } @@ -886,11 +886,6 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], } else { if (xi->dev != dev) return -EEXIST; - if (xi->p.collect_md) { - NL_SET_ERR_MSG(extack, - "device can't be changed to collect_md"); - return -EINVAL; - } } return xfrmi_update(xi, &p); From b441cf3f8c4b8576639d20c8eb4aa32917602ecd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 4 Jul 2025 16:54:33 +0200 Subject: [PATCH 7/8] xfrm: delete x->tunnel as we delete x The ipcomp fallback tunnels currently get deleted (from the various lists and hashtables) as the last user state that needed that fallback is destroyed (not deleted). If a reference to that user state still exists, the fallback state will remain on the hashtables/lists, triggering the WARN in xfrm_state_fini. Because of those remaining references, the fix in commit f75a2804da39 ("xfrm: destroy xfrm_state synchronously on net exit path") is not complete. We recently fixed one such situation in TCP due to defered freeing of skbs (commit 9b6412e6979f ("tcp: drop secpath at the same time as we currently drop dst")). This can also happen due to IP reassembly: skbs with a secpath remain on the reassembly queue until netns destruction. If we can't guarantee that the queues are flushed by the time xfrm_state_fini runs, there may still be references to a (user) xfrm_state, preventing the timely deletion of the corresponding fallback state. Instead of chasing each instance of skbs holding a secpath one by one, this patch fixes the issue directly within xfrm, by deleting the fallback state as soon as the last user state depending on it has been deleted. Destruction will still happen when the final reference is dropped. A separate lockdep class for the fallback state is required since we're going to lock x->tunnel while x is locked. Fixes: 9d4139c76905 ("netns xfrm: per-netns xfrm_state_all list") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/ipcomp.c | 2 ++ net/ipv6/ipcomp6.c | 2 ++ net/ipv6/xfrm6_tunnel.c | 2 +- net/xfrm/xfrm_ipcomp.c | 1 - net/xfrm/xfrm_state.c | 19 ++++++++----------- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e45a275fca26..91d52a380e37 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -441,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); -void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 5a4fb2539b08..9a45aed508d1 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -54,6 +54,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) } /* We always hold one tunnel user reference to indicate a tunnel */ +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -62,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPIP; t->id.spi = x->props.saddr.a4; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 72d4858dec18..8607569de34f 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -71,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -79,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index bf140ef781c1..7fd8bc08e6eb 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,8 +334,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_flush_gc(); xfrm_state_flush(net, 0, false, true); + xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index a38545413b80..43fdc6ed8dd1 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -313,7 +313,6 @@ void ipcomp_destroy(struct xfrm_state *x) struct ipcomp_data *ipcd = x->data; if (!ipcd) return; - xfrm_state_delete_tunnel(x); ipcomp_free_data(ipcd); kfree(ipcd); } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c7e6472c623d..f7110a658897 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -811,6 +811,7 @@ void __xfrm_state_destroy(struct xfrm_state *x, bool sync) } EXPORT_SYMBOL(__xfrm_state_destroy); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -838,6 +839,8 @@ int __xfrm_state_delete(struct xfrm_state *x) xfrm_dev_state_delete(x); + xfrm_state_delete_tunnel(x); + /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. @@ -941,10 +944,7 @@ restart: err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); - if (sync) - xfrm_state_put_sync(x); - else - xfrm_state_put(x); + xfrm_state_put(x); if (!err) cnt++; @@ -3068,20 +3068,17 @@ void xfrm_flush_gc(void) } EXPORT_SYMBOL(xfrm_flush_gc); -/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -void xfrm_state_delete_tunnel(struct xfrm_state *x) +static void xfrm_state_delete_tunnel(struct xfrm_state *x) { if (x->tunnel) { struct xfrm_state *t = x->tunnel; - if (atomic_read(&t->tunnel_users) == 2) + if (atomic_dec_return(&t->tunnel_users) == 1) xfrm_state_delete(t); - atomic_dec(&t->tunnel_users); - xfrm_state_put_sync(t); + xfrm_state_put(t); x->tunnel = NULL; } } -EXPORT_SYMBOL(xfrm_state_delete_tunnel); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { @@ -3286,8 +3283,8 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - flush_work(&xfrm_state_gc_work); xfrm_state_flush(net, 0, false, true); + flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); From 2a198bbec6913ae1c90ec963750003c6213668c7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 4 Jul 2025 16:54:34 +0200 Subject: [PATCH 8/8] Revert "xfrm: destroy xfrm_state synchronously on net exit path" This reverts commit f75a2804da391571563c4b6b29e7797787332673. With all states (whether user or kern) removed from the hashtables during deletion, there's no need for synchronous destruction of states. xfrm6_tunnel states still need to have been destroyed (which will be the case when its last user is deleted (not destroyed)) so that xfrm6_tunnel_free_spi removes it from the per-netns hashtable before the netns is destroyed. This has the benefit of skipping one synchronize_rcu per state (in __xfrm_state_destroy(sync=true)) when we exit a netns. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 +++--------- net/ipv6/xfrm6_tunnel.c | 2 +- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 23 +++++++++-------------- net/xfrm/xfrm_user.c | 2 +- 5 files changed, 15 insertions(+), 26 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 91d52a380e37..f3014e4f54fc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -915,7 +915,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *, bool); +void __xfrm_state_destroy(struct xfrm_state *); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -925,13 +925,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, false); -} - -static inline void xfrm_state_put_sync(struct xfrm_state *x) -{ - if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, true); + __xfrm_state_destroy(x); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1769,7 +1763,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 7fd8bc08e6eb..5120a763da0d 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_state_flush(net, 0, false, true); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) diff --git a/net/key/af_key.c b/net/key/af_key.c index efc2a91f4c48..b5d761700776 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1766,7 +1766,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m if (proto == 0) return -EINVAL; - err = xfrm_state_flush(net, proto, true, false); + err = xfrm_state_flush(net, proto, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f7110a658897..327a1a6f892c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -592,7 +592,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); -static void ___xfrm_state_destroy(struct xfrm_state *x) +static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (x->mode_cbs && x->mode_cbs->destroy_state) x->mode_cbs->destroy_state(x); @@ -631,7 +631,7 @@ static void xfrm_state_gc_task(struct work_struct *work) synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) - ___xfrm_state_destroy(x); + xfrm_state_gc_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) @@ -795,19 +795,14 @@ void xfrm_dev_state_free(struct xfrm_state *x) } #endif -void __xfrm_state_destroy(struct xfrm_state *x, bool sync) +void __xfrm_state_destroy(struct xfrm_state *x) { WARN_ON(x->km.state != XFRM_STATE_DEAD); - if (sync) { - synchronize_rcu(); - ___xfrm_state_destroy(x); - } else { - spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &xfrm_state_gc_list); - spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&xfrm_state_gc_work); - } + spin_lock_bh(&xfrm_state_gc_lock); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -922,7 +917,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool } #endif -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) { int i, err = 0, cnt = 0; @@ -3283,7 +3278,7 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - xfrm_state_flush(net, 0, false, true); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 1db18f470f42..684239018bec 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2635,7 +2635,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_usersa_flush *p = nlmsg_data(nlh); int err; - err = xfrm_state_flush(net, p->proto, true, false); + err = xfrm_state_flush(net, p->proto, true); if (err) { if (err == -ESRCH) /* empty table */ return 0;