From 94d077c331730510d5611b438640a292097341f0 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 23 May 2025 17:11:17 +0200 Subject: [PATCH 01/34] xfrm: state: initialize state_ptrs earlier in xfrm_state_find In case of preemption, xfrm_state_look_at will find a different pcpu_id and look up states for that other CPU. If we matched a state for CPU2 in the state_cache while the lookup started on CPU1, we will jump to "found", but the "best" state that we got will be ignored and we will enter the "acquire" block. This block uses state_ptrs, which isn't initialized at this point. Let's initialize state_ptrs just after taking rcu_read_lock. This will also prevent a possible misuse in the future, if someone adjusts this function. Reported-by: syzbot+7ed9d47e15e88581dc5b@syzkaller.appspotmail.com Fixes: e952837f3ddb ("xfrm: state: fix out-of-bounds read during lookup") Signed-off-by: Sabrina Dubroca Reviewed-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 203b585c2ae2..2e2e95d2a06f 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1389,6 +1389,8 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); + xfrm_hash_ptrs_get(net, &state_ptrs); + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && @@ -1429,8 +1431,6 @@ cached: else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); - xfrm_hash_ptrs_get(net, &state_ptrs); - h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask); hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD From 7eb11c0ab70777b9e5145a5ba1c0a2312c3980b2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 23 May 2025 17:11:18 +0200 Subject: [PATCH 02/34] xfrm: state: use a consistent pcpu_id in xfrm_state_find If we get preempted during xfrm_state_find, we could run xfrm_state_look_at using a different pcpu_id than the one xfrm_state_find saw. This could lead to ignoring states that should have matched, and triggering acquires on a CPU that already has a pcpu state. xfrm_state_find starts on CPU1 pcpu_id = 1 lookup starts xfrm_state_look_at pcpu_id = 2 finds a state found: best->pcpu_num != pcpu_id (2 != 1) if (!x && !error && !acquire_in_progress) { ... xfrm_state_alloc xfrm_init_tempstate ... This can be avoided by passing the original pcpu_id down to all xfrm_state_look_at() calls. Also switch to raw_smp_processor_id, disabling preempting just to re-enable it immediately doesn't really make sense. Fixes: 1ddf9916ac09 ("xfrm: Add support for per cpu xfrm state handling.") Signed-off-by: Sabrina Dubroca Reviewed-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 2e2e95d2a06f..7e34fc94f668 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1307,14 +1307,8 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision) static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, const struct flowi *fl, unsigned short family, struct xfrm_state **best, int *acq_in_progress, - int *error) + int *error, unsigned int pcpu_id) { - /* We need the cpu id just as a lookup key, - * we don't require it to be stable. - */ - unsigned int pcpu_id = get_cpu(); - put_cpu(); - /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. @@ -1381,8 +1375,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, /* We need the cpu id just as a lookup key, * we don't require it to be stable. */ - pcpu_id = get_cpu(); - put_cpu(); + pcpu_id = raw_smp_processor_id(); to_put = NULL; @@ -1402,7 +1395,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, encap_family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } if (best) @@ -1419,7 +1412,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } cached: @@ -1460,7 +1453,7 @@ cached: tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } if (best || acquire_in_progress) goto found; @@ -1495,7 +1488,7 @@ cached: tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } found: From c0f21029f123d1b15f8eddc8e3976bf0c8781c43 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 8 Jun 2025 10:42:53 +0300 Subject: [PATCH 03/34] xfrm: always initialize offload path Offload path is used for GRO with SW IPsec, and not just for HW offload. So initialize it anyway. Fixes: 585b64f5a620 ("xfrm: delay initialization of offload path till its actually requested") Reported-by: Sabrina Dubroca Closes: https://lore.kernel.org/all/aEGW_5HfPqU1rFjl@krikkit Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/xfrm/xfrm_device.c | 1 - net/xfrm/xfrm_state.c | 6 ++---- net/xfrm/xfrm_user.c | 1 + 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a21e276dbe44..e45a275fca26 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -474,7 +474,7 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); -void xfrm_set_type_offload(struct xfrm_state *x); +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load); static inline void xfrm_unset_type_offload(struct xfrm_state *x) { if (!x->type_offload) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 81fd486b5e56..d2819baea414 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -305,7 +305,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - xfrm_set_type_offload(x); if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); dev_put(dev); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7e34fc94f668..c7e6472c623d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -424,11 +424,10 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, } EXPORT_SYMBOL(xfrm_unregister_type_offload); -void xfrm_set_type_offload(struct xfrm_state *x) +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; - bool try_load = true; retry: afinfo = xfrm_state_get_afinfo(x->props.family); @@ -607,6 +606,7 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); + xfrm_unset_type_offload(x); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -780,8 +780,6 @@ void xfrm_dev_state_free(struct xfrm_state *x) struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); - xfrm_unset_type_offload(x); - if (dev && dev->xfrmdev_ops) { spin_lock_bh(&xfrm_state_dev_gc_lock); if (!hlist_unhashed(&x->dev_gclist)) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 59f258daf830..1db18f470f42 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -977,6 +977,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); + xfrm_set_type_offload(x, attrs[XFRMA_OFFLOAD_DEV]); /* configure the hardware if offload is requested */ if (attrs[XFRMA_OFFLOAD_DEV]) { err = xfrm_dev_state_add(net, x, From 3ac9e29211fa2df5539ba0d742c8fe9fe95fdc79 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Tue, 24 Jun 2025 14:47:20 +0200 Subject: [PATCH 04/34] xfrm: Set transport header to fix UDP GRO handling The referenced commit replaced a call to __xfrm4|6_udp_encap_rcv() with a custom check for non-ESP markers. But what the called function also did was setting the transport header to the ESP header. The function that follows, esp4|6_gro_receive(), relies on that being set when it calls xfrm_parse_spi(). We have to set the full offset as the skb's head was not moved yet so adding just the UDP header length won't work. Fixes: e3fd05777685 ("xfrm: Fix UDP GRO handling for some corner cases") Signed-off-by: Tobias Brunner Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 3 +++ net/ipv6/xfrm6_input.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 0d31a8c108d4..f28cfd88eaf5 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -202,6 +202,9 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 841c81abaaf4..9005fc156a20 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -202,6 +202,9 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); From 2ca58d87ebae20906cf808ef813d747db0177a18 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 24 Jun 2025 15:11:15 +0200 Subject: [PATCH 05/34] xfrm: ipcomp: adjust transport header after decompressing The skb transport header pointer needs to be adjusted by network header pointer plus the size of the ipcomp header. This shows up when running traffic over ipcomp using transport mode. After being reinjected, packets are dropped because the header isn't adjusted properly and some checks can be triggered. E.g the skb is mistakenly considered as IP fragmented packet and later dropped. kworker/30:1-mm 443 [030] 102.055250: skb:kfree_skb:skbaddr=0xffff8f104aa3ce00 rx_sk=( ffffffff8419f1f4 sk_skb_reason_drop+0x94 ([kernel.kallsyms]) ffffffff8419f1f4 sk_skb_reason_drop+0x94 ([kernel.kallsyms]) ffffffff84281420 ip_defrag+0x4b0 ([kernel.kallsyms]) ffffffff8428006e ip_local_deliver+0x4e ([kernel.kallsyms]) ffffffff8432afb1 xfrm_trans_reinject+0xe1 ([kernel.kallsyms]) ffffffff83758230 process_one_work+0x190 ([kernel.kallsyms]) ffffffff83758f37 worker_thread+0x2d7 ([kernel.kallsyms]) ffffffff83761cc9 kthread+0xf9 ([kernel.kallsyms]) ffffffff836c3437 ret_from_fork+0x197 ([kernel.kallsyms]) ffffffff836718da ret_from_fork_asm+0x1a ([kernel.kallsyms]) Fixes: eb2953d26971 ("xfrm: ipcomp: Use crypto_acomp interface") Link: https://bugzilla.suse.com/1244532 Signed-off-by: Fernando Fernandez Mancera Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_ipcomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 907c3ccb440d..a38545413b80 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -97,7 +97,7 @@ static int ipcomp_input_done2(struct sk_buff *skb, int err) struct ip_comp_hdr *ipch = ip_comp_hdr(skb); const int plen = skb->len; - skb_reset_transport_header(skb); + skb->transport_header = skb->network_header + sizeof(*ipch); return ipcomp_post_acomp(skb, err, 0) ?: skb->len < (plen + sizeof(ip_comp_hdr)) ? -EINVAL : From a90b2a1aaacbcf0f91d7e4868ad6c51c5dee814b Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 3 Jul 2025 10:02:58 -0700 Subject: [PATCH 06/34] xfrm: interface: fix use-after-free after changing collect_md xfrm interface collect_md property on xfrm interfaces can only be set on device creation, thus xfrmi_changelink() should fail when called on such interfaces. The check to enforce this was done only in the case where the xi was returned from xfrmi_locate() which doesn't look for the collect_md interface, and thus the validation was never reached. Calling changelink would thus errornously place the special interface xi in the xfrmi_net->xfrmi hash, but since it also exists in the xfrmi_net->collect_md_xfrmi pointer it would lead to a double free when the net namespace was taken down [1]. Change the check to use the xi from netdev_priv which is available earlier in the function to prevent changes in xfrm collect_md interfaces. [1] resulting oops: [ 8.516540] kernel BUG at net/core/dev.c:12029! [ 8.516552] Oops: invalid opcode: 0000 [#1] SMP NOPTI [ 8.516559] CPU: 0 UID: 0 PID: 12 Comm: kworker/u80:0 Not tainted 6.15.0-virtme #5 PREEMPT(voluntary) [ 8.516565] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 8.516569] Workqueue: netns cleanup_net [ 8.516579] RIP: 0010:unregister_netdevice_many_notify+0x101/0xab0 [ 8.516590] Code: 90 0f 0b 90 48 8b b0 78 01 00 00 48 8b 90 80 01 00 00 48 89 56 08 48 89 32 4c 89 80 78 01 00 00 48 89 b8 80 01 00 00 eb ac 90 <0f> 0b 48 8b 45 00 4c 8d a0 88 fe ff ff 48 39 c5 74 5c 41 80 bc 24 [ 8.516593] RSP: 0018:ffffa93b8006bd30 EFLAGS: 00010206 [ 8.516598] RAX: ffff98fe4226e000 RBX: ffffa93b8006bd58 RCX: ffffa93b8006bc60 [ 8.516601] RDX: 0000000000000004 RSI: 0000000000000000 RDI: dead000000000122 [ 8.516603] RBP: ffffa93b8006bdd8 R08: dead000000000100 R09: ffff98fe4133c100 [ 8.516605] R10: 0000000000000000 R11: 00000000000003d2 R12: ffffa93b8006be00 [ 8.516608] R13: ffffffff96c1a510 R14: ffffffff96c1a510 R15: ffffa93b8006be00 [ 8.516615] FS: 0000000000000000(0000) GS:ffff98fee73b7000(0000) knlGS:0000000000000000 [ 8.516619] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8.516622] CR2: 00007fcd2abd0700 CR3: 000000003aa40000 CR4: 0000000000752ef0 [ 8.516625] PKRU: 55555554 [ 8.516627] Call Trace: [ 8.516632] [ 8.516635] ? rtnl_is_locked+0x15/0x20 [ 8.516641] ? unregister_netdevice_queue+0x29/0xf0 [ 8.516650] ops_undo_list+0x1f2/0x220 [ 8.516659] cleanup_net+0x1ad/0x2e0 [ 8.516664] process_one_work+0x160/0x380 [ 8.516673] worker_thread+0x2aa/0x3c0 [ 8.516679] ? __pfx_worker_thread+0x10/0x10 [ 8.516686] kthread+0xfb/0x200 [ 8.516690] ? __pfx_kthread+0x10/0x10 [ 8.516693] ? __pfx_kthread+0x10/0x10 [ 8.516697] ret_from_fork+0x82/0xf0 [ 8.516705] ? __pfx_kthread+0x10/0x10 [ 8.516709] ret_from_fork_asm+0x1a/0x30 [ 8.516718] Fixes: abc340b38ba2 ("xfrm: interface: support collect metadata mode") Reported-by: Lonial Con Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface_core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index cb1e12740c87..330a05286a56 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -875,7 +875,7 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], return -EINVAL; } - if (p.collect_md) { + if (p.collect_md || xi->p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } @@ -886,11 +886,6 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], } else { if (xi->dev != dev) return -EEXIST; - if (xi->p.collect_md) { - NL_SET_ERR_MSG(extack, - "device can't be changed to collect_md"); - return -EINVAL; - } } return xfrmi_update(xi, &p); From b441cf3f8c4b8576639d20c8eb4aa32917602ecd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 4 Jul 2025 16:54:33 +0200 Subject: [PATCH 07/34] xfrm: delete x->tunnel as we delete x The ipcomp fallback tunnels currently get deleted (from the various lists and hashtables) as the last user state that needed that fallback is destroyed (not deleted). If a reference to that user state still exists, the fallback state will remain on the hashtables/lists, triggering the WARN in xfrm_state_fini. Because of those remaining references, the fix in commit f75a2804da39 ("xfrm: destroy xfrm_state synchronously on net exit path") is not complete. We recently fixed one such situation in TCP due to defered freeing of skbs (commit 9b6412e6979f ("tcp: drop secpath at the same time as we currently drop dst")). This can also happen due to IP reassembly: skbs with a secpath remain on the reassembly queue until netns destruction. If we can't guarantee that the queues are flushed by the time xfrm_state_fini runs, there may still be references to a (user) xfrm_state, preventing the timely deletion of the corresponding fallback state. Instead of chasing each instance of skbs holding a secpath one by one, this patch fixes the issue directly within xfrm, by deleting the fallback state as soon as the last user state depending on it has been deleted. Destruction will still happen when the final reference is dropped. A separate lockdep class for the fallback state is required since we're going to lock x->tunnel while x is locked. Fixes: 9d4139c76905 ("netns xfrm: per-netns xfrm_state_all list") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/ipcomp.c | 2 ++ net/ipv6/ipcomp6.c | 2 ++ net/ipv6/xfrm6_tunnel.c | 2 +- net/xfrm/xfrm_ipcomp.c | 1 - net/xfrm/xfrm_state.c | 19 ++++++++----------- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e45a275fca26..91d52a380e37 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -441,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); -void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 5a4fb2539b08..9a45aed508d1 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -54,6 +54,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) } /* We always hold one tunnel user reference to indicate a tunnel */ +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -62,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPIP; t->id.spi = x->props.saddr.a4; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 72d4858dec18..8607569de34f 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -71,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -79,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index bf140ef781c1..7fd8bc08e6eb 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,8 +334,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_flush_gc(); xfrm_state_flush(net, 0, false, true); + xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index a38545413b80..43fdc6ed8dd1 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -313,7 +313,6 @@ void ipcomp_destroy(struct xfrm_state *x) struct ipcomp_data *ipcd = x->data; if (!ipcd) return; - xfrm_state_delete_tunnel(x); ipcomp_free_data(ipcd); kfree(ipcd); } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c7e6472c623d..f7110a658897 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -811,6 +811,7 @@ void __xfrm_state_destroy(struct xfrm_state *x, bool sync) } EXPORT_SYMBOL(__xfrm_state_destroy); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -838,6 +839,8 @@ int __xfrm_state_delete(struct xfrm_state *x) xfrm_dev_state_delete(x); + xfrm_state_delete_tunnel(x); + /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. @@ -941,10 +944,7 @@ restart: err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); - if (sync) - xfrm_state_put_sync(x); - else - xfrm_state_put(x); + xfrm_state_put(x); if (!err) cnt++; @@ -3068,20 +3068,17 @@ void xfrm_flush_gc(void) } EXPORT_SYMBOL(xfrm_flush_gc); -/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -void xfrm_state_delete_tunnel(struct xfrm_state *x) +static void xfrm_state_delete_tunnel(struct xfrm_state *x) { if (x->tunnel) { struct xfrm_state *t = x->tunnel; - if (atomic_read(&t->tunnel_users) == 2) + if (atomic_dec_return(&t->tunnel_users) == 1) xfrm_state_delete(t); - atomic_dec(&t->tunnel_users); - xfrm_state_put_sync(t); + xfrm_state_put(t); x->tunnel = NULL; } } -EXPORT_SYMBOL(xfrm_state_delete_tunnel); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { @@ -3286,8 +3283,8 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - flush_work(&xfrm_state_gc_work); xfrm_state_flush(net, 0, false, true); + flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); From 2a198bbec6913ae1c90ec963750003c6213668c7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 4 Jul 2025 16:54:34 +0200 Subject: [PATCH 08/34] Revert "xfrm: destroy xfrm_state synchronously on net exit path" This reverts commit f75a2804da391571563c4b6b29e7797787332673. With all states (whether user or kern) removed from the hashtables during deletion, there's no need for synchronous destruction of states. xfrm6_tunnel states still need to have been destroyed (which will be the case when its last user is deleted (not destroyed)) so that xfrm6_tunnel_free_spi removes it from the per-netns hashtable before the netns is destroyed. This has the benefit of skipping one synchronize_rcu per state (in __xfrm_state_destroy(sync=true)) when we exit a netns. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 +++--------- net/ipv6/xfrm6_tunnel.c | 2 +- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 23 +++++++++-------------- net/xfrm/xfrm_user.c | 2 +- 5 files changed, 15 insertions(+), 26 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 91d52a380e37..f3014e4f54fc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -915,7 +915,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *, bool); +void __xfrm_state_destroy(struct xfrm_state *); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -925,13 +925,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, false); -} - -static inline void xfrm_state_put_sync(struct xfrm_state *x) -{ - if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, true); + __xfrm_state_destroy(x); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1769,7 +1763,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 7fd8bc08e6eb..5120a763da0d 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_state_flush(net, 0, false, true); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) diff --git a/net/key/af_key.c b/net/key/af_key.c index efc2a91f4c48..b5d761700776 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1766,7 +1766,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m if (proto == 0) return -EINVAL; - err = xfrm_state_flush(net, proto, true, false); + err = xfrm_state_flush(net, proto, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f7110a658897..327a1a6f892c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -592,7 +592,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); -static void ___xfrm_state_destroy(struct xfrm_state *x) +static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (x->mode_cbs && x->mode_cbs->destroy_state) x->mode_cbs->destroy_state(x); @@ -631,7 +631,7 @@ static void xfrm_state_gc_task(struct work_struct *work) synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) - ___xfrm_state_destroy(x); + xfrm_state_gc_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) @@ -795,19 +795,14 @@ void xfrm_dev_state_free(struct xfrm_state *x) } #endif -void __xfrm_state_destroy(struct xfrm_state *x, bool sync) +void __xfrm_state_destroy(struct xfrm_state *x) { WARN_ON(x->km.state != XFRM_STATE_DEAD); - if (sync) { - synchronize_rcu(); - ___xfrm_state_destroy(x); - } else { - spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &xfrm_state_gc_list); - spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&xfrm_state_gc_work); - } + spin_lock_bh(&xfrm_state_gc_lock); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -922,7 +917,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool } #endif -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) { int i, err = 0, cnt = 0; @@ -3283,7 +3278,7 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - xfrm_state_flush(net, 0, false, true); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 1db18f470f42..684239018bec 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2635,7 +2635,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_usersa_flush *p = nlmsg_data(nlh); int err; - err = xfrm_state_flush(net, p->proto, true, false); + err = xfrm_state_flush(net, p->proto, true); if (err) { if (err == -ESRCH) /* empty table */ return 0; From bddbe13d36a02d5097b99cf02354d5752ad1ac60 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 17 Jul 2025 10:23:07 +0800 Subject: [PATCH 09/34] bus: fsl-mc: Fix potential double device reference in fsl_mc_get_endpoint() The fsl_mc_get_endpoint() function may call fsl_mc_device_lookup() twice, which would increment the device's reference count twice if both lookups find a device. This could lead to a reference count leak. Found by code review. Cc: stable@vger.kernel.org Fixes: 1ac210d128ef ("bus: fsl-mc: add the fsl_mc_get_endpoint function") Signed-off-by: Ma Ke Tested-by: Ioana Ciornei Reviewed-by: Simon Horman Fixes: 8567494cebe5 ("bus: fsl-mc: rescan devices if endpoint not found") Link: https://patch.msgid.link/20250717022309.3339976-1-make24@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/bus/fsl-mc/fsl-mc-bus.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 7671bd158545..c1c0a4759c7e 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -943,6 +943,7 @@ struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev, struct fsl_mc_obj_desc endpoint_desc = {{ 0 }}; struct dprc_endpoint endpoint1 = {{ 0 }}; struct dprc_endpoint endpoint2 = {{ 0 }}; + struct fsl_mc_bus *mc_bus; int state, err; mc_bus_dev = to_fsl_mc_device(mc_dev->dev.parent); @@ -966,6 +967,8 @@ struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev, strcpy(endpoint_desc.type, endpoint2.type); endpoint_desc.id = endpoint2.id; endpoint = fsl_mc_device_lookup(&endpoint_desc, mc_bus_dev); + if (endpoint) + return endpoint; /* * We know that the device has an endpoint because we verified by @@ -973,17 +976,13 @@ struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev, * yet discovered by the fsl-mc bus, thus the lookup returned NULL. * Force a rescan of the devices in this container and retry the lookup. */ - if (!endpoint) { - struct fsl_mc_bus *mc_bus = to_fsl_mc_bus(mc_bus_dev); - - if (mutex_trylock(&mc_bus->scan_mutex)) { - err = dprc_scan_objects(mc_bus_dev, true); - mutex_unlock(&mc_bus->scan_mutex); - } - - if (err < 0) - return ERR_PTR(err); + mc_bus = to_fsl_mc_bus(mc_bus_dev); + if (mutex_trylock(&mc_bus->scan_mutex)) { + err = dprc_scan_objects(mc_bus_dev, true); + mutex_unlock(&mc_bus->scan_mutex); } + if (err < 0) + return ERR_PTR(err); endpoint = fsl_mc_device_lookup(&endpoint_desc, mc_bus_dev); /* From ee9f3a81ab08dfe0538dbd1746f81fd4d5147fdc Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 17 Jul 2025 10:23:08 +0800 Subject: [PATCH 10/34] dpaa2-eth: Fix device reference count leak in MAC endpoint handling The fsl_mc_get_endpoint() function uses device_find_child() for localization, which implicitly calls get_device() to increment the device's reference count before returning the pointer. However, the caller dpaa2_eth_connect_mac() fails to properly release this reference in multiple scenarios. We should call put_device() to decrement reference count properly. As comment of device_find_child() says, 'NOTE: you will need to drop the reference with put_device() after use'. Found by code review. Cc: stable@vger.kernel.org Fixes: 719479230893 ("dpaa2-eth: add MAC/PHY support through phylink") Signed-off-by: Ma Ke Tested-by: Ioana Ciornei Reviewed-by: Ioana Ciornei Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250717022309.3339976-2-make24@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index b82f121cadad..0f4efd505332 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -4666,12 +4666,19 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv *priv) return PTR_ERR(dpmac_dev); } - if (IS_ERR(dpmac_dev) || dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type) + if (IS_ERR(dpmac_dev)) return 0; + if (dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type) { + err = 0; + goto out_put_device; + } + mac = kzalloc(sizeof(struct dpaa2_mac), GFP_KERNEL); - if (!mac) - return -ENOMEM; + if (!mac) { + err = -ENOMEM; + goto out_put_device; + } mac->mc_dev = dpmac_dev; mac->mc_io = priv->mc_io; @@ -4705,6 +4712,8 @@ err_close_mac: dpaa2_mac_close(mac); err_free_mac: kfree(mac); +out_put_device: + put_device(&dpmac_dev->dev); return err; } From 96e056ffba912ef18a72177f71956a5b347b5177 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 17 Jul 2025 10:23:09 +0800 Subject: [PATCH 11/34] dpaa2-switch: Fix device reference count leak in MAC endpoint handling The fsl_mc_get_endpoint() function uses device_find_child() for localization, which implicitly calls get_device() to increment the device's reference count before returning the pointer. However, the caller dpaa2_switch_port_connect_mac() fails to properly release this reference in multiple scenarios. We should call put_device() to decrement reference count properly. As comment of device_find_child() says, 'NOTE: you will need to drop the reference with put_device() after use'. Found by code review. Cc: stable@vger.kernel.org Fixes: 84cba72956fd ("dpaa2-switch: integrate the MAC endpoint support") Signed-off-by: Ma Ke Tested-by: Ioana Ciornei Reviewed-by: Ioana Ciornei Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250717022309.3339976-3-make24@iscas.ac.cn Signed-off-by: Jakub Kicinski --- .../net/ethernet/freescale/dpaa2/dpaa2-switch.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 147a93bf9fa9..4643a3380618 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -1448,12 +1448,19 @@ static int dpaa2_switch_port_connect_mac(struct ethsw_port_priv *port_priv) if (PTR_ERR(dpmac_dev) == -EPROBE_DEFER) return PTR_ERR(dpmac_dev); - if (IS_ERR(dpmac_dev) || dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type) + if (IS_ERR(dpmac_dev)) return 0; + if (dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type) { + err = 0; + goto out_put_device; + } + mac = kzalloc(sizeof(*mac), GFP_KERNEL); - if (!mac) - return -ENOMEM; + if (!mac) { + err = -ENOMEM; + goto out_put_device; + } mac->mc_dev = dpmac_dev; mac->mc_io = port_priv->ethsw_data->mc_io; @@ -1483,6 +1490,8 @@ err_close_mac: dpaa2_mac_close(mac); err_free_mac: kfree(mac); +out_put_device: + put_device(&dpmac_dev->dev); return err; } From 6e86fb73de0fe3ec5cdcd5873ad1d6005f295b64 Mon Sep 17 00:00:00 2001 From: Himanshu Mittal Date: Thu, 17 Jul 2025 15:12:20 +0530 Subject: [PATCH 12/34] net: ti: icssg-prueth: Fix buffer allocation for ICSSG Fixes overlapping buffer allocation for ICSSG peripheral used for storing packets to be received/transmitted. There are 3 buffers: 1. Buffer for Locally Injected Packets 2. Buffer for Forwarding Packets 3. Buffer for Host Egress Packets In existing allocation buffers for 2. and 3. are overlapping causing packet corruption. Packet corruption observations: During tcp iperf testing, due to overlapping buffers the received ack packet overwrites the packet to be transmitted. So, we see packets on wire with the ack packet content inside the content of next TCP packet from sender device. Details for AM64x switch mode: -> Allocation by existing driver: +---------+-------------------------------------------------------------+ | | SLICE 0 | SLICE 1 | | +------+--------------+--------+------+--------------+--------+ | | Slot | Base Address | Size | Slot | Base Address | Size | |---------+------+--------------+--------+------+--------------+--------+ | | 0 | 70000000 | 0x2000 | 0 | 70010000 | 0x2000 | | | 1 | 70002000 | 0x2000 | 1 | 70012000 | 0x2000 | | | 2 | 70004000 | 0x2000 | 2 | 70014000 | 0x2000 | | FWD | 3 | 70006000 | 0x2000 | 3 | 70016000 | 0x2000 | | Buffers | 4 | 70008000 | 0x2000 | 4 | 70018000 | 0x2000 | | | 5 | 7000A000 | 0x2000 | 5 | 7001A000 | 0x2000 | | | 6 | 7000C000 | 0x2000 | 6 | 7001C000 | 0x2000 | | | 7 | 7000E000 | 0x2000 | 7 | 7001E000 | 0x2000 | +---------+------+--------------+--------+------+--------------+--------+ | | 8 | 70020000 | 0x1000 | 8 | 70028000 | 0x1000 | | | 9 | 70021000 | 0x1000 | 9 | 70029000 | 0x1000 | | | 10 | 70022000 | 0x1000 | 10 | 7002A000 | 0x1000 | | Our | 11 | 70023000 | 0x1000 | 11 | 7002B000 | 0x1000 | | LI | 12 | 00000000 | 0x0 | 12 | 00000000 | 0x0 | | Buffers | 13 | 00000000 | 0x0 | 13 | 00000000 | 0x0 | | | 14 | 00000000 | 0x0 | 14 | 00000000 | 0x0 | | | 15 | 00000000 | 0x0 | 15 | 00000000 | 0x0 | +---------+------+--------------+--------+------+--------------+--------+ | | 16 | 70024000 | 0x1000 | 16 | 7002C000 | 0x1000 | | | 17 | 70025000 | 0x1000 | 17 | 7002D000 | 0x1000 | | | 18 | 70026000 | 0x1000 | 18 | 7002E000 | 0x1000 | | Their | 19 | 70027000 | 0x1000 | 19 | 7002F000 | 0x1000 | | LI | 20 | 00000000 | 0x0 | 20 | 00000000 | 0x0 | | Buffers | 21 | 00000000 | 0x0 | 21 | 00000000 | 0x0 | | | 22 | 00000000 | 0x0 | 22 | 00000000 | 0x0 | | | 23 | 00000000 | 0x0 | 23 | 00000000 | 0x0 | +---------+------+--------------+--------+------+--------------+--------+ --> here 16, 17, 18, 19 overlapping with below express buffer +-----+-----------------------------------------------+ | | SLICE 0 | SLICE 1 | | +------------+----------+------------+----------+ | | Start addr | End addr | Start addr | End addr | +-----+------------+----------+------------+----------+ | EXP | 70024000 | 70028000 | 7002C000 | 70030000 | <-- Overlapping | PRE | 70030000 | 70033800 | 70034000 | 70037800 | +-----+------------+----------+------------+----------+ +---------------------+----------+----------+ | | SLICE 0 | SLICE 1 | +---------------------+----------+----------+ | Default Drop Offset | 00000000 | 00000000 | <-- Field not configured +---------------------+----------+----------+ -> Allocation this patch brings: +---------+-------------------------------------------------------------+ | | SLICE 0 | SLICE 1 | | +------+--------------+--------+------+--------------+--------+ | | Slot | Base Address | Size | Slot | Base Address | Size | |---------+------+--------------+--------+------+--------------+--------+ | | 0 | 70000000 | 0x2000 | 0 | 70040000 | 0x2000 | | | 1 | 70002000 | 0x2000 | 1 | 70042000 | 0x2000 | | | 2 | 70004000 | 0x2000 | 2 | 70044000 | 0x2000 | | FWD | 3 | 70006000 | 0x2000 | 3 | 70046000 | 0x2000 | | Buffers | 4 | 70008000 | 0x2000 | 4 | 70048000 | 0x2000 | | | 5 | 7000A000 | 0x2000 | 5 | 7004A000 | 0x2000 | | | 6 | 7000C000 | 0x2000 | 6 | 7004C000 | 0x2000 | | | 7 | 7000E000 | 0x2000 | 7 | 7004E000 | 0x2000 | +---------+------+--------------+--------+------+--------------+--------+ | | 8 | 70010000 | 0x1000 | 8 | 70050000 | 0x1000 | | | 9 | 70011000 | 0x1000 | 9 | 70051000 | 0x1000 | | | 10 | 70012000 | 0x1000 | 10 | 70052000 | 0x1000 | | Our | 11 | 70013000 | 0x1000 | 11 | 70053000 | 0x1000 | | LI | 12 | 00000000 | 0x0 | 12 | 00000000 | 0x0 | | Buffers | 13 | 00000000 | 0x0 | 13 | 00000000 | 0x0 | | | 14 | 00000000 | 0x0 | 14 | 00000000 | 0x0 | | | 15 | 00000000 | 0x0 | 15 | 00000000 | 0x0 | +---------+------+--------------+--------+------+--------------+--------+ | | 16 | 70014000 | 0x1000 | 16 | 70054000 | 0x1000 | | | 17 | 70015000 | 0x1000 | 17 | 70055000 | 0x1000 | | | 18 | 70016000 | 0x1000 | 18 | 70056000 | 0x1000 | | Their | 19 | 70017000 | 0x1000 | 19 | 70057000 | 0x1000 | | LI | 20 | 00000000 | 0x0 | 20 | 00000000 | 0x0 | | Buffers | 21 | 00000000 | 0x0 | 21 | 00000000 | 0x0 | | | 22 | 00000000 | 0x0 | 22 | 00000000 | 0x0 | | | 23 | 00000000 | 0x0 | 23 | 00000000 | 0x0 | +---------+------+--------------+--------+------+--------------+--------+ +-----+-----------------------------------------------+ | | SLICE 0 | SLICE 1 | | +------------+----------+------------+----------+ | | Start addr | End addr | Start addr | End addr | +-----+------------+----------+------------+----------+ | EXP | 70018000 | 7001C000 | 70058000 | 7005C000 | | PRE | 7001C000 | 7001F800 | 7005C000 | 7005F800 | +-----+------------+----------+------------+----------+ +---------------------+----------+----------+ | | SLICE 0 | SLICE 1 | +---------------------+----------+----------+ | Default Drop Offset | 7001F800 | 7005F800 | +---------------------+----------+----------+ Rootcause: missing buffer configuration for Express frames in function: prueth_fw_offload_buffer_setup() Details: Driver implements two distinct buffer configuration functions that are invoked based on the driver state and ICSSG firmware:- - prueth_fw_offload_buffer_setup() - prueth_emac_buffer_setup() During initialization, driver creates standard network interfaces (netdevs) and configures buffers via prueth_emac_buffer_setup(). This function properly allocates and configures all required memory regions including: - LI buffers - Express packet buffers - Preemptible packet buffers However, when the driver transitions to an offload mode (switch/HSR/PRP), buffer reconfiguration is handled by prueth_fw_offload_buffer_setup(). This function does not reconfigure the buffer regions required for Express packets, leading to incorrect buffer allocation. Fixes: abd5576b9c57 ("net: ti: icssg-prueth: Add support for ICSSG switch firmware") Signed-off-by: Himanshu Mittal Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250717094220.546388-1-h-mittal1@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssg/icssg_config.c | 162 ++++++++++++------ drivers/net/ethernet/ti/icssg/icssg_config.h | 78 +++++++-- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 20 ++- drivers/net/ethernet/ti/icssg/icssg_prueth.h | 2 + .../net/ethernet/ti/icssg/icssg_switch_map.h | 3 + 5 files changed, 191 insertions(+), 74 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.c b/drivers/net/ethernet/ti/icssg/icssg_config.c index ddfd1c02a885..da53eb04b0a4 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.c +++ b/drivers/net/ethernet/ti/icssg/icssg_config.c @@ -288,8 +288,12 @@ static int prueth_fw_offload_buffer_setup(struct prueth_emac *emac) int i; addr = lower_32_bits(prueth->msmcram.pa); - if (slice) - addr += PRUETH_NUM_BUF_POOLS * PRUETH_EMAC_BUF_POOL_SIZE; + if (slice) { + if (prueth->pdata.banked_ms_ram) + addr += MSMC_RAM_BANK_SIZE; + else + addr += PRUETH_SW_TOTAL_BUF_SIZE_PER_SLICE; + } if (addr % SZ_64K) { dev_warn(prueth->dev, "buffer pool needs to be 64KB aligned\n"); @@ -297,43 +301,66 @@ static int prueth_fw_offload_buffer_setup(struct prueth_emac *emac) } bpool_cfg = emac->dram.va + BUFFER_POOL_0_ADDR_OFFSET; - /* workaround for f/w bug. bpool 0 needs to be initialized */ - for (i = 0; i < PRUETH_NUM_BUF_POOLS; i++) { + + /* Configure buffer pools for forwarding buffers + * - used by firmware to store packets to be forwarded to other port + * - 8 total pools per slice + */ + for (i = 0; i < PRUETH_NUM_FWD_BUF_POOLS_PER_SLICE; i++) { writel(addr, &bpool_cfg[i].addr); - writel(PRUETH_EMAC_BUF_POOL_SIZE, &bpool_cfg[i].len); - addr += PRUETH_EMAC_BUF_POOL_SIZE; + writel(PRUETH_SW_FWD_BUF_POOL_SIZE, &bpool_cfg[i].len); + addr += PRUETH_SW_FWD_BUF_POOL_SIZE; } - if (!slice) - addr += PRUETH_NUM_BUF_POOLS * PRUETH_EMAC_BUF_POOL_SIZE; - else - addr += PRUETH_SW_NUM_BUF_POOLS_HOST * PRUETH_SW_BUF_POOL_SIZE_HOST; + /* Configure buffer pools for Local Injection buffers + * - used by firmware to store packets received from host core + * - 16 total pools per slice + */ + for (i = 0; i < PRUETH_NUM_LI_BUF_POOLS_PER_SLICE; i++) { + int cfg_idx = i + PRUETH_NUM_FWD_BUF_POOLS_PER_SLICE; - for (i = PRUETH_NUM_BUF_POOLS; - i < 2 * PRUETH_SW_NUM_BUF_POOLS_HOST + PRUETH_NUM_BUF_POOLS; - i++) { - /* The driver only uses first 4 queues per PRU so only initialize them */ - if (i % PRUETH_SW_NUM_BUF_POOLS_HOST < PRUETH_SW_NUM_BUF_POOLS_PER_PRU) { - writel(addr, &bpool_cfg[i].addr); - writel(PRUETH_SW_BUF_POOL_SIZE_HOST, &bpool_cfg[i].len); - addr += PRUETH_SW_BUF_POOL_SIZE_HOST; + /* The driver only uses first 4 queues per PRU, + * so only initialize buffer for them + */ + if ((i % PRUETH_NUM_LI_BUF_POOLS_PER_PORT_PER_SLICE) + < PRUETH_SW_USED_LI_BUF_POOLS_PER_PORT_PER_SLICE) { + writel(addr, &bpool_cfg[cfg_idx].addr); + writel(PRUETH_SW_LI_BUF_POOL_SIZE, + &bpool_cfg[cfg_idx].len); + addr += PRUETH_SW_LI_BUF_POOL_SIZE; } else { - writel(0, &bpool_cfg[i].addr); - writel(0, &bpool_cfg[i].len); + writel(0, &bpool_cfg[cfg_idx].addr); + writel(0, &bpool_cfg[cfg_idx].len); } } - if (!slice) - addr += PRUETH_SW_NUM_BUF_POOLS_HOST * PRUETH_SW_BUF_POOL_SIZE_HOST; - else - addr += PRUETH_EMAC_RX_CTX_BUF_SIZE; + /* Express RX buffer queue + * - used by firmware to store express packets to be transmitted + * to the host core + */ + rxq_ctx = emac->dram.va + HOST_RX_Q_EXP_CONTEXT_OFFSET; + for (i = 0; i < 3; i++) + writel(addr, &rxq_ctx->start[i]); + addr += PRUETH_SW_HOST_EXP_BUF_POOL_SIZE; + writel(addr, &rxq_ctx->end); + + /* Pre-emptible RX buffer queue + * - used by firmware to store preemptible packets to be transmitted + * to the host core + */ rxq_ctx = emac->dram.va + HOST_RX_Q_PRE_CONTEXT_OFFSET; for (i = 0; i < 3; i++) writel(addr, &rxq_ctx->start[i]); - addr += PRUETH_EMAC_RX_CTX_BUF_SIZE; - writel(addr - SZ_2K, &rxq_ctx->end); + addr += PRUETH_SW_HOST_PRE_BUF_POOL_SIZE; + writel(addr, &rxq_ctx->end); + + /* Set pointer for default dropped packet write + * - used by firmware to temporarily store packet to be dropped + */ + rxq_ctx = emac->dram.va + DEFAULT_MSMC_Q_OFFSET; + writel(addr, &rxq_ctx->start[0]); return 0; } @@ -347,13 +374,13 @@ static int prueth_emac_buffer_setup(struct prueth_emac *emac) u32 addr; int i; - /* Layout to have 64KB aligned buffer pool - * |BPOOL0|BPOOL1|RX_CTX0|RX_CTX1| - */ - addr = lower_32_bits(prueth->msmcram.pa); - if (slice) - addr += PRUETH_NUM_BUF_POOLS * PRUETH_EMAC_BUF_POOL_SIZE; + if (slice) { + if (prueth->pdata.banked_ms_ram) + addr += MSMC_RAM_BANK_SIZE; + else + addr += PRUETH_EMAC_TOTAL_BUF_SIZE_PER_SLICE; + } if (addr % SZ_64K) { dev_warn(prueth->dev, "buffer pool needs to be 64KB aligned\n"); @@ -361,39 +388,66 @@ static int prueth_emac_buffer_setup(struct prueth_emac *emac) } bpool_cfg = emac->dram.va + BUFFER_POOL_0_ADDR_OFFSET; - /* workaround for f/w bug. bpool 0 needs to be initilalized */ - writel(addr, &bpool_cfg[0].addr); - writel(0, &bpool_cfg[0].len); - for (i = PRUETH_EMAC_BUF_POOL_START; - i < PRUETH_EMAC_BUF_POOL_START + PRUETH_NUM_BUF_POOLS; - i++) { - writel(addr, &bpool_cfg[i].addr); - writel(PRUETH_EMAC_BUF_POOL_SIZE, &bpool_cfg[i].len); - addr += PRUETH_EMAC_BUF_POOL_SIZE; + /* Configure buffer pools for forwarding buffers + * - in mac mode - no forwarding so initialize all pools to 0 + * - 8 total pools per slice + */ + for (i = 0; i < PRUETH_NUM_FWD_BUF_POOLS_PER_SLICE; i++) { + writel(0, &bpool_cfg[i].addr); + writel(0, &bpool_cfg[i].len); } - if (!slice) - addr += PRUETH_NUM_BUF_POOLS * PRUETH_EMAC_BUF_POOL_SIZE; - else - addr += PRUETH_EMAC_RX_CTX_BUF_SIZE * 2; + /* Configure buffer pools for Local Injection buffers + * - used by firmware to store packets received from host core + * - 16 total pools per slice + */ + bpool_cfg = emac->dram.va + BUFFER_POOL_0_ADDR_OFFSET; + for (i = 0; i < PRUETH_NUM_LI_BUF_POOLS_PER_SLICE; i++) { + int cfg_idx = i + PRUETH_NUM_FWD_BUF_POOLS_PER_SLICE; - /* Pre-emptible RX buffer queue */ - rxq_ctx = emac->dram.va + HOST_RX_Q_PRE_CONTEXT_OFFSET; - for (i = 0; i < 3; i++) - writel(addr, &rxq_ctx->start[i]); + /* In EMAC mode, only first 4 buffers are used, + * as 1 slice needs to handle only 1 port + */ + if (i < PRUETH_EMAC_USED_LI_BUF_POOLS_PER_PORT_PER_SLICE) { + writel(addr, &bpool_cfg[cfg_idx].addr); + writel(PRUETH_EMAC_LI_BUF_POOL_SIZE, + &bpool_cfg[cfg_idx].len); + addr += PRUETH_EMAC_LI_BUF_POOL_SIZE; + } else { + writel(0, &bpool_cfg[cfg_idx].addr); + writel(0, &bpool_cfg[cfg_idx].len); + } + } - addr += PRUETH_EMAC_RX_CTX_BUF_SIZE; - writel(addr, &rxq_ctx->end); - - /* Express RX buffer queue */ + /* Express RX buffer queue + * - used by firmware to store express packets to be transmitted + * to host core + */ rxq_ctx = emac->dram.va + HOST_RX_Q_EXP_CONTEXT_OFFSET; for (i = 0; i < 3; i++) writel(addr, &rxq_ctx->start[i]); - addr += PRUETH_EMAC_RX_CTX_BUF_SIZE; + addr += PRUETH_EMAC_HOST_EXP_BUF_POOL_SIZE; writel(addr, &rxq_ctx->end); + /* Pre-emptible RX buffer queue + * - used by firmware to store preemptible packets to be transmitted + * to host core + */ + rxq_ctx = emac->dram.va + HOST_RX_Q_PRE_CONTEXT_OFFSET; + for (i = 0; i < 3; i++) + writel(addr, &rxq_ctx->start[i]); + + addr += PRUETH_EMAC_HOST_PRE_BUF_POOL_SIZE; + writel(addr, &rxq_ctx->end); + + /* Set pointer for default dropped packet write + * - used by firmware to temporarily store packet to be dropped + */ + rxq_ctx = emac->dram.va + DEFAULT_MSMC_Q_OFFSET; + writel(addr, &rxq_ctx->start[0]); + return 0; } diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.h b/drivers/net/ethernet/ti/icssg/icssg_config.h index c884e9fa099e..60d69744ffae 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.h +++ b/drivers/net/ethernet/ti/icssg/icssg_config.h @@ -26,21 +26,71 @@ struct icssg_flow_cfg { #define PRUETH_MAX_RX_FLOWS 1 /* excluding default flow */ #define PRUETH_RX_FLOW_DATA 0 -#define PRUETH_EMAC_BUF_POOL_SIZE SZ_8K -#define PRUETH_EMAC_POOLS_PER_SLICE 24 -#define PRUETH_EMAC_BUF_POOL_START 8 -#define PRUETH_NUM_BUF_POOLS 8 -#define PRUETH_EMAC_RX_CTX_BUF_SIZE SZ_16K /* per slice */ -#define MSMC_RAM_SIZE \ - (2 * (PRUETH_EMAC_BUF_POOL_SIZE * PRUETH_NUM_BUF_POOLS + \ - PRUETH_EMAC_RX_CTX_BUF_SIZE * 2)) +/* Defines for forwarding path buffer pools: + * - used by firmware to store packets to be forwarded to other port + * - 8 total pools per slice + * - only used in switch mode (as no forwarding in mac mode) + */ +#define PRUETH_NUM_FWD_BUF_POOLS_PER_SLICE 8 +#define PRUETH_SW_FWD_BUF_POOL_SIZE (SZ_8K) -#define PRUETH_SW_BUF_POOL_SIZE_HOST SZ_4K -#define PRUETH_SW_NUM_BUF_POOLS_HOST 8 -#define PRUETH_SW_NUM_BUF_POOLS_PER_PRU 4 -#define MSMC_RAM_SIZE_SWITCH_MODE \ - (MSMC_RAM_SIZE + \ - (2 * PRUETH_SW_BUF_POOL_SIZE_HOST * PRUETH_SW_NUM_BUF_POOLS_HOST)) +/* Defines for local injection path buffer pools: + * - used by firmware to store packets received from host core + * - 16 total pools per slice + * - 8 pools per port per slice and each slice handles both ports + * - only 4 out of 8 pools used per port (as only 4 real QoS levels in ICSSG) + * - switch mode: 8 total pools used + * - mac mode: 4 total pools used + */ +#define PRUETH_NUM_LI_BUF_POOLS_PER_SLICE 16 +#define PRUETH_NUM_LI_BUF_POOLS_PER_PORT_PER_SLICE 8 +#define PRUETH_SW_LI_BUF_POOL_SIZE SZ_4K +#define PRUETH_SW_USED_LI_BUF_POOLS_PER_SLICE 8 +#define PRUETH_SW_USED_LI_BUF_POOLS_PER_PORT_PER_SLICE 4 +#define PRUETH_EMAC_LI_BUF_POOL_SIZE SZ_8K +#define PRUETH_EMAC_USED_LI_BUF_POOLS_PER_SLICE 4 +#define PRUETH_EMAC_USED_LI_BUF_POOLS_PER_PORT_PER_SLICE 4 + +/* Defines for host egress path - express and preemptible buffers + * - used by firmware to store express and preemptible packets + * to be transmitted to host core + * - used by both mac/switch modes + */ +#define PRUETH_SW_HOST_EXP_BUF_POOL_SIZE SZ_16K +#define PRUETH_SW_HOST_PRE_BUF_POOL_SIZE (SZ_16K - SZ_2K) +#define PRUETH_EMAC_HOST_EXP_BUF_POOL_SIZE PRUETH_SW_HOST_EXP_BUF_POOL_SIZE +#define PRUETH_EMAC_HOST_PRE_BUF_POOL_SIZE PRUETH_SW_HOST_PRE_BUF_POOL_SIZE + +/* Buffer used by firmware to temporarily store packet to be dropped */ +#define PRUETH_SW_DROP_PKT_BUF_SIZE SZ_2K +#define PRUETH_EMAC_DROP_PKT_BUF_SIZE PRUETH_SW_DROP_PKT_BUF_SIZE + +/* Total switch mode memory usage for buffers per slice */ +#define PRUETH_SW_TOTAL_BUF_SIZE_PER_SLICE \ + (PRUETH_SW_FWD_BUF_POOL_SIZE * PRUETH_NUM_FWD_BUF_POOLS_PER_SLICE + \ + PRUETH_SW_LI_BUF_POOL_SIZE * PRUETH_SW_USED_LI_BUF_POOLS_PER_SLICE + \ + PRUETH_SW_HOST_EXP_BUF_POOL_SIZE + \ + PRUETH_SW_HOST_PRE_BUF_POOL_SIZE + \ + PRUETH_SW_DROP_PKT_BUF_SIZE) + +/* Total switch mode memory usage for all buffers */ +#define PRUETH_SW_TOTAL_BUF_SIZE \ + (2 * PRUETH_SW_TOTAL_BUF_SIZE_PER_SLICE) + +/* Total mac mode memory usage for buffers per slice */ +#define PRUETH_EMAC_TOTAL_BUF_SIZE_PER_SLICE \ + (PRUETH_EMAC_LI_BUF_POOL_SIZE * \ + PRUETH_EMAC_USED_LI_BUF_POOLS_PER_SLICE + \ + PRUETH_EMAC_HOST_EXP_BUF_POOL_SIZE + \ + PRUETH_EMAC_HOST_PRE_BUF_POOL_SIZE + \ + PRUETH_EMAC_DROP_PKT_BUF_SIZE) + +/* Total mac mode memory usage for all buffers */ +#define PRUETH_EMAC_TOTAL_BUF_SIZE \ + (2 * PRUETH_EMAC_TOTAL_BUF_SIZE_PER_SLICE) + +/* Size of 1 bank of MSMC/OC_SRAM memory */ +#define MSMC_RAM_BANK_SIZE SZ_256K #define PRUETH_SWITCH_FDB_MASK ((SIZE_OF_FDB / NUMBER_OF_FDB_BUCKET_ENTRIES) - 1) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 86fc1278127c..2f5c4335dec3 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -1764,10 +1764,15 @@ static int prueth_probe(struct platform_device *pdev) goto put_mem; } - msmc_ram_size = MSMC_RAM_SIZE; prueth->is_switchmode_supported = prueth->pdata.switch_mode; - if (prueth->is_switchmode_supported) - msmc_ram_size = MSMC_RAM_SIZE_SWITCH_MODE; + if (prueth->pdata.banked_ms_ram) { + /* Reserve 2 MSMC RAM banks for buffers to avoid arbitration */ + msmc_ram_size = (2 * MSMC_RAM_BANK_SIZE); + } else { + msmc_ram_size = PRUETH_EMAC_TOTAL_BUF_SIZE; + if (prueth->is_switchmode_supported) + msmc_ram_size = PRUETH_SW_TOTAL_BUF_SIZE; + } /* NOTE: FW bug needs buffer base to be 64KB aligned */ prueth->msmcram.va = @@ -1924,7 +1929,8 @@ put_iep0: free_pool: gen_pool_free(prueth->sram_pool, - (unsigned long)prueth->msmcram.va, msmc_ram_size); + (unsigned long)prueth->msmcram.va, + prueth->msmcram.size); put_mem: pruss_release_mem_region(prueth->pruss, &prueth->shram); @@ -1976,8 +1982,8 @@ static void prueth_remove(struct platform_device *pdev) icss_iep_put(prueth->iep0); gen_pool_free(prueth->sram_pool, - (unsigned long)prueth->msmcram.va, - MSMC_RAM_SIZE); + (unsigned long)prueth->msmcram.va, + prueth->msmcram.size); pruss_release_mem_region(prueth->pruss, &prueth->shram); @@ -1994,12 +2000,14 @@ static const struct prueth_pdata am654_icssg_pdata = { .fdqring_mode = K3_RINGACC_RING_MODE_MESSAGE, .quirk_10m_link_issue = 1, .switch_mode = 1, + .banked_ms_ram = 0, }; static const struct prueth_pdata am64x_icssg_pdata = { .fdqring_mode = K3_RINGACC_RING_MODE_RING, .quirk_10m_link_issue = 1, .switch_mode = 1, + .banked_ms_ram = 1, }; static const struct of_device_id prueth_dt_match[] = { diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index 23c465f1ce7f..3bb9fd8c3804 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -251,11 +251,13 @@ struct prueth_emac { * @fdqring_mode: Free desc queue mode * @quirk_10m_link_issue: 10M link detect errata * @switch_mode: switch firmware support + * @banked_ms_ram: banked memory support */ struct prueth_pdata { enum k3_ring_mode fdqring_mode; u32 quirk_10m_link_issue:1; u32 switch_mode:1; + u32 banked_ms_ram:1; }; struct icssg_firmwares { diff --git a/drivers/net/ethernet/ti/icssg/icssg_switch_map.h b/drivers/net/ethernet/ti/icssg/icssg_switch_map.h index 490a9cc06fb0..7e053b8af3ec 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_switch_map.h +++ b/drivers/net/ethernet/ti/icssg/icssg_switch_map.h @@ -180,6 +180,9 @@ /* Used to notify the FW of the current link speed */ #define PORT_LINK_SPEED_OFFSET 0x00A8 +/* 2k memory pointer reserved for default writes by PRU0*/ +#define DEFAULT_MSMC_Q_OFFSET 0x00AC + /* TAS gate mask for windows list0 */ #define TAS_GATE_MASK_LIST0 0x0100 From 3afa3ae3db52e3c216d77bd5907a5a86833806cc Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 17 Jul 2025 15:06:09 +0300 Subject: [PATCH 13/34] net/mlx5: Fix memory leak in cmd_exec() If cmd_exec() is called with callback and mlx5_cmd_invoke() returns an error, resources allocated in cmd_exec() will not be freed. Fix the code to release the resources if mlx5_cmd_invoke() returns an error. Fixes: f086470122d5 ("net/mlx5: cmdif, Return value improvements") Reported-by: Alex Tereshkin Signed-off-by: Chiara Meiohas Reviewed-by: Moshe Shemesh Signed-off-by: Vlad Dumitrescu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Link: https://patch.msgid.link/1752753970-261832-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index b1aeea7c4a91..e395ef5f356e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1947,8 +1947,8 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context, pages_queue, token, force_polling); - if (callback) - return err; + if (callback && !err) + return 0; if (err > 0) /* Failed in FW, command didn't execute */ err = deliv_status_to_err(err); From 5b4c56ad4da0aa00b258ab50b1f5775b7d3108c7 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Thu, 17 Jul 2025 15:06:10 +0300 Subject: [PATCH 14/34] net/mlx5: E-Switch, Fix peer miss rules to use peer eswitch In the original design, it is assumed local and peer eswitches have the same number of vfs. However, in new firmware, local and peer eswitches can have different number of vfs configured by mlxconfig. In such configuration, it is incorrect to derive the number of vfs from the local device's eswitch. Fix this by updating the peer miss rules add and delete functions to use the peer device's eswitch and vf count instead of the local device's information, ensuring correct behavior regardless of vf configuration differences. Fixes: ac004b832128 ("net/mlx5e: E-Switch, Add peer miss rules") Signed-off-by: Shahar Shitrit Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Link: https://patch.msgid.link/1752753970-261832-3-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/eswitch_offloads.c | 108 +++++++++--------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 0e3a977d5332..bee906661282 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1182,19 +1182,19 @@ static void esw_set_peer_miss_rule_source_port(struct mlx5_eswitch *esw, static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, struct mlx5_core_dev *peer_dev) { + struct mlx5_eswitch *peer_esw = peer_dev->priv.eswitch; struct mlx5_flow_destination dest = {}; struct mlx5_flow_act flow_act = {0}; struct mlx5_flow_handle **flows; - /* total vports is the same for both e-switches */ - int nvports = esw->total_vports; struct mlx5_flow_handle *flow; + struct mlx5_vport *peer_vport; struct mlx5_flow_spec *spec; - struct mlx5_vport *vport; int err, pfindex; unsigned long i; void *misc; - if (!MLX5_VPORT_MANAGER(esw->dev) && !mlx5_core_is_ecpf_esw_manager(esw->dev)) + if (!MLX5_VPORT_MANAGER(peer_dev) && + !mlx5_core_is_ecpf_esw_manager(peer_dev)) return 0; spec = kvzalloc(sizeof(*spec), GFP_KERNEL); @@ -1203,7 +1203,7 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, peer_miss_rules_setup(esw, peer_dev, spec, &dest); - flows = kvcalloc(nvports, sizeof(*flows), GFP_KERNEL); + flows = kvcalloc(peer_esw->total_vports, sizeof(*flows), GFP_KERNEL); if (!flows) { err = -ENOMEM; goto alloc_flows_err; @@ -1213,10 +1213,10 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); - esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, - spec, MLX5_VPORT_PF); + if (mlx5_core_is_ecpf_esw_manager(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF); + esw_set_peer_miss_rule_source_port(esw, peer_esw, spec, + MLX5_VPORT_PF); flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), spec, &flow_act, &dest, 1); @@ -1224,11 +1224,11 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_pf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } - if (mlx5_ecpf_vport_exists(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + if (mlx5_ecpf_vport_exists(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF); MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_ECPF); flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), spec, &flow_act, &dest, 1); @@ -1236,13 +1236,14 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_ecpf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } - mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { + mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_vfs(peer_dev)) { esw_set_peer_miss_rule_source_port(esw, - peer_dev->priv.eswitch, - spec, vport->vport); + peer_esw, + spec, peer_vport->vport); flow = mlx5_add_flow_rules(mlx5_eswitch_get_slow_fdb(esw), spec, &flow_act, &dest, 1); @@ -1250,22 +1251,22 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, err = PTR_ERR(flow); goto add_vf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } - if (mlx5_core_ec_sriov_enabled(esw->dev)) { - mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) { - if (i >= mlx5_core_max_ec_vfs(peer_dev)) - break; - esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, - spec, vport->vport); + if (mlx5_core_ec_sriov_enabled(peer_dev)) { + mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_ec_vfs(peer_dev)) { + esw_set_peer_miss_rule_source_port(esw, peer_esw, + spec, + peer_vport->vport); flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); if (IS_ERR(flow)) { err = PTR_ERR(flow); goto add_ec_vf_flow_err; } - flows[vport->index] = flow; + flows[peer_vport->index] = flow; } } @@ -1282,25 +1283,27 @@ static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, return 0; add_ec_vf_flow_err: - mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) { - if (!flows[vport->index]) + mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_ec_vfs(peer_dev)) { + if (!flows[peer_vport->index]) continue; - mlx5_del_flow_rules(flows[vport->index]); + mlx5_del_flow_rules(flows[peer_vport->index]); } add_vf_flow_err: - mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { - if (!flows[vport->index]) + mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_vfs(peer_dev)) { + if (!flows[peer_vport->index]) continue; - mlx5_del_flow_rules(flows[vport->index]); + mlx5_del_flow_rules(flows[peer_vport->index]); } - if (mlx5_ecpf_vport_exists(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); - mlx5_del_flow_rules(flows[vport->index]); + if (mlx5_ecpf_vport_exists(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[peer_vport->index]); } add_ecpf_flow_err: - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); - mlx5_del_flow_rules(flows[vport->index]); + if (mlx5_core_is_ecpf_esw_manager(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[peer_vport->index]); } add_pf_flow_err: esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err); @@ -1313,37 +1316,34 @@ alloc_flows_err: static void esw_del_fdb_peer_miss_rules(struct mlx5_eswitch *esw, struct mlx5_core_dev *peer_dev) { + struct mlx5_eswitch *peer_esw = peer_dev->priv.eswitch; u16 peer_index = mlx5_get_dev_index(peer_dev); struct mlx5_flow_handle **flows; - struct mlx5_vport *vport; + struct mlx5_vport *peer_vport; unsigned long i; flows = esw->fdb_table.offloads.peer_miss_rules[peer_index]; if (!flows) return; - if (mlx5_core_ec_sriov_enabled(esw->dev)) { - mlx5_esw_for_each_ec_vf_vport(esw, i, vport, mlx5_core_max_ec_vfs(esw->dev)) { - /* The flow for a particular vport could be NULL if the other ECPF - * has fewer or no VFs enabled - */ - if (!flows[vport->index]) - continue; - mlx5_del_flow_rules(flows[vport->index]); - } + if (mlx5_core_ec_sriov_enabled(peer_dev)) { + mlx5_esw_for_each_ec_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_ec_vfs(peer_dev)) + mlx5_del_flow_rules(flows[peer_vport->index]); } - mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) - mlx5_del_flow_rules(flows[vport->index]); + mlx5_esw_for_each_vf_vport(peer_esw, i, peer_vport, + mlx5_core_max_vfs(peer_dev)) + mlx5_del_flow_rules(flows[peer_vport->index]); - if (mlx5_ecpf_vport_exists(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); - mlx5_del_flow_rules(flows[vport->index]); + if (mlx5_ecpf_vport_exists(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[peer_vport->index]); } - if (mlx5_core_is_ecpf_esw_manager(esw->dev)) { - vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); - mlx5_del_flow_rules(flows[vport->index]); + if (mlx5_core_is_ecpf_esw_manager(peer_dev)) { + peer_vport = mlx5_eswitch_get_vport(peer_esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[peer_vport->index]); } kvfree(flows); From 50b2af451597ca6eefe9d4543f8bbf8de8aa00e7 Mon Sep 17 00:00:00 2001 From: Dennis Chen Date: Wed, 18 Jun 2025 15:52:40 -0400 Subject: [PATCH 15/34] i40e: report VF tx_dropped with tx_errors instead of tx_discards Currently the tx_dropped field in VF stats is not updated correctly when reading stats from the PF. This is because it reads from i40e_eth_stats.tx_discards which seems to be unused for per VSI stats, as it is not updated by i40e_update_eth_stats() and the corresponding register, GLV_TDPC, is not implemented[1]. Use i40e_eth_stats.tx_errors instead, which is actually updated by i40e_update_eth_stats() by reading from GLV_TEPC. To test, create a VF and try to send bad packets through it: $ echo 1 > /sys/class/net/enp2s0f0/device/sriov_numvfs $ cat test.py from scapy.all import * vlan_pkt = Ether(dst="ff:ff:ff:ff:ff:ff") / Dot1Q(vlan=999) / IP(dst="192.168.0.1") / ICMP() ttl_pkt = IP(dst="8.8.8.8", ttl=0) / ICMP() print("Send packet with bad VLAN tag") sendp(vlan_pkt, iface="enp2s0f0v0") print("Send packet with TTL=0") sendp(ttl_pkt, iface="enp2s0f0v0") $ ip -s link show dev enp2s0f0 16: enp2s0f0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 3c:ec:ef:b7:e0:ac brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped missed mcast 0 0 0 0 0 0 TX: bytes packets errors dropped carrier collsns 0 0 0 0 0 0 vf 0 link/ether e2:c6:fd:c1:1e:92 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state auto, trust off RX: bytes packets mcast bcast dropped 0 0 0 0 0 TX: bytes packets dropped 0 0 0 $ python test.py Send packet with bad VLAN tag . Sent 1 packets. Send packet with TTL=0 . Sent 1 packets. $ ip -s link show dev enp2s0f0 16: enp2s0f0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 3c:ec:ef:b7:e0:ac brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped missed mcast 0 0 0 0 0 0 TX: bytes packets errors dropped carrier collsns 0 0 0 0 0 0 vf 0 link/ether e2:c6:fd:c1:1e:92 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state auto, trust off RX: bytes packets mcast bcast dropped 0 0 0 0 0 TX: bytes packets dropped 0 0 0 A packet with non-existent VLAN tag and a packet with TTL = 0 are sent, but tx_dropped is not incremented. After patch: $ ip -s link show dev enp2s0f0 19: enp2s0f0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 3c:ec:ef:b7:e0:ac brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped missed mcast 0 0 0 0 0 0 TX: bytes packets errors dropped carrier collsns 0 0 0 0 0 0 vf 0 link/ether 4a:b7:3d:37:f7:56 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state auto, trust off RX: bytes packets mcast bcast dropped 0 0 0 0 0 TX: bytes packets dropped 0 0 2 Fixes: dc645daef9af5bcbd9c ("i40e: implement VF stats NDO") Signed-off-by: Dennis Chen Link: https://www.intel.com/content/www/us/en/content-details/596333/intel-ethernet-controller-x710-tm4-at2-carlsville-datasheet.html Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 88e6bef69342..2dbe38eb9494 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -5006,7 +5006,7 @@ int i40e_get_vf_stats(struct net_device *netdev, int vf_id, vf_stats->broadcast = stats->rx_broadcast; vf_stats->multicast = stats->rx_multicast; vf_stats->rx_dropped = stats->rx_discards + stats->rx_discards_other; - vf_stats->tx_dropped = stats->tx_discards; + vf_stats->tx_dropped = stats->tx_errors; return 0; } From 5a0df02999dbe838c3feed54b1d59e9445f68b89 Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 25 Jun 2025 09:29:18 +1000 Subject: [PATCH 16/34] i40e: When removing VF MAC filters, only check PF-set MAC When the PF is processing an Admin Queue message to delete a VF's MACs from the MAC filter, we currently check if the PF set the MAC and if the VF is trusted. This results in undesirable behaviour, where if a trusted VF with a PF-set MAC sets itself down (which sends an AQ message to delete the VF's MAC filters) then the VF MAC is erased from the interface. This results in the VF losing its PF-set MAC which should not happen. There is no need to check for trust at all, because an untrusted VF cannot change its own MAC. The only check needed is whether the PF set the MAC. If the PF set the MAC, then don't erase the MAC on link-down. Resolve this by changing the deletion check only for PF-set MAC. (the out-of-tree driver has also intentionally removed the check for VF trust here with OOT driver version 2.26.8, this changes the Linux kernel driver behaviour and comment to match the OOT driver behaviour) Fixes: ea2a1cfc3b201 ("i40e: Fix VF MAC filter removal") Signed-off-by: Jamie Bainbridge Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 2dbe38eb9494..7ccfc1191ae5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3137,10 +3137,10 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) const u8 *addr = al->list[i].addr; /* Allow to delete VF primary MAC only if it was not set - * administratively by PF or if VF is trusted. + * administratively by PF. */ if (ether_addr_equal(addr, vf->default_lan_addr.addr)) { - if (i40e_can_vf_change_mac(vf)) + if (!vf->pf_set_mac) was_unimac_deleted = true; else continue; From 4ff12d82dac119b4b99b5a78b5af3bf2474c0a36 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Thu, 3 Jul 2025 17:52:32 +0800 Subject: [PATCH 17/34] ice: Fix a null pointer dereference in ice_copy_and_init_pkg() Add check for the return value of devm_kmemdup() to prevent potential null pointer dereference. Fixes: c76488109616 ("ice: Implement Dynamic Device Personalization (DDP) download") Cc: stable@vger.kernel.org Signed-off-by: Haoxiang Li Reviewed-by: Michal Swiatkowski Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ddp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index 59323c019544..351824dc3c62 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -2301,6 +2301,8 @@ enum ice_ddp_state ice_copy_and_init_pkg(struct ice_hw *hw, const u8 *buf, return ICE_DDP_PKG_ERR; buf_copy = devm_kmemdup(ice_hw_to_dev(hw), buf, len, GFP_KERNEL); + if (!buf_copy) + return ICE_DDP_PKG_ERR; state = ice_init_pkg(hw, buf_copy, len); if (!ice_is_init_pkg_successful(state)) { From 536fd741c7ac907d63166cdae1081b1febfab613 Mon Sep 17 00:00:00 2001 From: Jacek Kowalski Date: Mon, 30 Jun 2025 10:33:39 +0200 Subject: [PATCH 18/34] e1000e: disregard NVM checksum on tgp when valid checksum bit is not set As described by Vitaly Lifshits: > Starting from Tiger Lake, LAN NVM is locked for writes by SW, so the > driver cannot perform checksum validation and correction. This means > that all NVM images must leave the factory with correct checksum and > checksum valid bit set. Since Tiger Lake devices were the first to have > this lock, some systems in the field did not meet this requirement. > Therefore, for these transitional devices we skip checksum update and > verification, if the valid bit is not set. Signed-off-by: Jacek Kowalski Reviewed-by: Simon Horman Reviewed-by: Vitaly Lifshits Fixes: 4051f68318ca9 ("e1000e: Do not take care about recovery NVM checksum") Cc: stable@vger.kernel.org Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 364378133526..df4e7d781cb1 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -4274,6 +4274,8 @@ static s32 e1000_validate_nvm_checksum_ich8lan(struct e1000_hw *hw) ret_val = e1000e_update_nvm_checksum(hw); if (ret_val) return ret_val; + } else if (hw->mac.type == e1000_pch_tgp) { + return 0; } } From 61114910a5f6a71d0b6ea3b95082dfe031b19dfe Mon Sep 17 00:00:00 2001 From: Jacek Kowalski Date: Mon, 30 Jun 2025 10:35:00 +0200 Subject: [PATCH 19/34] e1000e: ignore uninitialized checksum word on tgp As described by Vitaly Lifshits: > Starting from Tiger Lake, LAN NVM is locked for writes by SW, so the > driver cannot perform checksum validation and correction. This means > that all NVM images must leave the factory with correct checksum and > checksum valid bit set. Unfortunately some systems have left the factory with an uninitialized value of 0xFFFF at register address 0x3F (checksum word location). So on Tiger Lake platform we ignore the computed checksum when such condition is encountered. Signed-off-by: Jacek Kowalski Tested-by: Vlad URSU Fixes: 4051f68318ca9 ("e1000e: Do not take care about recovery NVM checksum") Cc: stable@vger.kernel.org Reviewed-by: Simon Horman Reviewed-by: Vitaly Lifshits Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/defines.h | 3 +++ drivers/net/ethernet/intel/e1000e/nvm.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index 8294a7c4f122..ba331899d186 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -638,6 +638,9 @@ /* For checksumming, the sum of all words in the NVM should equal 0xBABA. */ #define NVM_SUM 0xBABA +/* Uninitialized ("empty") checksum word value */ +#define NVM_CHECKSUM_UNINITIALIZED 0xFFFF + /* PBA (printed board assembly) number words */ #define NVM_PBA_OFFSET_0 8 #define NVM_PBA_OFFSET_1 9 diff --git a/drivers/net/ethernet/intel/e1000e/nvm.c b/drivers/net/ethernet/intel/e1000e/nvm.c index e609f4df86f4..16369e6d245a 100644 --- a/drivers/net/ethernet/intel/e1000e/nvm.c +++ b/drivers/net/ethernet/intel/e1000e/nvm.c @@ -558,6 +558,12 @@ s32 e1000e_validate_nvm_checksum_generic(struct e1000_hw *hw) checksum += nvm_data; } + if (hw->mac.type == e1000_pch_tgp && + nvm_data == NVM_CHECKSUM_UNINITIALIZED) { + e_dbg("Uninitialized NVM Checksum on TGP platform - ignoring\n"); + return 0; + } + if (checksum != (u16)NVM_SUM) { e_dbg("NVM Checksum Invalid\n"); return -E1000_ERR_NVM; From 37848a456fc38c191aedfe41f662cc24db8c23d9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 15 Jul 2025 20:43:28 +0200 Subject: [PATCH 20/34] selftests: mptcp: connect: also cover alt modes The "mmap" and "sendfile" alternate modes for mptcp_connect.sh/.c are available from the beginning, but only tested when mptcp_connect.sh is manually launched with "-m mmap" or "-m sendfile", not via the kselftests helpers. The MPTCP CI was manually running "mptcp_connect.sh -m mmap", but not "-m sendfile". Plus other CIs, especially the ones validating the stable releases, were not validating these alternate modes. To make sure these modes are validated by these CIs, add two new test programs executing mptcp_connect.sh with the alternate modes. Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250715-net-mptcp-sft-connect-alt-v2-1-8230ddd82454@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/Makefile | 3 ++- tools/testing/selftests/net/mptcp/mptcp_connect_mmap.sh | 5 +++++ tools/testing/selftests/net/mptcp/mptcp_connect_sendfile.sh | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/mptcp/mptcp_connect_mmap.sh create mode 100755 tools/testing/selftests/net/mptcp/mptcp_connect_sendfile.sh diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index e47788bfa671..c6b030babba8 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -4,7 +4,8 @@ top_srcdir = ../../../../.. CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) -TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ +TEST_PROGS := mptcp_connect.sh mptcp_connect_mmap.sh mptcp_connect_sendfile.sh \ + pm_netlink.sh mptcp_join.sh diag.sh \ simult_flows.sh mptcp_sockopt.sh userspace_pm.sh TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq mptcp_diag diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect_mmap.sh b/tools/testing/selftests/net/mptcp/mptcp_connect_mmap.sh new file mode 100755 index 000000000000..5dd30f9394af --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_connect_mmap.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +MPTCP_LIB_KSFT_TEST="$(basename "${0}" .sh)" \ + "$(dirname "${0}")/mptcp_connect.sh" -m mmap "${@}" diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect_sendfile.sh b/tools/testing/selftests/net/mptcp/mptcp_connect_sendfile.sh new file mode 100755 index 000000000000..1d16fb1cc9bb --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_connect_sendfile.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +MPTCP_LIB_KSFT_TEST="$(basename "${0}" .sh)" \ + "$(dirname "${0}")/mptcp_connect.sh" -m sendfile "${@}" From fdf0f60a2bb02ba581d9e71d583e69dd0714a521 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 15 Jul 2025 20:43:29 +0200 Subject: [PATCH 21/34] selftests: mptcp: connect: also cover checksum The checksum mode has been added a while ago, but it is only validated when manually launching mptcp_connect.sh with "-C". The different CIs were then not validating these MPTCP Connect tests with checksum enabled. To make sure they do, add a new test program executing mptcp_connect.sh with the checksum mode. Fixes: 94d66ba1d8e4 ("selftests: mptcp: enable checksum in mptcp_connect.sh") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250715-net-mptcp-sft-connect-alt-v2-2-8230ddd82454@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/Makefile | 2 +- tools/testing/selftests/net/mptcp/mptcp_connect_checksum.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/mptcp/mptcp_connect_checksum.sh diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index c6b030babba8..4c7e51336ab2 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -5,7 +5,7 @@ top_srcdir = ../../../../.. CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := mptcp_connect.sh mptcp_connect_mmap.sh mptcp_connect_sendfile.sh \ - pm_netlink.sh mptcp_join.sh diag.sh \ + mptcp_connect_checksum.sh pm_netlink.sh mptcp_join.sh diag.sh \ simult_flows.sh mptcp_sockopt.sh userspace_pm.sh TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq mptcp_diag diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect_checksum.sh b/tools/testing/selftests/net/mptcp/mptcp_connect_checksum.sh new file mode 100755 index 000000000000..ce93ec2f107f --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_connect_checksum.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +MPTCP_LIB_KSFT_TEST="$(basename "${0}" .sh)" \ + "$(dirname "${0}")/mptcp_connect.sh" -C "${@}" From 18ff09c1b94fa1584b31d3f4e9eecdca29230ce5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 18 Jul 2025 14:22:42 -0700 Subject: [PATCH 22/34] net: bcmasp: Restore programming of TX map vector register On ASP versions v2.x we need to program the TX map vector register to properly exercise end-to-end flow control, otherwise the TX engine can either lock-up, or cause the hardware calculated checksum to be wrong/corrupted when multiple back to back packets are being submitted for transmission. This register defaults to 0, which means no flow control being applied. Fixes: e9f31435ee7d ("net: bcmasp: Add support for asp-v3.0") Signed-off-by: Florian Fainelli Link: https://patch.msgid.link/20250718212242.3447751-1-florian.fainelli@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index 0d61b8580d72..f832562bd7a3 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -818,6 +818,9 @@ static void bcmasp_init_tx(struct bcmasp_intf *intf) /* Tx SPB */ tx_spb_ctrl_wl(intf, ((intf->channel + 8) << TX_SPB_CTRL_XF_BID_SHIFT), TX_SPB_CTRL_XF_CTRL2); + + if (intf->parent->tx_chan_offset) + tx_pause_ctrl_wl(intf, (1 << (intf->channel + 8)), TX_PAUSE_MAP_VECTOR); tx_spb_top_wl(intf, 0x1e, TX_SPB_TOP_BLKOUT); tx_spb_dma_wq(intf, intf->tx_spb_dma_addr, TX_SPB_DMA_READ); From 6c4a92d07b0850342d3becf2e608f805e972467c Mon Sep 17 00:00:00 2001 From: "Kito Xu (veritas501)" Date: Thu, 17 Jul 2025 01:28:43 +0000 Subject: [PATCH 23/34] net: appletalk: Fix use-after-free in AARP proxy probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AARP proxy‐probe routine (aarp_proxy_probe_network) sends a probe, releases the aarp_lock, sleeps, then re-acquires the lock. During that window an expire timer thread (__aarp_expire_timer) can remove and kfree() the same entry, leading to a use-after-free. race condition: cpu 0 | cpu 1 atalk_sendmsg() | atif_proxy_probe_device() aarp_send_ddp() | aarp_proxy_probe_network() mod_timer() | lock(aarp_lock) // LOCK!! timeout around 200ms | alloc(aarp_entry) and then call | proxies[hash] = aarp_entry aarp_expire_timeout() | aarp_send_probe() | unlock(aarp_lock) // UNLOCK!! lock(aarp_lock) // LOCK!! | msleep(100); __aarp_expire_timer(&proxies[ct]) | free(aarp_entry) | unlock(aarp_lock) // UNLOCK!! | | lock(aarp_lock) // LOCK!! | UAF aarp_entry !! ================================================================== BUG: KASAN: slab-use-after-free in aarp_proxy_probe_network+0x560/0x630 net/appletalk/aarp.c:493 Read of size 4 at addr ffff8880123aa360 by task repro/13278 CPU: 3 UID: 0 PID: 13278 Comm: repro Not tainted 6.15.2 #3 PREEMPT(full) Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1b0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xc1/0x630 mm/kasan/report.c:521 kasan_report+0xca/0x100 mm/kasan/report.c:634 aarp_proxy_probe_network+0x560/0x630 net/appletalk/aarp.c:493 atif_proxy_probe_device net/appletalk/ddp.c:332 [inline] atif_ioctl+0xb58/0x16c0 net/appletalk/ddp.c:857 atalk_ioctl+0x198/0x2f0 net/appletalk/ddp.c:1818 sock_do_ioctl+0xdc/0x260 net/socket.c:1190 sock_ioctl+0x239/0x6a0 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl fs/ioctl.c:892 [inline] __x64_sys_ioctl+0x194/0x200 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcb/0x250 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Allocated: aarp_alloc net/appletalk/aarp.c:382 [inline] aarp_proxy_probe_network+0xd8/0x630 net/appletalk/aarp.c:468 atif_proxy_probe_device net/appletalk/ddp.c:332 [inline] atif_ioctl+0xb58/0x16c0 net/appletalk/ddp.c:857 atalk_ioctl+0x198/0x2f0 net/appletalk/ddp.c:1818 Freed: kfree+0x148/0x4d0 mm/slub.c:4841 __aarp_expire net/appletalk/aarp.c:90 [inline] __aarp_expire_timer net/appletalk/aarp.c:261 [inline] aarp_expire_timeout+0x480/0x6e0 net/appletalk/aarp.c:317 The buggy address belongs to the object at ffff8880123aa300 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 96 bytes inside of freed 192-byte region [ffff8880123aa300, ffff8880123aa3c0) Memory state around the buggy address: ffff8880123aa200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8880123aa280: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc >ffff8880123aa300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880123aa380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff8880123aa400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kito Xu (veritas501) Link: https://patch.msgid.link/20250717012843.880423-1-hxzene@gmail.com Signed-off-by: Jakub Kicinski --- net/appletalk/aarp.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 9c787e2e4b17..4744e3fd4544 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -35,6 +35,7 @@ #include #include #include +#include int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME; int sysctl_aarp_tick_time = AARP_TICK_TIME; @@ -44,6 +45,7 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; /* Lists of aarp entries */ /** * struct aarp_entry - AARP entry + * @refcnt: Reference count * @last_sent: Last time we xmitted the aarp request * @packet_queue: Queue of frames wait for resolution * @status: Used for proxy AARP @@ -55,6 +57,7 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; * @next: Next entry in chain */ struct aarp_entry { + refcount_t refcnt; /* These first two are only used for unresolved entries */ unsigned long last_sent; struct sk_buff_head packet_queue; @@ -79,6 +82,17 @@ static DEFINE_RWLOCK(aarp_lock); /* Used to walk the list and purge/kick entries. */ static struct timer_list aarp_timer; +static inline void aarp_entry_get(struct aarp_entry *a) +{ + refcount_inc(&a->refcnt); +} + +static inline void aarp_entry_put(struct aarp_entry *a) +{ + if (refcount_dec_and_test(&a->refcnt)) + kfree(a); +} + /* * Delete an aarp queue * @@ -87,7 +101,7 @@ static struct timer_list aarp_timer; static void __aarp_expire(struct aarp_entry *a) { skb_queue_purge(&a->packet_queue); - kfree(a); + aarp_entry_put(a); } /* @@ -380,9 +394,11 @@ static void aarp_purge(void) static struct aarp_entry *aarp_alloc(void) { struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC); + if (!a) + return NULL; - if (a) - skb_queue_head_init(&a->packet_queue); + refcount_set(&a->refcnt, 1); + skb_queue_head_init(&a->packet_queue); return a; } @@ -477,6 +493,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa) entry->dev = atif->dev; write_lock_bh(&aarp_lock); + aarp_entry_get(entry); hash = sa->s_node % (AARP_HASH_SIZE - 1); entry->next = proxies[hash]; @@ -502,6 +519,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa) retval = 1; } + aarp_entry_put(entry); write_unlock_bh(&aarp_lock); out: return retval; From b03f15c0192b184078206760c839054ae6eb4eaa Mon Sep 17 00:00:00 2001 From: Praveen Kaligineedi Date: Thu, 17 Jul 2025 19:20:24 +0000 Subject: [PATCH 24/34] gve: Fix stuck TX queue for DQ queue format gve_tx_timeout was calculating missed completions in a way that is only relevant in the GQ queue format. Additionally, it was attempting to disable device interrupts, which is not needed in either GQ or DQ queue formats. As a result, TX timeouts with the DQ queue format likely would have triggered early resets without kicking the queue at all. This patch drops the check for pending work altogether and always kicks the queue after validating the queue has not seen a TX timeout too recently. Cc: stable@vger.kernel.org Fixes: 87a7f321bb6a ("gve: Recover from queue stall due to missed IRQ") Co-developed-by: Tim Hostetler Signed-off-by: Tim Hostetler Signed-off-by: Praveen Kaligineedi Signed-off-by: Harshitha Ramamurthy Link: https://patch.msgid.link/20250717192024.1820931-1-hramamurthy@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 67 ++++++++++++---------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index dc35a23ec47f..d1aeb722d48f 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1917,49 +1917,56 @@ static void gve_turnup_and_check_status(struct gve_priv *priv) gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status); } -static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue) +static struct gve_notify_block *gve_get_tx_notify_block(struct gve_priv *priv, + unsigned int txqueue) { - struct gve_notify_block *block; - struct gve_tx_ring *tx = NULL; - struct gve_priv *priv; - u32 last_nic_done; - u32 current_time; u32 ntfy_idx; - netdev_info(dev, "Timeout on tx queue, %d", txqueue); - priv = netdev_priv(dev); if (txqueue > priv->tx_cfg.num_queues) - goto reset; + return NULL; ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue); if (ntfy_idx >= priv->num_ntfy_blks) - goto reset; + return NULL; - block = &priv->ntfy_blocks[ntfy_idx]; - tx = block->tx; + return &priv->ntfy_blocks[ntfy_idx]; +} + +static bool gve_tx_timeout_try_q_kick(struct gve_priv *priv, + unsigned int txqueue) +{ + struct gve_notify_block *block; + u32 current_time; + + block = gve_get_tx_notify_block(priv, txqueue); + + if (!block) + return false; current_time = jiffies_to_msecs(jiffies); - if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time) - goto reset; + if (block->tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time) + return false; - /* Check to see if there are missed completions, which will allow us to - * kick the queue. - */ - last_nic_done = gve_tx_load_event_counter(priv, tx); - if (last_nic_done - tx->done) { - netdev_info(dev, "Kicking queue %d", txqueue); - iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block)); - napi_schedule(&block->napi); - tx->last_kick_msec = current_time; - goto out; - } // Else reset. + netdev_info(priv->dev, "Kicking queue %d", txqueue); + napi_schedule(&block->napi); + block->tx->last_kick_msec = current_time; + return true; +} -reset: - gve_schedule_reset(priv); +static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue) +{ + struct gve_notify_block *block; + struct gve_priv *priv; -out: - if (tx) - tx->queue_timeout++; + netdev_info(dev, "Timeout on tx queue, %d", txqueue); + priv = netdev_priv(dev); + + if (!gve_tx_timeout_try_q_kick(priv, txqueue)) + gve_schedule_reset(priv); + + block = gve_get_tx_notify_block(priv, txqueue); + if (block) + block->tx->queue_timeout++; priv->tx_timeo_cnt++; } From cf074eca0065bc5142e6004ae236bb35a2687fdf Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Thu, 17 Jul 2025 16:01:28 -0700 Subject: [PATCH 25/34] net/sched: sch_qfq: Avoid triggering might_sleep in atomic context in qfq_delete_class might_sleep could be trigger in the atomic context in qfq_delete_class. qfq_destroy_class was moved into atomic context locked by sch_tree_lock to avoid a race condition bug on qfq_aggregate. However, might_sleep could be triggered by qfq_destroy_class, which introduced sleeping in atomic context (path: qfq_destroy_class->qdisc_put->__qdisc_destroy->lockdep_unregister_key ->might_sleep). Considering the race is on the qfq_aggregate objects, keeping qfq_rm_from_agg in the lock but moving the left part out can solve this issue. Fixes: 5e28d5a3f774 ("net/sched: sch_qfq: Fix race condition on qfq_aggregate") Reported-by: Dan Carpenter Signed-off-by: Xiang Mei Link: https://patch.msgid.link/4a04e0cc-a64b-44e7-9213-2880ed641d77@sabinyo.mountain Reviewed-by: Cong Wang Reviewed-by: Dan Carpenter Link: https://patch.msgid.link/20250717230128.159766-1-xmei5@asu.edu Signed-off-by: Paolo Abeni --- net/sched/sch_qfq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index f0eb70353744..2255355e51d3 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -536,9 +536,6 @@ destroy_class: static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { - struct qfq_sched *q = qdisc_priv(sch); - - qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); @@ -559,10 +556,11 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); - qfq_destroy_class(sch, cl); + qfq_rm_from_agg(q, cl); sch_tree_unlock(sch); + qfq_destroy_class(sch, cl); return 0; } @@ -1503,6 +1501,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { + qfq_rm_from_agg(q, cl); qfq_destroy_class(sch, cl); } } From c1f3f9797c1f44a762e6f5f72520b2e520537b52 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 15 Jul 2025 22:35:46 +0200 Subject: [PATCH 26/34] can: netlink: can_changelink(): fix NULL pointer deref of struct can_priv::do_set_mode Andrei Lalaev reported a NULL pointer deref when a CAN device is restarted from Bus Off and the driver does not implement the struct can_priv::do_set_mode callback. There are 2 code path that call struct can_priv::do_set_mode: - directly by a manual restart from the user space, via can_changelink() - delayed automatic restart after bus off (deactivated by default) To prevent the NULL pointer deference, refuse a manual restart or configure the automatic restart delay in can_changelink() and report the error via extack to user space. As an additional safety measure let can_restart() return an error if can_priv::do_set_mode is not set instead of dereferencing it unchecked. Reported-by: Andrei Lalaev Closes: https://lore.kernel.org/all/20250714175520.307467-1-andrey.lalaev@gmail.com Fixes: 39549eef3587 ("can: CAN Network device driver and Netlink interface") Link: https://patch.msgid.link/20250718-fix-nullptr-deref-do_set_mode-v1-1-0b520097bb96@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 12 +++++++++--- drivers/net/can/dev/netlink.c | 12 ++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index ea8c807af4d8..3913971125de 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -145,13 +145,16 @@ void can_change_state(struct net_device *dev, struct can_frame *cf, EXPORT_SYMBOL_GPL(can_change_state); /* CAN device restart for bus-off recovery */ -static void can_restart(struct net_device *dev) +static int can_restart(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); struct sk_buff *skb; struct can_frame *cf; int err; + if (!priv->do_set_mode) + return -EOPNOTSUPP; + if (netif_carrier_ok(dev)) netdev_err(dev, "Attempt to restart for bus-off recovery, but carrier is OK?\n"); @@ -173,10 +176,14 @@ static void can_restart(struct net_device *dev) if (err) { netdev_err(dev, "Restart failed, error %pe\n", ERR_PTR(err)); netif_carrier_off(dev); + + return err; } else { netdev_dbg(dev, "Restarted\n"); priv->can_stats.restarts++; } + + return 0; } static void can_restart_work(struct work_struct *work) @@ -201,9 +208,8 @@ int can_restart_now(struct net_device *dev) return -EBUSY; cancel_delayed_work_sync(&priv->restart_work); - can_restart(dev); - return 0; + return can_restart(dev); } /* CAN bus-off diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index a36842ace084..f0e3f0d538fb 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -285,6 +285,12 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_CAN_RESTART_MS]) { + if (!priv->do_set_mode) { + NL_SET_ERR_MSG(extack, + "Device doesn't support restart from Bus Off"); + return -EOPNOTSUPP; + } + /* Do not allow changing restart delay while running */ if (dev->flags & IFF_UP) return -EBUSY; @@ -292,6 +298,12 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_CAN_RESTART]) { + if (!priv->do_set_mode) { + NL_SET_ERR_MSG(extack, + "Device doesn't support restart from Bus Off"); + return -EOPNOTSUPP; + } + /* Do not allow a restart while not running */ if (!(dev->flags & IFF_UP)) return -EINVAL; From dca56cc8b5c3bee889d6cb0d2ee3e933b21cb4ec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Jul 2025 00:36:49 +0200 Subject: [PATCH 27/34] selftests: netfilter: tone-down conntrack clash test The test is supposed to observe that the 'clash_resolve' stat counter incremented (i.e., the code path was covered). This check was incorrect, 'conntrack -S' needs to be called in the revevant namespace, not the initial netns. The clash resolution logic in conntrack is only exercised when multiple packets with the same udp quadruple race. Depending on kernel config, number of CPUs, scheduling policy etc. this might not trigger even after several retries. Thus the script eventually returns SKIP if the retry count is exceeded. The udpclash tool with also exit with a failure if it did not observe the expected number of replies. In the script, make a note of this but do not fail anymore, just check if the clash resolution logic triggered after all. Remove the 'single-core' test: while unlikely, with preemptible kernel it should be possible to also trigger clash resolution logic. With this change the test will either SKIP or pass. Hard error could be restored later once its clear whats going on, so also dump 'conntrack -S' when some packets went missing to see if conntrack dropped them on insert. Fixes: 78a588363587 ("selftests: netfilter: add conntrack clash resolution test case") Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20250721223652.6956-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- .../net/netfilter/conntrack_clash.sh | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/net/netfilter/conntrack_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_clash.sh index 3712c1b9b38b..606a43a60f73 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_clash.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_clash.sh @@ -93,32 +93,28 @@ ping_test() run_one_clash_test() { local ns="$1" - local daddr="$2" - local dport="$3" + local ctns="$2" + local daddr="$3" + local dport="$4" local entries local cre if ! ip netns exec "$ns" ./udpclash $daddr $dport;then - echo "FAIL: did not receive expected number of replies for $daddr:$dport" - ret=1 - return 1 + echo "INFO: did not receive expected number of replies for $daddr:$dport" + ip netns exec "$ctns" conntrack -S + # don't fail: check if clash resolution triggered after all. fi - entries=$(conntrack -S | wc -l) - cre=$(conntrack -S | grep -v "clash_resolve=0" | wc -l) + entries=$(ip netns exec "$ctns" conntrack -S | wc -l) + cre=$(ip netns exec "$ctns" conntrack -S | grep "clash_resolve=0" | wc -l) - if [ "$cre" -ne "$entries" ] ;then + if [ "$cre" -ne "$entries" ];then clash_resolution_active=1 return 0 fi - # 1 cpu -> parallel insertion impossible - if [ "$entries" -eq 1 ]; then - return 0 - fi - - # not a failure: clash resolution logic did not trigger, but all replies - # were received. With right timing, xmit completed sequentially and + # not a failure: clash resolution logic did not trigger. + # With right timing, xmit completed sequentially and # no parallel insertion occurs. return $ksft_skip } @@ -126,20 +122,23 @@ run_one_clash_test() run_clash_test() { local ns="$1" - local daddr="$2" - local dport="$3" + local ctns="$2" + local daddr="$3" + local dport="$4" + local softerr=0 for i in $(seq 1 10);do - run_one_clash_test "$ns" "$daddr" "$dport" + run_one_clash_test "$ns" "$ctns" "$daddr" "$dport" local rv=$? if [ $rv -eq 0 ];then echo "PASS: clash resolution test for $daddr:$dport on attempt $i" return 0 - elif [ $rv -eq 1 ];then - echo "FAIL: clash resolution test for $daddr:$dport on attempt $i" - return 1 + elif [ $rv -eq $ksft_skip ]; then + softerr=1 fi done + + [ $softerr -eq 1 ] && echo "SKIP: clash resolution for $daddr:$dport did not trigger" } ip link add veth0 netns "$nsclient1" type veth peer name veth0 netns "$nsrouter" @@ -161,11 +160,11 @@ spawn_servers "$nsclient2" # exercise clash resolution with nat: # nsrouter is supposed to dnat to 10.0.2.1:900{0,1,2,3}. -run_clash_test "$nsclient1" 10.0.1.99 "$dport" +run_clash_test "$nsclient1" "$nsrouter" 10.0.1.99 "$dport" # exercise clash resolution without nat. load_simple_ruleset "$nsclient2" -run_clash_test "$nsclient2" 127.0.0.1 9001 +run_clash_test "$nsclient2" "$nsclient2" 127.0.0.1 9001 if [ $clash_resolution_active -eq 0 ];then [ "$ret" -eq 0 ] && ret=$ksft_skip From 14822f782700a094baa9b5966cc047a9b1b31701 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Jul 2025 09:56:49 -0700 Subject: [PATCH 28/34] MAINTAINERS: Add in6.h to MAINTAINERS My CC-adding automation returned nothing on a future patch to the include/linux/in6.h file, and I went looking for why. Add the missed in6.h to MAINTAINERS. Signed-off-by: Kees Cook Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250722165645.work.047-kees@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 60bba48f5479..f31380451431 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17384,6 +17384,7 @@ F: include/linux/ethtool.h F: include/linux/framer/framer-provider.h F: include/linux/framer/framer.h F: include/linux/in.h +F: include/linux/in6.h F: include/linux/indirect_call_wrapper.h F: include/linux/inet.h F: include/linux/inet_diag.h From 86941382508850d58c11bdafe0fec646dfd31b09 Mon Sep 17 00:00:00 2001 From: Nimrod Oren Date: Tue, 22 Jul 2025 15:26:55 +0300 Subject: [PATCH 29/34] selftests: drv-net: wait for iperf client to stop sending A few packets may still be sent out during the termination of iperf processes. These late packets cause failures in rss_ctx.py when they arrive on queues expected to be empty. Example failure observed: Check failed 2 != 0 traffic on inactive queues (context 1): [0, 0, 1, 1, 386385, 397196, 0, 0, 0, 0, ...] Check failed 4 != 0 traffic on inactive queues (context 2): [0, 0, 0, 0, 2, 2, 247152, 253013, 0, 0, ...] Check failed 2 != 0 traffic on inactive queues (context 3): [0, 0, 0, 0, 0, 0, 1, 1, 282434, 283070, ...] To avoid such failures, wait until all client sockets for the requested port are either closed or in the TIME_WAIT state. Fixes: 847aa551fa78 ("selftests: drv-net: rss_ctx: factor out send traffic and check") Signed-off-by: Nimrod Oren Reviewed-by: Gal Pressman Reviewed-by: Carolina Jubran Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250722122655.3194442-1-noren@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/lib/py/load.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py index d9c10613ae67..44151b7b1a24 100644 --- a/tools/testing/selftests/drivers/net/lib/py/load.py +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +import re import time from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen @@ -10,12 +11,11 @@ class GenerateTraffic: self.env = env - if port is None: - port = rand_port() - self._iperf_server = cmd(f"iperf3 -s -1 -p {port}", background=True) - wait_port_listen(port) + self.port = rand_port() if port is None else port + self._iperf_server = cmd(f"iperf3 -s -1 -p {self.port}", background=True) + wait_port_listen(self.port) time.sleep(0.1) - self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {port} -t 86400", + self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {self.port} -t 86400", background=True, host=env.remote) # Wait for traffic to ramp up @@ -56,3 +56,16 @@ class GenerateTraffic: ksft_pr(">> Server:") ksft_pr(self._iperf_server.stdout) ksft_pr(self._iperf_server.stderr) + self._wait_client_stopped() + + def _wait_client_stopped(self, sleep=0.005, timeout=5): + end = time.monotonic() + timeout + + live_port_pattern = re.compile(fr":{self.port:04X} 0[^6] ") + + while time.monotonic() < end: + data = cmd("cat /proc/net/tcp*", host=self.env.remote).stdout + if not live_port_pattern.search(data): + return + time.sleep(sleep) + raise Exception(f"Waiting for client to stop timed out after {timeout}s") From 897e8601b9cff1d054cdd53047f568b0e1995726 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Tue, 22 Jul 2025 18:18:17 +0200 Subject: [PATCH 30/34] s390/ism: fix concurrency management in ism_cmd() The s390x ISM device data sheet clearly states that only one request-response sequence is allowable per ISM function at any point in time. Unfortunately as of today the s390/ism driver in Linux does not honor that requirement. This patch aims to rectify that. This problem was discovered based on Aliaksei's bug report which states that for certain workloads the ISM functions end up entering error state (with PEC 2 as seen from the logs) after a while and as a consequence connections handled by the respective function break, and for future connection requests the ISM device is not considered -- given it is in a dysfunctional state. During further debugging PEC 3A was observed as well. A kernel message like [ 1211.244319] zpci: 061a:00:00.0: Event 0x2 reports an error for PCI function 0x61a is a reliable indicator of the stated function entering error state with PEC 2. Let me also point out that a kernel message like [ 1211.244325] zpci: 061a:00:00.0: The ism driver bound to the device does not support error recovery is a reliable indicator that the ISM function won't be auto-recovered because the ISM driver currently lacks support for it. On a technical level, without this synchronization, commands (inputs to the FW) may be partially or fully overwritten (corrupted) by another CPU trying to issue commands on the same function. There is hard evidence that this can lead to DMB token values being used as DMB IOVAs, leading to PEC 2 PCI events indicating invalid DMA. But this is only one of the failure modes imaginable. In theory even completely losing one command and executing another one twice and then trying to interpret the outputs as if the command we intended to execute was actually executed and not the other one is also possible. Frankly, I don't feel confident about providing an exhaustive list of possible consequences. Fixes: 684b89bc39ce ("s390/ism: add device driver for internal shared memory") Reported-by: Aliaksei Makarau Tested-by: Mahanta Jambigi Tested-by: Aliaksei Makarau Signed-off-by: Halil Pasic Reviewed-by: Alexandra Winter Signed-off-by: Alexandra Winter Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250722161817.1298473-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism_drv.c | 3 +++ include/linux/ism.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index b7f15f303ea2..967dc4f9eea8 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -130,6 +130,7 @@ static int ism_cmd(struct ism_dev *ism, void *cmd) struct ism_req_hdr *req = cmd; struct ism_resp_hdr *resp = cmd; + spin_lock(&ism->cmd_lock); __ism_write_cmd(ism, req + 1, sizeof(*req), req->len - sizeof(*req)); __ism_write_cmd(ism, req, 0, sizeof(*req)); @@ -143,6 +144,7 @@ static int ism_cmd(struct ism_dev *ism, void *cmd) } __ism_read_cmd(ism, resp + 1, sizeof(*resp), resp->len - sizeof(*resp)); out: + spin_unlock(&ism->cmd_lock); return resp->ret; } @@ -606,6 +608,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; spin_lock_init(&ism->lock); + spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; ism->dev.parent = &pdev->dev; diff --git a/include/linux/ism.h b/include/linux/ism.h index 5428edd90982..8358b4cd7ba6 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -28,6 +28,7 @@ struct ism_dmb { struct ism_dev { spinlock_t lock; /* protects the ism device */ + spinlock_t cmd_lock; /* serializes cmds */ struct list_head list; struct pci_dev *pdev; From 4555f8f8b6aa46940f55feb6a07704c2935b6d6e Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Tue, 22 Jul 2025 20:54:20 +0800 Subject: [PATCH 31/34] net: hns3: fix concurrent setting vlan filter issue The vport->req_vlan_fltr_en may be changed concurrently by function hclge_sync_vlan_fltr_state() called in periodic work task and function hclge_enable_vport_vlan_filter() called by user configuration. It may cause the user configuration inoperative. Fixes it by protect the vport->req_vlan_fltr by vport_lock. Fixes: 2ba306627f59 ("net: hns3: add support for modify VLAN filter state") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250722125423.1270673-2-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- .../hisilicon/hns3/hns3pf/hclge_main.c | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a7de67699a01..30bdec1acb57 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9576,33 +9576,36 @@ static bool hclge_need_enable_vport_vlan_filter(struct hclge_vport *vport) return false; } -int hclge_enable_vport_vlan_filter(struct hclge_vport *vport, bool request_en) +static int __hclge_enable_vport_vlan_filter(struct hclge_vport *vport, + bool request_en) { - struct hclge_dev *hdev = vport->back; bool need_en; int ret; - mutex_lock(&hdev->vport_lock); - - vport->req_vlan_fltr_en = request_en; - need_en = hclge_need_enable_vport_vlan_filter(vport); - if (need_en == vport->cur_vlan_fltr_en) { - mutex_unlock(&hdev->vport_lock); + if (need_en == vport->cur_vlan_fltr_en) return 0; - } ret = hclge_set_vport_vlan_filter(vport, need_en); - if (ret) { - mutex_unlock(&hdev->vport_lock); + if (ret) return ret; - } vport->cur_vlan_fltr_en = need_en; + return 0; +} + +int hclge_enable_vport_vlan_filter(struct hclge_vport *vport, bool request_en) +{ + struct hclge_dev *hdev = vport->back; + int ret; + + mutex_lock(&hdev->vport_lock); + vport->req_vlan_fltr_en = request_en; + ret = __hclge_enable_vport_vlan_filter(vport, request_en); mutex_unlock(&hdev->vport_lock); - return 0; + return ret; } static int hclge_enable_vlan_filter(struct hnae3_handle *handle, bool enable) @@ -10623,16 +10626,19 @@ static void hclge_sync_vlan_fltr_state(struct hclge_dev *hdev) &vport->state)) continue; - ret = hclge_enable_vport_vlan_filter(vport, - vport->req_vlan_fltr_en); + mutex_lock(&hdev->vport_lock); + ret = __hclge_enable_vport_vlan_filter(vport, + vport->req_vlan_fltr_en); if (ret) { dev_err(&hdev->pdev->dev, "failed to sync vlan filter state for vport%u, ret = %d\n", vport->vport_id, ret); set_bit(HCLGE_VPORT_STATE_VLAN_FLTR_CHANGE, &vport->state); + mutex_unlock(&hdev->vport_lock); return; } + mutex_unlock(&hdev->vport_lock); } } From cde304655f25d94a996c45b0f9956e7dcc2bc4c0 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Tue, 22 Jul 2025 20:54:21 +0800 Subject: [PATCH 32/34] net: hns3: disable interrupt when ptp init failed When ptp init failed, we'd better disable the interrupt and clear the flag, to avoid early report interrupt at next probe. Fixes: 0bf5eb788512 ("net: hns3: add support for PTP") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250722125423.1270673-3-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c index ec581d4b696f..4bd52eab3914 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c @@ -497,14 +497,14 @@ int hclge_ptp_init(struct hclge_dev *hdev) if (ret) { dev_err(&hdev->pdev->dev, "failed to init freq, ret = %d\n", ret); - goto out; + goto out_clear_int; } ret = hclge_ptp_set_ts_mode(hdev, &hdev->ptp->ts_cfg); if (ret) { dev_err(&hdev->pdev->dev, "failed to init ts mode, ret = %d\n", ret); - goto out; + goto out_clear_int; } ktime_get_real_ts64(&ts); @@ -512,7 +512,7 @@ int hclge_ptp_init(struct hclge_dev *hdev) if (ret) { dev_err(&hdev->pdev->dev, "failed to init ts time, ret = %d\n", ret); - goto out; + goto out_clear_int; } set_bit(HCLGE_STATE_PTP_EN, &hdev->state); @@ -520,6 +520,9 @@ int hclge_ptp_init(struct hclge_dev *hdev) return 0; +out_clear_int: + clear_bit(HCLGE_PTP_FLAG_EN, &hdev->ptp->flags); + hclge_ptp_int_en(hdev, false); out: hclge_ptp_destroy_clock(hdev); From b3e75c0bcc53f647311960bc1b0970b9b480ca5a Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Tue, 22 Jul 2025 20:54:22 +0800 Subject: [PATCH 33/34] net: hns3: fixed vf get max channels bug Currently, the queried maximum of vf channels is the maximum of channels supported by each TC. However, the actual maximum of channels is the maximum of channels supported by the device. Fixes: 849e46077689 ("net: hns3: add ethtool_ops.get_channels support for VF") Signed-off-by: Jian Shen Signed-off-by: Hao Lan Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250722125423.1270673-4-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index c4f35e8e2177..4cf6ac04662a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -3094,11 +3094,7 @@ static void hclgevf_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) static u32 hclgevf_get_max_channels(struct hclgevf_dev *hdev) { - struct hnae3_handle *nic = &hdev->nic; - struct hnae3_knic_private_info *kinfo = &nic->kinfo; - - return min_t(u32, hdev->rss_size_max, - hdev->num_tqps / kinfo->tc_info.num_tc); + return min(hdev->rss_size_max, hdev->num_tqps); } /** From 49ade8630f36e9dca2395592cfb0b7deeb07e746 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Tue, 22 Jul 2025 20:54:23 +0800 Subject: [PATCH 34/34] net: hns3: default enable tx bounce buffer when smmu enabled The SMMU engine on HIP09 chip has a hardware issue. SMMU pagetable prefetch features may prefetch and use a invalid PTE even the PTE is valid at that time. This will cause the device trigger fake pagefaults. The solution is to avoid prefetching by adding a SYNC command when smmu mapping a iova. But the performance of nic has a sharp drop. Then we do this workaround, always enable tx bounce buffer, avoid mapping/unmapping on TX path. This issue only affects HNS3, so we always enable tx bounce buffer when smmu enabled to improve performance. Fixes: 295ba232a8c3 ("net: hns3: add device version to replace pci revision") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250722125423.1270673-5-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 31 +++++++++++++++++++ .../net/ethernet/hisilicon/hns3/hns3_enet.h | 2 ++ 2 files changed, 33 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b03b8758c777..aaa803563bd2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -1039,6 +1040,8 @@ static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring, static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) { u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size; + struct net_device *netdev = ring_to_netdev(ring); + struct hns3_nic_priv *priv = netdev_priv(netdev); struct hns3_tx_spare *tx_spare; struct page *page; dma_addr_t dma; @@ -1080,6 +1083,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) tx_spare->buf = page_address(page); tx_spare->len = PAGE_SIZE << order; ring->tx_spare = tx_spare; + ring->tx_copybreak = priv->tx_copybreak; return; dma_mapping_error: @@ -4874,6 +4878,30 @@ static void hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv) devm_kfree(&pdev->dev, priv->tqp_vector); } +static void hns3_update_tx_spare_buf_config(struct hns3_nic_priv *priv) +{ +#define HNS3_MIN_SPARE_BUF_SIZE (2 * 1024 * 1024) +#define HNS3_MAX_PACKET_SIZE (64 * 1024) + + struct iommu_domain *domain = iommu_get_domain_for_dev(priv->dev); + struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(priv->ae_handle); + struct hnae3_handle *handle = priv->ae_handle; + + if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3) + return; + + if (!(domain && iommu_is_dma_domain(domain))) + return; + + priv->min_tx_copybreak = HNS3_MAX_PACKET_SIZE; + priv->min_tx_spare_buf_size = HNS3_MIN_SPARE_BUF_SIZE; + + if (priv->tx_copybreak < priv->min_tx_copybreak) + priv->tx_copybreak = priv->min_tx_copybreak; + if (handle->kinfo.tx_spare_buf_size < priv->min_tx_spare_buf_size) + handle->kinfo.tx_spare_buf_size = priv->min_tx_spare_buf_size; +} + static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, unsigned int ring_type) { @@ -5107,6 +5135,7 @@ int hns3_init_all_ring(struct hns3_nic_priv *priv) int i, j; int ret; + hns3_update_tx_spare_buf_config(priv); for (i = 0; i < ring_num; i++) { ret = hns3_alloc_ring_memory(&priv->ring[i]); if (ret) { @@ -5311,6 +5340,8 @@ static int hns3_client_init(struct hnae3_handle *handle) priv->ae_handle = handle; priv->tx_timeout_count = 0; priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num; + priv->min_tx_copybreak = 0; + priv->min_tx_spare_buf_size = 0; set_bit(HNS3_NIC_STATE_DOWN, &priv->state); handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index d36c4ed16d8d..caf7a4df8585 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -596,6 +596,8 @@ struct hns3_nic_priv { struct hns3_enet_coalesce rx_coal; u32 tx_copybreak; u32 rx_copybreak; + u32 min_tx_copybreak; + u32 min_tx_spare_buf_size; }; union l3_hdr_info {