From ba5cb47b56e5d89f0a5eb5163ffbfda1e3e7cef0 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Wed, 21 May 2025 16:00:43 +0530 Subject: [PATCH 01/21] octeontx2-af: Send Link events one by one Send link events one after another otherwise new message is overwriting the message which is being processed by PF. Fixes: a88e0f936ba9 ("octeontx2: Detect the mbox up or down message via register") Signed-off-by: Subbaraya Sundeep Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/1747823443-404-1-git-send-email-sbhatta@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 2 ++ drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 2 ++ drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c index 655dd4726d36..0277d226293e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c @@ -143,6 +143,8 @@ static int mcs_notify_pfvf(struct mcs_intr_event *event, struct rvu *rvu) otx2_mbox_msg_send_up(&rvu->afpf_wq_info.mbox_up, pf); + otx2_mbox_wait_for_rsp(&rvu->afpf_wq_info.mbox_up, pf); + mutex_unlock(&rvu->mbox_lock); return 0; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 992fa0b82e8d..ebb56eb0d18c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -272,6 +272,8 @@ static void cgx_notify_pfs(struct cgx_link_event *event, struct rvu *rvu) otx2_mbox_msg_send_up(&rvu->afpf_wq_info.mbox_up, pfid); + otx2_mbox_wait_for_rsp(&rvu->afpf_wq_info.mbox_up, pfid); + mutex_unlock(&rvu->mbox_lock); } while (pfmap); } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c index 052ae5923e3a..32953cca108c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c @@ -60,6 +60,8 @@ static int rvu_rep_up_notify(struct rvu *rvu, struct rep_event *event) otx2_mbox_msg_send_up(&rvu->afpf_wq_info.mbox_up, pf); + otx2_mbox_wait_for_rsp(&rvu->afpf_wq_info.mbox_up, pf); + mutex_unlock(&rvu->mbox_lock); return 0; } From 45ca7e9f0730ae36fc610e675b990e9cc9ca0714 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 21 May 2025 14:17:05 +0200 Subject: [PATCH 02/21] vsock/virtio: fix `rx_bytes` accounting for stream sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `struct virtio_vsock_sock`, we maintain two counters: - `rx_bytes`: used internally to track how many bytes have been read. This supports mechanisms like .stream_has_data() and sock_rcvlowat(). - `fwd_cnt`: used for the credit mechanism to inform available receive buffer space to the remote peer. These counters are updated via virtio_transport_inc_rx_pkt() and virtio_transport_dec_rx_pkt(). Since the beginning with commit 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko"), we call virtio_transport_dec_rx_pkt() in virtio_transport_stream_do_dequeue() only when we consume the entire packet, so partial reads, do not update `rx_bytes` and `fwd_cnt`. This is fine for `fwd_cnt`, because we still have space used for the entire packet, and we don't want to update the credit for the other peer until we free the space of the entire packet. However, this causes `rx_bytes` to be stale on partial reads. Previously, this didn’t cause issues because `rx_bytes` was used only by .stream_has_data(), and any unread portion of a packet implied data was still available. However, since commit 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages"), we now rely on `rx_bytes` to determine if a credit update should be sent when the data in the RX queue drops below SO_RCVLOWAT value. This patch fixes the accounting by updating `rx_bytes` with the number of bytes actually read, even on partial reads, while leaving `fwd_cnt` untouched until the packet is fully consumed. Also introduce a new `buf_used` counter to check that the remote peer is honoring the given credit; this was previously done via `rx_bytes`. Fixes: 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages") Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250521121705.196379-1-sgarzare@redhat.com Signed-off-by: Paolo Abeni --- include/linux/virtio_vsock.h | 1 + net/vmw_vsock/virtio_transport_common.c | 26 +++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 0387d64e2c66..36fb3edfa403 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -140,6 +140,7 @@ struct virtio_vsock_sock { u32 last_fwd_cnt; u32 rx_bytes; u32 buf_alloc; + u32 buf_used; struct sk_buff_head rx_queue; u32 msg_count; }; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d88096..2c9b1011cdcc 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -441,18 +441,20 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->rx_bytes + len > vvs->buf_alloc) + if (vvs->buf_used + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; + vvs->buf_used += len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - u32 len) + u32 bytes_read, u32 bytes_dequeued) { - vvs->rx_bytes -= len; - vvs->fwd_cnt += len; + vvs->rx_bytes -= bytes_read; + vvs->buf_used -= bytes_dequeued; + vvs->fwd_cnt += bytes_dequeued; } void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) @@ -581,11 +583,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; bool low_rx_bytes; int err = -EFAULT; + size_t total = 0; u32 free_space; spin_lock_bh(&vvs->rx_lock); @@ -597,6 +599,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + size_t bytes, dequeued = 0; + skb = skb_peek(&vvs->rx_queue); bytes = min_t(size_t, len - total, @@ -620,12 +624,12 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes; if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) { - u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); - - virtio_transport_dec_rx_pkt(vvs, pkt_len); + dequeued = le32_to_cpu(virtio_vsock_hdr(skb)->len); __skb_unlink(skb, &vvs->rx_queue); consume_skb(skb); } + + virtio_transport_dec_rx_pkt(vvs, bytes, dequeued); } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; @@ -781,7 +785,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt_len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); kfree_skb(skb); } @@ -1735,6 +1739,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto struct sock *sk = sk_vsock(vsk); struct virtio_vsock_hdr *hdr; struct sk_buff *skb; + u32 pkt_len; int off = 0; int err; @@ -1752,7 +1757,8 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count--; - virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len)); + pkt_len = le32_to_cpu(hdr->len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); spin_unlock_bh(&vvs->rx_lock); virtio_transport_send_credit_update(vsk); From 57ee9584fd8606deef66d7b65fa4dcf94f6843aa Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 21 May 2025 14:41:59 +0200 Subject: [PATCH 03/21] net: lan966x: Fix 1-step timestamping over ipv4 or ipv6 When enabling 1-step timestamping for ptp frames that are over udpv4 or udpv6 then the inserted timestamp is added at the wrong offset in the frame, meaning that will modify the frame at the wrong place, so the frame will be malformed. To fix this, the HW needs to know which kind of frame it is to know where to insert the timestamp. For that there is a field in the IFH that says the PDU_TYPE, which can be NONE which is the default value, IPV4 or IPV6. Therefore make sure to set the PDU_TYPE so the HW knows where to insert the timestamp. Like I mention before the issue is not seen with L2 frames because by default the PDU_TYPE has a value of 0, which represents the L2 frames. Fixes: 77eecf25bd9d2f ("net: lan966x: Update extraction/injection for timestamping") Signed-off-by: Horatiu Vultur Link: https://patch.msgid.link/20250521124159.2713525-1-horatiu.vultur@microchip.com Signed-off-by: Paolo Abeni --- .../ethernet/microchip/lan966x/lan966x_main.c | 6 +++ .../ethernet/microchip/lan966x/lan966x_main.h | 5 ++ .../ethernet/microchip/lan966x/lan966x_ptp.c | 49 ++++++++++++++----- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 0af143ec0f86..427bdc0e4908 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -353,6 +353,11 @@ static void lan966x_ifh_set_rew_op(void *ifh, u64 rew_op) lan966x_ifh_set(ifh, rew_op, IFH_POS_REW_CMD, IFH_WID_REW_CMD); } +static void lan966x_ifh_set_oam_type(void *ifh, u64 oam_type) +{ + lan966x_ifh_set(ifh, oam_type, IFH_POS_PDU_TYPE, IFH_WID_PDU_TYPE); +} + static void lan966x_ifh_set_timestamp(void *ifh, u64 timestamp) { lan966x_ifh_set(ifh, timestamp, IFH_POS_TIMESTAMP, IFH_WID_TIMESTAMP); @@ -380,6 +385,7 @@ static netdev_tx_t lan966x_port_xmit(struct sk_buff *skb, return err; lan966x_ifh_set_rew_op(ifh, LAN966X_SKB_CB(skb)->rew_op); + lan966x_ifh_set_oam_type(ifh, LAN966X_SKB_CB(skb)->pdu_type); lan966x_ifh_set_timestamp(ifh, LAN966X_SKB_CB(skb)->ts_id); } diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h index 1efa584e7107..1f9df67f0504 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h @@ -75,6 +75,10 @@ #define IFH_REW_OP_ONE_STEP_PTP 0x3 #define IFH_REW_OP_TWO_STEP_PTP 0x4 +#define IFH_PDU_TYPE_NONE 0 +#define IFH_PDU_TYPE_IPV4 7 +#define IFH_PDU_TYPE_IPV6 8 + #define FDMA_RX_DCB_MAX_DBS 1 #define FDMA_TX_DCB_MAX_DBS 1 @@ -254,6 +258,7 @@ struct lan966x_phc { struct lan966x_skb_cb { u8 rew_op; + u8 pdu_type; u16 ts_id; unsigned long jiffies; }; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index 63905bb5a63a..87e5e81d40dc 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -322,34 +322,55 @@ void lan966x_ptp_hwtstamp_get(struct lan966x_port *port, *cfg = phc->hwtstamp_config; } -static int lan966x_ptp_classify(struct lan966x_port *port, struct sk_buff *skb) +static void lan966x_ptp_classify(struct lan966x_port *port, struct sk_buff *skb, + u8 *rew_op, u8 *pdu_type) { struct ptp_header *header; u8 msgtype; int type; - if (port->ptp_tx_cmd == IFH_REW_OP_NOOP) - return IFH_REW_OP_NOOP; + if (port->ptp_tx_cmd == IFH_REW_OP_NOOP) { + *rew_op = IFH_REW_OP_NOOP; + *pdu_type = IFH_PDU_TYPE_NONE; + return; + } type = ptp_classify_raw(skb); - if (type == PTP_CLASS_NONE) - return IFH_REW_OP_NOOP; + if (type == PTP_CLASS_NONE) { + *rew_op = IFH_REW_OP_NOOP; + *pdu_type = IFH_PDU_TYPE_NONE; + return; + } header = ptp_parse_header(skb, type); - if (!header) - return IFH_REW_OP_NOOP; + if (!header) { + *rew_op = IFH_REW_OP_NOOP; + *pdu_type = IFH_PDU_TYPE_NONE; + return; + } - if (port->ptp_tx_cmd == IFH_REW_OP_TWO_STEP_PTP) - return IFH_REW_OP_TWO_STEP_PTP; + if (type & PTP_CLASS_L2) + *pdu_type = IFH_PDU_TYPE_NONE; + if (type & PTP_CLASS_IPV4) + *pdu_type = IFH_PDU_TYPE_IPV4; + if (type & PTP_CLASS_IPV6) + *pdu_type = IFH_PDU_TYPE_IPV6; + + if (port->ptp_tx_cmd == IFH_REW_OP_TWO_STEP_PTP) { + *rew_op = IFH_REW_OP_TWO_STEP_PTP; + return; + } /* If it is sync and run 1 step then set the correct operation, * otherwise run as 2 step */ msgtype = ptp_get_msgtype(header, type); - if ((msgtype & 0xf) == 0) - return IFH_REW_OP_ONE_STEP_PTP; + if ((msgtype & 0xf) == 0) { + *rew_op = IFH_REW_OP_ONE_STEP_PTP; + return; + } - return IFH_REW_OP_TWO_STEP_PTP; + *rew_op = IFH_REW_OP_TWO_STEP_PTP; } static void lan966x_ptp_txtstamp_old_release(struct lan966x_port *port) @@ -374,10 +395,12 @@ int lan966x_ptp_txtstamp_request(struct lan966x_port *port, { struct lan966x *lan966x = port->lan966x; unsigned long flags; + u8 pdu_type; u8 rew_op; - rew_op = lan966x_ptp_classify(port, skb); + lan966x_ptp_classify(port, skb, &rew_op, &pdu_type); LAN966X_SKB_CB(skb)->rew_op = rew_op; + LAN966X_SKB_CB(skb)->pdu_type = pdu_type; if (rew_op != IFH_REW_OP_TWO_STEP_PTP) return 0; From f0b50730bdd8f2734e548de541e845c0d40dceb6 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 21 May 2025 21:36:20 +0800 Subject: [PATCH 04/21] net/mlx5_core: Add error handling inmlx5_query_nic_vport_qkey_viol_cntr() The function mlx5_query_nic_vport_qkey_viol_cntr() calls the function mlx5_query_nic_vport_context() but does not check its return value. This could lead to undefined behavior if the query fails. A proper implementation can be found in mlx5_nic_vport_query_local_lb(). Add error handling for mlx5_query_nic_vport_context(). If it fails, free the out buffer via kvfree() and return error code. Fixes: 9efa75254593 ("net/mlx5_core: Introduce access functions to query vport RoCE fields") Cc: stable@vger.kernel.org # v4.5 Signed-off-by: Wentao Liang Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250521133620.912-1-vulab@iscas.ac.cn Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index d10d4c396040..a3c57bb8b521 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -519,19 +519,22 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; out = kvzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out, nic_vport_context.qkey_violation_counter); - +out: kvfree(out); - return 0; + return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr); From 32374234ab0101881e7d0c6a8ef7ebce566c46c9 Mon Sep 17 00:00:00 2001 From: Suraj Gupta Date: Wed, 21 May 2025 23:46:08 +0530 Subject: [PATCH 05/21] net: xilinx: axienet: Fix Tx skb circular buffer occupancy check in dmaengine xmit In Dmaengine flow, driver maintains struct skbuf_dma_descriptor rings each element of which corresponds to a skb. In Tx datapath, compare available space in skb ring with number of skbs instead of skb fragments. Replace x * (MAX_SKB_FRAGS) in netif_txq_completed_wake() and netif_txq_maybe_stop() with x * (1 skb) to fix the comparison. Fixes: 6a91b846af85 ("net: axienet: Introduce dmaengine support") Signed-off-by: Suraj Gupta Reviewed-by: Sean Anderson Link: https://patch.msgid.link/20250521181608.669554-1-suraj.gupta2@amd.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 054abf283ab3..5f912b27bfd7 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -880,7 +880,7 @@ static void axienet_dma_tx_cb(void *data, const struct dmaengine_result *result) dev_consume_skb_any(skbuf_dma->skb); netif_txq_completed_wake(txq, 1, len, CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX), - 2 * MAX_SKB_FRAGS); + 2); } /** @@ -914,7 +914,7 @@ axienet_start_xmit_dmaengine(struct sk_buff *skb, struct net_device *ndev) dma_dev = lp->tx_chan->device; sg_len = skb_shinfo(skb)->nr_frags + 1; - if (CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX) <= sg_len) { + if (CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX) <= 1) { netif_stop_queue(ndev); if (net_ratelimit()) netdev_warn(ndev, "TX ring unexpectedly full\n"); @@ -964,7 +964,7 @@ axienet_start_xmit_dmaengine(struct sk_buff *skb, struct net_device *ndev) txq = skb_get_tx_queue(lp->ndev, skb); netdev_tx_sent_queue(txq, skb->len); netif_txq_maybe_stop(txq, CIRC_SPACE(lp->tx_ring_head, lp->tx_ring_tail, TX_BD_NUM_MAX), - MAX_SKB_FRAGS + 1, 2 * MAX_SKB_FRAGS); + 1, 2); dmaengine_submit(dma_tx_desc); dma_async_issue_pending(lp->tx_chan); From d8d85ef0a631df9127f202e6371bb33a0b589952 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 21 May 2025 20:11:28 -0700 Subject: [PATCH 06/21] af_packet: move notifier's packet_dev_mc out of rcu critical section Syzkaller reports the following issue: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:578 __mutex_lock+0x106/0xe80 kernel/locking/mutex.c:746 team_change_rx_flags+0x38/0x220 drivers/net/team/team_core.c:1781 dev_change_rx_flags net/core/dev.c:9145 [inline] __dev_set_promiscuity+0x3f8/0x590 net/core/dev.c:9189 netif_set_promiscuity+0x50/0xe0 net/core/dev.c:9201 dev_set_promiscuity+0x126/0x260 net/core/dev_api.c:286 packet_dev_mc net/packet/af_packet.c:3698 [inline] packet_dev_mclist_delete net/packet/af_packet.c:3722 [inline] packet_notifier+0x292/0xa60 net/packet/af_packet.c:4247 notifier_call_chain+0x1b3/0x3e0 kernel/notifier.c:85 call_netdevice_notifiers_extack net/core/dev.c:2214 [inline] call_netdevice_notifiers net/core/dev.c:2228 [inline] unregister_netdevice_many_notify+0x15d8/0x2330 net/core/dev.c:11972 rtnl_delete_link net/core/rtnetlink.c:3522 [inline] rtnl_dellink+0x488/0x710 net/core/rtnetlink.c:3564 rtnetlink_rcv_msg+0x7cf/0xb70 net/core/rtnetlink.c:6955 netlink_rcv_skb+0x219/0x490 net/netlink/af_netlink.c:2534 Calling `PACKET_ADD_MEMBERSHIP` on an ops-locked device can trigger the `NETDEV_UNREGISTER` notifier, which may require disabling promiscuous and/or allmulti mode. Both of these operations require acquiring the netdev instance lock. Move the call to `packet_dev_mc` outside of the RCU critical section. The `mclist` modifications (add, del, flush, unregister) are protected by the RTNL, not the RCU. The RCU only protects the `sklist` and its associated `sks`. The delayed operation on the `mclist` entry remains within the RTNL. Reported-by: syzbot+b191b5ccad8d7a986286@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b191b5ccad8d7a986286 Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250522031129.3247266-1-stfomichev@gmail.com Signed-off-by: Paolo Abeni --- net/packet/af_packet.c | 21 ++++++++++++++++----- net/packet/internal.h | 1 + 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d4dba06297c3..20be2c47cf41 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3713,15 +3713,15 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, } static void packet_dev_mclist_delete(struct net_device *dev, - struct packet_mclist **mlp) + struct packet_mclist **mlp, + struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { - packet_dev_mc(dev, ml, -1); + list_add(&ml->remove_list, list); *mlp = ml->next; - kfree(ml); } else mlp = &ml->next; } @@ -3769,6 +3769,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; + INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); @@ -4233,9 +4234,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { - struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct packet_mclist *ml, *tmp; + LIST_HEAD(mclist); + struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { @@ -4244,7 +4247,8 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist_delete(dev, &po->mclist); + packet_dev_mclist_delete(dev, &po->mclist, + &mclist); fallthrough; case NETDEV_DOWN: @@ -4277,6 +4281,13 @@ static int packet_notifier(struct notifier_block *this, } } rcu_read_unlock(); + + /* packet_dev_mc might grab instance locks so can't run under rcu */ + list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + return NOTIFY_DONE; } diff --git a/net/packet/internal.h b/net/packet/internal.h index d5d70712007a..1e743d0316fd 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -11,6 +11,7 @@ struct packet_mclist { unsigned short type; unsigned short alen; unsigned char addr[MAX_ADDR_LEN]; + struct list_head remove_list; }; /* kbdq - kernel block descriptor queue */ From 0795b05a59b1371b18ffbf09d385296b12e9f5d5 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 23 May 2025 16:37:59 +0800 Subject: [PATCH 07/21] net: phy: clear phydev->devlink when the link is deleted There is a potential crash issue when disabling and re-enabling the network port. When disabling the network port, phy_detach() calls device_link_del() to remove the device link, but it does not clear phydev->devlink, so phydev->devlink is not a NULL pointer. Then the network port is re-enabled, but if phy_attach_direct() fails before calling device_link_add(), the code jumps to the "error" label and calls phy_detach(). Since phydev->devlink retains the old value from the previous attach/detach cycle, device_link_del() uses the old value, which accesses a NULL pointer and causes a crash. The simplified crash log is as follows. [ 24.702421] Call trace: [ 24.704856] device_link_put_kref+0x20/0x120 [ 24.709124] device_link_del+0x30/0x48 [ 24.712864] phy_detach+0x24/0x168 [ 24.716261] phy_attach_direct+0x168/0x3a4 [ 24.720352] phylink_fwnode_phy_connect+0xc8/0x14c [ 24.725140] phylink_of_phy_connect+0x1c/0x34 Therefore, phydev->devlink needs to be cleared when the device link is deleted. Fixes: bc66fa87d4fd ("net: phy: Add link between phy dev and mac dev") Signed-off-by: Wei Fang Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250523083759.3741168-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index cc1bfd22fb81..7d5e76a3db0e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1727,8 +1727,10 @@ void phy_detach(struct phy_device *phydev) struct module *ndev_owner = NULL; struct mii_bus *bus; - if (phydev->devlink) + if (phydev->devlink) { device_link_del(phydev->devlink); + phydev->devlink = NULL; + } if (phydev->sysfs_links) { if (dev) From c59783780c8ad66f6076a9a7c74df3e006e29519 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 24 May 2025 09:29:11 +0200 Subject: [PATCH 08/21] net: airoha: Fix an error handling path in airoha_alloc_gdm_port() If register_netdev() fails, the error handling path of the probe will not free the memory allocated by the previous airoha_metadata_dst_alloc() call because port->dev->reg_state will not be NETREG_REGISTERED. So, an explicit airoha_metadata_dst_free() call is needed in this case to avoid a memory leak. Fixes: af3cf757d5c9 ("net: airoha: Move DSA tag in DMA descriptor") Signed-off-by: Christophe JAILLET Acked-by: Lorenzo Bianconi Reviewed-by: Simon Horman Link: https://patch.msgid.link/1b94b91345017429ed653e2f05d25620dc2823f9.1746715755.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 1e9ab65218ff..c169b8b411b4 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2541,7 +2541,15 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth, if (err) return err; - return register_netdev(dev); + err = register_netdev(dev); + if (err) + goto free_metadata_dst; + + return 0; + +free_metadata_dst: + airoha_metadata_dst_free(port); + return err; } static int airoha_probe(struct platform_device *pdev) From c6bb8a21cdad8c975a3a646b9e5c8df01ad29783 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Sun, 25 May 2025 00:34:25 +0800 Subject: [PATCH 09/21] net/mlx5: Add error handling in mlx5_query_nic_vport_node_guid() The function mlx5_query_nic_vport_node_guid() calls the function mlx5_query_nic_vport_context() but does not check its return value. A proper implementation can be found in mlx5_nic_vport_query_local_lb(). Add error handling for mlx5_query_nic_vport_context(). If it fails, free the out buffer via kvfree() and return error code. Fixes: 9efa75254593 ("net/mlx5_core: Introduce access functions to query vport RoCE fields") Cc: stable@vger.kernel.org # v4.5 Signed-off-by: Wentao Liang Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250524163425.1695-1-vulab@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index a3c57bb8b521..da5c24fc7b30 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -465,19 +465,22 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; out = kvzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; *node_guid = MLX5_GET64(query_nic_vport_context_out, out, nic_vport_context.node_guid); - +out: kvfree(out); - return 0; + return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid); From 126cd7852a62c6fab11a4a4cb6fa96421929ab69 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 26 May 2025 10:44:33 +0800 Subject: [PATCH 10/21] net: mctp: start tx queue on netdev open We stop queues in ndo_stop, so they need to be restarted in ndo_open. This allows us to resume tx after a link down/up cycle. Suggested-by: Nitin Singh Fixes: 0791c0327a6e ("net: mctp: Add MCTP USB transport driver") Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20250526-dev-mctp-usb-v1-1-c7bd6cb75aa0@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- drivers/net/mctp/mctp-usb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/mctp/mctp-usb.c b/drivers/net/mctp/mctp-usb.c index e8d4b01c3f34..775a386d0aca 100644 --- a/drivers/net/mctp/mctp-usb.c +++ b/drivers/net/mctp/mctp-usb.c @@ -257,6 +257,8 @@ static int mctp_usb_open(struct net_device *dev) WRITE_ONCE(mctp_usb->stopped, false); + netif_start_queue(dev); + return mctp_usb_rx_queue(mctp_usb, GFP_KERNEL); } From 3920a758800762917177a6b5ab39707d8e376fe6 Mon Sep 17 00:00:00 2001 From: Sergio Perez Gonzalez Date: Sun, 25 May 2025 21:20:31 -0600 Subject: [PATCH 11/21] net: macb: Check return value of dma_set_mask_and_coherent() Issue flagged by coverity. Add a safety check for the return value of dma_set_mask_and_coherent, go to a safe exit if it returns error. Link: https://scan7.scan.coverity.com/#/project-view/53936/11354?selectedIssue=1643754 Signed-off-by: Sergio Perez Gonzalez Reviewed-by: Claudiu Beznea Link: https://patch.msgid.link/20250526032034.84900-1-sperezglz@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index e1e8bd2ec155..d1f1ae5ea161 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5283,7 +5283,11 @@ static int macb_probe(struct platform_device *pdev) #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT if (GEM_BFEXT(DAW64, gem_readl(bp, DCFG6))) { - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(44)); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(44)); + if (err) { + dev_err(&pdev->dev, "failed to set DMA mask\n"); + goto err_out_free_netdev; + } bp->hw_dma_cap |= HW_DMA_CAP_64B; } #endif From 68927eb52d0af04863584930db06075d2610e194 Mon Sep 17 00:00:00 2001 From: Thangaraj Samynathan Date: Mon, 26 May 2025 11:00:47 +0530 Subject: [PATCH 12/21] net: lan743x: rename lan743x_reset_phy to lan743x_hw_reset_phy rename the function to lan743x_hw_reset_phy to better describe it operation. Fixes: 23f0703c125be ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Thangaraj Samynathan Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250526053048.287095-2-thangaraj.s@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index a70b88037a20..da5bd54208dd 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1330,7 +1330,7 @@ static int lan743x_mac_set_mtu(struct lan743x_adapter *adapter, int new_mtu) } /* PHY */ -static int lan743x_phy_reset(struct lan743x_adapter *adapter) +static int lan743x_hw_reset_phy(struct lan743x_adapter *adapter) { u32 data; @@ -1348,7 +1348,7 @@ static int lan743x_phy_reset(struct lan743x_adapter *adapter) static int lan743x_phy_init(struct lan743x_adapter *adapter) { - return lan743x_phy_reset(adapter); + return lan743x_hw_reset_phy(adapter); } static void lan743x_phy_interface_select(struct lan743x_adapter *adapter) From 82d1096ca8b5dbb3158d707e6fb3ad21c3403a49 Mon Sep 17 00:00:00 2001 From: Thangaraj Samynathan Date: Mon, 26 May 2025 11:00:48 +0530 Subject: [PATCH 13/21] net: lan743x: Fix PHY reset handling during initialization and WOL Remove lan743x_phy_init from lan743x_hardware_init as it resets the PHY registers, causing WOL to fail on subsequent attempts. Add a call to lan743x_hw_reset_phy in the probe function to ensure the PHY is reset during device initialization. Fixes: 23f0703c125be ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Thangaraj Samynathan Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250526053048.287095-3-thangaraj.s@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index da5bd54208dd..7f36443832ad 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1346,11 +1346,6 @@ static int lan743x_hw_reset_phy(struct lan743x_adapter *adapter) 50000, 1000000); } -static int lan743x_phy_init(struct lan743x_adapter *adapter) -{ - return lan743x_hw_reset_phy(adapter); -} - static void lan743x_phy_interface_select(struct lan743x_adapter *adapter) { u32 id_rev; @@ -3534,10 +3529,6 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, if (ret) return ret; - ret = lan743x_phy_init(adapter); - if (ret) - return ret; - ret = lan743x_ptp_init(adapter); if (ret) return ret; @@ -3674,6 +3665,10 @@ static int lan743x_pcidev_probe(struct pci_dev *pdev, if (ret) goto cleanup_pci; + ret = lan743x_hw_reset_phy(adapter); + if (ret) + goto cleanup_pci; + ret = lan743x_hardware_init(adapter, pdev); if (ret) goto cleanup_pci; From 846992645b25ec4253167e3f931e4597eb84af56 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 22 May 2025 13:57:22 +0200 Subject: [PATCH 14/21] net: phy: mscc: Fix memory leak when using one step timestamping Fix memory leak when running one-step timestamping. When running one-step sync timestamping, the HW is configured to insert the TX time into the frame, so there is no reason to keep the skb anymore. As in this case the HW will never generate an interrupt to say that the frame was timestamped, then the frame will never released. Fix this by freeing the frame in case of one-step timestamping. Fixes: 7d272e63e0979d ("net: phy: mscc: timestamping and PHC support") Signed-off-by: Horatiu Vultur Link: https://patch.msgid.link/20250522115722.2827199-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mscc/mscc_ptp.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c index ed8fb14a7f21..6f96f2679f0b 100644 --- a/drivers/net/phy/mscc/mscc_ptp.c +++ b/drivers/net/phy/mscc/mscc_ptp.c @@ -1166,18 +1166,24 @@ static void vsc85xx_txtstamp(struct mii_timestamper *mii_ts, container_of(mii_ts, struct vsc8531_private, mii_ts); if (!vsc8531->ptp->configured) - return; + goto out; - if (vsc8531->ptp->tx_type == HWTSTAMP_TX_OFF) { - kfree_skb(skb); - return; - } + if (vsc8531->ptp->tx_type == HWTSTAMP_TX_OFF) + goto out; + + if (vsc8531->ptp->tx_type == HWTSTAMP_TX_ONESTEP_SYNC) + if (ptp_msg_is_sync(skb, type)) + goto out; skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; mutex_lock(&vsc8531->ts_lock); __skb_queue_tail(&vsc8531->ptp->tx_queue, skb); mutex_unlock(&vsc8531->ts_lock); + return; + +out: + kfree_skb(skb); } static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts, From 479c58016099d19686e36f6c50f912360839a7fa Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 22 May 2025 15:17:41 +0530 Subject: [PATCH 15/21] octeontx2-pf: QOS: Perform cache sync on send queue teardown QOS is designed to create a new send queue whenever a class is created, ensuring proper shaping and scheduling. However, when multiple send queues are created and deleted in a loop, SMMU errors are observed. This patch addresses the issue by performing an data cache sync during the teardown of QOS send queues. Fixes: ab6dddd2a669 ("octeontx2-pf: qos send queues management") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250522094742.1498295-1-hkelam@marvell.com Signed-off-by: Paolo Abeni --- .../ethernet/marvell/octeontx2/nic/qos_sq.c | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c index c5dbae0e513b..58d572ce08ef 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c @@ -256,6 +256,26 @@ out: return err; } +static int otx2_qos_nix_npa_ndc_sync(struct otx2_nic *pfvf) +{ + struct ndc_sync_op *req; + int rc; + + mutex_lock(&pfvf->mbox.lock); + + req = otx2_mbox_alloc_msg_ndc_sync_op(&pfvf->mbox); + if (!req) { + mutex_unlock(&pfvf->mbox.lock); + return -ENOMEM; + } + + req->nix_lf_tx_sync = true; + req->npa_lf_sync = true; + rc = otx2_sync_mbox_msg(&pfvf->mbox); + mutex_unlock(&pfvf->mbox.lock); + return rc; +} + void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx) { struct otx2_qset *qset = &pfvf->qset; @@ -285,6 +305,8 @@ void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx) otx2_qos_sqb_flush(pfvf, sq_idx); otx2_smq_flush(pfvf, otx2_get_smq_idx(pfvf, sq_idx)); + /* NIX/NPA NDC sync */ + otx2_qos_nix_npa_ndc_sync(pfvf); otx2_cleanup_tx_cqes(pfvf, cq); mutex_lock(&pfvf->mbox.lock); From 67af4ec948e8ce3ea53a9cf614d01fddf172e56d Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 22 May 2025 17:28:42 +0530 Subject: [PATCH 16/21] octeontx2-pf: QOS: Refactor TC_HTB_LEAF_DEL_LAST callback This patch addresses below issues, 1. Active traffic on the leaf node must be stopped before its send queue is reassigned to the parent. This patch resolves the issue by marking the node as 'Inner'. 2. During a system reboot, the interface receives TC_HTB_LEAF_DEL and TC_HTB_LEAF_DEL_LAST callbacks to delete its HTB queues. In the case of TC_HTB_LEAF_DEL_LAST, although the same send queue is reassigned to the parent, the current logic still attempts to update the real number of queues, leadning to below warnings New queues can't be registered after device unregistration. WARNING: CPU: 0 PID: 6475 at net/core/net-sysfs.c:1714 netdev_queue_update_kobjects+0x1e4/0x200 Fixes: 5e6808b4c68d ("octeontx2-pf: Add support for HTB offload") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250522115842.1499666-1-hkelam@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/qos.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c index 35acc07bd964..5765bac119f0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.c @@ -1638,6 +1638,7 @@ static int otx2_qos_leaf_del_last(struct otx2_nic *pfvf, u16 classid, bool force if (!node->is_static) dwrr_del_node = true; + WRITE_ONCE(node->qid, OTX2_QOS_QID_INNER); /* destroy the leaf node */ otx2_qos_disable_sq(pfvf, qid); otx2_qos_destroy_node(pfvf, node); @@ -1682,9 +1683,6 @@ static int otx2_qos_leaf_del_last(struct otx2_nic *pfvf, u16 classid, bool force } kfree(new_cfg); - /* update tx_real_queues */ - otx2_qos_update_tx_netdev_queues(pfvf); - return 0; } From ac9fe7dd8e730a103ae4481147395cc73492d786 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Thu, 22 May 2025 15:14:47 -0300 Subject: [PATCH 17/21] net_sched: hfsc: Address reentrant enqueue adding class to eltree twice Savino says: "We are writing to report that this recent patch (141d34391abbb315d68556b7c67ad97885407547) [1] can be bypassed, and a UAF can still occur when HFSC is utilized with NETEM. The patch only checks the cl->cl_nactive field to determine whether it is the first insertion or not [2], but this field is only incremented by init_vf [3]. By using HFSC_RSC (which uses init_ed) [4], it is possible to bypass the check and insert the class twice in the eltree. Under normal conditions, this would lead to an infinite loop in hfsc_dequeue for the reasons we already explained in this report [5]. However, if TBF is added as root qdisc and it is configured with a very low rate, it can be utilized to prevent packets from being dequeued. This behavior can be exploited to perform subsequent insertions in the HFSC eltree and cause a UAF." To fix both the UAF and the infinite loop, with netem as an hfsc child, check explicitly in hfsc_enqueue whether the class is already in the eltree whenever the HFSC_RSC flag is set. [1] https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=141d34391abbb315d68556b7c67ad97885407547 [2] https://elixir.bootlin.com/linux/v6.15-rc5/source/net/sched/sch_hfsc.c#L1572 [3] https://elixir.bootlin.com/linux/v6.15-rc5/source/net/sched/sch_hfsc.c#L677 [4] https://elixir.bootlin.com/linux/v6.15-rc5/source/net/sched/sch_hfsc.c#L1574 [5] https://lore.kernel.org/netdev/8DuRWwfqjoRDLDmBMlIfbrsZg9Gx50DHJc1ilxsEBNe2D6NMoigR_eIRIG0LOjMc3r10nUUZtArXx4oZBIdUfZQrwjcQhdinnMis_0G7VEk=@willsroot.io/T/#u Fixes: 37d9cf1a3ce3 ("sched: Fix detection of empty queues in child qdiscs") Reported-by: Savino Dicanosa Reported-by: William Liu Acked-by: Jamal Hadi Salim Tested-by: Victor Nogueira Signed-off-by: Pedro Tammela Link: https://patch.msgid.link/20250522181448.1439717-2-pctammela@mojatatu.com Signed-off-by: Paolo Abeni --- net/sched/sch_hfsc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 7986145a527c..5a7745170e84 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -175,6 +175,11 @@ struct hfsc_sched { #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ +static bool cl_in_el_or_vttree(struct hfsc_class *cl) +{ + return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) || + ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node)); +} /* * eligible tree holds backlogged classes being sorted by their eligible times. @@ -1040,6 +1045,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + RB_CLEAR_NODE(&cl->el_node); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1572,7 +1579,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) sch->qstats.backlog += len; sch->q.qlen++; - if (first && !cl->cl_nactive) { + if (first && !cl_in_el_or_vttree(cl)) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) From 2945ff733dee951ed64d0f13cba22348bfc1f438 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Thu, 22 May 2025 15:14:48 -0300 Subject: [PATCH 18/21] selftests/tc-testing: Add a test for HFSC eltree double add with reentrant enqueue behaviour on netem Reproduce the UAF scenario where netem is a child of HFSC and HFSC is configured to use the eltree. In such case, this TDC test would cause the HFSC class to be added to the eltree twice resulting in a UAF. Reviewed-by: Victor Nogueira Signed-off-by: Pedro Tammela Link: https://patch.msgid.link/20250522181448.1439717-3-pctammela@mojatatu.com Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index ddc97ecd8b39..9aa44d8176d9 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -600,5 +600,40 @@ "matchPattern": "qdisc hfsc", "matchCount": "1", "teardown": ["$TC qdisc del dev $DEV1 root handle 1: drr"] + }, + { + "id": "309e", + "name": "Test HFSC eltree double add with reentrant enqueue behaviour on netem", + "category": [ + "qdisc", + "hfsc" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 1s", + "$TC qdisc add dev $DUMMY parent 1:0 handle 2:0 hfsc", + "ping -I $DUMMY -f -c10 -s48 -W0.001 10.10.11.1 || true", + "$TC class add dev $DUMMY parent 2:0 classid 2:1 hfsc rt m2 20Kbit", + "$TC qdisc add dev $DUMMY parent 2:1 handle 3:0 netem duplicate 100%", + "$TC class add dev $DUMMY parent 2:0 classid 2:2 hfsc rt m2 20Kbit", + "$TC filter add dev $DUMMY parent 2:0 protocol ip prio 1 u32 match ip dst 10.10.11.2/32 flowid 2:1", + "$TC filter add dev $DUMMY parent 2:0 protocol ip prio 2 u32 match ip dst 10.10.11.3/32 flowid 2:2", + "ping -c 1 10.10.11.2 -I$DUMMY > /dev/null || true", + "$TC filter del dev $DUMMY parent 2:0 protocol ip prio 1", + "$TC class del dev $DUMMY classid 2:1", + "ping -c 1 10.10.11.3 -I$DUMMY > /dev/null || true" + ], + "cmdUnderTest": "$TC class change dev $DUMMY parent 2:0 classid 2:2 hfsc sc m2 20Kbit", + "expExitCode": "0", + "verifyCmd": "$TC -j class ls dev $DUMMY classid 2:1", + "matchJSON": [], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1:0 root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] From 6e9f2df1c550ead7cecb3e450af1105735020c92 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 22 May 2025 15:18:56 -0700 Subject: [PATCH 19/21] calipso: Don't call calipso functions for AF_INET sk. syzkaller reported a null-ptr-deref in txopt_get(). [0] The offset 0x70 was of struct ipv6_txoptions in struct ipv6_pinfo, so struct ipv6_pinfo was NULL there. However, this never happens for IPv6 sockets as inet_sk(sk)->pinet6 is always set in inet6_create(), meaning the socket was not IPv6 one. The root cause is missing validation in netlbl_conn_setattr(). netlbl_conn_setattr() switches branches based on struct sockaddr.sa_family, which is passed from userspace. However, netlbl_conn_setattr() does not check if the address family matches the socket. The syzkaller must have called connect() for an IPv6 address on an IPv4 socket. We have a proper validation in tcp_v[46]_connect(), but security_socket_connect() is called in the earlier stage. Let's copy the validation to netlbl_conn_setattr(). [0]: Oops: general protection fault, probably for non-canonical address 0xdffffc000000000e: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077] CPU: 2 UID: 0 PID: 12928 Comm: syz.9.1677 Not tainted 6.12.0 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:txopt_get include/net/ipv6.h:390 [inline] RIP: 0010: Code: 02 00 00 49 8b ac 24 f8 02 00 00 e8 84 69 2a fd e8 ff 00 16 fd 48 8d 7d 70 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 53 02 00 00 48 8b 6d 70 48 85 ed 0f 84 ab 01 00 RSP: 0018:ffff88811b8afc48 EFLAGS: 00010212 RAX: dffffc0000000000 RBX: 1ffff11023715f8a RCX: ffffffff841ab00c RDX: 000000000000000e RSI: ffffc90007d9e000 RDI: 0000000000000070 RBP: 0000000000000000 R08: ffffed1023715f9d R09: ffffed1023715f9e R10: ffffed1023715f9d R11: 0000000000000003 R12: ffff888123075f00 R13: ffff88810245bd80 R14: ffff888113646780 R15: ffff888100578a80 FS: 00007f9019bd7640(0000) GS:ffff8882d2d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f901b927bac CR3: 0000000104788003 CR4: 0000000000770ef0 PKRU: 80000000 Call Trace: calipso_sock_setattr+0x56/0x80 net/netlabel/netlabel_calipso.c:557 netlbl_conn_setattr+0x10c/0x280 net/netlabel/netlabel_kapi.c:1177 selinux_netlbl_socket_connect_helper+0xd3/0x1b0 security/selinux/netlabel.c:569 selinux_netlbl_socket_connect_locked security/selinux/netlabel.c:597 [inline] selinux_netlbl_socket_connect+0xb6/0x100 security/selinux/netlabel.c:615 selinux_socket_connect+0x5f/0x80 security/selinux/hooks.c:4931 security_socket_connect+0x50/0xa0 security/security.c:4598 __sys_connect_file+0xa4/0x190 net/socket.c:2067 __sys_connect+0x12c/0x170 net/socket.c:2088 __do_sys_connect net/socket.c:2098 [inline] __se_sys_connect net/socket.c:2095 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:2095 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xaa/0x1b0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f901b61a12d Code: 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9019bd6fa8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 00007f901b925fa0 RCX: 00007f901b61a12d RDX: 000000000000001c RSI: 0000200000000140 RDI: 0000000000000003 RBP: 00007f901b701505 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f901b5b62a0 R15: 00007f9019bb7000 Modules linked in: Fixes: ceba1832b1b2 ("calipso: Set the calipso socket label to match the secattr.") Reported-by: syzkaller Reported-by: John Cheung Closes: https://lore.kernel.org/netdev/CAP=Rh=M1LzunrcQB1fSGauMrJrhL6GGps5cPAKzHJXj6GQV+-g@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore Link: https://patch.msgid.link/20250522221858.91240-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/netlabel/netlabel_kapi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index cd9160bbc919..6ea16138582c 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1165,6 +1165,9 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + if (sk->sk_family != AF_INET6) + return -EAFNOSUPPORT; + addr6 = (struct sockaddr_in6 *)addr; entry = netlbl_domhsh_getentry_af6(secattr->domain, &addr6->sin6_addr); From 0bdc924bfb319fb10d1113cbf091fc26fb7b1f99 Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Fri, 23 May 2025 03:41:43 +0000 Subject: [PATCH 20/21] net: openvswitch: Fix the dead loop of MPLS parse The unexpected MPLS packet may not end with the bottom label stack. When there are many stacks, The label count value has wrapped around. A dead loop occurs, soft lockup/CPU stuck finally. stack backtrace: UBSAN: array-index-out-of-bounds in /build/linux-0Pa0xK/linux-5.15.0/net/openvswitch/flow.c:662:26 index -1 is out of range for type '__be32 [3]' CPU: 34 PID: 0 Comm: swapper/34 Kdump: loaded Tainted: G OE 5.15.0-121-generic #131-Ubuntu Hardware name: Dell Inc. PowerEdge C6420/0JP9TF, BIOS 2.12.2 07/14/2021 Call Trace: show_stack+0x52/0x5c dump_stack_lvl+0x4a/0x63 dump_stack+0x10/0x16 ubsan_epilogue+0x9/0x36 __ubsan_handle_out_of_bounds.cold+0x44/0x49 key_extract_l3l4+0x82a/0x840 [openvswitch] ? kfree_skbmem+0x52/0xa0 key_extract+0x9c/0x2b0 [openvswitch] ovs_flow_key_extract+0x124/0x350 [openvswitch] ovs_vport_receive+0x61/0xd0 [openvswitch] ? kernel_init_free_pages.part.0+0x4a/0x70 ? get_page_from_freelist+0x353/0x540 netdev_port_receive+0xc4/0x180 [openvswitch] ? netdev_port_receive+0x180/0x180 [openvswitch] netdev_frame_hook+0x1f/0x40 [openvswitch] __netif_receive_skb_core.constprop.0+0x23a/0xf00 __netif_receive_skb_list_core+0xfa/0x240 netif_receive_skb_list_internal+0x18e/0x2a0 napi_complete_done+0x7a/0x1c0 bnxt_poll+0x155/0x1c0 [bnxt_en] __napi_poll+0x30/0x180 net_rx_action+0x126/0x280 ? bnxt_msix+0x67/0x80 [bnxt_en] handle_softirqs+0xda/0x2d0 irq_exit_rcu+0x96/0xc0 common_interrupt+0x8e/0xa0 Fixes: fbdcdd78da7c ("Change in Openvswitch to support MPLS label depth of 3 in ingress direction") Signed-off-by: Faicker Mo Acked-by: Ilya Maximets Reviewed-by: Aaron Conole Link: https://patch.msgid.link/259D3404-575D-4A6D-B263-1DF59A67CF89@zenlayer.com Signed-off-by: Paolo Abeni --- net/openvswitch/flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8a848ce72e29..b80bd3a90773 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -788,7 +788,7 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - u8 label_count = 1; + size_t label_count = 1; memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); From 57a92d14659df3e7e7e0052358c8cc68bbbc3b5e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 23 May 2025 10:27:16 +0200 Subject: [PATCH 21/21] net: phy: mscc: Stop clearing the the UDPv4 checksum for L2 frames We have noticed that when PHY timestamping is enabled, L2 frames seems to be modified by changing two 2 bytes with a value of 0. The place were these 2 bytes seems to be random(or I couldn't find a pattern). In most of the cases the userspace can ignore these frames but if for example those 2 bytes are in the correction field there is nothing to do. This seems to happen when configuring the HW for IPv4 even that the flow is not enabled. These 2 bytes correspond to the UDPv4 checksum and once we don't enable clearing the checksum when using L2 frames then the frame doesn't seem to be changed anymore. Fixes: 7d272e63e0979d ("net: phy: mscc: timestamping and PHC support") Signed-off-by: Horatiu Vultur Link: https://patch.msgid.link/20250523082716.2935895-1-horatiu.vultur@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/phy/mscc/mscc_ptp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c index 6f96f2679f0b..6b800081eed5 100644 --- a/drivers/net/phy/mscc/mscc_ptp.c +++ b/drivers/net/phy/mscc/mscc_ptp.c @@ -946,7 +946,9 @@ static int vsc85xx_ip1_conf(struct phy_device *phydev, enum ts_blk blk, /* UDP checksum offset in IPv4 packet * according to: https://tools.ietf.org/html/rfc768 */ - val |= IP1_NXT_PROT_UDP_CHKSUM_OFF(26) | IP1_NXT_PROT_UDP_CHKSUM_CLEAR; + val |= IP1_NXT_PROT_UDP_CHKSUM_OFF(26); + if (enable) + val |= IP1_NXT_PROT_UDP_CHKSUM_CLEAR; vsc85xx_ts_write_csr(phydev, blk, MSCC_ANA_IP1_NXT_PROT_UDP_CHKSUM, val);