From 67bcf375d3b341f5aea7bb92fb7509ff439339e2 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 8 Mar 2023 17:47:46 +0100
Subject: [PATCH 01/40] net: intel: introduce Intel Ethernet common library

Not a secret there's a ton of code duplication between two and more Intel
ethernet modules.
Before introducing new changes, which would need to be copied over again,
start decoupling the already existing duplicate functionality into a new
module, which will be shared between several Intel Ethernet drivers.
Add the lookup table which converts 8/10-bit hardware packet type into
a parsed bitfield structure for easy checking packet format parameters,
such as payload level, IP version, etc. This is currently used by i40e,
ice and iavf and it's all the same in all three drivers.
The only difference introduced in this implementation is that instead of
defining a 256 (or 1024 in case of ice) element array, add unlikely()
condition to limit the input to 154 (current maximum non-reserved packet
type). There's no reason to waste 600 (or even 3600) bytes only to not
hurt very unlikely exception packets.
The hash computation function now takes payload level directly as a
pkt_hash_type. There's a couple cases when non-IP ptypes are marked as
L3 payload and in the previous versions their hash level would be 2, not
3. But skb_set_hash() only sees difference between L4 and non-L4, thus
this won't change anything at all.
The module is behind the hidden Kconfig symbol, which the drivers will
select when needed. The exports are behind 'LIBIE' namespace to limit
the scope of the functions.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 MAINTAINERS                                   |   3 +-
 drivers/net/ethernet/intel/Kconfig            |  11 +-
 drivers/net/ethernet/intel/Makefile           |   1 +
 drivers/net/ethernet/intel/i40e/i40e_common.c | 253 --------------
 drivers/net/ethernet/intel/i40e/i40e_main.c   |   1 +
 .../net/ethernet/intel/i40e/i40e_prototype.h  |   7 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  74 +---
 drivers/net/ethernet/intel/i40e/i40e_type.h   |  88 -----
 drivers/net/ethernet/intel/iavf/iavf_common.c | 253 --------------
 drivers/net/ethernet/intel/iavf/iavf_main.c   |   1 +
 .../net/ethernet/intel/iavf/iavf_prototype.h  |   7 -
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  70 +---
 drivers/net/ethernet/intel/iavf/iavf_type.h   |  88 -----
 .../net/ethernet/intel/ice/ice_lan_tx_rx.h    | 316 ------------------
 drivers/net/ethernet/intel/ice/ice_main.c     |   1 +
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  74 +---
 drivers/net/ethernet/intel/libie/Makefile     |   6 +
 drivers/net/ethernet/intel/libie/rx.c         | 110 ++++++
 include/linux/net/intel/libie/rx.h            | 128 +++++++
 19 files changed, 312 insertions(+), 1180 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libie/Makefile
 create mode 100644 drivers/net/ethernet/intel/libie/rx.c
 create mode 100644 include/linux/net/intel/libie/rx.h

diff --git a/MAINTAINERS b/MAINTAINERS
index fbbda4671e734d..f0bb5ee1787068 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10260,7 +10260,8 @@ F:	Documentation/networking/device_drivers/ethernet/intel/
 F:	drivers/net/ethernet/intel/
 F:	drivers/net/ethernet/intel/*/
 F:	include/linux/avf/virtchnl.h
-F:	include/linux/net/intel/iidc.h
+F:	include/linux/net/intel/
+F:	include/linux/net/intel/*/
 
 INTEL ETHERNET PROTOCOL DRIVER FOR RDMA
 M:	Mustafa Ismail <mustafa.ismail@intel.com>
diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index 9bc0a951989964..cec4a938fbd0f6 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -84,6 +84,12 @@ config E1000E_HWTS
 	 devices. The cross-timestamp is available through the PTP clock
 	 driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE).
 
+config LIBIE
+	tristate
+	help
+	  libie (Intel Ethernet library) is a common library containing
+	  routines shared by several Intel Ethernet drivers.
+
 config IGB
 	tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
 	depends on PCI
@@ -225,6 +231,7 @@ config I40E
 	depends on PTP_1588_CLOCK_OPTIONAL
 	depends on PCI
 	select AUXILIARY_BUS
+	select LIBIE
 	help
 	  This driver supports Intel(R) Ethernet Controller XL710 Family of
 	  devices.  For more information on how to identify your adapter, go
@@ -254,8 +261,9 @@ config IAVF
 	tristate
 config I40EVF
 	tristate "Intel(R) Ethernet Adaptive Virtual Function support"
-	select IAVF
 	depends on PCI_MSI
+	select IAVF
+	select LIBIE
 	help
 	  This driver supports virtual functions for Intel XL710,
 	  X710, X722, XXV710, and all devices advertising support for
@@ -282,6 +290,7 @@ config ICE
 	depends on GNSS || GNSS = n
 	select AUXILIARY_BUS
 	select DIMLIB
+	select LIBIE
 	select NET_DEVLINK
 	select PLDMFW
 	help
diff --git a/drivers/net/ethernet/intel/Makefile b/drivers/net/ethernet/intel/Makefile
index d80d04132073ca..ce622b4d825df7 100644
--- a/drivers/net/ethernet/intel/Makefile
+++ b/drivers/net/ethernet/intel/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_I40E) += i40e/
 obj-$(CONFIG_IAVF) += iavf/
 obj-$(CONFIG_FM10K) += fm10k/
 obj-$(CONFIG_ICE) += ice/
+obj-$(CONFIG_LIBIE) += libie/
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index ed88e38d488b2d..25bb858268fcd6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -383,259 +383,6 @@ int i40e_aq_set_rss_key(struct i40e_hw *hw,
 	return i40e_aq_get_set_rss_key(hw, vsi_id, key, true);
 }
 
-/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT i40e_ptype_lookup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum i40e_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		I40E_RX_PTYPE_OUTER_##OUTER_IP, \
-		I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		I40E_RX_PTYPE_##OUTER_FRAG, \
-		I40E_RX_PTYPE_TUNNEL_##T, \
-		I40E_RX_PTYPE_TUNNEL_END_##TE, \
-		I40E_RX_PTYPE_##TEF, \
-		I40E_RX_PTYPE_INNER_PROT_##I, \
-		I40E_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define I40E_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define I40E_RX_PTYPE_NOF		I40E_RX_PTYPE_NOT_FRAG
-#define I40E_RX_PTYPE_FRG		I40E_RX_PTYPE_FRAG
-#define I40E_RX_PTYPE_INNER_PROT_TS	I40E_RX_PTYPE_INNER_PROT_TIMESYNC
-
-/* Lookup table mapping in the 8-bit HW PTYPE to the bit field for decoding */
-struct i40e_rx_ptype_decoded i40e_ptype_lookup[BIT(8)] = {
-	/* L2 Packet types */
-	I40E_PTT_UNUSED_ENTRY(0),
-	I40E_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
-	I40E_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT_UNUSED_ENTRY(4),
-	I40E_PTT_UNUSED_ENTRY(5),
-	I40E_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT_UNUSED_ENTRY(8),
-	I40E_PTT_UNUSED_ENTRY(9),
-	I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-
-	/* Non Tunneled IPv4 */
-	I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(25),
-	I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(32),
-	I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(39),
-	I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(47),
-	I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(54),
-	I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(62),
-	I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(69),
-	I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(77),
-	I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(84),
-	I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(91),
-	I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	I40E_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(98),
-	I40E_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(105),
-	I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(113),
-	I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(120),
-	I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(128),
-	I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(135),
-	I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(143),
-	I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(150),
-	I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
 /**
  * i40e_init_shared_code - Initialize the shared code
  * @hw: pointer to hardware structure
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c8ff5675b29d8b..d89a5fff15983d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -97,6 +97,7 @@ MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all), Debug mask (0x8XXXXXXX
 
 MODULE_AUTHOR("Intel Corporation, <e1000-devel@lists.sourceforge.net>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Connection XL710 Network Driver");
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 
 static struct workqueue_struct *i40e_wq;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index fe845987d99a55..5287d0ef32d5c7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -380,13 +380,6 @@ void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status);
 
 int i40e_set_mac_type(struct i40e_hw *hw);
 
-extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[];
-
-static inline struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
-{
-	return i40e_ptype_lookup[ptype];
-}
-
 /**
  * i40e_virtchnl_link_speed - Convert AdminQ link_speed to virtchnl definition
  * @link_speed: the speed to convert
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c8c2cbaa0ede6c..e4bfc7e3c076e1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
-#include <linux/prefetch.h>
 #include <linux/bpf_trace.h>
+#include <linux/net/intel/libie/rx.h>
+#include <linux/prefetch.h>
 #include <net/mpls.h>
 #include <net/xdp.h>
 #include "i40e.h"
@@ -1758,40 +1759,32 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 				    struct sk_buff *skb,
 				    union i40e_rx_desc *rx_desc)
 {
-	struct i40e_rx_ptype_decoded decoded;
+	struct libie_rx_ptype_parsed parsed;
 	u32 rx_error, rx_status;
 	bool ipv4, ipv6;
 	u8 ptype;
 	u64 qword;
 
+	skb->ip_summed = CHECKSUM_NONE;
+
 	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 	ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
+
+	parsed = libie_parse_rx_ptype(ptype);
+	if (!libie_has_rx_checksum(vsi->netdev, parsed))
+		return;
+
 	rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
 		   I40E_RXD_QW1_ERROR_SHIFT;
 	rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
 		    I40E_RXD_QW1_STATUS_SHIFT;
-	decoded = decode_rx_desc_ptype(ptype);
-
-	skb->ip_summed = CHECKSUM_NONE;
-
-	skb_checksum_none_assert(skb);
-
-	/* Rx csum enabled and ip headers found? */
-	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
-		return;
 
 	/* did the hardware decode the packet and checksum? */
 	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	/* both known and outer_ip must be set for the below code to work */
-	if (!(decoded.known && decoded.outer_ip))
-		return;
-
-	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
+	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
+	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
 
 	if (ipv4 &&
 	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
@@ -1819,49 +1812,16 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 	 * we need to bump the checksum level by 1 to reflect the fact that
 	 * we are indicating we validated the inner checksum.
 	 */
-	if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
+	if (parsed.tunnel_type >= LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT)
 		skb->csum_level = 1;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case I40E_RX_PTYPE_INNER_PROT_TCP:
-	case I40E_RX_PTYPE_INNER_PROT_UDP:
-	case I40E_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		fallthrough;
-	default:
-		break;
-	}
-
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
 	vsi->back->hw_csum_rx_error++;
 }
 
-/**
- * i40e_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns a hash type to be used by skb_set_hash
- **/
-static inline int i40e_ptype_to_htype(u8 ptype)
-{
-	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-
-	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
-	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
-		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	else
-		return PKT_HASH_TYPE_L2;
-}
-
 /**
  * i40e_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
@@ -1874,17 +1834,19 @@ static inline void i40e_rx_hash(struct i40e_ring *ring,
 				struct sk_buff *skb,
 				u8 rx_ptype)
 {
+	struct libie_rx_ptype_parsed parsed;
 	u32 hash;
 	const __le64 rss_mask =
 		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
 			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
 
-	if (!(ring->netdev->features & NETIF_F_RXHASH))
+	parsed = libie_parse_rx_ptype(rx_ptype);
+	if (!libie_has_rx_hash(ring->netdev, parsed))
 		return;
 
 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
+		libie_skb_set_hash(skb, hash, parsed);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 388c3d36d96a55..05b8510f99a930 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -773,94 +773,6 @@ enum i40e_rx_desc_error_l3l4e_fcoe_masks {
 #define I40E_RXD_QW1_PTYPE_SHIFT	30
 #define I40E_RXD_QW1_PTYPE_MASK		(0xFFULL << I40E_RXD_QW1_PTYPE_SHIFT)
 
-/* Packet type non-ip values */
-enum i40e_rx_l2_ptype {
-	I40E_RX_PTYPE_L2_RESERVED			= 0,
-	I40E_RX_PTYPE_L2_MAC_PAY2			= 1,
-	I40E_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
-	I40E_RX_PTYPE_L2_FIP_PAY2			= 3,
-	I40E_RX_PTYPE_L2_OUI_PAY2			= 4,
-	I40E_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
-	I40E_RX_PTYPE_L2_LLDP_PAY2			= 6,
-	I40E_RX_PTYPE_L2_ECP_PAY2			= 7,
-	I40E_RX_PTYPE_L2_EVB_PAY2			= 8,
-	I40E_RX_PTYPE_L2_QCN_PAY2			= 9,
-	I40E_RX_PTYPE_L2_EAPOL_PAY2			= 10,
-	I40E_RX_PTYPE_L2_ARP				= 11,
-	I40E_RX_PTYPE_L2_FCOE_PAY3			= 12,
-	I40E_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
-	I40E_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
-	I40E_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
-	I40E_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
-	I40E_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
-	I40E_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
-	I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
-	I40E_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
-	I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
-};
-
-struct i40e_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:1;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum i40e_rx_ptype_outer_ip {
-	I40E_RX_PTYPE_OUTER_L2	= 0,
-	I40E_RX_PTYPE_OUTER_IP	= 1
-};
-
-enum i40e_rx_ptype_outer_ip_ver {
-	I40E_RX_PTYPE_OUTER_NONE	= 0,
-	I40E_RX_PTYPE_OUTER_IPV4	= 0,
-	I40E_RX_PTYPE_OUTER_IPV6	= 1
-};
-
-enum i40e_rx_ptype_outer_fragmented {
-	I40E_RX_PTYPE_NOT_FRAG	= 0,
-	I40E_RX_PTYPE_FRAG	= 1
-};
-
-enum i40e_rx_ptype_tunnel_type {
-	I40E_RX_PTYPE_TUNNEL_NONE		= 0,
-	I40E_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum i40e_rx_ptype_tunnel_end_prot {
-	I40E_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	I40E_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	I40E_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum i40e_rx_ptype_inner_prot {
-	I40E_RX_PTYPE_INNER_PROT_NONE		= 0,
-	I40E_RX_PTYPE_INNER_PROT_UDP		= 1,
-	I40E_RX_PTYPE_INNER_PROT_TCP		= 2,
-	I40E_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	I40E_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	I40E_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
-};
-
-enum i40e_rx_ptype_payload_layer {
-	I40E_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 #define I40E_RXD_QW1_LENGTH_PBUF_SHIFT	38
 #define I40E_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
 					 I40E_RXD_QW1_LENGTH_PBUF_SHIFT)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_common.c b/drivers/net/ethernet/intel/iavf/iavf_common.c
index dd11dbbd5551a2..ba6c9f154d189a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_common.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_common.c
@@ -499,259 +499,6 @@ enum iavf_status iavf_aq_set_rss_key(struct iavf_hw *hw, u16 vsi_id,
 	return iavf_aq_get_set_rss_key(hw, vsi_id, key, true);
 }
 
-/* The iavf_ptype_lookup table is used to convert from the 8-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT iavf_ptype_lookup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF iavf_ptype_lookup[ptype].outer_ip == IAVF_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum iavf_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define IAVF_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		IAVF_RX_PTYPE_OUTER_##OUTER_IP, \
-		IAVF_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		IAVF_RX_PTYPE_##OUTER_FRAG, \
-		IAVF_RX_PTYPE_TUNNEL_##T, \
-		IAVF_RX_PTYPE_TUNNEL_END_##TE, \
-		IAVF_RX_PTYPE_##TEF, \
-		IAVF_RX_PTYPE_INNER_PROT_##I, \
-		IAVF_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define IAVF_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define IAVF_RX_PTYPE_NOF		IAVF_RX_PTYPE_NOT_FRAG
-#define IAVF_RX_PTYPE_FRG		IAVF_RX_PTYPE_FRAG
-#define IAVF_RX_PTYPE_INNER_PROT_TS	IAVF_RX_PTYPE_INNER_PROT_TIMESYNC
-
-/* Lookup table mapping the 8-bit HW PTYPE to the bit field for decoding */
-struct iavf_rx_ptype_decoded iavf_ptype_lookup[BIT(8)] = {
-	/* L2 Packet types */
-	IAVF_PTT_UNUSED_ENTRY(0),
-	IAVF_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
-	IAVF_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT_UNUSED_ENTRY(4),
-	IAVF_PTT_UNUSED_ENTRY(5),
-	IAVF_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT_UNUSED_ENTRY(8),
-	IAVF_PTT_UNUSED_ENTRY(9),
-	IAVF_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	IAVF_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-
-	/* Non Tunneled IPv4 */
-	IAVF_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(25),
-	IAVF_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	IAVF_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	IAVF_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	IAVF_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(32),
-	IAVF_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	IAVF_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(39),
-	IAVF_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	IAVF_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	IAVF_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(47),
-	IAVF_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	IAVF_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(54),
-	IAVF_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	IAVF_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	IAVF_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(62),
-	IAVF_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	IAVF_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(69),
-	IAVF_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	IAVF_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	IAVF_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(77),
-	IAVF_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	IAVF_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(84),
-	IAVF_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	IAVF_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(91),
-	IAVF_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	IAVF_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	IAVF_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	IAVF_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(98),
-	IAVF_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	IAVF_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(105),
-	IAVF_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	IAVF_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	IAVF_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(113),
-	IAVF_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	IAVF_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(120),
-	IAVF_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	IAVF_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	IAVF_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(128),
-	IAVF_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	IAVF_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(135),
-	IAVF_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	IAVF_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	IAVF_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(143),
-	IAVF_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	IAVF_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(150),
-	IAVF_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
 /**
  * iavf_aq_send_msg_to_pf
  * @hw: pointer to the hardware structure
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 095201e83c9db0..9f2e67a6cde3db 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -46,6 +46,7 @@ MODULE_DEVICE_TABLE(pci, iavf_pci_tbl);
 MODULE_ALIAS("i40evf");
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Adaptive Virtual Function Network Driver");
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 
 static const struct net_device_ops iavf_netdev_ops;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_prototype.h b/drivers/net/ethernet/intel/iavf/iavf_prototype.h
index edebfbbcffdc2e..c2e5dbc0a75a35 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_prototype.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_prototype.h
@@ -51,13 +51,6 @@ enum iavf_status iavf_aq_set_rss_key(struct iavf_hw *hw, u16 seid,
 
 enum iavf_status iavf_set_mac_type(struct iavf_hw *hw);
 
-extern struct iavf_rx_ptype_decoded iavf_ptype_lookup[];
-
-static inline struct iavf_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
-{
-	return iavf_ptype_lookup[ptype];
-}
-
 void iavf_vf_parse_hw_config(struct iavf_hw *hw,
 			     struct virtchnl_vf_resource *msg);
 enum iavf_status iavf_vf_reset(struct iavf_hw *hw);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index e989feda133c1e..a83b96e9b6fcf4 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 
 #include "iavf.h"
@@ -982,40 +983,32 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 				    struct sk_buff *skb,
 				    union iavf_rx_desc *rx_desc)
 {
-	struct iavf_rx_ptype_decoded decoded;
+	struct libie_rx_ptype_parsed parsed;
 	u32 rx_error, rx_status;
 	bool ipv4, ipv6;
 	u8 ptype;
 	u64 qword;
 
+	skb->ip_summed = CHECKSUM_NONE;
+
 	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 	ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
+
+	parsed = libie_parse_rx_ptype(ptype);
+	if (!libie_has_rx_checksum(vsi->netdev, parsed))
+		return;
+
 	rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
 		   IAVF_RXD_QW1_ERROR_SHIFT;
 	rx_status = (qword & IAVF_RXD_QW1_STATUS_MASK) >>
 		    IAVF_RXD_QW1_STATUS_SHIFT;
-	decoded = decode_rx_desc_ptype(ptype);
-
-	skb->ip_summed = CHECKSUM_NONE;
-
-	skb_checksum_none_assert(skb);
-
-	/* Rx csum enabled and ip headers found? */
-	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
-		return;
 
 	/* did the hardware decode the packet and checksum? */
 	if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	/* both known and outer_ip must be set for the below code to work */
-	if (!(decoded.known && decoded.outer_ip))
-		return;
-
-	ipv4 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV6);
+	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
+	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
 
 	if (ipv4 &&
 	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
@@ -1039,46 +1032,13 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 	if (rx_error & BIT(IAVF_RX_DESC_ERROR_PPRS_SHIFT))
 		return;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case IAVF_RX_PTYPE_INNER_PROT_TCP:
-	case IAVF_RX_PTYPE_INNER_PROT_UDP:
-	case IAVF_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		fallthrough;
-	default:
-		break;
-	}
-
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
 	vsi->back->hw_csum_rx_error++;
 }
 
-/**
- * iavf_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns a hash type to be used by skb_set_hash
- **/
-static inline int iavf_ptype_to_htype(u8 ptype)
-{
-	struct iavf_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-
-	if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
-	    decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	else if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
-		 decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	else
-		return PKT_HASH_TYPE_L2;
-}
-
 /**
  * iavf_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
@@ -1091,17 +1051,19 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
 				struct sk_buff *skb,
 				u8 rx_ptype)
 {
+	struct libie_rx_ptype_parsed parsed;
 	u32 hash;
 	const __le64 rss_mask =
 		cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
 			    IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);
 
-	if (!(ring->netdev->features & NETIF_F_RXHASH))
+	parsed = libie_parse_rx_ptype(rx_ptype);
+	if (!libie_has_rx_hash(ring->netdev, parsed))
 		return;
 
 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		skb_set_hash(skb, hash, iavf_ptype_to_htype(rx_ptype));
+		libie_skb_set_hash(skb, hash, parsed);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
index 9f1f523807c4e6..3030ba33032603 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_type.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
@@ -339,94 +339,6 @@ enum iavf_rx_desc_error_l3l4e_fcoe_masks {
 #define IAVF_RXD_QW1_PTYPE_SHIFT	30
 #define IAVF_RXD_QW1_PTYPE_MASK		(0xFFULL << IAVF_RXD_QW1_PTYPE_SHIFT)
 
-/* Packet type non-ip values */
-enum iavf_rx_l2_ptype {
-	IAVF_RX_PTYPE_L2_RESERVED			= 0,
-	IAVF_RX_PTYPE_L2_MAC_PAY2			= 1,
-	IAVF_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
-	IAVF_RX_PTYPE_L2_FIP_PAY2			= 3,
-	IAVF_RX_PTYPE_L2_OUI_PAY2			= 4,
-	IAVF_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
-	IAVF_RX_PTYPE_L2_LLDP_PAY2			= 6,
-	IAVF_RX_PTYPE_L2_ECP_PAY2			= 7,
-	IAVF_RX_PTYPE_L2_EVB_PAY2			= 8,
-	IAVF_RX_PTYPE_L2_QCN_PAY2			= 9,
-	IAVF_RX_PTYPE_L2_EAPOL_PAY2			= 10,
-	IAVF_RX_PTYPE_L2_ARP				= 11,
-	IAVF_RX_PTYPE_L2_FCOE_PAY3			= 12,
-	IAVF_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
-	IAVF_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
-	IAVF_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
-	IAVF_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
-	IAVF_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
-	IAVF_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
-	IAVF_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
-	IAVF_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
-};
-
-struct iavf_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:1;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum iavf_rx_ptype_outer_ip {
-	IAVF_RX_PTYPE_OUTER_L2	= 0,
-	IAVF_RX_PTYPE_OUTER_IP	= 1
-};
-
-enum iavf_rx_ptype_outer_ip_ver {
-	IAVF_RX_PTYPE_OUTER_NONE	= 0,
-	IAVF_RX_PTYPE_OUTER_IPV4	= 0,
-	IAVF_RX_PTYPE_OUTER_IPV6	= 1
-};
-
-enum iavf_rx_ptype_outer_fragmented {
-	IAVF_RX_PTYPE_NOT_FRAG	= 0,
-	IAVF_RX_PTYPE_FRAG	= 1
-};
-
-enum iavf_rx_ptype_tunnel_type {
-	IAVF_RX_PTYPE_TUNNEL_NONE		= 0,
-	IAVF_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum iavf_rx_ptype_tunnel_end_prot {
-	IAVF_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	IAVF_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	IAVF_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum iavf_rx_ptype_inner_prot {
-	IAVF_RX_PTYPE_INNER_PROT_NONE		= 0,
-	IAVF_RX_PTYPE_INNER_PROT_UDP		= 1,
-	IAVF_RX_PTYPE_INNER_PROT_TCP		= 2,
-	IAVF_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	IAVF_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	IAVF_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
-};
-
-enum iavf_rx_ptype_payload_layer {
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 #define IAVF_RXD_QW1_LENGTH_PBUF_SHIFT	38
 #define IAVF_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
 					 IAVF_RXD_QW1_LENGTH_PBUF_SHIFT)
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index 89f986a75cc855..611577ebc29d82 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -160,64 +160,6 @@ struct ice_fltr_desc {
 				(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S)
 #define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES	0x1ULL
 
-struct ice_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:2;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum ice_rx_ptype_outer_ip {
-	ICE_RX_PTYPE_OUTER_L2	= 0,
-	ICE_RX_PTYPE_OUTER_IP	= 1,
-};
-
-enum ice_rx_ptype_outer_ip_ver {
-	ICE_RX_PTYPE_OUTER_NONE	= 0,
-	ICE_RX_PTYPE_OUTER_IPV4	= 1,
-	ICE_RX_PTYPE_OUTER_IPV6	= 2,
-};
-
-enum ice_rx_ptype_outer_fragmented {
-	ICE_RX_PTYPE_NOT_FRAG	= 0,
-	ICE_RX_PTYPE_FRAG	= 1,
-};
-
-enum ice_rx_ptype_tunnel_type {
-	ICE_RX_PTYPE_TUNNEL_NONE		= 0,
-	ICE_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum ice_rx_ptype_tunnel_end_prot {
-	ICE_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	ICE_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	ICE_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum ice_rx_ptype_inner_prot {
-	ICE_RX_PTYPE_INNER_PROT_NONE		= 0,
-	ICE_RX_PTYPE_INNER_PROT_UDP		= 1,
-	ICE_RX_PTYPE_INNER_PROT_TCP		= 2,
-	ICE_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	ICE_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	ICE_RX_PTYPE_INNER_PROT_TIMESYNC	= 5,
-};
-
-enum ice_rx_ptype_payload_layer {
-	ICE_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 /* Rx Flex Descriptor
  * This descriptor is used instead of the legacy version descriptor when
  * ice_rlan_ctx.adv_desc is set
@@ -651,262 +593,4 @@ struct ice_tlan_ctx {
 	u8 int_q_state;	/* width not needed - internal - DO NOT WRITE!!! */
 };
 
-/* The ice_ptype_lkup table is used to convert from the 10-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT ice_ptype_lkup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF ice_ptype_lkup[ptype].outer_ip == ICE_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum ice_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define ICE_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		ICE_RX_PTYPE_OUTER_##OUTER_IP, \
-		ICE_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		ICE_RX_PTYPE_##OUTER_FRAG, \
-		ICE_RX_PTYPE_TUNNEL_##T, \
-		ICE_RX_PTYPE_TUNNEL_END_##TE, \
-		ICE_RX_PTYPE_##TEF, \
-		ICE_RX_PTYPE_INNER_PROT_##I, \
-		ICE_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define ICE_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define ICE_RX_PTYPE_NOF		ICE_RX_PTYPE_NOT_FRAG
-#define ICE_RX_PTYPE_FRG		ICE_RX_PTYPE_FRAG
-
-/* Lookup table mapping in the 10-bit HW PTYPE to the bit field for decoding */
-static const struct ice_rx_ptype_decoded ice_ptype_lkup[BIT(10)] = {
-	/* L2 Packet types */
-	ICE_PTT_UNUSED_ENTRY(0),
-	ICE_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	ICE_PTT_UNUSED_ENTRY(2),
-	ICE_PTT_UNUSED_ENTRY(3),
-	ICE_PTT_UNUSED_ENTRY(4),
-	ICE_PTT_UNUSED_ENTRY(5),
-	ICE_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT_UNUSED_ENTRY(8),
-	ICE_PTT_UNUSED_ENTRY(9),
-	ICE_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT_UNUSED_ENTRY(12),
-	ICE_PTT_UNUSED_ENTRY(13),
-	ICE_PTT_UNUSED_ENTRY(14),
-	ICE_PTT_UNUSED_ENTRY(15),
-	ICE_PTT_UNUSED_ENTRY(16),
-	ICE_PTT_UNUSED_ENTRY(17),
-	ICE_PTT_UNUSED_ENTRY(18),
-	ICE_PTT_UNUSED_ENTRY(19),
-	ICE_PTT_UNUSED_ENTRY(20),
-	ICE_PTT_UNUSED_ENTRY(21),
-
-	/* Non Tunneled IPv4 */
-	ICE_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(25),
-	ICE_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	ICE_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	ICE_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	ICE_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(32),
-	ICE_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	ICE_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(39),
-	ICE_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	ICE_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	ICE_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(47),
-	ICE_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	ICE_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(54),
-	ICE_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	ICE_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	ICE_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(62),
-	ICE_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	ICE_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(69),
-	ICE_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	ICE_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	ICE_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(77),
-	ICE_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	ICE_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(84),
-	ICE_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	ICE_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(91),
-	ICE_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	ICE_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	ICE_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	ICE_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(98),
-	ICE_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	ICE_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(105),
-	ICE_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	ICE_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	ICE_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(113),
-	ICE_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	ICE_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(120),
-	ICE_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	ICE_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	ICE_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(128),
-	ICE_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	ICE_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(135),
-	ICE_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	ICE_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	ICE_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(143),
-	ICE_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	ICE_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(150),
-	ICE_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 1023] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
-static inline struct ice_rx_ptype_decoded ice_decode_rx_desc_ptype(u16 ptype)
-{
-	return ice_ptype_lkup[ptype];
-}
-
-
 #endif /* _ICE_LAN_TX_RX_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index a1f7c8edc22f34..f3d9c5ddef33e3 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -34,6 +34,7 @@ static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation.";
 
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION(DRV_SUMMARY);
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 MODULE_FIRMWARE(ICE_DDP_PKG_FILE);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 7bc5aa340c7df7..3b3793428ab9ca 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2019, Intel Corporation. */
 
 #include <linux/filter.h>
+#include <linux/net/intel/libie/rx.h>
 
 #include "ice_txrx_lib.h"
 #include "ice_eswitch.h"
@@ -38,30 +39,6 @@ void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val)
 	}
 }
 
-/**
- * ice_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns appropriate hash type (such as PKT_HASH_TYPE_L2/L3/L4) to be used by
- * skb_set_hash based on PTYPE as parsed by HW Rx pipeline and is part of
- * Rx desc.
- */
-static enum pkt_hash_types ice_ptype_to_htype(u16 ptype)
-{
-	struct ice_rx_ptype_decoded decoded = ice_decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	if (decoded.outer_ip == ICE_RX_PTYPE_OUTER_L2)
-		return PKT_HASH_TYPE_L2;
-
-	return PKT_HASH_TYPE_NONE;
-}
-
 /**
  * ice_rx_hash - set the hash value in the skb
  * @rx_ring: descriptor ring
@@ -74,9 +51,11 @@ ice_rx_hash(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
 	    struct sk_buff *skb, u16 rx_ptype)
 {
 	struct ice_32b_rx_flex_desc_nic *nic_mdid;
+	struct libie_rx_ptype_parsed parsed;
 	u32 hash;
 
-	if (!(rx_ring->netdev->features & NETIF_F_RXHASH))
+	parsed = libie_parse_rx_ptype(rx_ptype);
+	if (!libie_has_rx_hash(rx_ring->netdev, parsed))
 		return;
 
 	if (rx_desc->wb.rxdid != ICE_RXDID_FLEX_NIC)
@@ -84,7 +63,7 @@ ice_rx_hash(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
 
 	nic_mdid = (struct ice_32b_rx_flex_desc_nic *)rx_desc;
 	hash = le32_to_cpu(nic_mdid->rss_hash);
-	skb_set_hash(skb, hash, ice_ptype_to_htype(rx_ptype));
+	libie_skb_set_hash(skb, hash, parsed);
 }
 
 /**
@@ -92,7 +71,7 @@ ice_rx_hash(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
  * @ring: the ring we care about
  * @skb: skb currently being received and modified
  * @rx_desc: the receive descriptor
- * @ptype: the packet type decoded by hardware
+ * @ptype: the packet type parsed by hardware
  *
  * skb->protocol must be set before this function is called
  */
@@ -100,34 +79,26 @@ static void
 ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
 	    union ice_32b_rx_flex_desc *rx_desc, u16 ptype)
 {
-	struct ice_rx_ptype_decoded decoded;
+	struct libie_rx_ptype_parsed parsed;
 	u16 rx_status0, rx_status1;
 	bool ipv4, ipv6;
 
-	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
-	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
-
-	decoded = ice_decode_rx_desc_ptype(ptype);
-
 	/* Start with CHECKSUM_NONE and by default csum_level = 0 */
 	skb->ip_summed = CHECKSUM_NONE;
-	skb_checksum_none_assert(skb);
 
-	/* check if Rx checksum is enabled */
-	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+	parsed = libie_parse_rx_ptype(ptype);
+	if (!libie_has_rx_checksum(ring->netdev, parsed))
 		return;
 
-	/* check if HW has decoded the packet and checksum */
-	if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
-		return;
+	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
+	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
 
-	if (!(decoded.known && decoded.outer_ip))
+	/* check if HW has parsed the packet and checksum */
+	if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
 		return;
 
-	ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6);
+	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
+	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
 
 	if (ipv4 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
 				   BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S))))
@@ -151,19 +122,10 @@ ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
 	 * we need to bump the checksum level by 1 to reflect the fact that
 	 * we are indicating we validated the inner checksum.
 	 */
-	if (decoded.tunnel_type >= ICE_RX_PTYPE_TUNNEL_IP_GRENAT)
+	if (parsed.tunnel_type >= LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT)
 		skb->csum_level = 1;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case ICE_RX_PTYPE_INNER_PROT_TCP:
-	case ICE_RX_PTYPE_INNER_PROT_UDP:
-	case ICE_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		break;
-	default:
-		break;
-	}
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
@@ -175,7 +137,7 @@ ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
  * @rx_ring: Rx descriptor ring packet is being transacted on
  * @rx_desc: pointer to the EOP Rx descriptor
  * @skb: pointer to current skb being populated
- * @ptype: the packet type decoded by hardware
+ * @ptype: the packet type parsed by hardware
  *
  * This function checks the ring, descriptor, and packet information in
  * order to populate the hash, checksum, VLAN, protocol, and
diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile
new file mode 100644
index 00000000000000..95e81d09b4746c
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright(c) 2023 Intel Corporation.
+
+obj-$(CONFIG_LIBIE)	+= libie.o
+
+libie-objs		+= rx.o
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
new file mode 100644
index 00000000000000..f503476d8eeff9
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <linux/net/intel/libie/rx.h>
+
+/* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
+ * bitfield struct.
+ */
+
+#define LIBIE_RX_PTYPE(oip, ofrag, tun, tp, tefr, iprot, pl) {		   \
+		.outer_ip		= LIBIE_RX_PTYPE_OUTER_##oip,	   \
+		.outer_frag		= LIBIE_RX_PTYPE_##ofrag,	   \
+		.tunnel_type		= LIBIE_RX_PTYPE_TUNNEL_IP_##tun,  \
+		.tunnel_end_prot	= LIBIE_RX_PTYPE_TUNNEL_END_##tp,  \
+		.tunnel_end_frag	= LIBIE_RX_PTYPE_##tefr,	   \
+		.inner_prot		= LIBIE_RX_PTYPE_INNER_##iprot,	   \
+		.payload_layer		= LIBIE_RX_PTYPE_PAYLOAD_##pl,	   \
+	}
+
+#define LIBIE_RX_PTYPE_UNUSED		{ }
+
+#define __LIBIE_RX_PTYPE_L2(iprot, pl)					   \
+	LIBIE_RX_PTYPE(L2, NOT_FRAG, NONE, NONE, NOT_FRAG, iprot, pl)
+#define LIBIE_RX_PTYPE_L2		__LIBIE_RX_PTYPE_L2(NONE, L2)
+#define LIBIE_RX_PTYPE_TS		__LIBIE_RX_PTYPE_L2(TIMESYNC, L2)
+#define LIBIE_RX_PTYPE_L3		__LIBIE_RX_PTYPE_L2(NONE, L3)
+
+#define LIBIE_RX_PTYPE_IP_FRAG(oip)					   \
+	LIBIE_RX_PTYPE(IPV##oip, FRAG, NONE, NONE, NOT_FRAG, NONE, L3)
+#define LIBIE_RX_PTYPE_IP_L3(oip, tun, teprot, tefr)			   \
+	LIBIE_RX_PTYPE(IPV##oip, NOT_FRAG, tun, teprot, tefr, NONE, L3)
+#define LIBIE_RX_PTYPE_IP_L4(oip, tun, teprot, iprot)			   \
+	LIBIE_RX_PTYPE(IPV##oip, NOT_FRAG, tun, teprot, NOT_FRAG, iprot, L4)
+
+#define LIBIE_RX_PTYPE_IP_NOF(oip, tun, ver)				   \
+	LIBIE_RX_PTYPE_IP_L3(oip, tun, ver, NOT_FRAG),			   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, UDP),			   \
+	LIBIE_RX_PTYPE_UNUSED,						   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, TCP),			   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, SCTP),			   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, ICMP)
+
+/* IPv oip --> tun --> IPv ver */
+#define LIBIE_RX_PTYPE_IP_TUN_VER(oip, tun, ver)			   \
+	LIBIE_RX_PTYPE_IP_L3(oip, tun, ver, FRAG),			   \
+	LIBIE_RX_PTYPE_IP_NOF(oip, tun, ver)
+
+/* Non Tunneled IPv oip */
+#define LIBIE_RX_PTYPE_IP_RAW(oip)					   \
+	LIBIE_RX_PTYPE_IP_FRAG(oip),					   \
+	LIBIE_RX_PTYPE_IP_NOF(oip, NONE, NONE)
+
+/* IPv oip --> tun --> { IPv4, IPv6 } */
+#define LIBIE_RX_PTYPE_IP_TUN(oip, tun)					   \
+	LIBIE_RX_PTYPE_IP_TUN_VER(oip, tun, IPV4),			   \
+	LIBIE_RX_PTYPE_IP_TUN_VER(oip, tun, IPV6)
+
+/* IPv oip --> GRE/NAT tun --> { x, IPv4, IPv6 } */
+#define LIBIE_RX_PTYPE_IP_GRE(oip, tun)					   \
+	LIBIE_RX_PTYPE_IP_L3(oip, tun, NONE, NOT_FRAG),			   \
+	LIBIE_RX_PTYPE_IP_TUN(oip, tun)
+
+/* Non Tunneled IPv oip
+ * IPv oip --> { IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> { x, IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> MAC --> { x, IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> MAC/VLAN --> { x, IPv4, IPv6 }
+ */
+#define LIBIE_RX_PTYPE_IP(oip)						   \
+	LIBIE_RX_PTYPE_IP_RAW(oip),					   \
+	LIBIE_RX_PTYPE_IP_TUN(oip, IP),					   \
+	LIBIE_RX_PTYPE_IP_GRE(oip, GRENAT),				   \
+	LIBIE_RX_PTYPE_IP_GRE(oip, GRENAT_MAC),				   \
+	LIBIE_RX_PTYPE_IP_GRE(oip, GRENAT_MAC_VLAN)
+
+/* Lookup table mapping for O(1) parsing */
+const struct libie_rx_ptype_parsed libie_rx_ptype_lut[LIBIE_RX_PTYPE_NUM] = {
+	/* L2 packet types */
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_TS,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_UNUSED,
+
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+
+	LIBIE_RX_PTYPE_IP(4),
+	LIBIE_RX_PTYPE_IP(6),
+};
+EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel(R) Ethernet common library");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
new file mode 100644
index 00000000000000..58bd0f35d0253f
--- /dev/null
+++ b/include/linux/net/intel/libie/rx.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Intel Corporation. */
+
+#ifndef __LIBIE_RX_H
+#define __LIBIE_RX_H
+
+#include <linux/netdevice.h>
+
+/* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
+ * bitfield struct.
+ */
+
+struct libie_rx_ptype_parsed {
+	u16	outer_ip:2;
+	u16	outer_frag:1;
+	u16	tunnel_type:3;
+	u16	tunnel_end_prot:2;
+	u16	tunnel_end_frag:1;
+	u16	inner_prot:3;
+	u16	payload_layer:2;
+};
+
+enum libie_rx_ptype_outer_ip {
+	LIBIE_RX_PTYPE_OUTER_L2				= 0U,
+	LIBIE_RX_PTYPE_OUTER_IPV4,
+	LIBIE_RX_PTYPE_OUTER_IPV6,
+};
+
+enum libie_rx_ptype_outer_fragmented {
+	LIBIE_RX_PTYPE_NOT_FRAG				= 0U,
+	LIBIE_RX_PTYPE_FRAG,
+};
+
+enum libie_rx_ptype_tunnel_type {
+	LIBIE_RX_PTYPE_TUNNEL_IP_NONE			= 0U,
+	LIBIE_RX_PTYPE_TUNNEL_IP_IP,
+	LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT,
+	LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC,
+	LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN,
+};
+
+enum libie_rx_ptype_tunnel_end_prot {
+	LIBIE_RX_PTYPE_TUNNEL_END_NONE			= 0U,
+	LIBIE_RX_PTYPE_TUNNEL_END_IPV4,
+	LIBIE_RX_PTYPE_TUNNEL_END_IPV6,
+};
+
+enum libie_rx_ptype_inner_prot {
+	LIBIE_RX_PTYPE_INNER_NONE			= 0U,
+	LIBIE_RX_PTYPE_INNER_UDP,
+	LIBIE_RX_PTYPE_INNER_TCP,
+	LIBIE_RX_PTYPE_INNER_SCTP,
+	LIBIE_RX_PTYPE_INNER_ICMP,
+	LIBIE_RX_PTYPE_INNER_TIMESYNC,
+};
+
+enum libie_rx_ptype_payload_layer {
+	LIBIE_RX_PTYPE_PAYLOAD_NONE			= PKT_HASH_TYPE_NONE,
+	LIBIE_RX_PTYPE_PAYLOAD_L2			= PKT_HASH_TYPE_L2,
+	LIBIE_RX_PTYPE_PAYLOAD_L3			= PKT_HASH_TYPE_L3,
+	LIBIE_RX_PTYPE_PAYLOAD_L4			= PKT_HASH_TYPE_L4,
+};
+
+#define LIBIE_RX_PTYPE_NUM				154
+
+extern const struct libie_rx_ptype_parsed
+libie_rx_ptype_lut[LIBIE_RX_PTYPE_NUM];
+
+/**
+ * libie_parse_rx_ptype - convert HW packet type to software bitfield structure
+ * @ptype: 10-bit hardware packet type value from the descriptor
+ *
+ * @libie_rx_ptype_lut must be accessed only using this wrapper.
+ *
+ * Returns the parsed bitfield struct corresponding to the provided ptype.
+ */
+static inline struct libie_rx_ptype_parsed libie_parse_rx_ptype(u32 ptype)
+{
+	if (unlikely(ptype >= LIBIE_RX_PTYPE_NUM))
+		ptype = 0;
+
+	return libie_rx_ptype_lut[ptype];
+}
+
+/* libie_has_*() can be used to quickly check whether the HW metadata is
+ * available to avoid further expensive processing such as descriptor reads.
+ * They already check for the corresponding netdev feature to be enabled,
+ * thus can be used as drop-in replacements.
+ */
+
+static inline bool libie_has_rx_checksum(const struct net_device *dev,
+					 struct libie_rx_ptype_parsed parsed)
+{
+	/* _INNER_{SCTP,TCP,UDP} are possible only when _OUTER_IPV* is set,
+	 * it is enough to check only for the L4 type.
+	 */
+	switch (parsed.inner_prot) {
+	case LIBIE_RX_PTYPE_INNER_TCP:
+	case LIBIE_RX_PTYPE_INNER_UDP:
+	case LIBIE_RX_PTYPE_INNER_SCTP:
+		return dev->features & NETIF_F_RXCSUM;
+	default:
+		return false;
+	}
+}
+
+static inline bool libie_has_rx_hash(const struct net_device *dev,
+				     struct libie_rx_ptype_parsed parsed)
+{
+	if (parsed.payload_layer < LIBIE_RX_PTYPE_PAYLOAD_L2)
+		return false;
+
+	return dev->features & NETIF_F_RXHASH;
+}
+
+/**
+ * libie_skb_set_hash - fill in skb hash value basing on the parsed ptype
+ * @skb: skb to fill the hash in
+ * @hash: 32-bit hash value from the descriptor
+ * @parsed: parsed packet type
+ */
+static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
+				      struct libie_rx_ptype_parsed parsed)
+{
+	skb_set_hash(skb, hash, parsed.payload_layer);
+}
+
+#endif /* __LIBIE_RX_H */

From 0e490c5d12f78f30563e8e736143ebb16acaea10 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 1 Feb 2023 17:45:17 +0100
Subject: [PATCH 02/40] iavf: kill "legacy-rx" for good

Ever since build_skb() became stable, the old way with allocating an skb
for storing the headers separately, which will be then copied manually,
was slower, less flexible and thus obsolete.

* it had higher pressure on MM since it actually allocates new pages,
  which then get split and refcount-biased (NAPI page cache);
* it implies memcpy() of packet headers (40+ bytes per each frame);
* the actual header length was calculated via eth_get_headlen(), which
  invokes Flow Dissector and thus wastes a bunch of CPU cycles;
* XDP makes it even more weird since it requires headroom for long and
  also tailroom for some time (since mbuf landed). Take a look at the
  ice driver, which is built around work-arounds to make XDP work with
  it.

Even on some quite low-end hardware (not a common case for 100G NICs) it
was performing worse.
The only advantage "legacy-rx" had is that it didn't require any
reserved headroom and tailroom. But iavf didn't use this, as it always
splits pages into two halves of 2k, while that save would only be useful
when striding. And again, XDP effectively removes that sole pro.

There's a train of features to land in IAVF soon: Page Pool, XDP, XSk,
multi-buffer etc. Each new would require adding more and more Danse
Macabre for absolutely no reason, besides making hotpath less and less
effective.
Remove the "feature" with all the related code. This includes at least
one very hot branch (typically hit on each new frame), which was either
always-true or always-false at least for a complete NAPI bulk of 64
frames, the whole private flags cruft and so on. Some stats:

Function: add/remove: 0/2 grow/shrink: 0/7 up/down: 0/-774 (-774)
RO Data: add/remove: 0/1 grow/shrink: 0/0 up/down: 0/-40 (-40)

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |   2 +-
 .../net/ethernet/intel/iavf/iavf_ethtool.c    | 140 ------------------
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  10 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  84 +----------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   |  18 +--
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |   3 +-
 6 files changed, 8 insertions(+), 249 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 2cdce251472c08..7dbec98d2a983f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -294,7 +294,7 @@ struct iavf_adapter {
 #define IAVF_FLAG_CLIENT_NEEDS_L2_PARAMS	BIT(12)
 #define IAVF_FLAG_PROMISC_ON			BIT(13)
 #define IAVF_FLAG_ALLMULTI_ON			BIT(14)
-#define IAVF_FLAG_LEGACY_RX			BIT(15)
+/* BIT(15) is free, was IAVF_FLAG_LEGACY_RX */
 #define IAVF_FLAG_REINIT_ITR_NEEDED		BIT(16)
 #define IAVF_FLAG_QUEUES_DISABLED		BIT(17)
 #define IAVF_FLAG_SETUP_NETDEV_FEATURES		BIT(18)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index 6f171d1d85b75f..de3050c02b6ffc 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -239,29 +239,6 @@ static const struct iavf_stats iavf_gstrings_stats[] = {
 
 #define IAVF_QUEUE_STATS_LEN	ARRAY_SIZE(iavf_gstrings_queue_stats)
 
-/* For now we have one and only one private flag and it is only defined
- * when we have support for the SKIP_CPU_SYNC DMA attribute.  Instead
- * of leaving all this code sitting around empty we will strip it unless
- * our one private flag is actually available.
- */
-struct iavf_priv_flags {
-	char flag_string[ETH_GSTRING_LEN];
-	u32 flag;
-	bool read_only;
-};
-
-#define IAVF_PRIV_FLAG(_name, _flag, _read_only) { \
-	.flag_string = _name, \
-	.flag = _flag, \
-	.read_only = _read_only, \
-}
-
-static const struct iavf_priv_flags iavf_gstrings_priv_flags[] = {
-	IAVF_PRIV_FLAG("legacy-rx", IAVF_FLAG_LEGACY_RX, 0),
-};
-
-#define IAVF_PRIV_FLAGS_STR_LEN ARRAY_SIZE(iavf_gstrings_priv_flags)
-
 /**
  * iavf_get_link_ksettings - Get Link Speed and Duplex settings
  * @netdev: network interface device structure
@@ -341,8 +318,6 @@ static int iavf_get_sset_count(struct net_device *netdev, int sset)
 		return IAVF_STATS_LEN +
 			(IAVF_QUEUE_STATS_LEN * 2 *
 			 netdev->real_num_tx_queues);
-	else if (sset == ETH_SS_PRIV_FLAGS)
-		return IAVF_PRIV_FLAGS_STR_LEN;
 	else
 		return -EINVAL;
 }
@@ -384,24 +359,6 @@ static void iavf_get_ethtool_stats(struct net_device *netdev,
 	rcu_read_unlock();
 }
 
-/**
- * iavf_get_priv_flag_strings - Get private flag strings
- * @netdev: network interface device structure
- * @data: buffer for string data
- *
- * Builds the private flags string table
- **/
-static void iavf_get_priv_flag_strings(struct net_device *netdev, u8 *data)
-{
-	unsigned int i;
-
-	for (i = 0; i < IAVF_PRIV_FLAGS_STR_LEN; i++) {
-		snprintf(data, ETH_GSTRING_LEN, "%s",
-			 iavf_gstrings_priv_flags[i].flag_string);
-		data += ETH_GSTRING_LEN;
-	}
-}
-
 /**
  * iavf_get_stat_strings - Get stat strings
  * @netdev: network interface device structure
@@ -440,105 +397,11 @@ static void iavf_get_strings(struct net_device *netdev, u32 sset, u8 *data)
 	case ETH_SS_STATS:
 		iavf_get_stat_strings(netdev, data);
 		break;
-	case ETH_SS_PRIV_FLAGS:
-		iavf_get_priv_flag_strings(netdev, data);
-		break;
 	default:
 		break;
 	}
 }
 
-/**
- * iavf_get_priv_flags - report device private flags
- * @netdev: network interface device structure
- *
- * The get string set count and the string set should be matched for each
- * flag returned.  Add new strings for each flag to the iavf_gstrings_priv_flags
- * array.
- *
- * Returns a u32 bitmap of flags.
- **/
-static u32 iavf_get_priv_flags(struct net_device *netdev)
-{
-	struct iavf_adapter *adapter = netdev_priv(netdev);
-	u32 i, ret_flags = 0;
-
-	for (i = 0; i < IAVF_PRIV_FLAGS_STR_LEN; i++) {
-		const struct iavf_priv_flags *priv_flags;
-
-		priv_flags = &iavf_gstrings_priv_flags[i];
-
-		if (priv_flags->flag & adapter->flags)
-			ret_flags |= BIT(i);
-	}
-
-	return ret_flags;
-}
-
-/**
- * iavf_set_priv_flags - set private flags
- * @netdev: network interface device structure
- * @flags: bit flags to be set
- **/
-static int iavf_set_priv_flags(struct net_device *netdev, u32 flags)
-{
-	struct iavf_adapter *adapter = netdev_priv(netdev);
-	u32 orig_flags, new_flags, changed_flags;
-	u32 i;
-
-	orig_flags = READ_ONCE(adapter->flags);
-	new_flags = orig_flags;
-
-	for (i = 0; i < IAVF_PRIV_FLAGS_STR_LEN; i++) {
-		const struct iavf_priv_flags *priv_flags;
-
-		priv_flags = &iavf_gstrings_priv_flags[i];
-
-		if (flags & BIT(i))
-			new_flags |= priv_flags->flag;
-		else
-			new_flags &= ~(priv_flags->flag);
-
-		if (priv_flags->read_only &&
-		    ((orig_flags ^ new_flags) & ~BIT(i)))
-			return -EOPNOTSUPP;
-	}
-
-	/* Before we finalize any flag changes, any checks which we need to
-	 * perform to determine if the new flags will be supported should go
-	 * here...
-	 */
-
-	/* Compare and exchange the new flags into place. If we failed, that
-	 * is if cmpxchg returns anything but the old value, this means
-	 * something else must have modified the flags variable since we
-	 * copied it. We'll just punt with an error and log something in the
-	 * message buffer.
-	 */
-	if (cmpxchg(&adapter->flags, orig_flags, new_flags) != orig_flags) {
-		dev_warn(&adapter->pdev->dev,
-			 "Unable to update adapter->flags as it was modified by another thread...\n");
-		return -EAGAIN;
-	}
-
-	changed_flags = orig_flags ^ new_flags;
-
-	/* Process any additional changes needed as a result of flag changes.
-	 * The changed_flags value reflects the list of bits that were changed
-	 * in the code above.
-	 */
-
-	/* issue a reset to force legacy-rx change to take effect */
-	if (changed_flags & IAVF_FLAG_LEGACY_RX) {
-		if (netif_running(netdev)) {
-			adapter->flags |= IAVF_FLAG_RESET_NEEDED;
-			queue_work(adapter->wq, &adapter->reset_task);
-		}
-	}
-
-	return 0;
-}
-
 /**
  * iavf_get_msglevel - Get debug message level
  * @netdev: network interface device structure
@@ -584,7 +447,6 @@ static void iavf_get_drvinfo(struct net_device *netdev,
 	strscpy(drvinfo->driver, iavf_driver_name, 32);
 	strscpy(drvinfo->fw_version, "N/A", 4);
 	strscpy(drvinfo->bus_info, pci_name(adapter->pdev), 32);
-	drvinfo->n_priv_flags = IAVF_PRIV_FLAGS_STR_LEN;
 }
 
 /**
@@ -1969,8 +1831,6 @@ static const struct ethtool_ops iavf_ethtool_ops = {
 	.get_strings		= iavf_get_strings,
 	.get_ethtool_stats	= iavf_get_ethtool_stats,
 	.get_sset_count		= iavf_get_sset_count,
-	.get_priv_flags		= iavf_get_priv_flags,
-	.set_priv_flags		= iavf_set_priv_flags,
 	.get_msglevel		= iavf_get_msglevel,
 	.set_msglevel		= iavf_set_msglevel,
 	.get_coalesce		= iavf_get_coalesce,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 9f2e67a6cde3db..8f387fa10b8d85 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -713,9 +713,7 @@ static void iavf_configure_rx(struct iavf_adapter *adapter)
 	struct iavf_hw *hw = &adapter->hw;
 	int i;
 
-	/* Legacy Rx will always default to a 2048 buffer size. */
-#if (PAGE_SIZE < 8192)
-	if (!(adapter->flags & IAVF_FLAG_LEGACY_RX)) {
+	if (PAGE_SIZE < 8192) {
 		struct net_device *netdev = adapter->netdev;
 
 		/* For jumbo frames on systems with 4K pages we have to use
@@ -732,16 +730,10 @@ static void iavf_configure_rx(struct iavf_adapter *adapter)
 		    (netdev->mtu <= ETH_DATA_LEN))
 			rx_buf_len = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
 	}
-#endif
 
 	for (i = 0; i < adapter->num_active_queues; i++) {
 		adapter->rx_rings[i].tail = hw->hw_addr + IAVF_QRX_TAIL1(i);
 		adapter->rx_rings[i].rx_buf_len = rx_buf_len;
-
-		if (adapter->flags & IAVF_FLAG_LEGACY_RX)
-			clear_ring_build_skb_enabled(&adapter->rx_rings[i]);
-		else
-			set_ring_build_skb_enabled(&adapter->rx_rings[i]);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index a83b96e9b6fcf4..a7121dc5c32b3a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -824,17 +824,6 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 	writel(val, rx_ring->tail);
 }
 
-/**
- * iavf_rx_offset - Return expected offset into page to access data
- * @rx_ring: Ring we are requesting offset of
- *
- * Returns the offset value for ring into the data buffer.
- */
-static inline unsigned int iavf_rx_offset(struct iavf_ring *rx_ring)
-{
-	return ring_uses_build_skb(rx_ring) ? IAVF_SKB_PAD : 0;
-}
-
 /**
  * iavf_alloc_mapped_page - recycle or make a new page
  * @rx_ring: ring to use
@@ -879,7 +868,7 @@ static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
 
 	bi->dma = dma;
 	bi->page = page;
-	bi->page_offset = iavf_rx_offset(rx_ring);
+	bi->page_offset = IAVF_SKB_PAD;
 
 	/* initialize pagecnt_bias to 1 representing we fully own page */
 	bi->pagecnt_bias = 1;
@@ -1220,7 +1209,7 @@ static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = SKB_DATA_ALIGN(size + iavf_rx_offset(rx_ring));
+	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
 #endif
 
 	if (!size)
@@ -1268,71 +1257,6 @@ static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring,
 	return rx_buffer;
 }
 
-/**
- * iavf_construct_skb - Allocate skb and populate it
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: rx buffer to pull data from
- * @size: size of buffer to add to skb
- *
- * This function allocates an skb.  It then populates it with the page
- * data from the current receive descriptor, taking care to set up the
- * skb correctly.
- */
-static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
-					  struct iavf_rx_buffer *rx_buffer,
-					  unsigned int size)
-{
-	void *va;
-#if (PAGE_SIZE < 8192)
-	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
-#else
-	unsigned int truesize = SKB_DATA_ALIGN(size);
-#endif
-	unsigned int headlen;
-	struct sk_buff *skb;
-
-	if (!rx_buffer)
-		return NULL;
-	/* prefetch first cache line of first page */
-	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
-	net_prefetch(va);
-
-	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-			       IAVF_RX_HDR_SIZE,
-			       GFP_ATOMIC | __GFP_NOWARN);
-	if (unlikely(!skb))
-		return NULL;
-
-	/* Determine available headroom for copy */
-	headlen = size;
-	if (headlen > IAVF_RX_HDR_SIZE)
-		headlen = eth_get_headlen(skb->dev, va, IAVF_RX_HDR_SIZE);
-
-	/* align pull length to size of long to optimize memcpy performance */
-	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
-
-	/* update all of the pointers */
-	size -= headlen;
-	if (size) {
-		skb_add_rx_frag(skb, 0, rx_buffer->page,
-				rx_buffer->page_offset + headlen,
-				size, truesize);
-
-		/* buffer is used by skb, update page_offset */
-#if (PAGE_SIZE < 8192)
-		rx_buffer->page_offset ^= truesize;
-#else
-		rx_buffer->page_offset += truesize;
-#endif
-	} else {
-		/* buffer is unused, reset bias back to rx_buffer */
-		rx_buffer->pagecnt_bias++;
-	}
-
-	return skb;
-}
-
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @rx_ring: Rx descriptor ring to transact packets on
@@ -1505,10 +1429,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* retrieve a buffer from the ring */
 		if (skb)
 			iavf_add_rx_frag(rx_ring, rx_buffer, skb, size);
-		else if (ring_uses_build_skb(rx_ring))
-			skb = iavf_build_skb(rx_ring, rx_buffer, size);
 		else
-			skb = iavf_construct_skb(rx_ring, rx_buffer, size);
+			skb = iavf_build_skb(rx_ring, rx_buffer, size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 2624bf6d009e36..234e189c198755 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -362,7 +362,8 @@ struct iavf_ring {
 
 	u16 flags;
 #define IAVF_TXR_FLAGS_WB_ON_ITR		BIT(0)
-#define IAVF_RXR_FLAGS_BUILD_SKB_ENABLED	BIT(1)
+/* BIT(1) is free, was IAVF_RXR_FLAGS_BUILD_SKB_ENABLED */
+/* BIT(2) is free */
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
@@ -393,21 +394,6 @@ struct iavf_ring {
 					 */
 } ____cacheline_internodealigned_in_smp;
 
-static inline bool ring_uses_build_skb(struct iavf_ring *ring)
-{
-	return !!(ring->flags & IAVF_RXR_FLAGS_BUILD_SKB_ENABLED);
-}
-
-static inline void set_ring_build_skb_enabled(struct iavf_ring *ring)
-{
-	ring->flags |= IAVF_RXR_FLAGS_BUILD_SKB_ENABLED;
-}
-
-static inline void clear_ring_build_skb_enabled(struct iavf_ring *ring)
-{
-	ring->flags &= ~IAVF_RXR_FLAGS_BUILD_SKB_ENABLED;
-}
-
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002
 #define IAVF_ITR_ADAPTIVE_MIN_USECS	0x0002
 #define IAVF_ITR_ADAPTIVE_MAX_USECS	0x007e
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 4e17d006c52d46..c2e328ec5af8f0 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -290,8 +290,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 		return;
 
 	/* Limit maximum frame size when jumbo frames is not enabled */
-	if (!(adapter->flags & IAVF_FLAG_LEGACY_RX) &&
-	    (adapter->netdev->mtu <= ETH_DATA_LEN))
+	if (adapter->netdev->mtu <= ETH_DATA_LEN)
 		max_frame = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
 
 	vqci->vsi_id = adapter->vsi_res->vsi_id;

From 88798b949be6225358afb87a8b45cb0dfd1ddda9 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 2 Feb 2023 18:17:12 +0100
Subject: [PATCH 03/40] iavf: optimize Rx buffer allocation a bunch

The Rx hotpath code of IAVF is not well-optimized TBH. Before doing any
further buffer model changes, shake it up a bit. Notably:

1. Cache more variables on the stack.
   DMA device, Rx page size, NTC -- these are the most common things
   used all throughout the hotpath, often in loops on each iteration.
   Instead of fetching (or even calculating, as with the page size) them
   from the ring all the time, cache them on the stack at the beginning
   of the NAPI polling callback. NTC will be written back at the end,
   the rest are used read-only, so no sync needed.
2. Don't move the recycled buffers around the ring.
   The idea of passing the page of the right-now-recycled-buffer to a
   different buffer, in this case, the first one that needs to be
   allocated, moreover, on each new frame, is fundamentally wrong. It
   involves a few o' fetches, branches and then writes (and one Rx
   buffer struct is at least 32 bytes) where they're completely unneeded,
   but gives no good -- the result is the same as if we'd recycle it
   inplace, at the same position where it was used. So drop this and let
   the main refilling function take care of all the buffers, which were
   processed and now need to be recycled/refilled.
3. Don't allocate with %GPF_ATOMIC on ifup.
   This involved introducing the @gfp parameter to a couple functions.
   Doesn't change anything for Rx -> softirq.
4. 1 budget unit == 1 descriptor, not skb.
   There could be underflow when receiving a lot of fragmented frames.
   If each of them would consist of 2 frags, it means that we'd process
   64 descriptors at the point where we pass the 32th skb to the stack.
   But the driver would count that only as a half, which could make NAPI
   re-enable interrupts prematurely and create unnecessary CPU load.
5. Shortcut !size case.
   It's super rare, but possible -- for example, if the last buffer of
   the fragmented frame contained only FCS, which was then stripped by
   the HW. Instead of checking for size several times when processing,
   quickly reuse the buffer and jump to the skb fields part.
6. Refill the ring after finishing the polling loop.
   Previously, the loop wasn't starting a new iteration after the 64th
   desc, meaning that we were always leaving 16 buffers non-refilled
   until the next NAPI poll. It's better to refill them while they're
   still hot, so do that right after exiting the loop as well.
   For a full cycle of 64 descs, there will be 4 refills of 16 descs
   from now on.

Function: add/remove: 4/2 grow/shrink: 0/5 up/down: 473/-647 (-174)

+ up to 2% performance.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |   2 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 259 +++++++++-----------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |   3 +-
 3 files changed, 114 insertions(+), 150 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 8f387fa10b8d85..a497acd96385de 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1237,7 +1237,7 @@ static void iavf_configure(struct iavf_adapter *adapter)
 	for (i = 0; i < adapter->num_active_queues; i++) {
 		struct iavf_ring *ring = &adapter->rx_rings[i];
 
-		iavf_alloc_rx_buffers(ring, IAVF_DESC_UNUSED(ring));
+		iavf_alloc_rx_buffers(ring);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index a7121dc5c32b3a..fd08ce67380ee2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -736,7 +736,6 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 	/* Zero out the descriptor ring */
 	memset(rx_ring->desc, 0, rx_ring->size);
 
-	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 }
@@ -792,7 +791,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
-	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
@@ -812,9 +810,6 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 {
 	rx_ring->next_to_use = val;
 
-	/* update next to alloc since we have filled the ring */
-	rx_ring->next_to_alloc = val;
-
 	/* Force memory writes to complete before letting h/w
 	 * know there are new descriptors to fetch.  (Only
 	 * applicable for weak-ordered memory model archs,
@@ -828,12 +823,17 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
  * iavf_alloc_mapped_page - recycle or make a new page
  * @rx_ring: ring to use
  * @bi: rx_buffer struct to modify
+ * @dev: device used for DMA mapping
+ * @order: page order to allocate
+ * @gfp: GFP mask to allocate page
  *
  * Returns true if the page was successfully allocated or
  * reused.
  **/
 static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
-				   struct iavf_rx_buffer *bi)
+				   struct iavf_rx_buffer *bi,
+				   struct device *dev, u32 order,
+				   gfp_t gfp)
 {
 	struct page *page = bi->page;
 	dma_addr_t dma;
@@ -845,23 +845,21 @@ static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
 	}
 
 	/* alloc new page for storage */
-	page = dev_alloc_pages(iavf_rx_pg_order(rx_ring));
+	page = __dev_alloc_pages(gfp, order);
 	if (unlikely(!page)) {
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
 
 	/* map page for use */
-	dma = dma_map_page_attrs(rx_ring->dev, page, 0,
-				 iavf_rx_pg_size(rx_ring),
-				 DMA_FROM_DEVICE,
-				 IAVF_RX_DMA_ATTR);
+	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE << order,
+				 DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
 	 */
-	if (dma_mapping_error(rx_ring->dev, dma)) {
-		__free_pages(page, iavf_rx_pg_order(rx_ring));
+	if (dma_mapping_error(dev, dma)) {
+		__free_pages(page, order);
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
@@ -898,32 +896,36 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 }
 
 /**
- * iavf_alloc_rx_buffers - Replace used receive buffers
+ * __iavf_alloc_rx_buffers - Replace used receive buffers
  * @rx_ring: ring to place buffers on
- * @cleaned_count: number of buffers to replace
+ * @to_refill: number of buffers to replace
+ * @gfp: GFP mask to allocate pages
  *
- * Returns false if all allocations were successful, true if any fail
+ * Returns 0 if all allocations were successful or the number of buffers left
+ * to refill in case of an allocation failure.
  **/
-bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
+static u32 __iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u32 to_refill,
+				   gfp_t gfp)
 {
-	u16 ntu = rx_ring->next_to_use;
+	u32 order = iavf_rx_pg_order(rx_ring);
+	struct device *dev = rx_ring->dev;
+	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
 	struct iavf_rx_buffer *bi;
 
 	/* do nothing if no valid netdev defined */
-	if (!rx_ring->netdev || !cleaned_count)
-		return false;
+	if (unlikely(!rx_ring->netdev || !to_refill))
+		return 0;
 
 	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
 	bi = &rx_ring->rx_bi[ntu];
 
 	do {
-		if (!iavf_alloc_mapped_page(rx_ring, bi))
-			goto no_buffers;
+		if (!iavf_alloc_mapped_page(rx_ring, bi, dev, order, gfp))
+			break;
 
 		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
-						 bi->page_offset,
+		dma_sync_single_range_for_device(dev, bi->dma, bi->page_offset,
 						 rx_ring->rx_buf_len,
 						 DMA_FROM_DEVICE);
 
@@ -943,23 +945,17 @@ bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
 
 		/* clear the status bits for the next_to_use descriptor */
 		rx_desc->wb.qword1.status_error_len = 0;
-
-		cleaned_count--;
-	} while (cleaned_count);
+	} while (--to_refill);
 
 	if (rx_ring->next_to_use != ntu)
 		iavf_release_rx_desc(rx_ring, ntu);
 
-	return false;
-
-no_buffers:
-	if (rx_ring->next_to_use != ntu)
-		iavf_release_rx_desc(rx_ring, ntu);
+	return to_refill;
+}
 
-	/* make sure to come back via polling to try again after
-	 * allocation failure
-	 */
-	return true;
+void iavf_alloc_rx_buffers(struct iavf_ring *rxr)
+{
+	__iavf_alloc_rx_buffers(rxr, IAVF_DESC_UNUSED(rxr), GFP_KERNEL);
 }
 
 /**
@@ -1104,32 +1100,6 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
 	return false;
 }
 
-/**
- * iavf_reuse_rx_page - page flip buffer and store it back on the ring
- * @rx_ring: rx descriptor ring to store buffers on
- * @old_buff: donor buffer to have page reused
- *
- * Synchronizes page for reuse by the adapter
- **/
-static void iavf_reuse_rx_page(struct iavf_ring *rx_ring,
-			       struct iavf_rx_buffer *old_buff)
-{
-	struct iavf_rx_buffer *new_buff;
-	u16 nta = rx_ring->next_to_alloc;
-
-	new_buff = &rx_ring->rx_bi[nta];
-
-	/* update, and store next to alloc */
-	nta++;
-	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-	/* transfer page from old buffer to new buffer */
-	new_buff->dma		= old_buff->dma;
-	new_buff->page		= old_buff->page;
-	new_buff->page_offset	= old_buff->page_offset;
-	new_buff->pagecnt_bias	= old_buff->pagecnt_bias;
-}
-
 /**
  * iavf_can_reuse_rx_page - Determine if this page can be reused by
  * the adapter for another receive
@@ -1191,30 +1161,26 @@ static bool iavf_can_reuse_rx_page(struct iavf_rx_buffer *rx_buffer)
 
 /**
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: buffer containing page to add
  * @skb: sk_buff to place the data into
+ * @rx_buffer: buffer containing page to add
  * @size: packet length from rx_desc
+ * @pg_size: Rx buffer page size
  *
  * This function will add the data contained in rx_buffer->page to the skb.
  * It will just attach the page as a frag to the skb.
  *
  * The function will then update the page offset.
  **/
-static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
+static void iavf_add_rx_frag(struct sk_buff *skb,
 			     struct iavf_rx_buffer *rx_buffer,
-			     struct sk_buff *skb,
-			     unsigned int size)
+			     u32 size, u32 pg_size)
 {
 #if (PAGE_SIZE < 8192)
-	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
+	unsigned int truesize = pg_size / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
 #endif
 
-	if (!size)
-		return;
-
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
 			rx_buffer->page_offset, size, truesize);
 
@@ -1224,63 +1190,47 @@ static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
 #else
 	rx_buffer->page_offset += truesize;
 #endif
+
+	/* We have pulled a buffer for use, so decrement pagecnt_bias */
+	rx_buffer->pagecnt_bias--;
 }
 
 /**
- * iavf_get_rx_buffer - Fetch Rx buffer and synchronize data for use
- * @rx_ring: rx descriptor ring to transact packets on
- * @size: size of buffer to add to skb
+ * iavf_sync_rx_buffer - Synchronize received data for use
+ * @dev: device used for DMA mapping
+ * @buf: Rx buffer containing the data
+ * @size: size of the received data
  *
- * This function will pull an Rx buffer from the ring and synchronize it
- * for use by the CPU.
+ * This function will synchronize the Rx buffer for use by the CPU.
  */
-static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring,
-						 const unsigned int size)
+static void iavf_sync_rx_buffer(struct device *dev, struct iavf_rx_buffer *buf,
+				u32 size)
 {
-	struct iavf_rx_buffer *rx_buffer;
-
-	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
-	prefetchw(rx_buffer->page);
-	if (!size)
-		return rx_buffer;
-
-	/* we are reusing so sync this buffer for CPU use */
-	dma_sync_single_range_for_cpu(rx_ring->dev,
-				      rx_buffer->dma,
-				      rx_buffer->page_offset,
-				      size,
+	dma_sync_single_range_for_cpu(dev, buf->dma, buf->page_offset, size,
 				      DMA_FROM_DEVICE);
-
-	/* We have pulled a buffer for use, so decrement pagecnt_bias */
-	rx_buffer->pagecnt_bias--;
-
-	return rx_buffer;
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
- * @rx_ring: Rx descriptor ring to transact packets on
- * @rx_buffer: Rx buffer to pull data from
- * @size: size of buffer to add to skb
+ * @rx_buffer: Rx buffer with the data
+ * @size: size of the data
+ * @pg_size: size of the Rx page
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
-				      struct iavf_rx_buffer *rx_buffer,
-				      unsigned int size)
+static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
+				      u32 size, u32 pg_size)
 {
 	void *va;
 #if (PAGE_SIZE < 8192)
-	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
+	unsigned int truesize = pg_size / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
 				SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
 #endif
 	struct sk_buff *skb;
 
-	if (!rx_buffer || !size)
-		return NULL;
 	/* prefetch first cache line of first page */
 	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
 	net_prefetch(va);
@@ -1301,36 +1251,33 @@ static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
 	rx_buffer->page_offset += truesize;
 #endif
 
+	rx_buffer->pagecnt_bias--;
+
 	return skb;
 }
 
 /**
- * iavf_put_rx_buffer - Clean up used buffer and either recycle or free
+ * iavf_put_rx_buffer - Recycle or free used buffer
  * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: rx buffer to pull data from
+ * @dev: device used for DMA mapping
+ * @rx_buffer: Rx buffer to handle
+ * @pg_size: Rx page size
  *
- * This function will clean up the contents of the rx_buffer.  It will
- * either recycle the buffer or unmap it and free the associated resources.
+ * Either recycle the buffer if possible or unmap and free the page.
  */
-static void iavf_put_rx_buffer(struct iavf_ring *rx_ring,
-			       struct iavf_rx_buffer *rx_buffer)
+static void iavf_put_rx_buffer(struct iavf_ring *rx_ring, struct device *dev,
+			       struct iavf_rx_buffer *rx_buffer, u32 pg_size)
 {
-	if (!rx_buffer)
-		return;
-
 	if (iavf_can_reuse_rx_page(rx_buffer)) {
-		/* hand second half of page back to the ring */
-		iavf_reuse_rx_page(rx_ring, rx_buffer);
 		rx_ring->rx_stats.page_reuse_count++;
-	} else {
-		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
-				     iavf_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
-		__page_frag_cache_drain(rx_buffer->page,
-					rx_buffer->pagecnt_bias);
+		return;
 	}
 
+	/* we are not reusing the buffer so unmap it */
+	dma_unmap_page_attrs(dev, rx_buffer->dma, pg_size,
+			     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
+	__page_frag_cache_drain(rx_buffer->page, rx_buffer->pagecnt_bias);
+
 	/* clear contents of buffer_info */
 	rx_buffer->page = NULL;
 }
@@ -1350,14 +1297,6 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
 			    union iavf_rx_desc *rx_desc,
 			    struct sk_buff *skb)
 {
-	u32 ntc = rx_ring->next_to_clean + 1;
-
-	/* fetch, update, and store next to clean */
-	ntc = (ntc < rx_ring->count) ? ntc : 0;
-	rx_ring->next_to_clean = ntc;
-
-	prefetch(IAVF_RX_DESC(rx_ring, ntc));
-
 	/* if we are the last buffer then there is nothing else to do */
 #define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
 	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
@@ -1383,11 +1322,16 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
 static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 {
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
+	u32 pg_size = iavf_rx_pg_size(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
-	u16 cleaned_count = IAVF_DESC_UNUSED(rx_ring);
-	bool failure = false;
+	struct device *dev = rx_ring->dev;
+	u32 ntc = rx_ring->next_to_clean;
+	u32 ring_size = rx_ring->count;
+	u32 cleaned_count = 0;
 
-	while (likely(total_rx_packets < (unsigned int)budget)) {
+	while (likely(cleaned_count < budget)) {
 		struct iavf_rx_buffer *rx_buffer;
 		union iavf_rx_desc *rx_desc;
 		unsigned int size;
@@ -1396,13 +1340,11 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
-		if (cleaned_count >= IAVF_RX_BUFFER_WRITE) {
-			failure = failure ||
-				  iavf_alloc_rx_buffers(rx_ring, cleaned_count);
-			cleaned_count = 0;
-		}
+		if (to_refill >= IAVF_RX_BUFFER_WRITE)
+			to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill,
+							    gfp);
 
-		rx_desc = IAVF_RX_DESC(rx_ring, rx_ring->next_to_clean);
+		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
 
 		/* status_error_len will always be zero for unused descriptors
 		 * because it's cleared in cleanup, and overlaps with hdr_addr
@@ -1424,24 +1366,38 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
 
 		iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
-		rx_buffer = iavf_get_rx_buffer(rx_ring, size);
+		rx_buffer = &rx_ring->rx_bi[ntc];
+
+		/* Very rare, but possible case. The most common reason:
+		 * the last fragment contained FCS only, which was then
+		 * stripped by the HW.
+		 */
+		if (unlikely(!size))
+			goto skip_data;
+
+		iavf_sync_rx_buffer(dev, rx_buffer, size);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(rx_ring, rx_buffer, skb, size);
+			iavf_add_rx_frag(skb, rx_buffer, size, pg_size);
 		else
-			skb = iavf_build_skb(rx_ring, rx_buffer, size);
+			skb = iavf_build_skb(rx_buffer, size, pg_size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
 			rx_ring->rx_stats.alloc_buff_failed++;
-			if (rx_buffer && size)
-				rx_buffer->pagecnt_bias++;
 			break;
 		}
 
-		iavf_put_rx_buffer(rx_ring, rx_buffer);
+skip_data:
+		iavf_put_rx_buffer(rx_ring, dev, rx_buffer, pg_size);
+
 		cleaned_count++;
+		to_refill++;
+		if (unlikely(++ntc == ring_size))
+			ntc = 0;
+
+		prefetch(IAVF_RX_DESC(rx_ring, ntc));
 
 		if (iavf_is_non_eop(rx_ring, rx_desc, skb))
 			continue;
@@ -1488,8 +1444,18 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		total_rx_packets++;
 	}
 
+	rx_ring->next_to_clean = ntc;
 	rx_ring->skb = skb;
 
+	if (to_refill >= IAVF_RX_BUFFER_WRITE) {
+		to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill, gfp);
+		/* guarantee a trip back through this routine if there was
+		 * a failure
+		 */
+		if (unlikely(to_refill))
+			cleaned_count = budget;
+	}
+
 	u64_stats_update_begin(&rx_ring->syncp);
 	rx_ring->stats.packets += total_rx_packets;
 	rx_ring->stats.bytes += total_rx_bytes;
@@ -1497,8 +1463,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	rx_ring->q_vector->rx.total_packets += total_rx_packets;
 	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
 
-	/* guarantee a trip back through this routine if there was a failure */
-	return failure ? budget : (int)total_rx_packets;
+	return cleaned_count;
 }
 
 static inline u32 iavf_buildreg_itr(const int type, u16 itr)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 234e189c198755..9c6661a6edf2f2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -383,7 +383,6 @@ struct iavf_ring {
 	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
 
 	struct rcu_head rcu;		/* to avoid race on free */
-	u16 next_to_alloc;
 	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
@@ -426,7 +425,7 @@ static inline unsigned int iavf_rx_pg_order(struct iavf_ring *ring)
 
 #define iavf_rx_pg_size(_ring) (PAGE_SIZE << iavf_rx_pg_order(_ring))
 
-bool iavf_alloc_rx_buffers(struct iavf_ring *rxr, u16 cleaned_count);
+void iavf_alloc_rx_buffers(struct iavf_ring *rxr);
 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring);
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring);

From 46eb61c5cc1cdaadc5a0fa6eee73a5afef3cdfad Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 17 Feb 2023 18:03:26 +0100
Subject: [PATCH 04/40] iavf: remove page splitting/recycling

As an intermediate step, remove all page splitting/recyclig code. Just
always allocate a new page and don't touch its refcount, so that it gets
freed by the core stack later.
The change allows to greatly simplify certain parts of the code:

Function: add/remove: 2/3 grow/shrink: 0/5 up/down: 543/-963 (-420)

&iavf_rx_buf can even now retire in favor of just storing an array of
pages used for Rx. Their DMA addresses can be stored in page::dma_addr
-- use Page Pool's function for that.
No surprise perf loses up to 30% here, but that regression will go away
once PP lands.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |   2 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 279 ++++++--------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  17 +-
 3 files changed, 87 insertions(+), 211 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index a497acd96385de..f7c585d10834ce 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1237,7 +1237,7 @@ static void iavf_configure(struct iavf_adapter *adapter)
 	for (i = 0; i < adapter->num_active_queues; i++) {
 		struct iavf_ring *ring = &adapter->rx_rings[i];
 
-		iavf_alloc_rx_buffers(ring);
+		iavf_alloc_rx_pages(ring);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index fd08ce67380ee2..a761f3e3d7ccce 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -690,11 +690,10 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
  **/
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 {
-	unsigned long bi_size;
 	u16 i;
 
 	/* ring already cleared, nothing to do */
-	if (!rx_ring->rx_bi)
+	if (!rx_ring->rx_pages)
 		return;
 
 	if (rx_ring->skb) {
@@ -704,38 +703,30 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
-		struct iavf_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+		struct page *page = rx_ring->rx_pages[i];
+		dma_addr_t dma;
 
-		if (!rx_bi->page)
+		if (!page)
 			continue;
 
+		dma = page_pool_get_dma_addr(page);
+
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
-		dma_sync_single_range_for_cpu(rx_ring->dev,
-					      rx_bi->dma,
-					      rx_bi->page_offset,
+		dma_sync_single_range_for_cpu(rx_ring->dev, dma, IAVF_SKB_PAD,
 					      rx_ring->rx_buf_len,
 					      DMA_FROM_DEVICE);
 
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
+		dma_unmap_page_attrs(rx_ring->dev, dma,
 				     iavf_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IAVF_RX_DMA_ATTR);
 
-		__page_frag_cache_drain(rx_bi->page, rx_bi->pagecnt_bias);
-
-		rx_bi->page = NULL;
-		rx_bi->page_offset = 0;
+		__free_pages(page, iavf_rx_pg_order(rx_ring));
 	}
 
-	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-	memset(rx_ring->rx_bi, 0, bi_size);
-
-	/* Zero out the descriptor ring */
-	memset(rx_ring->desc, 0, rx_ring->size);
-
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 }
@@ -749,8 +740,8 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
 	iavf_clean_rx_ring(rx_ring);
-	kfree(rx_ring->rx_bi);
-	rx_ring->rx_bi = NULL;
+	kfree(rx_ring->rx_pages);
+	rx_ring->rx_pages = NULL;
 
 	if (rx_ring->desc) {
 		dma_free_coherent(rx_ring->dev, rx_ring->size,
@@ -768,14 +759,13 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 {
 	struct device *dev = rx_ring->dev;
-	int bi_size;
 
 	/* warn if we are about to overwrite the pointer */
-	WARN_ON(rx_ring->rx_bi);
-	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
-	if (!rx_ring->rx_bi)
-		goto err;
+	WARN_ON(rx_ring->rx_pages);
+	rx_ring->rx_pages = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_pages),
+				    GFP_KERNEL);
+	if (!rx_ring->rx_pages)
+		return -ENOMEM;
 
 	u64_stats_init(&rx_ring->syncp);
 
@@ -796,8 +786,9 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
 	return 0;
 err:
-	kfree(rx_ring->rx_bi);
-	rx_ring->rx_bi = NULL;
+	kfree(rx_ring->rx_pages);
+	rx_ring->rx_pages = NULL;
+
 	return -ENOMEM;
 }
 
@@ -820,36 +811,23 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 }
 
 /**
- * iavf_alloc_mapped_page - recycle or make a new page
- * @rx_ring: ring to use
- * @bi: rx_buffer struct to modify
+ * iavf_alloc_mapped_page - allocate and map a new page
  * @dev: device used for DMA mapping
  * @order: page order to allocate
  * @gfp: GFP mask to allocate page
  *
- * Returns true if the page was successfully allocated or
- * reused.
+ * Returns a new &page if the it was successfully allocated, %NULL otherwise.
  **/
-static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
-				   struct iavf_rx_buffer *bi,
-				   struct device *dev, u32 order,
-				   gfp_t gfp)
+static struct page *iavf_alloc_mapped_page(struct device *dev, u32 order,
+					   gfp_t gfp)
 {
-	struct page *page = bi->page;
+	struct page *page;
 	dma_addr_t dma;
 
-	/* since we are recycling buffers we should seldom need to alloc */
-	if (likely(page)) {
-		rx_ring->rx_stats.page_reuse_count++;
-		return true;
-	}
-
 	/* alloc new page for storage */
 	page = __dev_alloc_pages(gfp, order);
-	if (unlikely(!page)) {
-		rx_ring->rx_stats.alloc_page_failed++;
-		return false;
-	}
+	if (unlikely(!page))
+		return NULL;
 
 	/* map page for use */
 	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE << order,
@@ -860,18 +838,12 @@ static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
 	 */
 	if (dma_mapping_error(dev, dma)) {
 		__free_pages(page, order);
-		rx_ring->rx_stats.alloc_page_failed++;
-		return false;
+		return NULL;
 	}
 
-	bi->dma = dma;
-	bi->page = page;
-	bi->page_offset = IAVF_SKB_PAD;
-
-	/* initialize pagecnt_bias to 1 representing we fully own page */
-	bi->pagecnt_bias = 1;
+	page_pool_set_dma_addr(page, dma);
 
-	return true;
+	return page;
 }
 
 /**
@@ -896,7 +868,7 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 }
 
 /**
- * __iavf_alloc_rx_buffers - Replace used receive buffers
+ * __iavf_alloc_rx_pages - Replace used receive pages
  * @rx_ring: ring to place buffers on
  * @to_refill: number of buffers to replace
  * @gfp: GFP mask to allocate pages
@@ -904,42 +876,47 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
  * Returns 0 if all allocations were successful or the number of buffers left
  * to refill in case of an allocation failure.
  **/
-static u32 __iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u32 to_refill,
-				   gfp_t gfp)
+static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
+				 gfp_t gfp)
 {
 	u32 order = iavf_rx_pg_order(rx_ring);
 	struct device *dev = rx_ring->dev;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
-	struct iavf_rx_buffer *bi;
 
 	/* do nothing if no valid netdev defined */
 	if (unlikely(!rx_ring->netdev || !to_refill))
 		return 0;
 
 	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
-	bi = &rx_ring->rx_bi[ntu];
 
 	do {
-		if (!iavf_alloc_mapped_page(rx_ring, bi, dev, order, gfp))
+		struct page *page;
+		dma_addr_t dma;
+
+		page = iavf_alloc_mapped_page(dev, order, gfp);
+		if (!page) {
+			rx_ring->rx_stats.alloc_page_failed++;
 			break;
+		}
+
+		rx_ring->rx_pages[ntu] = page;
+		dma = page_pool_get_dma_addr(page);
 
 		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(dev, bi->dma, bi->page_offset,
+		dma_sync_single_range_for_device(dev, dma, IAVF_SKB_PAD,
 						 rx_ring->rx_buf_len,
 						 DMA_FROM_DEVICE);
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
-		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma + IAVF_SKB_PAD);
 
 		rx_desc++;
-		bi++;
 		ntu++;
 		if (unlikely(ntu == rx_ring->count)) {
 			rx_desc = IAVF_RX_DESC(rx_ring, 0);
-			bi = rx_ring->rx_bi;
 			ntu = 0;
 		}
 
@@ -953,9 +930,9 @@ static u32 __iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u32 to_refill,
 	return to_refill;
 }
 
-void iavf_alloc_rx_buffers(struct iavf_ring *rxr)
+void iavf_alloc_rx_pages(struct iavf_ring *rxr)
 {
-	__iavf_alloc_rx_buffers(rxr, IAVF_DESC_UNUSED(rxr), GFP_KERNEL);
+	__iavf_alloc_rx_pages(rxr, IAVF_DESC_UNUSED(rxr), GFP_KERNEL);
 }
 
 /**
@@ -1100,80 +1077,20 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
 	return false;
 }
 
-/**
- * iavf_can_reuse_rx_page - Determine if this page can be reused by
- * the adapter for another receive
- *
- * @rx_buffer: buffer containing the page
- *
- * If page is reusable, rx_buffer->page_offset is adjusted to point to
- * an unused region in the page.
- *
- * For small pages, @truesize will be a constant value, half the size
- * of the memory at page.  We'll attempt to alternate between high and
- * low halves of the page, with one half ready for use by the hardware
- * and the other half being consumed by the stack.  We use the page
- * ref count to determine whether the stack has finished consuming the
- * portion of this page that was passed up with a previous packet.  If
- * the page ref count is >1, we'll assume the "other" half page is
- * still busy, and this page cannot be reused.
- *
- * For larger pages, @truesize will be the actual space used by the
- * received packet (adjusted upward to an even multiple of the cache
- * line size).  This will advance through the page by the amount
- * actually consumed by the received packets while there is still
- * space for a buffer.  Each region of larger pages will be used at
- * most once, after which the page will not be reused.
- *
- * In either case, if the page is reusable its refcount is increased.
- **/
-static bool iavf_can_reuse_rx_page(struct iavf_rx_buffer *rx_buffer)
-{
-	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
-	struct page *page = rx_buffer->page;
-
-	/* Is any reuse possible? */
-	if (!dev_page_is_reusable(page))
-		return false;
-
-#if (PAGE_SIZE < 8192)
-	/* if we are only owner of page we can reuse it */
-	if (unlikely((page_count(page) - pagecnt_bias) > 1))
-		return false;
-#else
-#define IAVF_LAST_OFFSET \
-	(SKB_WITH_OVERHEAD(PAGE_SIZE) - IAVF_RXBUFFER_2048)
-	if (rx_buffer->page_offset > IAVF_LAST_OFFSET)
-		return false;
-#endif
-
-	/* If we have drained the page fragment pool we need to update
-	 * the pagecnt_bias and page count so that we fully restock the
-	 * number of references the driver holds.
-	 */
-	if (unlikely(!pagecnt_bias)) {
-		page_ref_add(page, USHRT_MAX);
-		rx_buffer->pagecnt_bias = USHRT_MAX;
-	}
-
-	return true;
-}
-
 /**
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @skb: sk_buff to place the data into
- * @rx_buffer: buffer containing page to add
+ * @page: page containing data to add
  * @size: packet length from rx_desc
  * @pg_size: Rx buffer page size
  *
- * This function will add the data contained in rx_buffer->page to the skb.
+ * This function will add the data contained in page to the skb.
  * It will just attach the page as a frag to the skb.
  *
  * The function will then update the page offset.
  **/
-static void iavf_add_rx_frag(struct sk_buff *skb,
-			     struct iavf_rx_buffer *rx_buffer,
-			     u32 size, u32 pg_size)
+static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size,
+			     u32 pg_size)
 {
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = pg_size / 2;
@@ -1181,46 +1098,34 @@ static void iavf_add_rx_frag(struct sk_buff *skb,
 	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
 #endif
 
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
-			rx_buffer->page_offset, size, truesize);
-
-	/* page is being used so we must update the page offset */
-#if (PAGE_SIZE < 8192)
-	rx_buffer->page_offset ^= truesize;
-#else
-	rx_buffer->page_offset += truesize;
-#endif
-
-	/* We have pulled a buffer for use, so decrement pagecnt_bias */
-	rx_buffer->pagecnt_bias--;
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, IAVF_SKB_PAD,
+			size, truesize);
 }
 
 /**
- * iavf_sync_rx_buffer - Synchronize received data for use
+ * iavf_sync_rx_page - Synchronize received data for use
  * @dev: device used for DMA mapping
- * @buf: Rx buffer containing the data
+ * @page: Rx page containing the data
  * @size: size of the received data
  *
  * This function will synchronize the Rx buffer for use by the CPU.
  */
-static void iavf_sync_rx_buffer(struct device *dev, struct iavf_rx_buffer *buf,
-				u32 size)
+static void iavf_sync_rx_page(struct device *dev, struct page *page, u32 size)
 {
-	dma_sync_single_range_for_cpu(dev, buf->dma, buf->page_offset, size,
-				      DMA_FROM_DEVICE);
+	dma_sync_single_range_for_cpu(dev, page_pool_get_dma_addr(page),
+				      IAVF_SKB_PAD, size, DMA_FROM_DEVICE);
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
- * @rx_buffer: Rx buffer with the data
+ * @page: Rx page to with the data
  * @size: size of the data
  * @pg_size: size of the Rx page
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
-				      u32 size, u32 pg_size)
+static struct sk_buff *iavf_build_skb(struct page *page, u32 size, u32 pg_size)
 {
 	void *va;
 #if (PAGE_SIZE < 8192)
@@ -1232,11 +1137,11 @@ static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
-	net_prefetch(va);
+	va = page_address(page);
+	net_prefetch(va + IAVF_SKB_PAD);
 
 	/* build an skb around the page buffer */
-	skb = napi_build_skb(va - IAVF_SKB_PAD, truesize);
+	skb = napi_build_skb(va, truesize);
 	if (unlikely(!skb))
 		return NULL;
 
@@ -1244,42 +1149,21 @@ static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
 	skb_reserve(skb, IAVF_SKB_PAD);
 	__skb_put(skb, size);
 
-	/* buffer is used by skb, update page_offset */
-#if (PAGE_SIZE < 8192)
-	rx_buffer->page_offset ^= truesize;
-#else
-	rx_buffer->page_offset += truesize;
-#endif
-
-	rx_buffer->pagecnt_bias--;
-
 	return skb;
 }
 
 /**
- * iavf_put_rx_buffer - Recycle or free used buffer
- * @rx_ring: rx descriptor ring to transact packets on
+ * iavf_unmap_rx_page - Unmap used page
  * @dev: device used for DMA mapping
- * @rx_buffer: Rx buffer to handle
+ * @page: page to release
  * @pg_size: Rx page size
- *
- * Either recycle the buffer if possible or unmap and free the page.
  */
-static void iavf_put_rx_buffer(struct iavf_ring *rx_ring, struct device *dev,
-			       struct iavf_rx_buffer *rx_buffer, u32 pg_size)
+static void iavf_unmap_rx_page(struct device *dev, struct page *page,
+			       u32 pg_size)
 {
-	if (iavf_can_reuse_rx_page(rx_buffer)) {
-		rx_ring->rx_stats.page_reuse_count++;
-		return;
-	}
-
-	/* we are not reusing the buffer so unmap it */
-	dma_unmap_page_attrs(dev, rx_buffer->dma, pg_size,
+	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page), pg_size,
 			     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
-	__page_frag_cache_drain(rx_buffer->page, rx_buffer->pagecnt_bias);
-
-	/* clear contents of buffer_info */
-	rx_buffer->page = NULL;
+	page_pool_set_dma_addr(page, 0);
 }
 
 /**
@@ -1332,8 +1216,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	u32 cleaned_count = 0;
 
 	while (likely(cleaned_count < budget)) {
-		struct iavf_rx_buffer *rx_buffer;
 		union iavf_rx_desc *rx_desc;
+		struct page *page;
 		unsigned int size;
 		u16 vlan_tag = 0;
 		u8 rx_ptype;
@@ -1341,8 +1225,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		/* return some buffers to hardware, one at a time is too slow */
 		if (to_refill >= IAVF_RX_BUFFER_WRITE)
-			to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill,
-							    gfp);
+			to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill,
+							  gfp);
 
 		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
 
@@ -1366,32 +1250,37 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
 
 		iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
-		rx_buffer = &rx_ring->rx_bi[ntc];
+
+		page = rx_ring->rx_pages[ntc];
+		rx_ring->rx_pages[ntc] = NULL;
 
 		/* Very rare, but possible case. The most common reason:
 		 * the last fragment contained FCS only, which was then
 		 * stripped by the HW.
 		 */
-		if (unlikely(!size))
+		if (unlikely(!size)) {
+			iavf_unmap_rx_page(dev, page, pg_size);
+			__free_pages(page, get_order(pg_size));
 			goto skip_data;
+		}
 
-		iavf_sync_rx_buffer(dev, rx_buffer, size);
+		iavf_sync_rx_page(dev, page, size);
+		iavf_unmap_rx_page(dev, page, pg_size);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(skb, rx_buffer, size, pg_size);
+			iavf_add_rx_frag(skb, page, size, pg_size);
 		else
-			skb = iavf_build_skb(rx_buffer, size, pg_size);
+			skb = iavf_build_skb(page, size, pg_size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
+			__free_pages(page, get_order(pg_size));
 			rx_ring->rx_stats.alloc_buff_failed++;
 			break;
 		}
 
 skip_data:
-		iavf_put_rx_buffer(rx_ring, dev, rx_buffer, pg_size);
-
 		cleaned_count++;
 		to_refill++;
 		if (unlikely(++ntc == ring_size))
@@ -1448,7 +1337,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	rx_ring->skb = skb;
 
 	if (to_refill >= IAVF_RX_BUFFER_WRITE) {
-		to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill, gfp);
+		to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill, gfp);
 		/* guarantee a trip back through this routine if there was
 		 * a failure
 		 */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 9c6661a6edf2f2..c09ac580fe84cc 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -272,17 +272,6 @@ struct iavf_tx_buffer {
 	u32 tx_flags;
 };
 
-struct iavf_rx_buffer {
-	dma_addr_t dma;
-	struct page *page;
-#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
-	__u32 page_offset;
-#else
-	__u16 page_offset;
-#endif
-	__u16 pagecnt_bias;
-};
-
 struct iavf_queue_stats {
 	u64 packets;
 	u64 bytes;
@@ -302,8 +291,6 @@ struct iavf_rx_queue_stats {
 	u64 non_eop_descs;
 	u64 alloc_page_failed;
 	u64 alloc_buff_failed;
-	u64 page_reuse_count;
-	u64 realloc_count;
 };
 
 enum iavf_ring_state_t {
@@ -331,7 +318,7 @@ struct iavf_ring {
 	struct net_device *netdev;	/* netdev ring maps to */
 	union {
 		struct iavf_tx_buffer *tx_bi;
-		struct iavf_rx_buffer *rx_bi;
+		struct page **rx_pages;
 	};
 	DECLARE_BITMAP(state, __IAVF_RING_STATE_NBITS);
 	u16 queue_index;		/* Queue number of ring */
@@ -425,7 +412,7 @@ static inline unsigned int iavf_rx_pg_order(struct iavf_ring *ring)
 
 #define iavf_rx_pg_size(_ring) (PAGE_SIZE << iavf_rx_pg_order(_ring))
 
-void iavf_alloc_rx_buffers(struct iavf_ring *rxr);
+void iavf_alloc_rx_pages(struct iavf_ring *rxr);
 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring);
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring);

From 95a993b46b712af2304f5788aa7f691f07d05370 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 3 Feb 2023 17:18:57 +0100
Subject: [PATCH 05/40] iavf: always use a full order-0 page

The current scheme with trying to pick the smallest buffer possible for
the current MTU in order to flip/split pages is not very optimal.
For example, on default MTU of 1500 it gives only 192 bytes of headroom,
while XDP may require up to 258. But this also involves unnecessary code
complication, which sometimes is even hard to follow.
As page split is no more, always allocate order-0 pages. This optimizes
performance a bit and drops some bytes off the object code. Next, always
pick the maximum buffer length available for this %PAGE_SIZE to set it
up in the hardware. This means it now becomes a constant value, which
also has its positive impact.
On x64 this means (without XDP):

4096 page
64 head, 320 tail
3712 HW buffer size
3686 max MTU w/o frags

Previously, the maximum MTU w/o splitting a frame into several buffers
was 3046.
Increased buffer size allows us to reach the maximum frame size w/ frags
supported by HW: 16382 bytes (MTU 16356). Reflect it in the netdev
config as well. Relying on max single buffer size when calculating MTU
was not correct.
Move around a couple of fields in &iavf_ring after ::rx_buf_len removal
to reduce holes and improve cache locality.
Instead of providing the Rx definitions, which can and will be reused in
rest of the drivers, exclusively for IAVF, do that in the libie header.
Non-PP drivers could still use at least some of them and lose a couple
copied lines.

Function: add/remove: 0/0 grow/shrink: 3/9 up/down: 18/-265 (-247)

+ even reclaims a half percent of performance, nice.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  32 +-----
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  96 +++++++---------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   | 103 +-----------------
 drivers/net/ethernet/intel/iavf/iavf_type.h   |   2 -
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  15 +--
 include/linux/net/intel/libie/rx.h            |  39 +++++++
 6 files changed, 89 insertions(+), 198 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index f7c585d10834ce..fb2bd1c423a158 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
+
 #include "iavf.h"
 #include "iavf_prototype.h"
 #include "iavf_client.h"
@@ -709,32 +711,10 @@ static void iavf_configure_tx(struct iavf_adapter *adapter)
  **/
 static void iavf_configure_rx(struct iavf_adapter *adapter)
 {
-	unsigned int rx_buf_len = IAVF_RXBUFFER_2048;
 	struct iavf_hw *hw = &adapter->hw;
-	int i;
-
-	if (PAGE_SIZE < 8192) {
-		struct net_device *netdev = adapter->netdev;
 
-		/* For jumbo frames on systems with 4K pages we have to use
-		 * an order 1 page, so we might as well increase the size
-		 * of our Rx buffer to make better use of the available space
-		 */
-		rx_buf_len = IAVF_RXBUFFER_3072;
-
-		/* We use a 1536 buffer size for configurations with
-		 * standard Ethernet mtu.  On x86 this gives us enough room
-		 * for shared info and 192 bytes of padding.
-		 */
-		if (!IAVF_2K_TOO_SMALL_WITH_PADDING &&
-		    (netdev->mtu <= ETH_DATA_LEN))
-			rx_buf_len = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
-	}
-
-	for (i = 0; i < adapter->num_active_queues; i++) {
+	for (u32 i = 0; i < adapter->num_active_queues; i++)
 		adapter->rx_rings[i].tail = hw->hw_addr + IAVF_QRX_TAIL1(i);
-		adapter->rx_rings[i].rx_buf_len = rx_buf_len;
-	}
 }
 
 /**
@@ -2583,11 +2563,7 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 
 	netdev->netdev_ops = &iavf_netdev_ops;
 	iavf_set_ethtool_ops(netdev);
-	netdev->watchdog_timeo = 5 * HZ;
-
-	/* MTU range: 68 - 9710 */
-	netdev->min_mtu = ETH_MIN_MTU;
-	netdev->max_mtu = IAVF_MAX_RXBUFFER - IAVF_PACKET_HDR_PAD;
+	netdev->max_mtu = LIBIE_MAX_MTU;
 
 	if (!is_valid_ether_addr(adapter->hw.mac.addr)) {
 		dev_info(&pdev->dev, "Invalid MAC address %pM, using random\n",
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index a761f3e3d7ccce..8e0e6d59cd3e10 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -301,7 +301,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 		    ((j / WB_STRIDE) == 0) && (j > 0) &&
 		    !test_bit(__IAVF_VSI_DOWN, vsi->state) &&
 		    (IAVF_DESC_UNUSED(tx_ring) != tx_ring->count))
-			tx_ring->arm_wb = true;
+			tx_ring->flags |= IAVF_TXRX_FLAGS_ARM_WB;
 	}
 
 	/* notify netdev of completed buffers */
@@ -714,17 +714,16 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
-		dma_sync_single_range_for_cpu(rx_ring->dev, dma, IAVF_SKB_PAD,
-					      rx_ring->rx_buf_len,
+		dma_sync_single_range_for_cpu(rx_ring->dev, dma,
+					      LIBIE_SKB_HEADROOM,
+					      LIBIE_RX_BUF_LEN,
 					      DMA_FROM_DEVICE);
 
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(rx_ring->dev, dma,
-				     iavf_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE,
-				     IAVF_RX_DMA_ATTR);
+		dma_unmap_page_attrs(rx_ring->dev, dma, LIBIE_RX_TRUESIZE,
+				     DMA_FROM_DEVICE, LIBIE_RX_DMA_ATTR);
 
-		__free_pages(page, iavf_rx_pg_order(rx_ring));
+		__free_page(page);
 	}
 
 	rx_ring->next_to_clean = 0;
@@ -813,31 +812,29 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 /**
  * iavf_alloc_mapped_page - allocate and map a new page
  * @dev: device used for DMA mapping
- * @order: page order to allocate
  * @gfp: GFP mask to allocate page
  *
  * Returns a new &page if the it was successfully allocated, %NULL otherwise.
  **/
-static struct page *iavf_alloc_mapped_page(struct device *dev, u32 order,
-					   gfp_t gfp)
+static struct page *iavf_alloc_mapped_page(struct device *dev, gfp_t gfp)
 {
 	struct page *page;
 	dma_addr_t dma;
 
 	/* alloc new page for storage */
-	page = __dev_alloc_pages(gfp, order);
+	page = __dev_alloc_page(gfp);
 	if (unlikely(!page))
 		return NULL;
 
 	/* map page for use */
-	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE << order,
-				 DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
+	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE,
+				 LIBIE_RX_DMA_ATTR);
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
 	 */
 	if (dma_mapping_error(dev, dma)) {
-		__free_pages(page, order);
+		__free_page(page);
 		return NULL;
 	}
 
@@ -879,7 +876,6 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 				 gfp_t gfp)
 {
-	u32 order = iavf_rx_pg_order(rx_ring);
 	struct device *dev = rx_ring->dev;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
@@ -894,7 +890,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		struct page *page;
 		dma_addr_t dma;
 
-		page = iavf_alloc_mapped_page(dev, order, gfp);
+		page = iavf_alloc_mapped_page(dev, gfp);
 		if (!page) {
 			rx_ring->rx_stats.alloc_page_failed++;
 			break;
@@ -904,14 +900,14 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		dma = page_pool_get_dma_addr(page);
 
 		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(dev, dma, IAVF_SKB_PAD,
-						 rx_ring->rx_buf_len,
+		dma_sync_single_range_for_device(dev, dma, LIBIE_SKB_HEADROOM,
+						 LIBIE_RX_BUF_LEN,
 						 DMA_FROM_DEVICE);
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
-		rx_desc->read.pkt_addr = cpu_to_le64(dma + IAVF_SKB_PAD);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma + LIBIE_SKB_HEADROOM);
 
 		rx_desc++;
 		ntu++;
@@ -1082,24 +1078,16 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
  * @skb: sk_buff to place the data into
  * @page: page containing data to add
  * @size: packet length from rx_desc
- * @pg_size: Rx buffer page size
  *
  * This function will add the data contained in page to the skb.
  * It will just attach the page as a frag to the skb.
  *
  * The function will then update the page offset.
  **/
-static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size,
-			     u32 pg_size)
+static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size)
 {
-#if (PAGE_SIZE < 8192)
-	unsigned int truesize = pg_size / 2;
-#else
-	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
-#endif
-
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, IAVF_SKB_PAD,
-			size, truesize);
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
+			LIBIE_SKB_HEADROOM, size, LIBIE_RX_TRUESIZE);
 }
 
 /**
@@ -1113,40 +1101,34 @@ static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size,
 static void iavf_sync_rx_page(struct device *dev, struct page *page, u32 size)
 {
 	dma_sync_single_range_for_cpu(dev, page_pool_get_dma_addr(page),
-				      IAVF_SKB_PAD, size, DMA_FROM_DEVICE);
+				      LIBIE_SKB_HEADROOM, size,
+				      DMA_FROM_DEVICE);
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @page: Rx page to with the data
  * @size: size of the data
- * @pg_size: size of the Rx page
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct page *page, u32 size, u32 pg_size)
+static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 {
-	void *va;
-#if (PAGE_SIZE < 8192)
-	unsigned int truesize = pg_size / 2;
-#else
-	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-				SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
-#endif
 	struct sk_buff *skb;
+	void *va;
 
 	/* prefetch first cache line of first page */
 	va = page_address(page);
-	net_prefetch(va + IAVF_SKB_PAD);
+	net_prefetch(va + LIBIE_SKB_HEADROOM);
 
 	/* build an skb around the page buffer */
-	skb = napi_build_skb(va, truesize);
+	skb = napi_build_skb(va, LIBIE_RX_TRUESIZE);
 	if (unlikely(!skb))
 		return NULL;
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, IAVF_SKB_PAD);
+	skb_reserve(skb, LIBIE_SKB_HEADROOM);
 	__skb_put(skb, size);
 
 	return skb;
@@ -1156,13 +1138,12 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size, u32 pg_size)
  * iavf_unmap_rx_page - Unmap used page
  * @dev: device used for DMA mapping
  * @page: page to release
- * @pg_size: Rx page size
  */
-static void iavf_unmap_rx_page(struct device *dev, struct page *page,
-			       u32 pg_size)
+static void iavf_unmap_rx_page(struct device *dev, struct page *page)
 {
-	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page), pg_size,
-			     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
+	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page),
+			     LIBIE_RX_TRUESIZE, DMA_FROM_DEVICE,
+			     LIBIE_RX_DMA_ATTR);
 	page_pool_set_dma_addr(page, 0);
 }
 
@@ -1208,7 +1189,6 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
-	u32 pg_size = iavf_rx_pg_size(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
 	struct device *dev = rx_ring->dev;
 	u32 ntc = rx_ring->next_to_clean;
@@ -1259,23 +1239,23 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 * stripped by the HW.
 		 */
 		if (unlikely(!size)) {
-			iavf_unmap_rx_page(dev, page, pg_size);
-			__free_pages(page, get_order(pg_size));
+			iavf_unmap_rx_page(dev, page);
+			__free_page(page);
 			goto skip_data;
 		}
 
 		iavf_sync_rx_page(dev, page, size);
-		iavf_unmap_rx_page(dev, page, pg_size);
+		iavf_unmap_rx_page(dev, page);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(skb, page, size, pg_size);
+			iavf_add_rx_frag(skb, page, size);
 		else
-			skb = iavf_build_skb(page, size, pg_size);
+			skb = iavf_build_skb(page, size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
-			__free_pages(page, get_order(pg_size));
+			__free_page(page);
 			rx_ring->rx_stats.alloc_buff_failed++;
 			break;
 		}
@@ -1485,8 +1465,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 			clean_complete = false;
 			continue;
 		}
-		arm_wb |= ring->arm_wb;
-		ring->arm_wb = false;
+		arm_wb |= !!(ring->flags & IAVF_TXRX_FLAGS_ARM_WB);
+		ring->flags &= ~IAVF_TXRX_FLAGS_ARM_WB;
 	}
 
 	/* Handle case where we are called by netpoll with a budget of 0 */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index c09ac580fe84cc..25459411000a66 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -81,79 +81,8 @@ enum iavf_dyn_idx_t {
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
 
-/* Supported Rx Buffer Sizes (a multiple of 128) */
-#define IAVF_RXBUFFER_256   256
-#define IAVF_RXBUFFER_1536  1536  /* 128B aligned standard Ethernet frame */
-#define IAVF_RXBUFFER_2048  2048
-#define IAVF_RXBUFFER_3072  3072  /* Used for large frames w/ padding */
-#define IAVF_MAX_RXBUFFER   9728  /* largest size for single descriptor */
-
-/* NOTE: netdev_alloc_skb reserves up to 64 bytes, NET_IP_ALIGN means we
- * reserve 2 more, and skb_shared_info adds an additional 384 bytes more,
- * this adds up to 512 bytes of extra data meaning the smallest allocation
- * we could have is 1K.
- * i.e. RXBUFFER_256 --> 960 byte skb (size-1024 slab)
- * i.e. RXBUFFER_512 --> 1216 byte skb (size-2048 slab)
- */
-#define IAVF_RX_HDR_SIZE IAVF_RXBUFFER_256
-#define IAVF_PACKET_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2))
 #define iavf_rx_desc iavf_32byte_rx_desc
 
-#define IAVF_RX_DMA_ATTR \
-	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
-
-/* Attempt to maximize the headroom available for incoming frames.  We
- * use a 2K buffer for receives and need 1536/1534 to store the data for
- * the frame.  This leaves us with 512 bytes of room.  From that we need
- * to deduct the space needed for the shared info and the padding needed
- * to IP align the frame.
- *
- * Note: For cache line sizes 256 or larger this value is going to end
- *	 up negative.  In these cases we should fall back to the legacy
- *	 receive path.
- */
-#if (PAGE_SIZE < 8192)
-#define IAVF_2K_TOO_SMALL_WITH_PADDING \
-((NET_SKB_PAD + IAVF_RXBUFFER_1536) > SKB_WITH_OVERHEAD(IAVF_RXBUFFER_2048))
-
-static inline int iavf_compute_pad(int rx_buf_len)
-{
-	int page_size, pad_size;
-
-	page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2);
-	pad_size = SKB_WITH_OVERHEAD(page_size) - rx_buf_len;
-
-	return pad_size;
-}
-
-static inline int iavf_skb_pad(void)
-{
-	int rx_buf_len;
-
-	/* If a 2K buffer cannot handle a standard Ethernet frame then
-	 * optimize padding for a 3K buffer instead of a 1.5K buffer.
-	 *
-	 * For a 3K buffer we need to add enough padding to allow for
-	 * tailroom due to NET_IP_ALIGN possibly shifting us out of
-	 * cache-line alignment.
-	 */
-	if (IAVF_2K_TOO_SMALL_WITH_PADDING)
-		rx_buf_len = IAVF_RXBUFFER_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN);
-	else
-		rx_buf_len = IAVF_RXBUFFER_1536;
-
-	/* if needed make room for NET_IP_ALIGN */
-	rx_buf_len -= NET_IP_ALIGN;
-
-	return iavf_compute_pad(rx_buf_len);
-}
-
-#define IAVF_SKB_PAD iavf_skb_pad()
-#else
-#define IAVF_2K_TOO_SMALL_WITH_PADDING false
-#define IAVF_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN)
-#endif
-
 /**
  * iavf_test_staterr - tests bits in Rx descriptor status and error fields
  * @rx_desc: pointer to receive descriptor (in le64 format)
@@ -293,12 +222,6 @@ struct iavf_rx_queue_stats {
 	u64 alloc_buff_failed;
 };
 
-enum iavf_ring_state_t {
-	__IAVF_TX_FDIR_INIT_DONE,
-	__IAVF_TX_XPS_INIT_DONE,
-	__IAVF_RING_STATE_NBITS /* must be last */
-};
-
 /* some useful defines for virtchannel interface, which
  * is the only remaining user of header split
  */
@@ -320,10 +243,9 @@ struct iavf_ring {
 		struct iavf_tx_buffer *tx_bi;
 		struct page **rx_pages;
 	};
-	DECLARE_BITMAP(state, __IAVF_RING_STATE_NBITS);
+	u8 __iomem *tail;
 	u16 queue_index;		/* Queue number of ring */
 	u8 dcb_tc;			/* Traffic class of ring */
-	u8 __iomem *tail;
 
 	/* high bit set means dynamic, use accessors routines to read/write.
 	 * hardware only supports 2us resolution for the ITR registers.
@@ -332,24 +254,16 @@ struct iavf_ring {
 	 */
 	u16 itr_setting;
 
-	u16 count;			/* Number of descriptors */
 	u16 reg_idx;			/* HW register index of the ring */
-	u16 rx_buf_len;
+	u16 count;			/* Number of descriptors */
 
 	/* used in interrupt processing */
 	u16 next_to_use;
 	u16 next_to_clean;
 
-	u8 atr_sample_rate;
-	u8 atr_count;
-
-	bool ring_active;		/* is ring online or not */
-	bool arm_wb;		/* do something to arm write back */
-	u8 packet_stride;
-
 	u16 flags;
 #define IAVF_TXR_FLAGS_WB_ON_ITR		BIT(0)
-/* BIT(1) is free, was IAVF_RXR_FLAGS_BUILD_SKB_ENABLED */
+#define IAVF_TXRX_FLAGS_ARM_WB			BIT(1)
 /* BIT(2) is free */
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
@@ -401,17 +315,6 @@ struct iavf_ring_container {
 #define iavf_for_each_ring(pos, head) \
 	for (pos = (head).ring; pos != NULL; pos = pos->next)
 
-static inline unsigned int iavf_rx_pg_order(struct iavf_ring *ring)
-{
-#if (PAGE_SIZE < 8192)
-	if (ring->rx_buf_len > (PAGE_SIZE / 2))
-		return 1;
-#endif
-	return 0;
-}
-
-#define iavf_rx_pg_size(_ring) (PAGE_SIZE << iavf_rx_pg_order(_ring))
-
 void iavf_alloc_rx_pages(struct iavf_ring *rxr);
 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
index 3030ba33032603..bb90d8f3ad7efe 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_type.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
@@ -10,8 +10,6 @@
 #include "iavf_adminq.h"
 #include "iavf_devids.h"
 
-#define IAVF_RXQ_CTX_DBUFF_SHIFT 7
-
 /* IAVF_MASK is a macro used on 32 bit registers */
 #define IAVF_MASK(mask, shift) ((u32)(mask) << (shift))
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index c2e328ec5af8f0..3a031d8b9685e2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
+
 #include "iavf.h"
 #include "iavf_prototype.h"
 #include "iavf_client.h"
@@ -269,13 +271,12 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
 void iavf_configure_queues(struct iavf_adapter *adapter)
 {
 	struct virtchnl_vsi_queue_config_info *vqci;
-	int i, max_frame = adapter->vf_res->max_mtu;
+	u32 i, max_frame = adapter->vf_res->max_mtu;
 	int pairs = adapter->num_active_queues;
 	struct virtchnl_queue_pair_info *vqpi;
 	size_t len;
 
-	if (max_frame > IAVF_MAX_RXBUFFER || !max_frame)
-		max_frame = IAVF_MAX_RXBUFFER;
+	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN);
 
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
@@ -289,10 +290,6 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 	if (!vqci)
 		return;
 
-	/* Limit maximum frame size when jumbo frames is not enabled */
-	if (adapter->netdev->mtu <= ETH_DATA_LEN)
-		max_frame = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
-
 	vqci->vsi_id = adapter->vsi_res->vsi_id;
 	vqci->num_queue_pairs = pairs;
 	vqpi = vqci->qpair;
@@ -309,9 +306,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 		vqpi->rxq.ring_len = adapter->rx_rings[i].count;
 		vqpi->rxq.dma_ring_addr = adapter->rx_rings[i].dma;
 		vqpi->rxq.max_pkt_size = max_frame;
-		vqpi->rxq.databuffer_size =
-			ALIGN(adapter->rx_rings[i].rx_buf_len,
-			      BIT_ULL(IAVF_RXQ_CTX_DBUFF_SHIFT));
+		vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
 		vqpi++;
 	}
 
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 58bd0f35d0253f..9c9db68d3f3f61 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -4,6 +4,7 @@
 #ifndef __LIBIE_RX_H
 #define __LIBIE_RX_H
 
+#include <linux/if_vlan.h>
 #include <linux/netdevice.h>
 
 /* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
@@ -125,4 +126,42 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 	skb_set_hash(skb, hash, parsed.payload_layer);
 }
 
+/* Rx MTU/buffer/truesize helpers. Mostly pure software-side; HW-defined values
+ * are valid for all Intel HW.
+ */
+
+/* Space reserved in front of each frame */
+#define LIBIE_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
+#define LIBIE_RX_LL_LEN		(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
+
+/* Truesize: total space wasted on each frame. Always use order-0 pages */
+#define LIBIE_RX_PAGE_ORDER	0
+#define LIBIE_RX_TRUESIZE	(PAGE_SIZE << LIBIE_RX_PAGE_ORDER)
+/* Rx buffer size config is a multiple of 128 */
+#define LIBIE_RX_BUF_LEN_ALIGN	128
+/* HW-writeable space in one buffer: truesize - headroom/tailroom,
+ * HW-aligned
+ */
+#define __LIBIE_RX_BUF_LEN						    \
+	ALIGN_DOWN(SKB_MAX_ORDER(LIBIE_SKB_HEADROOM, LIBIE_RX_PAGE_ORDER),  \
+		   LIBIE_RX_BUF_LEN_ALIGN)
+/* The largest size for a single descriptor as per HW */
+#define LIBIE_MAX_RX_BUF_LEN	9728U
+/* "True" HW-writeable space: minimum from SW and HW values */
+#define LIBIE_RX_BUF_LEN	min_t(u32, __LIBIE_RX_BUF_LEN,		    \
+				      LIBIE_MAX_RX_BUF_LEN)
+
+/* The maximum frame size as per HW (S/G) */
+#define __LIBIE_MAX_RX_FRM_LEN	16382U
+/* ATST, HW can chain up to 5 Rx descriptors */
+#define LIBIE_MAX_RX_FRM_LEN	min_t(u32, __LIBIE_MAX_RX_FRM_LEN,	    \
+				      LIBIE_RX_BUF_LEN * 5)
+/* Maximum frame size minus LL overhead */
+#define LIBIE_MAX_MTU		(LIBIE_MAX_RX_FRM_LEN - LIBIE_RX_LL_LEN)
+
+/* DMA mapping attributes for Rx buffers: no impl. sync + relaxed on Sparc */
+#define LIBIE_RX_DMA_ATTR						    \
+	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 #endif /* __LIBIE_RX_H */

From 49b4e5c61faf7ac8d7df33165ba5fb6bcb001f04 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 3 Feb 2023 18:15:47 +0100
Subject: [PATCH 06/40] net: page_pool: allow DMA mapping with
 %DMA_ATTR_WEAK_ORDERING

Add a new flag, %PP_FLAG_DMA_MAP_WEAK, whill will tell PP to map pages
with %DMA_ATTR_WEAK_ORDERING.
To keep the code simple and optimized, map the following PP flags to DMA
map attr flags:

%PP_FLAG_DMA_MAP	=> %DMA_ATTR_SKIP_CPU_SYNC
%PP_FLAG_DMA_MAP_WEAK	=> %DMA_ATTR_WEAK_ORDERING

The first pair is done to be able to just pass it directly to
dma_map_page_attrs(). When a driver wants Page Pool to maintain DMA
mappings, it always sets this flag. Page Pool always skips CPU syncs
when mapping to do that separately later, so having those two 1:1 avoids
introducing ifs and/or bit-ors and keeps the code more compact.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/page_pool.h | 12 +++++++++---
 net/core/page_pool.c    | 20 ++++++++++++++++++--
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ddfa0b32867776..dec5772e851030 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -34,10 +34,16 @@
 #include <linux/ptr_ring.h>
 #include <linux/dma-direction.h>
 
-#define PP_FLAG_DMA_MAP		BIT(0) /* Should page_pool do the DMA
+#define PP_FLAG_DMA_MAP		BIT(5) /* Should page_pool do the DMA
 					* map/unmap
 					*/
-#define PP_FLAG_DMA_SYNC_DEV	BIT(1) /* If set all pages that the driver gets
+#define PP_FLAG_DMA_MAP_WEAK	BIT(1) /* Map with %DMA_ATTR_WEAK_ORDERING */
+/* These flags correspond to the DMA map attributes to pass them directly to
+ * dma_map_page_attrs(), see page_pool_dma_map().
+ */
+#define PP_FLAG_DMA_ATTR	(PP_FLAG_DMA_MAP | \
+				 PP_FLAG_DMA_MAP_WEAK)
+#define PP_FLAG_DMA_SYNC_DEV	BIT(0) /* If set all pages that the driver gets
 					* from page_pool will be
 					* DMA-synced-for-device according to
 					* the length provided by the device
@@ -46,7 +52,7 @@
 					* device driver responsibility
 					*/
 #define PP_FLAG_PAGE_FRAG	BIT(2) /* for page frag feature */
-#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
+#define PP_FLAG_ALL		(PP_FLAG_DMA_ATTR |\
 				 PP_FLAG_DMA_SYNC_DEV |\
 				 PP_FLAG_PAGE_FRAG)
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 193c1879986503..74e25b55e2f062 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -161,6 +161,13 @@ static int page_pool_init(struct page_pool *pool,
 			return -EINVAL;
 	}
 
+	/* Passing DMA mapping attributes without asking PP to map pages
+	 * makes no sense.
+	 */
+	if ((pool->p.flags & PP_FLAG_DMA_ATTR) &&
+	    !(pool->p.flags & PP_FLAG_DMA_MAP))
+		return -EINVAL;
+
 	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
 		/* In order to request DMA-sync-for-device the page
 		 * needs to be mapped
@@ -308,6 +315,14 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 {
 	dma_addr_t dma;
 
+	/* Pages are always mapped with %DMA_ATTR_SKIP_CPU_SYNC, so its value
+	 * corresponds to %PP_FLAG_DMA_MAP, which is always set when reaching
+	 * this function.
+	 */
+	static_assert(PP_FLAG_DMA_MAP == DMA_ATTR_SKIP_CPU_SYNC);
+	/* Drivers may set this for PP to map with weak ordering */
+	static_assert(PP_FLAG_DMA_MAP_WEAK == DMA_ATTR_WEAK_ORDERING);
+
 	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
 	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
@@ -315,7 +330,8 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	 */
 	dma = dma_map_page_attrs(pool->p.dev, page, 0,
 				 (PAGE_SIZE << pool->p.order),
-				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
+				 pool->p.dma_dir,
+				 pool->p.flags & PP_FLAG_DMA_ATTR);
 	if (dma_mapping_error(pool->p.dev, dma))
 		return false;
 
@@ -483,7 +499,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 	/* When page is unmapped, it cannot be returned to our pool */
 	dma_unmap_page_attrs(pool->p.dev, dma,
 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
-			     DMA_ATTR_SKIP_CPU_SYNC);
+			     pool->p.flags & PP_FLAG_DMA_ATTR);
 	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
 	page_pool_clear_pp_info(page);

From b71da32186369685ebc909aec6fae6b2bbedfb5c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 9 Mar 2023 21:26:57 +0100
Subject: [PATCH 07/40] net: page_pool: add DMA-sync-for-CPU inline helpers

Each driver is responsible for syncing buffers written by HW for CPU
before accessing them. Almost each PP-enabled driver uses the same
pattern, which could be shorthanded into a static inline to make driver
code a little bit more compact.
Introduce a pair of such functions. The first one takes the actual size
of the data written by HW and is the main one to be used on Rx. The
second picks max_len from the PP params and is designed for more extreme
cases when the size is unknown, but the buffer still needs to be synced.
Also constify pointer arguments of page_pool_get_dma_dir() and
page_pool_get_dma_addr() to give a bit more room for optimization, as
both of them are read-only.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/page_pool.h | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index dec5772e851030..fb949d168e14c0 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -32,7 +32,7 @@
 
 #include <linux/mm.h> /* Needed by ptr_ring */
 #include <linux/ptr_ring.h>
-#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
 
 #define PP_FLAG_DMA_MAP		BIT(5) /* Should page_pool do the DMA
 					* map/unmap
@@ -239,8 +239,8 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
 /* get the stored dma direction. A driver might decide to treat this locally and
  * avoid the extra cache line from page_pool to determine the direction
  */
-static
-inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
+static inline enum dma_data_direction
+page_pool_get_dma_dir(const struct page_pool *pool)
 {
 	return pool->p.dma_dir;
 }
@@ -360,7 +360,7 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 #define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
 		(sizeof(dma_addr_t) > sizeof(unsigned long))
 
-static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
+static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
 {
 	dma_addr_t ret = page->dma_addr;
 
@@ -377,6 +377,37 @@ static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 		page->dma_addr_upper = upper_32_bits(addr);
 }
 
+/**
+ * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW
+ * @pool: page_pool which this page belongs to
+ * @page: page to sync
+ * @dma_sync_size: size of the data written to the page
+ *
+ * Can be used as a shorthand to sync Rx pages before accessing them in the
+ * driver. The caller must ensure the pool was created with %PP_FLAG_DMA_MAP.
+ */
+static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool,
+					      const struct page *page,
+					      u32 dma_sync_size)
+{
+	dma_sync_single_range_for_cpu(pool->p.dev,
+				      page_pool_get_dma_addr(page),
+				      pool->p.offset, dma_sync_size,
+				      page_pool_get_dma_dir(pool));
+}
+
+/**
+ * page_pool_dma_sync_for_cpu - sync full Rx page for CPU
+ * @pool: page_pool which this page belongs to
+ * @page: page to sync
+ */
+static inline void
+page_pool_dma_sync_full_for_cpu(const struct page_pool *pool,
+				const struct page *page)
+{
+	page_pool_dma_sync_for_cpu(pool, page, pool->p.max_len);
+}
+
 static inline bool is_page_pool_compiled_in(void)
 {
 #ifdef CONFIG_PAGE_POOL

From b71ce3cb9ecd183b2e8e41d53454b1bdc517ba27 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 9 Mar 2023 13:31:08 +0100
Subject: [PATCH 08/40] iavf: switch to Page Pool

Now that the IAVF driver simply uses dev_alloc_page() + free_page() with
no custom recycling logics and one whole page per frame, it can easily
be switched to using Page Pool API instead.
Introduce libie_rx_page_pool_create(), a wrapper for creating a PP with
the default libie settings applicable to all Intel hardware, and replace
the alloc/free calls with the corresponding PP functions, including the
newly added sync-for-CPU helpers. Use skb_mark_for_recycle() to bring
back the recycling and restore the initial performance.

From the important object code changes, worth mentioning that
__iavf_alloc_rx_pages() is now inlined due to the greatly reduced size.
The resulting driver is on par with the pre-series code and 1-2% slower
than the "optimized" version right before the recycling removal.
But the number of locs and object code bytes slaughtered is much more
important here after all, not speaking of that there's still a vast
space for optimization and improvements.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 124 +++++---------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |   5 +-
 drivers/net/ethernet/intel/libie/rx.c       |  31 +++++
 include/linux/net/intel/libie/rx.h          |   3 +
 4 files changed, 69 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 8e0e6d59cd3e10..5d087f9b38ed47 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -690,8 +690,6 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
  **/
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 {
-	u16 i;
-
 	/* ring already cleared, nothing to do */
 	if (!rx_ring->rx_pages)
 		return;
@@ -702,28 +700,17 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 	}
 
 	/* Free all the Rx ring sk_buffs */
-	for (i = 0; i < rx_ring->count; i++) {
+	for (u32 i = 0; i < rx_ring->count; i++) {
 		struct page *page = rx_ring->rx_pages[i];
-		dma_addr_t dma;
 
 		if (!page)
 			continue;
 
-		dma = page_pool_get_dma_addr(page);
-
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
-		dma_sync_single_range_for_cpu(rx_ring->dev, dma,
-					      LIBIE_SKB_HEADROOM,
-					      LIBIE_RX_BUF_LEN,
-					      DMA_FROM_DEVICE);
-
-		/* free resources associated with mapping */
-		dma_unmap_page_attrs(rx_ring->dev, dma, LIBIE_RX_TRUESIZE,
-				     DMA_FROM_DEVICE, LIBIE_RX_DMA_ATTR);
-
-		__free_page(page);
+		page_pool_dma_sync_full_for_cpu(rx_ring->pool, page);
+		page_pool_put_full_page(rx_ring->pool, page, false);
 	}
 
 	rx_ring->next_to_clean = 0;
@@ -738,10 +725,15 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
  **/
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
+	struct device *dev = rx_ring->pool->p.dev;
+
 	iavf_clean_rx_ring(rx_ring);
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
+	page_pool_destroy(rx_ring->pool);
+	rx_ring->dev = dev;
+
 	if (rx_ring->desc) {
 		dma_free_coherent(rx_ring->dev, rx_ring->size,
 				  rx_ring->desc, rx_ring->dma);
@@ -758,13 +750,15 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 {
 	struct device *dev = rx_ring->dev;
+	struct page_pool *pool;
+	int ret = -ENOMEM;
 
 	/* warn if we are about to overwrite the pointer */
 	WARN_ON(rx_ring->rx_pages);
 	rx_ring->rx_pages = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_pages),
 				    GFP_KERNEL);
 	if (!rx_ring->rx_pages)
-		return -ENOMEM;
+		return ret;
 
 	u64_stats_init(&rx_ring->syncp);
 
@@ -780,15 +774,26 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
+	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count);
+	if (IS_ERR(pool)) {
+		ret = PTR_ERR(pool);
+		goto err_free_dma;
+	}
+
+	rx_ring->pool = pool;
+
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
 	return 0;
+
+err_free_dma:
+	dma_free_coherent(dev, rx_ring->size, rx_ring->desc, rx_ring->dma);
 err:
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
-	return -ENOMEM;
+	return ret;
 }
 
 /**
@@ -809,40 +814,6 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 	writel(val, rx_ring->tail);
 }
 
-/**
- * iavf_alloc_mapped_page - allocate and map a new page
- * @dev: device used for DMA mapping
- * @gfp: GFP mask to allocate page
- *
- * Returns a new &page if the it was successfully allocated, %NULL otherwise.
- **/
-static struct page *iavf_alloc_mapped_page(struct device *dev, gfp_t gfp)
-{
-	struct page *page;
-	dma_addr_t dma;
-
-	/* alloc new page for storage */
-	page = __dev_alloc_page(gfp);
-	if (unlikely(!page))
-		return NULL;
-
-	/* map page for use */
-	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE,
-				 LIBIE_RX_DMA_ATTR);
-
-	/* if mapping failed free memory back to system since
-	 * there isn't much point in holding memory we can't use
-	 */
-	if (dma_mapping_error(dev, dma)) {
-		__free_page(page);
-		return NULL;
-	}
-
-	page_pool_set_dma_addr(page, dma);
-
-	return page;
-}
-
 /**
  * iavf_receive_skb - Send a completed packet up the stack
  * @rx_ring:  rx ring in play
@@ -876,7 +847,7 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 				 gfp_t gfp)
 {
-	struct device *dev = rx_ring->dev;
+	struct page_pool *pool = rx_ring->pool;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
 
@@ -890,7 +861,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		struct page *page;
 		dma_addr_t dma;
 
-		page = iavf_alloc_mapped_page(dev, gfp);
+		page = page_pool_alloc_pages(pool, gfp);
 		if (!page) {
 			rx_ring->rx_stats.alloc_page_failed++;
 			break;
@@ -899,11 +870,6 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		rx_ring->rx_pages[ntu] = page;
 		dma = page_pool_get_dma_addr(page);
 
-		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(dev, dma, LIBIE_SKB_HEADROOM,
-						 LIBIE_RX_BUF_LEN,
-						 DMA_FROM_DEVICE);
-
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
@@ -1090,21 +1056,6 @@ static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size)
 			LIBIE_SKB_HEADROOM, size, LIBIE_RX_TRUESIZE);
 }
 
-/**
- * iavf_sync_rx_page - Synchronize received data for use
- * @dev: device used for DMA mapping
- * @page: Rx page containing the data
- * @size: size of the received data
- *
- * This function will synchronize the Rx buffer for use by the CPU.
- */
-static void iavf_sync_rx_page(struct device *dev, struct page *page, u32 size)
-{
-	dma_sync_single_range_for_cpu(dev, page_pool_get_dma_addr(page),
-				      LIBIE_SKB_HEADROOM, size,
-				      DMA_FROM_DEVICE);
-}
-
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @page: Rx page to with the data
@@ -1127,6 +1078,8 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 	if (unlikely(!skb))
 		return NULL;
 
+	skb_mark_for_recycle(skb);
+
 	/* update pointers within the skb to store the data */
 	skb_reserve(skb, LIBIE_SKB_HEADROOM);
 	__skb_put(skb, size);
@@ -1134,19 +1087,6 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 	return skb;
 }
 
-/**
- * iavf_unmap_rx_page - Unmap used page
- * @dev: device used for DMA mapping
- * @page: page to release
- */
-static void iavf_unmap_rx_page(struct device *dev, struct page *page)
-{
-	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page),
-			     LIBIE_RX_TRUESIZE, DMA_FROM_DEVICE,
-			     LIBIE_RX_DMA_ATTR);
-	page_pool_set_dma_addr(page, 0);
-}
-
 /**
  * iavf_is_non_eop - process handling of non-EOP buffers
  * @rx_ring: Rx ring being processed
@@ -1189,8 +1129,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
+	struct page_pool *pool = rx_ring->pool;
 	struct sk_buff *skb = rx_ring->skb;
-	struct device *dev = rx_ring->dev;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
 	u32 cleaned_count = 0;
@@ -1239,13 +1179,11 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 * stripped by the HW.
 		 */
 		if (unlikely(!size)) {
-			iavf_unmap_rx_page(dev, page);
-			__free_page(page);
+			page_pool_recycle_direct(pool, page);
 			goto skip_data;
 		}
 
-		iavf_sync_rx_page(dev, page, size);
-		iavf_unmap_rx_page(dev, page);
+		page_pool_dma_sync_for_cpu(pool, page, size);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
@@ -1255,7 +1193,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
-			__free_page(page);
+			page_pool_put_page(pool, page, size, true);
 			rx_ring->rx_stats.alloc_buff_failed++;
 			break;
 		}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 25459411000a66..8fbe549ce6a587 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -237,7 +237,10 @@ struct iavf_rx_queue_stats {
 struct iavf_ring {
 	struct iavf_ring *next;		/* pointer to next ring in q_vector */
 	void *desc;			/* Descriptor ring memory */
-	struct device *dev;		/* Used for DMA mapping */
+	union {
+		struct page_pool *pool;	/* Used for Rx page management */
+		struct device *dev;	/* Used for DMA mapping on Tx */
+	};
 	struct net_device *netdev;	/* netdev ring maps to */
 	union {
 		struct iavf_tx_buffer *tx_bi;
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index f503476d8eeff9..85d024f0a88567 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -105,6 +105,37 @@ const struct libie_rx_ptype_parsed libie_rx_ptype_lut[LIBIE_RX_PTYPE_NUM] = {
 };
 EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
 
+/* Page Pool */
+
+/**
+ * libie_rx_page_pool_create - create a PP with the default libie settings
+ * @dev: &net_device which a PP will be created for
+ * @size: size of the PP, usually simply Rx queue len
+ *
+ * Returns &page_pool on success, casted -errno on failure.
+ */
+struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
+					    u32 size)
+{
+	const struct page_pool_params pp = {
+		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK |
+				  PP_FLAG_DMA_SYNC_DEV,
+		.order		= LIBIE_RX_PAGE_ORDER,
+		.pool_size	= size,
+		.nid		= NUMA_NO_NODE,
+		.dev		= dev->dev.parent,
+		.dma_dir	= DMA_FROM_DEVICE,
+		.max_len	= LIBIE_RX_BUF_LEN,
+		.offset		= LIBIE_SKB_HEADROOM,
+	};
+
+	static_assert((PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK) ==
+		      LIBIE_RX_DMA_ATTR);
+
+	return page_pool_create(&pp);
+}
+EXPORT_SYMBOL_NS_GPL(libie_rx_page_pool_create, LIBIE);
+
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("Intel(R) Ethernet common library");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 9c9db68d3f3f61..44eafbd04a7c22 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -164,4 +164,7 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 #define LIBIE_RX_DMA_ATTR						    \
 	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
+struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
+					    u32 size);
+
 #endif /* __LIBIE_RX_H */

From 3d884e36aea0e2defa3f17aee9537ac8e7d7e678 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 14 Mar 2023 17:40:57 +0100
Subject: [PATCH 09/40] libie: add common queue stats

Next stop, per-queue private stats. They have only subtle differences
from driver to driver and can easily be resolved.
Define common structures, inline helpers and Ethtool helpers to collect,
update and export the statistics. Use u64_stats_t right from the start,
as well as the corresponding helpers to ensure tear-free operations.
For the NAPI parts of both Rx and Tx, also define small onstack
containers to update them in polling loops and then sync the actual
containers once a loop ends.
The drivers will be switched to use this API later on a per-driver
basis, along with conversion to PP.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/libie/Makefile |   1 +
 drivers/net/ethernet/intel/libie/stats.c  | 119 ++++++++++++++
 include/linux/net/intel/libie/stats.h     | 179 ++++++++++++++++++++++
 3 files changed, 299 insertions(+)
 create mode 100644 drivers/net/ethernet/intel/libie/stats.c
 create mode 100644 include/linux/net/intel/libie/stats.h

diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile
index 95e81d09b4746c..76f32253481b70 100644
--- a/drivers/net/ethernet/intel/libie/Makefile
+++ b/drivers/net/ethernet/intel/libie/Makefile
@@ -4,3 +4,4 @@
 obj-$(CONFIG_LIBIE)	+= libie.o
 
 libie-objs		+= rx.o
+libie-objs		+= stats.o
diff --git a/drivers/net/ethernet/intel/libie/stats.c b/drivers/net/ethernet/intel/libie/stats.c
new file mode 100644
index 00000000000000..61456842a36211
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/stats.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <linux/ethtool.h>
+#include <linux/net/intel/libie/stats.h>
+
+/* Rx per-queue stats */
+
+static const char * const libie_rq_stats_str[] = {
+#define act(s)	__stringify(s),
+	DECLARE_LIBIE_RQ_STATS(act)
+#undef act
+};
+
+#define LIBIE_RQ_STATS_NUM	ARRAY_SIZE(libie_rq_stats_str)
+
+/**
+ * libie_rq_stats_get_sset_count - get the number of Ethtool RQ stats provided
+ *
+ * Returns the number of per-queue Rx stats supported by the library.
+ */
+u32 libie_rq_stats_get_sset_count(void)
+{
+	return LIBIE_RQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_sset_count, LIBIE);
+
+/**
+ * libie_rq_stats_get_strings - get the name strings of Ethtool RQ stats
+ * @data: reference to the cursor pointing to the output buffer
+ * @qid: RQ number to print in the prefix
+ */
+void libie_rq_stats_get_strings(u8 **data, u32 qid)
+{
+	for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
+		ethtool_sprintf(data, "rq%u_%s", qid, libie_rq_stats_str[i]);
+}
+EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_strings, LIBIE);
+
+/**
+ * libie_rq_stats_get_data - get the RQ stats in Ethtool format
+ * @data: reference to the cursor pointing to the output array
+ * @stats: RQ stats container from the queue
+ */
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats)
+{
+	u64 sarr[LIBIE_RQ_STATS_NUM];
+	u32 start;
+
+	do {
+		start = u64_stats_fetch_begin(&stats->syncp);
+
+		for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
+			sarr[i] = u64_stats_read(&stats->raw[i]);
+	} while (u64_stats_fetch_retry(&stats->syncp, start));
+
+	for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
+		(*data)[i] += sarr[i];
+
+	*data += LIBIE_RQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_data, LIBIE);
+
+/* Tx per-queue stats */
+
+static const char * const libie_sq_stats_str[] = {
+#define act(s)	__stringify(s),
+	DECLARE_LIBIE_SQ_STATS(act)
+#undef act
+};
+
+#define LIBIE_SQ_STATS_NUM	ARRAY_SIZE(libie_sq_stats_str)
+
+/**
+ * libie_sq_stats_get_sset_count - get the number of Ethtool SQ stats provided
+ *
+ * Returns the number of per-queue Tx stats supported by the library.
+ */
+u32 libie_sq_stats_get_sset_count(void)
+{
+	return LIBIE_SQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_sq_stats_get_sset_count, LIBIE);
+
+/**
+ * libie_sq_stats_get_strings - get the name strings of Ethtool SQ stats
+ * @data: reference to the cursor pointing to the output buffer
+ * @qid: SQ number to print in the prefix
+ */
+void libie_sq_stats_get_strings(u8 **data, u32 qid)
+{
+	for (u32 i = 0; i < LIBIE_SQ_STATS_NUM; i++)
+		ethtool_sprintf(data, "sq%u_%s", qid, libie_sq_stats_str[i]);
+}
+EXPORT_SYMBOL_NS_GPL(libie_sq_stats_get_strings, LIBIE);
+
+/**
+ * libie_sq_stats_get_data - get the SQ stats in Ethtool format
+ * @data: reference to the cursor pointing to the output array
+ * @stats: SQ stats container from the queue
+ */
+void libie_sq_stats_get_data(u64 **data, const struct libie_sq_stats *stats)
+{
+	u64 sarr[LIBIE_SQ_STATS_NUM];
+	u32 start;
+
+	do {
+		start = u64_stats_fetch_begin(&stats->syncp);
+
+		for (u32 i = 0; i < LIBIE_SQ_STATS_NUM; i++)
+			sarr[i] = u64_stats_read(&stats->raw[i]);
+	} while (u64_stats_fetch_retry(&stats->syncp, start));
+
+	for (u32 i = 0; i < LIBIE_SQ_STATS_NUM; i++)
+		(*data)[i] += sarr[i];
+
+	*data += LIBIE_SQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_sq_stats_get_data, LIBIE);
diff --git a/include/linux/net/intel/libie/stats.h b/include/linux/net/intel/libie/stats.h
new file mode 100644
index 00000000000000..dbbc98bbd3a70c
--- /dev/null
+++ b/include/linux/net/intel/libie/stats.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Intel Corporation. */
+
+#ifndef __LIBIE_STATS_H
+#define __LIBIE_STATS_H
+
+#include <linux/u64_stats_sync.h>
+
+/* Common */
+
+/* Use 32-byte alignment to reduce false sharing */
+#define __libie_stats_aligned	__aligned(4 * sizeof(u64_stats_t))
+
+/**
+ * libie_stats_add - update one structure counter from a local struct
+ * @qs: queue stats structure to update (&libie_rq_stats or &libie_sq_stats)
+ * @ss: local/onstack stats structure
+ * @f: name of the field to update
+ *
+ * If a local/onstack stats structure is used to collect statistics during
+ * hotpath loops, this macro can be used to shorthand updates, given that
+ * the fields have the same name.
+ * Must be guarded with u64_stats_update_{begin,end}().
+ */
+#define libie_stats_add(qs, ss, f)			\
+	u64_stats_add(&(qs)->f, (ss)->f)
+
+/**
+ * __libie_stats_inc_one - safely increment one stats structure counter
+ * @s: queue stats structure to update (&libie_rq_stats or &libie_sq_stats)
+ * @f: name of the field to increment
+ * @n: name of the temporary variable, result of __UNIQUE_ID()
+ *
+ * To be used on exception or slow paths -- allocation fails, queue stops etc.
+ */
+#define __libie_stats_inc_one(s, f, n) ({		\
+	typeof(*(s)) *n = (s);				\
+							\
+	u64_stats_update_begin(&n->syncp);		\
+	u64_stats_inc(&n->f);				\
+	u64_stats_update_end(&n->syncp);		\
+})
+#define libie_stats_inc_one(s, f)			\
+	__libie_stats_inc_one(s, f, __UNIQUE_ID(qs_))
+
+/* Rx per-queue stats:
+ * packets: packets received on this queue
+ * bytes: bytes received on this queue
+ * fragments: number of processed descriptors carrying only a fragment
+ * alloc_page_fail: number of Rx page allocation fails
+ * build_skb_fail: number of build_skb() fails
+ */
+
+#define DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
+	act(packets)					\
+	act(bytes)					\
+	act(fragments)
+
+#define DECLARE_LIBIE_RQ_FAIL_STATS(act)		\
+	act(alloc_page_fail)				\
+	act(build_skb_fail)
+
+#define DECLARE_LIBIE_RQ_STATS(act)			\
+	DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
+	DECLARE_LIBIE_RQ_FAIL_STATS(act)
+
+struct libie_rq_stats {
+	struct u64_stats_sync	syncp;
+
+	union {
+		struct {
+#define act(s)	u64_stats_t	s;
+			DECLARE_LIBIE_RQ_NAPI_STATS(act);
+			DECLARE_LIBIE_RQ_FAIL_STATS(act);
+#undef act
+		};
+		DECLARE_FLEX_ARRAY(u64_stats_t, raw);
+	};
+} __libie_stats_aligned;
+
+/* Rx stats being modified frequently during the NAPI polling, to sync them
+ * with the queue stats once after the loop is finished.
+ */
+struct libie_rq_onstack_stats {
+	union {
+		struct {
+#define act(s)	u32		s;
+			DECLARE_LIBIE_RQ_NAPI_STATS(act);
+#undef act
+		};
+		DECLARE_FLEX_ARRAY(u32, raw);
+	};
+};
+
+/**
+ * libie_rq_napi_stats_add - add onstack Rx stats to the queue container
+ * @qs: Rx queue stats structure to update
+ * @ss: onstack structure to get the values from, updated during the NAPI loop
+ */
+static inline void
+libie_rq_napi_stats_add(struct libie_rq_stats *qs,
+			const struct libie_rq_onstack_stats *ss)
+{
+	u64_stats_update_begin(&qs->syncp);
+	libie_stats_add(qs, ss, packets);
+	libie_stats_add(qs, ss, bytes);
+	libie_stats_add(qs, ss, fragments);
+	u64_stats_update_end(&qs->syncp);
+}
+
+u32 libie_rq_stats_get_sset_count(void);
+void libie_rq_stats_get_strings(u8 **data, u32 qid);
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats);
+
+/* Tx per-queue stats:
+ * packets: packets sent from this queue
+ * bytes: bytes sent from this queue
+ * busy: number of xmit failures due to the ring being full
+ * stops: number times the ring was stopped from the driver
+ * restarts: number times it was started after being stopped
+ * linearized: number of skbs linearized due to HW limits
+ */
+
+#define DECLARE_LIBIE_SQ_NAPI_STATS(act)		\
+	act(packets)					\
+	act(bytes)
+
+#define DECLARE_LIBIE_SQ_XMIT_STATS(act)		\
+	act(busy)					\
+	act(stops)					\
+	act(restarts)					\
+	act(linearized)
+
+#define DECLARE_LIBIE_SQ_STATS(act)			\
+	DECLARE_LIBIE_SQ_NAPI_STATS(act)		\
+	DECLARE_LIBIE_SQ_XMIT_STATS(act)
+
+struct libie_sq_stats {
+	struct u64_stats_sync	syncp;
+
+	union {
+		struct {
+#define act(s)	u64_stats_t	s;
+			DECLARE_LIBIE_SQ_STATS(act);
+#undef act
+		};
+		DECLARE_FLEX_ARRAY(u64_stats_t, raw);
+	};
+} __libie_stats_aligned;
+
+struct libie_sq_onstack_stats {
+#define act(s)	u32		s;
+	DECLARE_LIBIE_SQ_NAPI_STATS(act);
+#undef act
+};
+
+/**
+ * libie_sq_napi_stats_add - add onstack Tx stats to the queue container
+ * @qs: Tx queue stats structure to update
+ * @ss: onstack structure to get the values from, updated during the NAPI loop
+ */
+static inline void
+libie_sq_napi_stats_add(struct libie_sq_stats *qs,
+			const struct libie_sq_onstack_stats *ss)
+{
+	if (unlikely(!ss->packets))
+		return;
+
+	u64_stats_update_begin(&qs->syncp);
+	libie_stats_add(qs, ss, packets);
+	libie_stats_add(qs, ss, bytes);
+	u64_stats_update_end(&qs->syncp);
+}
+
+u32 libie_sq_stats_get_sset_count(void);
+void libie_sq_stats_get_strings(u8 **data, u32 qid);
+void libie_sq_stats_get_data(u64 **data, const struct libie_sq_stats *stats);
+
+#endif /* __LIBIE_STATS_H */

From 16d126cbb22422dada94398928c76bb2e1571d75 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 16 Mar 2023 20:12:55 +0100
Subject: [PATCH 10/40] libie: add per-queue Page Pool stats

Expand the libie generic per-queue stats with the generic Page Pool
stats provided by the API itself, when CONFIG_PAGE_POOL is enable.
When it's not, there'll be no such fields in the stats structure, so
no space wasted.
They are also a bit special in terms of how they are obtained. One
&page_pool accumulates statistics until it's destroyed obviously,
which happens on ifdown. So, in order to not lose any statistics,
get the stats and store in the queue container before destroying
a pool. This container survives ifups/downs, so it basically stores
the statistics accumulated since the very first pool was allocated
on this queue. When it's needed to export the stats, first get the
numbers from this container and then add the "live" numbers -- the
ones that the current active pool returns. The result values will
always represent the actual device-lifetime* stats.
There's a cast from &page_pool_stats to `u64 *` in a couple functions,
but they are guarded with stats asserts to make sure it's safe to do.
FWIW it saves a lot of object code.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/libie/internal.h | 23 +++++++
 drivers/net/ethernet/intel/libie/rx.c       | 20 ++++++
 drivers/net/ethernet/intel/libie/stats.c    | 72 ++++++++++++++++++++-
 include/linux/net/intel/libie/rx.h          |  4 ++
 include/linux/net/intel/libie/stats.h       | 39 ++++++++++-
 5 files changed, 155 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libie/internal.h

diff --git a/drivers/net/ethernet/intel/libie/internal.h b/drivers/net/ethernet/intel/libie/internal.h
new file mode 100644
index 00000000000000..083398dc37c63d
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/internal.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* libie internal declarations not to be used in drivers.
+ *
+ * Copyright(c) 2023 Intel Corporation.
+ */
+
+#ifndef __LIBIE_INTERNAL_H
+#define __LIBIE_INTERNAL_H
+
+struct libie_rq_stats;
+struct page_pool;
+
+#ifdef CONFIG_PAGE_POOL_STATS
+void libie_rq_stats_sync_pp(struct libie_rq_stats *stats,
+			    struct page_pool *pool);
+#else
+static inline void libie_rq_stats_sync_pp(struct libie_rq_stats *stats,
+					  struct page_pool *pool)
+{
+}
+#endif
+
+#endif /* __LIBIE_INTERNAL_H */
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index 85d024f0a88567..10ef8741326ad2 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -3,6 +3,8 @@
 
 #include <linux/net/intel/libie/rx.h>
 
+#include "internal.h"
+
 /* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
  * bitfield struct.
  */
@@ -136,6 +138,24 @@ struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 }
 EXPORT_SYMBOL_NS_GPL(libie_rx_page_pool_create, LIBIE);
 
+/**
+ * libie_rx_page_pool_destroy - destroy a &page_pool created by libie
+ * @pool: pool to destroy
+ * @stats: RQ stats from the ring (or %NULL to skip updating PP stats)
+ *
+ * As the stats usually has the same lifetime as the device, but PP is usually
+ * created/destroyed on ifup/ifdown, in order to not lose the stats accumulated
+ * during the last ifup, the PP stats need to be added to the driver stats
+ * container. Then the PP gets destroyed.
+ */
+void libie_rx_page_pool_destroy(struct page_pool *pool,
+				struct libie_rq_stats *stats)
+{
+	libie_rq_stats_sync_pp(stats, pool);
+	page_pool_destroy(pool);
+}
+EXPORT_SYMBOL_NS_GPL(libie_rx_page_pool_destroy, LIBIE);
+
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("Intel(R) Ethernet common library");
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/intel/libie/stats.c b/drivers/net/ethernet/intel/libie/stats.c
index 61456842a36211..95bbb38c39e348 100644
--- a/drivers/net/ethernet/intel/libie/stats.c
+++ b/drivers/net/ethernet/intel/libie/stats.c
@@ -4,6 +4,8 @@
 #include <linux/ethtool.h>
 #include <linux/net/intel/libie/stats.h>
 
+#include "internal.h"
+
 /* Rx per-queue stats */
 
 static const char * const libie_rq_stats_str[] = {
@@ -14,6 +16,70 @@ static const char * const libie_rq_stats_str[] = {
 
 #define LIBIE_RQ_STATS_NUM	ARRAY_SIZE(libie_rq_stats_str)
 
+#ifdef CONFIG_PAGE_POOL_STATS
+/**
+ * libie_rq_stats_get_pp - get the current stats from a &page_pool
+ * @sarr: local array to add stats to
+ * @pool: pool to get the stats from
+ *
+ * Adds the current "live" stats from an online PP to the stats read from
+ * the RQ container, so that the actual totals will be returned.
+ */
+static void libie_rq_stats_get_pp(u64 *sarr, struct page_pool *pool)
+{
+	struct page_pool_stats *pps;
+	/* Used only to calculate pos below */
+	struct libie_rq_stats tmp;
+	u32 pos;
+
+	/* Validate the libie PP stats array can be casted <-> PP struct */
+	static_assert(sizeof(tmp.pp) == sizeof(*pps));
+
+	if (!pool)
+		return;
+
+	/* Position of the first Page Pool stats field */
+	pos = (u64_stats_t *)&tmp.pp - tmp.raw;
+	pps = (typeof(pps))&sarr[pos];
+
+	page_pool_get_stats(pool, pps);
+}
+
+/**
+ * libie_rq_stats_sync_pp - add the current PP stats to the RQ stats container
+ * @stats: stats structure to update
+ * @pool: pool to read the stats
+ *
+ * Called by libie_rx_page_pool_destroy() to save the stats before destroying
+ * the pool.
+ */
+void libie_rq_stats_sync_pp(struct libie_rq_stats *stats,
+			    struct page_pool *pool)
+{
+	u64_stats_t *qarr = (u64_stats_t *)&stats->pp;
+	struct page_pool_stats pps = { };
+	u64 *sarr = (u64 *)&pps;
+
+	if (!stats)
+		return;
+
+	page_pool_get_stats(pool, &pps);
+
+	u64_stats_update_begin(&stats->syncp);
+
+	for (u32 i = 0; i < sizeof(pps) / sizeof(*sarr); i++)
+		u64_stats_add(&qarr[i], sarr[i]);
+
+	u64_stats_update_end(&stats->syncp);
+}
+#else
+static inline void libie_rq_stats_get_pp(u64 *sarr, struct page_pool *pool)
+{
+}
+
+/* static inline void libie_rq_stats_sync_pp() is declared in "internal.h" */
+#endif
+
 /**
  * libie_rq_stats_get_sset_count - get the number of Ethtool RQ stats provided
  *
@@ -41,8 +107,10 @@ EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_strings, LIBIE);
  * libie_rq_stats_get_data - get the RQ stats in Ethtool format
  * @data: reference to the cursor pointing to the output array
  * @stats: RQ stats container from the queue
+ * @pool: &page_pool from the queue (%NULL to ignore PP "live" stats)
  */
-void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats)
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats,
+			     struct page_pool *pool)
 {
 	u64 sarr[LIBIE_RQ_STATS_NUM];
 	u32 start;
@@ -54,6 +122,8 @@ void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats)
 			sarr[i] = u64_stats_read(&stats->raw[i]);
 	} while (u64_stats_fetch_retry(&stats->syncp, start));
 
+	libie_rq_stats_get_pp(sarr, pool);
+
 	for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
 		(*data)[i] += sarr[i];
 
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 44eafbd04a7c22..f063a30f182ecb 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -164,7 +164,11 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 #define LIBIE_RX_DMA_ATTR						    \
 	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
+struct libie_rq_stats;
+
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 					    u32 size);
+void libie_rx_page_pool_destroy(struct page_pool *pool,
+				struct libie_rq_stats *stats);
 
 #endif /* __LIBIE_RX_H */
diff --git a/include/linux/net/intel/libie/stats.h b/include/linux/net/intel/libie/stats.h
index dbbc98bbd3a70c..23ca0079a90586 100644
--- a/include/linux/net/intel/libie/stats.h
+++ b/include/linux/net/intel/libie/stats.h
@@ -49,6 +49,17 @@
  * fragments: number of processed descriptors carrying only a fragment
  * alloc_page_fail: number of Rx page allocation fails
  * build_skb_fail: number of build_skb() fails
+ * pp_alloc_fast: pages taken from the cache or ring
+ * pp_alloc_slow: actual page allocations
+ * pp_alloc_slow_ho: non-order-0 page allocations
+ * pp_alloc_empty: number of times the pool was empty
+ * pp_alloc_refill: number of cache refills
+ * pp_alloc_waive: NUMA node mismatches during recycling
+ * pp_recycle_cached: direct recyclings into the cache
+ * pp_recycle_cache_full: number of times the cache was full
+ * pp_recycle_ring: recyclings into the ring
+ * pp_recycle_ring_full: number of times the ring was full
+ * pp_recycle_released_ref: pages released due to elevated refcnt
  */
 
 #define DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
@@ -60,9 +71,29 @@
 	act(alloc_page_fail)				\
 	act(build_skb_fail)
 
+#ifdef CONFIG_PAGE_POOL_STATS
+#define DECLARE_LIBIE_RQ_PP_STATS(act)			\
+	act(pp_alloc_fast)				\
+	act(pp_alloc_slow)				\
+	act(pp_alloc_slow_ho)				\
+	act(pp_alloc_empty)				\
+	act(pp_alloc_refill)				\
+	act(pp_alloc_waive)				\
+	act(pp_recycle_cached)				\
+	act(pp_recycle_cache_full)			\
+	act(pp_recycle_ring)				\
+	act(pp_recycle_ring_full)			\
+	act(pp_recycle_released_ref)
+#else
+#define DECLARE_LIBIE_RQ_PP_STATS(act)
+#endif
+
 #define DECLARE_LIBIE_RQ_STATS(act)			\
 	DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
-	DECLARE_LIBIE_RQ_FAIL_STATS(act)
+	DECLARE_LIBIE_RQ_FAIL_STATS(act)		\
+	DECLARE_LIBIE_RQ_PP_STATS(act)
+
+struct page_pool;
 
 struct libie_rq_stats {
 	struct u64_stats_sync	syncp;
@@ -72,6 +103,9 @@ struct libie_rq_stats {
 #define act(s)	u64_stats_t	s;
 			DECLARE_LIBIE_RQ_NAPI_STATS(act);
 			DECLARE_LIBIE_RQ_FAIL_STATS(act);
+			struct_group(pp,
+				DECLARE_LIBIE_RQ_PP_STATS(act);
+			);
 #undef act
 		};
 		DECLARE_FLEX_ARRAY(u64_stats_t, raw);
@@ -110,7 +144,8 @@ libie_rq_napi_stats_add(struct libie_rq_stats *qs,
 
 u32 libie_rq_stats_get_sset_count(void);
 void libie_rq_stats_get_strings(u8 **data, u32 qid);
-void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats);
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats,
+			     struct page_pool *pool);
 
 /* Tx per-queue stats:
  * packets: packets sent from this queue

From 1f934b6e9745525e418c394d2741d2cce434f152 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 15 Mar 2023 17:50:17 +0100
Subject: [PATCH 11/40] iavf: switch queue stats to libie

iavf is pretty much ready for using the generic libie stats, so drop all
the custom code and just use generic definitions. The only thing is that
it previously lacked the counter of Tx queue stops. It's present in the
other drivers, so add it here as well.
The rest is straightforward. There were two fields in the Tx stats
struct, which didn't belong there. The first one has never been used,
wipe it; and move the other to the queue structure. Plus move around
a couple fields in &iavf_ring to account stats structs' alignment.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 .../net/ethernet/intel/iavf/iavf_ethtool.c    | 87 ++++------------
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  2 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 98 ++++++++++---------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   | 47 +++------
 4 files changed, 87 insertions(+), 147 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index de3050c02b6ffc..0dcf50d75f8614 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -46,16 +46,6 @@ struct iavf_stats {
 	.stat_offset = offsetof(_type, _stat) \
 }
 
-/* Helper macro for defining some statistics related to queues */
-#define IAVF_QUEUE_STAT(_name, _stat) \
-	IAVF_STAT(struct iavf_ring, _name, _stat)
-
-/* Stats associated with a Tx or Rx ring */
-static const struct iavf_stats iavf_gstrings_queue_stats[] = {
-	IAVF_QUEUE_STAT("%s-%u.packets", stats.packets),
-	IAVF_QUEUE_STAT("%s-%u.bytes", stats.bytes),
-};
-
 /**
  * iavf_add_one_ethtool_stat - copy the stat into the supplied buffer
  * @data: location to store the stat value
@@ -141,43 +131,6 @@ __iavf_add_ethtool_stats(u64 **data, void *pointer,
 #define iavf_add_ethtool_stats(data, pointer, stats) \
 	__iavf_add_ethtool_stats(data, pointer, stats, ARRAY_SIZE(stats))
 
-/**
- * iavf_add_queue_stats - copy queue statistics into supplied buffer
- * @data: ethtool stats buffer
- * @ring: the ring to copy
- *
- * Queue statistics must be copied while protected by
- * u64_stats_fetch_begin, so we can't directly use iavf_add_ethtool_stats.
- * Assumes that queue stats are defined in iavf_gstrings_queue_stats. If the
- * ring pointer is null, zero out the queue stat values and update the data
- * pointer. Otherwise safely copy the stats from the ring into the supplied
- * buffer and update the data pointer when finished.
- *
- * This function expects to be called while under rcu_read_lock().
- **/
-static void
-iavf_add_queue_stats(u64 **data, struct iavf_ring *ring)
-{
-	const unsigned int size = ARRAY_SIZE(iavf_gstrings_queue_stats);
-	const struct iavf_stats *stats = iavf_gstrings_queue_stats;
-	unsigned int start;
-	unsigned int i;
-
-	/* To avoid invalid statistics values, ensure that we keep retrying
-	 * the copy until we get a consistent value according to
-	 * u64_stats_fetch_retry. But first, make sure our ring is
-	 * non-null before attempting to access its syncp.
-	 */
-	do {
-		start = !ring ? 0 : u64_stats_fetch_begin(&ring->syncp);
-		for (i = 0; i < size; i++)
-			iavf_add_one_ethtool_stat(&(*data)[i], ring, &stats[i]);
-	} while (ring && u64_stats_fetch_retry(&ring->syncp, start));
-
-	/* Once we successfully copy the stats in, update the data pointer */
-	*data += size;
-}
-
 /**
  * __iavf_add_stat_strings - copy stat strings into ethtool buffer
  * @p: ethtool supplied buffer
@@ -237,8 +190,6 @@ static const struct iavf_stats iavf_gstrings_stats[] = {
 
 #define IAVF_STATS_LEN	ARRAY_SIZE(iavf_gstrings_stats)
 
-#define IAVF_QUEUE_STATS_LEN	ARRAY_SIZE(iavf_gstrings_queue_stats)
-
 /**
  * iavf_get_link_ksettings - Get Link Speed and Duplex settings
  * @netdev: network interface device structure
@@ -308,18 +259,22 @@ static int iavf_get_link_ksettings(struct net_device *netdev,
  **/
 static int iavf_get_sset_count(struct net_device *netdev, int sset)
 {
-	/* Report the maximum number queues, even if not every queue is
-	 * currently configured. Since allocation of queues is in pairs,
-	 * use netdev->real_num_tx_queues * 2. The real_num_tx_queues is set
-	 * at device creation and never changes.
-	 */
+	u32 num;
 
-	if (sset == ETH_SS_STATS)
-		return IAVF_STATS_LEN +
-			(IAVF_QUEUE_STATS_LEN * 2 *
-			 netdev->real_num_tx_queues);
-	else
+	switch (sset) {
+	case ETH_SS_STATS:
+		/* Per-queue */
+		num = libie_rq_stats_get_sset_count();
+		num += libie_sq_stats_get_sset_count();
+		num *= netdev->real_num_tx_queues;
+
+		/* Global */
+		num += IAVF_STATS_LEN;
+
+		return num;
+	default:
 		return -EINVAL;
+	}
 }
 
 /**
@@ -346,15 +301,15 @@ static void iavf_get_ethtool_stats(struct net_device *netdev,
 	 * it to iterate over rings' stats.
 	 */
 	for (i = 0; i < adapter->num_active_queues; i++) {
-		struct iavf_ring *ring;
+		const struct iavf_ring *ring;
 
 		/* Tx rings stats */
-		ring = &adapter->tx_rings[i];
-		iavf_add_queue_stats(&data, ring);
+		libie_sq_stats_get_data(&data, &adapter->tx_rings[i].sq_stats);
 
 		/* Rx rings stats */
 		ring = &adapter->rx_rings[i];
-		iavf_add_queue_stats(&data, ring);
+		libie_rq_stats_get_data(&data, &ring->rq_stats,
+					ring->rx_pages ? ring->pool : NULL);
 	}
 	rcu_read_unlock();
 }
@@ -376,10 +331,8 @@ static void iavf_get_stat_strings(struct net_device *netdev, u8 *data)
 	 * real_num_tx_queues for both Tx and Rx queues.
 	 */
 	for (i = 0; i < netdev->real_num_tx_queues; i++) {
-		iavf_add_stat_strings(&data, iavf_gstrings_queue_stats,
-				      "tx", i);
-		iavf_add_stat_strings(&data, iavf_gstrings_queue_stats,
-				      "rx", i);
+		libie_sq_stats_get_strings(&data, i);
+		libie_rq_stats_get_strings(&data, i);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index fb2bd1c423a158..60463b3edfacf2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1587,6 +1587,7 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 		tx_ring->itr_setting = IAVF_ITR_TX_DEF;
 		if (adapter->flags & IAVF_FLAG_WB_ON_ITR_CAPABLE)
 			tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
+		u64_stats_init(&tx_ring->sq_stats.syncp);
 
 		rx_ring = &adapter->rx_rings[i];
 		rx_ring->queue_index = i;
@@ -1594,6 +1595,7 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 		rx_ring->dev = &adapter->pdev->dev;
 		rx_ring->count = adapter->rx_desc_count;
 		rx_ring->itr_setting = IAVF_ITR_RX_DEF;
+		u64_stats_init(&rx_ring->rq_stats.syncp);
 	}
 
 	adapter->num_active_queues = num_active_queues;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 5d087f9b38ed47..ab4863f86a3c3e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -158,6 +158,9 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 	for (i = 0; i < vsi->back->num_active_queues; i++) {
 		tx_ring = &vsi->back->tx_rings[i];
 		if (tx_ring && tx_ring->desc) {
+			const struct libie_sq_stats *st = &tx_ring->sq_stats;
+			u32 start;
+
 			/* If packet counter has not changed the queue is
 			 * likely stalled, so force an interrupt for this
 			 * queue.
@@ -165,8 +168,13 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 			 * prev_pkt_ctr would be negative if there was no
 			 * pending work.
 			 */
-			packets = tx_ring->stats.packets & INT_MAX;
-			if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
+			do {
+				start = u64_stats_fetch_begin(&st->syncp);
+				packets = u64_stats_read(&st->packets) &
+					  INT_MAX;
+			} while (u64_stats_fetch_retry(&st->syncp, start));
+
+			if (tx_ring->prev_pkt_ctr == packets) {
 				iavf_force_wb(vsi, tx_ring->q_vector);
 				continue;
 			}
@@ -175,7 +183,7 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 			 * to iavf_get_tx_pending()
 			 */
 			smp_rmb();
-			tx_ring->tx_stats.prev_pkt_ctr =
+			tx_ring->prev_pkt_ctr =
 			  iavf_get_tx_pending(tx_ring, true) ? packets : -1;
 		}
 	}
@@ -194,10 +202,10 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 			      struct iavf_ring *tx_ring, int napi_budget)
 {
+	struct libie_sq_onstack_stats stats = { };
 	int i = tx_ring->next_to_clean;
 	struct iavf_tx_buffer *tx_buf;
 	struct iavf_tx_desc *tx_desc;
-	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int budget = IAVF_DEFAULT_IRQ_WORK;
 
 	tx_buf = &tx_ring->tx_bi[i];
@@ -224,8 +232,8 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 		tx_buf->next_to_watch = NULL;
 
 		/* update the statistics for this packet */
-		total_bytes += tx_buf->bytecount;
-		total_packets += tx_buf->gso_segs;
+		stats.bytes += tx_buf->bytecount;
+		stats.packets += tx_buf->gso_segs;
 
 		/* free the skb */
 		napi_consume_skb(tx_buf->skb, napi_budget);
@@ -282,12 +290,9 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 
 	i += tx_ring->count;
 	tx_ring->next_to_clean = i;
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->stats.bytes += total_bytes;
-	tx_ring->stats.packets += total_packets;
-	u64_stats_update_end(&tx_ring->syncp);
-	tx_ring->q_vector->tx.total_bytes += total_bytes;
-	tx_ring->q_vector->tx.total_packets += total_packets;
+	libie_sq_napi_stats_add(&tx_ring->sq_stats, &stats);
+	tx_ring->q_vector->tx.total_bytes += stats.bytes;
+	tx_ring->q_vector->tx.total_packets += stats.packets;
 
 	if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) {
 		/* check to see if there are < 4 descriptors
@@ -306,10 +311,10 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 
 	/* notify netdev of completed buffers */
 	netdev_tx_completed_queue(txring_txq(tx_ring),
-				  total_packets, total_bytes);
+				  stats.packets, stats.bytes);
 
 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
-	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
+	if (unlikely(stats.packets && netif_carrier_ok(tx_ring->netdev) &&
 		     (IAVF_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
 		/* Make sure that anybody stopping the queue after this
 		 * sees the new next_to_clean.
@@ -320,7 +325,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 		   !test_bit(__IAVF_VSI_DOWN, vsi->state)) {
 			netif_wake_subqueue(tx_ring->netdev,
 					    tx_ring->queue_index);
-			++tx_ring->tx_stats.restart_queue;
+			libie_stats_inc_one(&tx_ring->sq_stats, restarts);
 		}
 	}
 
@@ -675,7 +680,7 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
-	tx_ring->tx_stats.prev_pkt_ctr = -1;
+	tx_ring->prev_pkt_ctr = -1;
 	return 0;
 
 err:
@@ -731,7 +736,7 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
-	page_pool_destroy(rx_ring->pool);
+	libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
 	rx_ring->dev = dev;
 
 	if (rx_ring->desc) {
@@ -760,8 +765,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 	if (!rx_ring->rx_pages)
 		return ret;
 
-	u64_stats_init(&rx_ring->syncp);
-
 	/* Round up to nearest 4K */
 	rx_ring->size = rx_ring->count * sizeof(union iavf_32byte_rx_desc);
 	rx_ring->size = ALIGN(rx_ring->size, 4096);
@@ -862,10 +865,8 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		dma_addr_t dma;
 
 		page = page_pool_alloc_pages(pool, gfp);
-		if (!page) {
-			rx_ring->rx_stats.alloc_page_failed++;
+		if (!page)
 			break;
-		}
 
 		rx_ring->rx_pages[ntu] = page;
 		dma = page_pool_get_dma_addr(page);
@@ -1089,25 +1090,23 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 
 /**
  * iavf_is_non_eop - process handling of non-EOP buffers
- * @rx_ring: Rx ring being processed
  * @rx_desc: Rx descriptor for current buffer
- * @skb: Current socket buffer containing buffer in progress
+ * @stats: NAPI poll local stats to update
  *
  * This function updates next to clean.  If the buffer is an EOP buffer
  * this function exits returning false, otherwise it will place the
  * sk_buff in the next buffer to be chained and return true indicating
  * that this is in fact a non-EOP buffer.
  **/
-static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
-			    union iavf_rx_desc *rx_desc,
-			    struct sk_buff *skb)
+static bool iavf_is_non_eop(union iavf_rx_desc *rx_desc,
+			    struct libie_rq_onstack_stats *stats)
 {
 	/* if we are the last buffer then there is nothing else to do */
 #define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
 	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
 		return false;
 
-	rx_ring->rx_stats.non_eop_descs++;
+	stats->fragments++;
 
 	return true;
 }
@@ -1126,8 +1125,8 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
  **/
 static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 {
-	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	struct libie_rq_onstack_stats stats = { };
 	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
 	struct page_pool *pool = rx_ring->pool;
 	struct sk_buff *skb = rx_ring->skb;
@@ -1144,9 +1143,13 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
-		if (to_refill >= IAVF_RX_BUFFER_WRITE)
+		if (to_refill >= IAVF_RX_BUFFER_WRITE) {
 			to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill,
 							  gfp);
+			if (unlikely(to_refill))
+				libie_stats_inc_one(&rx_ring->rq_stats,
+						    alloc_page_fail);
+		}
 
 		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
 
@@ -1194,7 +1197,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
 			page_pool_put_page(pool, page, size, true);
-			rx_ring->rx_stats.alloc_buff_failed++;
+			libie_stats_inc_one(&rx_ring->rq_stats,
+					    build_skb_fail);
 			break;
 		}
 
@@ -1206,7 +1210,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		prefetch(IAVF_RX_DESC(rx_ring, ntc));
 
-		if (iavf_is_non_eop(rx_ring, rx_desc, skb))
+		if (iavf_is_non_eop(rx_desc, &stats))
 			continue;
 
 		/* ERR_MASK will only have valid bits if EOP set, and
@@ -1226,7 +1230,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		}
 
 		/* probably a little skewed due to removing CRC */
-		total_rx_bytes += skb->len;
+		stats.bytes += skb->len;
 
 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 		rx_ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >>
@@ -1248,7 +1252,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		skb = NULL;
 
 		/* update budget accounting */
-		total_rx_packets++;
+		stats.packets++;
 	}
 
 	rx_ring->next_to_clean = ntc;
@@ -1259,16 +1263,16 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* guarantee a trip back through this routine if there was
 		 * a failure
 		 */
-		if (unlikely(to_refill))
+		if (unlikely(to_refill)) {
+			libie_stats_inc_one(&rx_ring->rq_stats,
+					    alloc_page_fail);
 			cleaned_count = budget;
+		}
 	}
 
-	u64_stats_update_begin(&rx_ring->syncp);
-	rx_ring->stats.packets += total_rx_packets;
-	rx_ring->stats.bytes += total_rx_bytes;
-	u64_stats_update_end(&rx_ring->syncp);
-	rx_ring->q_vector->rx.total_packets += total_rx_packets;
-	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
+	libie_rq_napi_stats_add(&rx_ring->rq_stats, &stats);
+	rx_ring->q_vector->rx.total_packets += stats.packets;
+	rx_ring->q_vector->rx.total_bytes += stats.bytes;
 
 	return cleaned_count;
 }
@@ -1447,10 +1451,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 			return budget - 1;
 		}
 tx_only:
-		if (arm_wb) {
-			q_vector->tx.ring[0].tx_stats.tx_force_wb++;
+		if (arm_wb)
 			iavf_enable_wb_on_itr(vsi, q_vector);
-		}
 		return budget;
 	}
 
@@ -1909,6 +1911,7 @@ bool __iavf_chk_linearize(struct sk_buff *skb)
 int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
 {
 	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
+	libie_stats_inc_one(&tx_ring->sq_stats, stops);
 	/* Memory barrier before checking head and tail */
 	smp_mb();
 
@@ -1918,7 +1921,8 @@ int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
 
 	/* A reprieve! - use start_queue because it doesn't call schedule */
 	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
-	++tx_ring->tx_stats.restart_queue;
+	libie_stats_inc_one(&tx_ring->sq_stats, restarts);
+
 	return 0;
 }
 
@@ -2099,7 +2103,7 @@ static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb,
 			return NETDEV_TX_OK;
 		}
 		count = iavf_txd_use_count(skb->len);
-		tx_ring->tx_stats.tx_linearize++;
+		libie_stats_inc_one(&tx_ring->sq_stats, linearized);
 	}
 
 	/* need: 1 descriptor per page * PAGE_SIZE/IAVF_MAX_DATA_PER_TXD,
@@ -2109,7 +2113,7 @@ static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb,
 	 * otherwise try next time
 	 */
 	if (iavf_maybe_stop_tx(tx_ring, count + 4 + 1)) {
-		tx_ring->tx_stats.tx_busy++;
+		libie_stats_inc_one(&tx_ring->sq_stats, busy);
 		return NETDEV_TX_BUSY;
 	}
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 8fbe549ce6a587..64c93d6fa54d09 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -4,6 +4,8 @@
 #ifndef _IAVF_TXRX_H_
 #define _IAVF_TXRX_H_
 
+#include <linux/net/intel/libie/stats.h>
+
 /* Interrupt Throttling and Rate Limiting Goodies */
 #define IAVF_DEFAULT_IRQ_WORK      256
 
@@ -201,27 +203,6 @@ struct iavf_tx_buffer {
 	u32 tx_flags;
 };
 
-struct iavf_queue_stats {
-	u64 packets;
-	u64 bytes;
-};
-
-struct iavf_tx_queue_stats {
-	u64 restart_queue;
-	u64 tx_busy;
-	u64 tx_done_old;
-	u64 tx_linearize;
-	u64 tx_force_wb;
-	int prev_pkt_ctr;
-	u64 tx_lost_interrupt;
-};
-
-struct iavf_rx_queue_stats {
-	u64 non_eop_descs;
-	u64 alloc_page_failed;
-	u64 alloc_buff_failed;
-};
-
 /* some useful defines for virtchannel interface, which
  * is the only remaining user of header split
  */
@@ -272,21 +253,9 @@ struct iavf_ring {
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
 
-	/* stats structs */
-	struct iavf_queue_stats	stats;
-	struct u64_stats_sync syncp;
-	union {
-		struct iavf_tx_queue_stats tx_stats;
-		struct iavf_rx_queue_stats rx_stats;
-	};
-
-	unsigned int size;		/* length of descriptor ring in bytes */
-	dma_addr_t dma;			/* physical address of ring */
-
 	struct iavf_vsi *vsi;		/* Backreference to associated VSI */
 	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
 
-	struct rcu_head rcu;		/* to avoid race on free */
 	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
@@ -295,6 +264,18 @@ struct iavf_ring {
 					 * iavf_clean_rx_ring_irq() is called
 					 * for this ring.
 					 */
+
+	/* stats structs */
+	union {
+		struct libie_sq_stats sq_stats;
+		struct libie_rq_stats rq_stats;
+	};
+
+	int prev_pkt_ctr;		/* For stall detection */
+	unsigned int size;		/* length of descriptor ring in bytes */
+	dma_addr_t dma;			/* physical address of ring */
+
+	struct rcu_head rcu;		/* to avoid race on free */
 } ____cacheline_internodealigned_in_smp;
 
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002

From 4e602363494a5f98dd3da00f7e5fba2a5a7d2ea9 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Mon, 13 Mar 2023 19:24:05 +0100
Subject: [PATCH 12/40] selftests/bpf: robustify test_xdp_do_redirect with more
 payload magics

Currently, the test relies on that only dropped ("xmitted") frames will
be recycled and if a frame became an skb, it will be freed later by the
stack and never come back to its page_pool.
So, it easily gets broken by trying to recycle skbs:

  test_xdp_do_redirect:PASS:pkt_count_xdp 0 nsec
  test_xdp_do_redirect:FAIL:pkt_count_zero unexpected pkt_count_zero:
actual 9936 != expected 2
  test_xdp_do_redirect:PASS:pkt_count_tc 0 nsec

That huge mismatch happened because after the TC ingress hook zeroes the
magic, the page gets recycled when skb is freed, not returned to the MM
layer. "Live frames" mode initializes only new pages and keeps the
recycled ones as is by design, so they appear with zeroed magic on the
Rx path again.
Expand the possible magic values from two: 0 (was "xmitted"/dropped or
did hit the TC hook) and 0x42 (hit the input XDP prog) to three: the new
one will mark frames hit the TC hook, so that they will elide both
@pkt_count_zero and @pkt_count_xdp. They can then be recycled to their
page_pool or returned to the page allocator, this won't affect the
counters anyhow. Just make sure to mark them as "input" (0x42) when they
appear on the Rx path again.
Also make an enum from those magics, so that they will be always visible
and can be changed in just one place anytime. This also eases adding any
new marks later on.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 .../bpf/progs/test_xdp_do_redirect.c          | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
index 77a123071940ae..cd2d4e3258b899 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
@@ -4,6 +4,19 @@
 
 #define ETH_ALEN 6
 #define HDR_SZ (sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
+
+/**
+ * enum frame_mark - magics to distinguish page/packet paths
+ * @MARK_XMIT: page was recycled due to the frame being "xmitted" by the NIC.
+ * @MARK_IN: frame is being processed by the input XDP prog.
+ * @MARK_SKB: frame did hit the TC ingress hook as an skb.
+ */
+enum frame_mark {
+	MARK_XMIT	= 0U,
+	MARK_IN		= 0x42,
+	MARK_SKB	= 0x45,
+};
+
 const volatile int ifindex_out;
 const volatile int ifindex_in;
 const volatile __u8 expect_dst[ETH_ALEN];
@@ -34,10 +47,10 @@ int xdp_redirect(struct xdp_md *xdp)
 	if (*metadata != 0x42)
 		return XDP_ABORTED;
 
-	if (*payload == 0) {
-		*payload = 0x42;
+	if (*payload == MARK_XMIT)
 		pkts_seen_zero++;
-	}
+
+	*payload = MARK_IN;
 
 	if (bpf_xdp_adjust_meta(xdp, 4))
 		return XDP_ABORTED;
@@ -51,7 +64,7 @@ int xdp_redirect(struct xdp_md *xdp)
 	return ret;
 }
 
-static bool check_pkt(void *data, void *data_end)
+static bool check_pkt(void *data, void *data_end, const __u32 mark)
 {
 	struct ipv6hdr *iph = data + sizeof(struct ethhdr);
 	__u8 *payload = data + HDR_SZ;
@@ -59,13 +72,13 @@ static bool check_pkt(void *data, void *data_end)
 	if (payload + 1 > data_end)
 		return false;
 
-	if (iph->nexthdr != IPPROTO_UDP || *payload != 0x42)
+	if (iph->nexthdr != IPPROTO_UDP || *payload != MARK_IN)
 		return false;
 
 	/* reset the payload so the same packet doesn't get counted twice when
 	 * it cycles back through the kernel path and out the dst veth
 	 */
-	*payload = 0;
+	*payload = mark;
 	return true;
 }
 
@@ -75,11 +88,11 @@ int xdp_count_pkts(struct xdp_md *xdp)
 	void *data = (void *)(long)xdp->data;
 	void *data_end = (void *)(long)xdp->data_end;
 
-	if (check_pkt(data, data_end))
+	if (check_pkt(data, data_end, MARK_XMIT))
 		pkts_seen_xdp++;
 
-	/* Return XDP_DROP to make sure the data page is recycled, like when it
-	 * exits a physical NIC. Recycled pages will be counted in the
+	/* Return %XDP_DROP to recycle the data page with %MARK_XMIT, like
+	 * it exited a physical NIC. Those pages will be counted in the
 	 * pkts_seen_zero counter above.
 	 */
 	return XDP_DROP;
@@ -91,9 +104,12 @@ int tc_count_pkts(struct __sk_buff *skb)
 	void *data = (void *)(long)skb->data;
 	void *data_end = (void *)(long)skb->data_end;
 
-	if (check_pkt(data, data_end))
+	if (check_pkt(data, data_end, MARK_SKB))
 		pkts_seen_tc++;
 
+	/* Will be either recycled or freed, %MARK_SKB makes sure it won't
+	 * hit any of the counters above.
+	 */
 	return 0;
 }
 

From 2e28dffa0f454f96f40e25c39e6e5c8c49017ec5 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 3 Mar 2023 13:25:11 +0100
Subject: [PATCH 13/40] net: page_pool, skbuff: make skb_mark_for_recycle()
 always available

skb_mark_for_recycle() is guarded with CONFIG_PAGE_POOL, this creates
unneeded complication when using it in the generic code. For now, it's
only used in the drivers always selecting Page Pool, so this works.
Move the guards so that preprocessor will cut out only the operation
itself and the function will still be a noop on !PAGE_POOL systems,
but available there as well.
No functional changes.

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303020342.Wi2PRFFH-lkp@intel.com
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe661011644b8f..3f3a2a82a86b30 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5069,12 +5069,12 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 #endif
 }
 
-#ifdef CONFIG_PAGE_POOL
 static inline void skb_mark_for_recycle(struct sk_buff *skb)
 {
+#ifdef CONFIG_PAGE_POOL
 	skb->pp_recycle = 1;
-}
 #endif
+}
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */

From df09c63ee0dbd772647bddbbb475050ddbeabad2 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 1 Mar 2023 16:21:58 +0100
Subject: [PATCH 14/40] xdp: recycle Page Pool backed skbs built from XDP
 frames

__xdp_build_skb_from_frame() state(d):

/* Until page_pool get SKB return path, release DMA here */

Page Pool got skb pages recycling in April 2021, but missed this
function.

xdp_release_frame() is relevant only for Page Pool backed frames and it
detaches the page from the corresponding page_pool in order to make it
freeable via page_frag_free(). It can instead just mark the output skb
as eligible for recycling if the frame is backed by a pp. No change for
other memory model types (the same condition check as before).
cpumap redirect and veth on Page Pool drivers now become zero-alloc (or
almost).

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 net/core/xdp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/xdp.c b/net/core/xdp.c
index 528d4b37983df8..f9b9ffb6beb140 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -658,8 +658,8 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 	 * - RX ring dev queue index	(skb_record_rx_queue)
 	 */
 
-	/* Until page_pool get SKB return path, release DMA here */
-	xdp_release_frame(xdpf);
+	if (xdpf->mem.type == MEM_TYPE_PAGE_POOL)
+		skb_mark_for_recycle(skb);
 
 	/* Allow SKB to reuse area used by xdp_frame */
 	xdp_scrub_frame(xdpf);

From 4fcbfcde0dd707a786ebf0c0c587f72f996a8935 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 1 Mar 2023 16:29:40 +0100
Subject: [PATCH 15/40] xdp: remove unused {__,}xdp_release_frame()

__xdp_build_skb_from_frame() was the last user of
{__,}xdp_release_frame(), which detaches pages from the page_pool.
All the consumers now recycle Page Pool skbs and page, except mlx5,
stmmac and tsnep drivers, which use page_pool_release_page() directly
(might change one day). It's safe to assume this functionality is not
needed anymore and can be removed (in favor of recycling).

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/xdp.h | 29 -----------------------------
 net/core/xdp.c    | 15 ---------------
 2 files changed, 44 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 41c57b8b167147..383b25b426a482 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -317,35 +317,6 @@ void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
 void xdp_return_frame_bulk(struct xdp_frame *xdpf,
 			   struct xdp_frame_bulk *bq);
 
-/* When sending xdp_frame into the network stack, then there is no
- * return point callback, which is needed to release e.g. DMA-mapping
- * resources with page_pool.  Thus, have explicit function to release
- * frame resources.
- */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
-static inline void xdp_release_frame(struct xdp_frame *xdpf)
-{
-	struct xdp_mem_info *mem = &xdpf->mem;
-	struct skb_shared_info *sinfo;
-	int i;
-
-	/* Curr only page_pool needs this */
-	if (mem->type != MEM_TYPE_PAGE_POOL)
-		return;
-
-	if (likely(!xdp_frame_has_frags(xdpf)))
-		goto out;
-
-	sinfo = xdp_get_shared_info_from_frame(xdpf);
-	for (i = 0; i < sinfo->nr_frags; i++) {
-		struct page *page = skb_frag_page(&sinfo->frags[i]);
-
-		__xdp_release_frame(page_address(page), mem);
-	}
-out:
-	__xdp_release_frame(xdpf->data, mem);
-}
-
 static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
 {
 	struct skb_shared_info *sinfo;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index f9b9ffb6beb140..018e0fe4e71405 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -531,21 +531,6 @@ void xdp_return_buff(struct xdp_buff *xdp)
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 
-/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
-{
-	struct xdp_mem_allocator *xa;
-	struct page *page;
-
-	rcu_read_lock();
-	xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-	page = virt_to_head_page(data);
-	if (xa)
-		page_pool_release_page(xa->page_pool, page);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(__xdp_release_frame);
-
 void xdp_attachment_setup(struct xdp_attachment_info *info,
 			  struct netdev_bpf *bpf)
 {

From 897dcfd701d2d097884ca1790a9daa5011c06e8f Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 17 Mar 2023 15:12:33 +0100
Subject: [PATCH 16/40] iavf: optimize Rx hotpath a bunch -- vol. 2

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 121 ++++++--------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  10 +-
 2 files changed, 41 insertions(+), 90 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index ab4863f86a3c3e..07f558b6d47497 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/bitfield.h>
 #include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 
@@ -902,29 +903,21 @@ void iavf_alloc_rx_pages(struct iavf_ring *rxr)
  * iavf_rx_checksum - Indicate in skb if hw indicated a good cksum
  * @vsi: the VSI we care about
  * @skb: skb currently being received and modified
- * @rx_desc: the receive descriptor
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  **/
 static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 				    struct sk_buff *skb,
-				    union iavf_rx_desc *rx_desc)
+				    u64 qword)
 {
 	struct libie_rx_ptype_parsed parsed;
-	u32 rx_error, rx_status;
-	bool ipv4, ipv6;
-	u8 ptype;
-	u64 qword;
+	u32 ptype, rx_error, rx_status;
 
-	skb->ip_summed = CHECKSUM_NONE;
-
-	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 	ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
 
 	parsed = libie_parse_rx_ptype(ptype);
 	if (!libie_has_rx_checksum(vsi->netdev, parsed))
 		return;
 
-	rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
-		   IAVF_RXD_QW1_ERROR_SHIFT;
 	rx_status = (qword & IAVF_RXD_QW1_STATUS_MASK) >>
 		    IAVF_RXD_QW1_STATUS_SHIFT;
 
@@ -932,17 +925,17 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 	if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
-	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
+	rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
+		   IAVF_RXD_QW1_ERROR_SHIFT;
 
-	if (ipv4 &&
+	if (parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4 &&
 	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
 			 BIT(IAVF_RX_DESC_ERROR_EIPE_SHIFT))))
 		goto checksum_fail;
 
 	/* likely incorrect csum if alternate IP extension headers found */
-	if (ipv6 &&
-	    rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT))
+	else if (parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6 &&
+		 (rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT)))
 		/* don't increment checksum err here, non-fatal err */
 		return;
 
@@ -969,27 +962,26 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
  * @ring: descriptor ring
  * @rx_desc: specific descriptor
  * @skb: skb currently being received and modified
- * @rx_ptype: Rx packet type
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  **/
 static inline void iavf_rx_hash(struct iavf_ring *ring,
 				union iavf_rx_desc *rx_desc,
 				struct sk_buff *skb,
-				u8 rx_ptype)
+				u64 qword)
 {
+	const u64 rss_mask = (u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
+			     IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT;
 	struct libie_rx_ptype_parsed parsed;
-	u32 hash;
-	const __le64 rss_mask =
-		cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
-			    IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);
+	u32 rx_ptype, hash;
+
+	rx_ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
 
 	parsed = libie_parse_rx_ptype(rx_ptype);
-	if (!libie_has_rx_hash(ring->netdev, parsed))
+	if (!libie_has_rx_hash(ring->netdev, parsed) ||
+	    (qword & rss_mask) != rss_mask)
 		return;
-
-	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
-		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		libie_skb_set_hash(skb, hash, parsed);
-	}
+	hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
+	libie_skb_set_hash(skb, hash, parsed);
 }
 
 /**
@@ -997,7 +989,7 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
  * @rx_ring: rx descriptor ring packet is being transacted on
  * @rx_desc: pointer to the EOP Rx descriptor
  * @skb: pointer to current skb being populated
- * @rx_ptype: the packet type decoded by hardware
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  *
  * This function checks the ring, descriptor, and packet information in
  * order to populate the hash, checksum, VLAN, protocol, and
@@ -1006,11 +998,11 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
 static inline
 void iavf_process_skb_fields(struct iavf_ring *rx_ring,
 			     union iavf_rx_desc *rx_desc, struct sk_buff *skb,
-			     u8 rx_ptype)
+			     u64 qword)
 {
-	iavf_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
+	iavf_rx_hash(rx_ring, rx_desc, skb, qword);
 
-	iavf_rx_checksum(rx_ring->vsi, skb, rx_desc);
+	iavf_rx_checksum(rx_ring->vsi, skb, qword);
 
 	skb_record_rx_queue(skb, rx_ring->queue_index);
 
@@ -1018,28 +1010,6 @@ void iavf_process_skb_fields(struct iavf_ring *rx_ring,
 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 }
 
-/**
- * iavf_cleanup_headers - Correct empty headers
- * @rx_ring: rx descriptor ring packet is being transacted on
- * @skb: pointer to current skb being fixed
- *
- * Also address the case where we are pulling data in on pages only
- * and as such no data is present in the skb header.
- *
- * In addition if skb is not at least 60 bytes we need to pad it so that
- * it is large enough to qualify as a valid Ethernet frame.
- *
- * Returns true if an error was encountered and skb was freed.
- **/
-static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
-{
-	/* if eth_skb_pad returns an error the skb was freed */
-	if (eth_skb_pad(skb))
-		return true;
-
-	return false;
-}
-
 /**
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @skb: sk_buff to place the data into
@@ -1089,21 +1059,14 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 }
 
 /**
- * iavf_is_non_eop - process handling of non-EOP buffers
- * @rx_desc: Rx descriptor for current buffer
+ * iavf_is_non_eop - check whether a buffer is non-EOP
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  * @stats: NAPI poll local stats to update
- *
- * This function updates next to clean.  If the buffer is an EOP buffer
- * this function exits returning false, otherwise it will place the
- * sk_buff in the next buffer to be chained and return true indicating
- * that this is in fact a non-EOP buffer.
  **/
-static bool iavf_is_non_eop(union iavf_rx_desc *rx_desc,
-			    struct libie_rq_onstack_stats *stats)
+static bool iavf_is_non_eop(u64 qword, struct libie_rq_onstack_stats *stats)
 {
 	/* if we are the last buffer then there is nothing else to do */
-#define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
-	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
+	if (likely(iavf_test_staterr(qword, IAVF_RX_DESC_STATUS_EOF_SHIFT)))
 		return false;
 
 	stats->fragments++;
@@ -1139,7 +1102,6 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		struct page *page;
 		unsigned int size;
 		u16 vlan_tag = 0;
-		u8 rx_ptype;
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
@@ -1159,15 +1121,14 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 * hardware wrote DD then the length will be non-zero
 		 */
 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+		if (!iavf_test_staterr(qword, IAVF_RX_DESC_STATUS_DD_SHIFT))
+			break;
 
 		/* This memory barrier is needed to keep us from reading
 		 * any other fields out of the rx_desc until we have
 		 * verified the descriptor has been written back.
 		 */
 		dma_rmb();
-#define IAVF_RXD_DD BIT(IAVF_RX_DESC_STATUS_DD_SHIFT)
-		if (!iavf_test_staterr(rx_desc, IAVF_RXD_DD))
-			break;
 
 		size = (qword & IAVF_RXD_QW1_LENGTH_PBUF_MASK) >>
 		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
@@ -1208,23 +1169,19 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		if (unlikely(++ntc == ring_size))
 			ntc = 0;
 
-		prefetch(IAVF_RX_DESC(rx_ring, ntc));
-
-		if (iavf_is_non_eop(rx_desc, &stats))
+		if (iavf_is_non_eop(qword, &stats))
 			continue;
 
+		prefetch(rx_desc);
+
 		/* ERR_MASK will only have valid bits if EOP set, and
 		 * what we are doing here is actually checking
 		 * IAVF_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
 		 * the error field
 		 */
-		if (unlikely(iavf_test_staterr(rx_desc, BIT(IAVF_RXD_QW1_ERROR_SHIFT)))) {
-			dev_kfree_skb_any(skb);
-			skb = NULL;
-			continue;
-		}
-
-		if (iavf_cleanup_headers(rx_ring, skb)) {
+		if (unlikely(iavf_test_staterr(qword,
+					       IAVF_RXD_QW1_ERROR_SHIFT))) {
+			dev_kfree_skb(skb);
 			skb = NULL;
 			continue;
 		}
@@ -1232,12 +1189,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* probably a little skewed due to removing CRC */
 		stats.bytes += skb->len;
 
-		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-		rx_ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >>
-			   IAVF_RXD_QW1_PTYPE_SHIFT;
-
 		/* populate checksum, VLAN, and protocol */
-		iavf_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+		iavf_process_skb_fields(rx_ring, rx_desc, skb, qword);
 
 		if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) &&
 		    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 64c93d6fa54d09..764b0ada0e6833 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -87,19 +87,17 @@ enum iavf_dyn_idx_t {
 
 /**
  * iavf_test_staterr - tests bits in Rx descriptor status and error fields
- * @rx_desc: pointer to receive descriptor (in le64 format)
- * @stat_err_bits: value to mask
+ * @qword: `wb.qword1.status_error_len` from the descriptor
+ * @stat_err: bit number to mask
  *
  * This function does some fast chicanery in order to return the
  * value of the mask which is really only used for boolean tests.
  * The status_error_len doesn't need to be shifted because it begins
  * at offset zero.
  */
-static inline bool iavf_test_staterr(union iavf_rx_desc *rx_desc,
-				     const u64 stat_err_bits)
+static inline bool iavf_test_staterr(u64 qword, const u64 stat_err)
 {
-	return !!(rx_desc->wb.qword1.status_error_len &
-		  cpu_to_le64(stat_err_bits));
+	return !!(qword & BIT_ULL(stat_err));
 }
 
 /* How many Rx Buffers do we bundle into one write to the hardware ? */

From d6ea05c110beb0add41bd5421af8f8da8fd8bf12 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 23 Mar 2023 16:03:52 +0100
Subject: [PATCH 17/40] iavf: fixup for optimize vol. 2

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 07f558b6d47497..3b0a2ddf96ff6a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -932,7 +932,6 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
 			 BIT(IAVF_RX_DESC_ERROR_EIPE_SHIFT))))
 		goto checksum_fail;
-
 	/* likely incorrect csum if alternate IP extension headers found */
 	else if (parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6 &&
 		 (rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT)))
@@ -980,6 +979,7 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
 	if (!libie_has_rx_hash(ring->netdev, parsed) ||
 	    (qword & rss_mask) != rss_mask)
 		return;
+
 	hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
 	libie_skb_set_hash(skb, hash, parsed);
 }

From 8aca6e1e9bf77508d7de5f64e7ebda0837e585db Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Fri, 7 Oct 2022 09:27:27 -0400
Subject: [PATCH 18/40] i40e: Unify handling of zero ring length in 'configure
 queue'

The current version of Intel 'ice' driver allows for using zero
for the ring lenghth in 'configure queue' VIRTCHNL message.
Such a value indicates the ring should not be configured.

Implement the same handling in i40e driver. Instead of returning
an 'invalid parameter' error for zero-sized rings, just skip
that ring during queue pair configuration.

That unified handling is needed for AF_XDP implementation for
'iavf' driver. In that use case we sometimes need to configure
Tx ring only for a given queue pair.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 8a4587585acde7..ee2a1e682a1c9b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -612,6 +612,9 @@ static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_id,
 	u32 qtx_ctl;
 	int ret = 0;
 
+	if (info->ring_len == 0)
+		return 0;
+
 	if (!i40e_vc_isvalid_vsi_id(vf, info->vsi_id)) {
 		ret = -ENOENT;
 		goto error_context;
@@ -688,6 +691,9 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
 	struct i40e_hmc_obj_rxq rx_ctx;
 	int ret = 0;
 
+	if (info->ring_len == 0)
+		return 0;
+
 	/* clear the context structure first */
 	memset(&rx_ctx, 0, sizeof(struct i40e_hmc_obj_rxq));
 

From 557b3929a58a3d30dffe37a78a62ca86922d7f90 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 22 Feb 2023 12:34:42 +0100
Subject: [PATCH 19/40] iavf: Remove IAVF_TX_FLAGS_FD_SB flag

This flag was never set, so remove it and simplify buffer
cleaning process.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 16 ++++------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  2 +-
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 3b0a2ddf96ff6a..aa0009c853c453 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -29,22 +29,14 @@ static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
 static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
 					    struct iavf_tx_buffer *tx_buffer)
 {
-	if (tx_buffer->skb) {
-		if (tx_buffer->tx_flags & IAVF_TX_FLAGS_FD_SB)
-			kfree(tx_buffer->raw_buf);
-		else
-			dev_kfree_skb_any(tx_buffer->skb);
-		if (dma_unmap_len(tx_buffer, len))
-			dma_unmap_single(ring->dev,
-					 dma_unmap_addr(tx_buffer, dma),
-					 dma_unmap_len(tx_buffer, len),
-					 DMA_TO_DEVICE);
-	} else if (dma_unmap_len(tx_buffer, len)) {
+	if (tx_buffer->skb)
+		dev_kfree_skb_any(tx_buffer->skb);
+
+	if (dma_unmap_len(tx_buffer, len))
 		dma_unmap_page(ring->dev,
 			       dma_unmap_addr(tx_buffer, dma),
 			       dma_unmap_len(tx_buffer, len),
 			       DMA_TO_DEVICE);
-	}
 
 	tx_buffer->next_to_watch = NULL;
 	tx_buffer->skb = NULL;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 764b0ada0e6833..5826501c5df1c8 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -179,7 +179,7 @@ static inline unsigned int iavf_txd_use_count(unsigned int size)
 #define IAVF_TX_FLAGS_IPV6			BIT(5)
 #define IAVF_TX_FLAGS_FCCRC			BIT(6)
 #define IAVF_TX_FLAGS_FSO			BIT(7)
-#define IAVF_TX_FLAGS_FD_SB			BIT(9)
+/* BIT(9) is free, was IAVF_TX_FLAGS_FD_SB */
 #define IAVF_TX_FLAGS_VXLAN_TUNNEL		BIT(10)
 #define IAVF_TX_FLAGS_HW_OUTER_SINGLE_VLAN	BIT(11)
 #define IAVF_TX_FLAGS_VLAN_MASK			0xffff0000

From 80f873cf7a50a2cdeb0e955eea76eef2b7da368a Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Wed, 15 Feb 2023 14:35:01 +0100
Subject: [PATCH 20/40] iavf: Use separate ring masks for TX and RX in q_vector

Replace the existing ring mask (common for RX and TX rings)
in iavf_q_vector with two masks dedicated to handling RX and TX
rings separately.

The virtchnl interface allows separate masks to be used for different
ring types, so there is no need to merge them into a single mask.
Also, after adding XDP support to iavf, the number of RX and TX
rings can be asymmetric. Therefore, this patch is a necessary
preparation for XDP support.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h          | 3 ++-
 drivers/net/ethernet/intel/iavf/iavf_main.c     | 6 ++++--
 drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 7dbec98d2a983f..1d0f79a2f53f9b 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -107,7 +107,8 @@ struct iavf_q_vector {
 	struct napi_struct napi;
 	struct iavf_ring_container rx;
 	struct iavf_ring_container tx;
-	u32 ring_mask;
+	u32 rx_ring_mask;
+	u32 tx_ring_mask;
 	u8 itr_countdown;	/* when 0 should adjust adaptive ITR */
 	u8 num_ringpairs;	/* total number of ring pairs in vector */
 	u16 v_idx;		/* index in the vsi->q_vector array. */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 60463b3edfacf2..174a702ceedf78 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -455,7 +455,7 @@ iavf_map_vector_to_rxq(struct iavf_adapter *adapter, int v_idx, int r_idx)
 	q_vector->rx.count++;
 	q_vector->rx.next_update = jiffies + 1;
 	q_vector->rx.target_itr = ITR_TO_REG(rx_ring->itr_setting);
-	q_vector->ring_mask |= BIT(r_idx);
+	q_vector->rx_ring_mask |= BIT(r_idx);
 	wr32(hw, IAVF_VFINT_ITRN1(IAVF_RX_ITR, q_vector->reg_idx),
 	     q_vector->rx.current_itr >> 1);
 	q_vector->rx.current_itr = q_vector->rx.target_itr;
@@ -481,7 +481,7 @@ iavf_map_vector_to_txq(struct iavf_adapter *adapter, int v_idx, int t_idx)
 	q_vector->tx.count++;
 	q_vector->tx.next_update = jiffies + 1;
 	q_vector->tx.target_itr = ITR_TO_REG(tx_ring->itr_setting);
-	q_vector->num_ringpairs++;
+	q_vector->tx_ring_mask |= BIT(t_idx);
 	wr32(hw, IAVF_VFINT_ITRN1(IAVF_TX_ITR, q_vector->reg_idx),
 	     q_vector->tx.target_itr >> 1);
 	q_vector->tx.current_itr = q_vector->tx.target_itr;
@@ -509,6 +509,8 @@ static void iavf_map_rings_to_vectors(struct iavf_adapter *adapter)
 		iavf_map_vector_to_rxq(adapter, vidx, ridx);
 		iavf_map_vector_to_txq(adapter, vidx, ridx);
 
+		adapter->q_vectors[vidx].num_ringpairs++;
+
 		/* In the case where we have more queues than vectors, continue
 		 * round-robin on vectors until all queues are mapped.
 		 */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 3a031d8b9685e2..7e662587d89cb6 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -404,8 +404,8 @@ void iavf_map_queues(struct iavf_adapter *adapter)
 
 		vecmap->vsi_id = adapter->vsi_res->vsi_id;
 		vecmap->vector_id = v_idx + NONQ_VECS;
-		vecmap->txq_map = q_vector->ring_mask;
-		vecmap->rxq_map = q_vector->ring_mask;
+		vecmap->txq_map = q_vector->tx_ring_mask;
+		vecmap->rxq_map = q_vector->rx_ring_mask;
 		vecmap->rxitr_idx = IAVF_RX_ITR;
 		vecmap->txitr_idx = IAVF_TX_ITR;
 	}

From 455dee1cc9a8e88adadd117d4f9d27682ef97d4e Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Tue, 29 Nov 2022 08:41:22 -0500
Subject: [PATCH 21/40] iavf: Prepare VIRTCHNL functions to support XDP

The XDP and AF_XDP feature is initialized using .ndo functions. Those
functions are always synchronous and may require some serious queues
reconfiguration including changing the number of queues.

Performing such a reconfiguration implies sending a bunch of VIRTCHNL
messages to the PF in order to disable queues, re-enable and re-configure
them, or update the RSS LUT.
By definition, those VIRTCHNL messages are sent asynchronously, so the
result of each VIRTCHNL operation can be received from the PF via admin
queue after some time.
Moreover, the previous implementation of some VIRTCHNL functions (e.g.
'iavf_disable_queues()' or 'iavf_enable_queues()' does not allow to call
them selectively for specific queues only.

In order to addres those problems and cover all scenarios of XDP and
AF_XDP initialization, implement a polling mechanism with a timeout for
blocking the execution of XDP .ndo functions until the result of
VIRTCHNL operation on PF is known to the driver.
Also, refactor the existing VIRTCHNL API by adding functions for
selective queue enabling, disabling and configuration.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |  17 +-
 drivers/net/ethernet/intel/iavf/iavf_main.c   |   8 +-
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   | 400 ++++++++++++++++--
 3 files changed, 374 insertions(+), 51 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 1d0f79a2f53f9b..899fa1ef162263 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -538,11 +538,17 @@ int iavf_send_vf_offload_vlan_v2_msg(struct iavf_adapter *adapter);
 void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter);
 u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter);
 void iavf_irq_enable(struct iavf_adapter *adapter, bool flush);
-void iavf_configure_queues(struct iavf_adapter *adapter);
+int iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask,
+				   bool wait);
+int iavf_configure_queues(struct iavf_adapter *adapter, bool wait);
 void iavf_deconfigure_queues(struct iavf_adapter *adapter);
-void iavf_enable_queues(struct iavf_adapter *adapter);
-void iavf_disable_queues(struct iavf_adapter *adapter);
-void iavf_map_queues(struct iavf_adapter *adapter);
+int iavf_enable_queues(struct iavf_adapter *adapter, bool wait);
+int iavf_disable_queues(struct iavf_adapter *adapter, bool wait);
+int iavf_enable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				u32 tx_queues, bool wait);
+int iavf_disable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				 u32 tx_queues, bool wait);
+int iavf_map_queues(struct iavf_adapter *adapter, bool wait);
 int iavf_request_queues(struct iavf_adapter *adapter, int num);
 void iavf_add_ether_addrs(struct iavf_adapter *adapter);
 void iavf_del_ether_addrs(struct iavf_adapter *adapter);
@@ -557,9 +563,12 @@ void iavf_set_rss_key(struct iavf_adapter *adapter);
 void iavf_set_rss_lut(struct iavf_adapter *adapter);
 void iavf_enable_vlan_stripping(struct iavf_adapter *adapter);
 void iavf_disable_vlan_stripping(struct iavf_adapter *adapter);
+int iavf_poll_for_link_status(struct iavf_adapter *adapter, unsigned int msecs);
 void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 			      enum virtchnl_ops v_opcode,
 			      enum iavf_status v_retval, u8 *msg, u16 msglen);
+int iavf_process_pending_pf_msg(struct iavf_adapter *adapter,
+				unsigned int timeout_msecs);
 int iavf_config_rss(struct iavf_adapter *adapter);
 int iavf_lan_add_device(struct iavf_adapter *adapter);
 int iavf_lan_del_device(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 174a702ceedf78..3737ea19e48efd 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1978,12 +1978,12 @@ static int iavf_process_aq_command(struct iavf_adapter *adapter)
 	if (adapter->aq_required & IAVF_FLAG_AQ_GET_OFFLOAD_VLAN_V2_CAPS)
 		return iavf_send_vf_offload_vlan_v2_msg(adapter);
 	if (adapter->aq_required & IAVF_FLAG_AQ_DISABLE_QUEUES) {
-		iavf_disable_queues(adapter);
+		iavf_disable_queues(adapter, false);
 		return 0;
 	}
 
 	if (adapter->aq_required & IAVF_FLAG_AQ_MAP_VECTORS) {
-		iavf_map_queues(adapter);
+		iavf_map_queues(adapter, false);
 		return 0;
 	}
 
@@ -2018,12 +2018,12 @@ static int iavf_process_aq_command(struct iavf_adapter *adapter)
 	}
 
 	if (adapter->aq_required & IAVF_FLAG_AQ_CONFIGURE_QUEUES) {
-		iavf_configure_queues(adapter);
+		iavf_configure_queues(adapter, false);
 		return 0;
 	}
 
 	if (adapter->aq_required & IAVF_FLAG_AQ_ENABLE_QUEUES) {
-		iavf_enable_queues(adapter);
+		iavf_enable_queues(adapter, false);
 		return 0;
 	}
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 7e662587d89cb6..9f25de3e2b6e78 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -7,6 +7,8 @@
 #include "iavf_prototype.h"
 #include "iavf_client.h"
 
+#define IAVF_VC_MSG_TIMEOUT_MS		300
+
 /**
  * iavf_send_pf_msg
  * @adapter: adapter structure
@@ -52,6 +54,59 @@ int iavf_send_api_ver(struct iavf_adapter *adapter)
 				sizeof(vvi));
 }
 
+/**
+ * iavf_poll_virtchnl_msg_timeout
+ * @hw: HW configuration structure
+ * @event: event to populate on success
+ * @op_to_poll: requested virtchnl op to poll for
+ * @msecs: timeout in milliseconds
+ *
+ * Initialize poll for virtchnl msg matching the requested_op. Returns 0
+ * if a message of the correct opcode is in the queue or an error code
+ * if no message matching the op code is waiting and other failures
+ * (including timeout). In case of timeout -EBUSY error is returned.
+ */
+static int
+iavf_poll_virtchnl_msg_timeout(struct iavf_hw *hw,
+			       struct iavf_arq_event_info *event,
+			       enum virtchnl_ops op_to_poll,
+			       unsigned int msecs)
+{
+	unsigned int wait, delay = 10;
+	enum virtchnl_ops received_op;
+	enum iavf_status status;
+	u32 v_retval;
+
+	for (wait = 0; wait < msecs; wait += delay) {
+		/* When the AQ is empty, iavf_clean_arq_element will be
+		 * nonzero and after some delay this loop will check again
+		 * if any message is added to the AQ.
+		 */
+		status = iavf_clean_arq_element(hw, event, NULL);
+		if (status == IAVF_ERR_ADMIN_QUEUE_NO_WORK)
+			goto wait_for_msg;
+		else if (status != IAVF_SUCCESS)
+			break;
+		received_op =
+		    (enum virtchnl_ops)le32_to_cpu(event->desc.cookie_high);
+		if (op_to_poll == received_op)
+			break;
+wait_for_msg:
+		msleep(delay);
+		status = IAVF_ERR_NOT_READY;
+	}
+
+	if (status == IAVF_SUCCESS) {
+		v_retval = le32_to_cpu(event->desc.cookie_low);
+		v_retval = virtchnl_status_to_errno((enum virtchnl_status_code)
+						    v_retval);
+	} else {
+		v_retval = iavf_status_to_errno(status);
+	}
+
+	return v_retval;
+}
+
 /**
  * iavf_poll_virtchnl_msg
  * @hw: HW configuration structure
@@ -87,6 +142,83 @@ iavf_poll_virtchnl_msg(struct iavf_hw *hw, struct iavf_arq_event_info *event,
 	return virtchnl_status_to_errno((enum virtchnl_status_code)v_retval);
 }
 
+/**
+ * iavf_process_pending_pf_msg
+ * @adapter: adapter structure
+ * @timeout_msecs: timeout in milliseconds
+ *
+ * Check if any VIRTCHNL message is currently pending and process it
+ * if needed.
+ * Poll the admin queue for the PF response and process it using
+ * a standard handler.
+ * If no PF response has been received within a given timeout, exit
+ * with an error.
+ */
+int
+iavf_process_pending_pf_msg(struct iavf_adapter *adapter,
+			    unsigned int timeout_msecs)
+{
+	enum virtchnl_ops current_op = adapter->current_op;
+	struct iavf_hw *hw = &adapter->hw;
+	struct iavf_arq_event_info event;
+	enum virtchnl_ops v_op;
+	enum iavf_status v_ret;
+	int err;
+
+	if (current_op == VIRTCHNL_OP_UNKNOWN)
+		return 0;
+
+	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(IAVF_MAX_AQ_BUF_SIZE, GFP_KERNEL);
+	if (!event.msg_buf)
+		return -ENOMEM;
+
+	err = iavf_poll_virtchnl_msg_timeout(hw, &event, current_op,
+					     timeout_msecs);
+	if (err)
+		goto free_exit;
+
+	v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high);
+	v_ret = (enum iavf_status)le32_to_cpu(event.desc.cookie_low);
+
+	iavf_virtchnl_completion(adapter, v_op, v_ret, event.msg_buf,
+				 event.msg_len);
+
+free_exit:
+	kfree(event.msg_buf);
+
+	return err;
+}
+
+/**
+ * iavf_get_vf_op_result
+ * @adapter: adapter structure
+ * @op: virtchnl operation
+ * @msecs: timeout in milliseconds
+ *
+ * Return a result of a given operation returned by PF
+ * or exit with timeout.
+ */
+static int iavf_get_vf_op_result(struct iavf_adapter *adapter,
+				 enum virtchnl_ops op,
+				 unsigned int msecs)
+{
+	struct iavf_hw *hw = &adapter->hw;
+	struct iavf_arq_event_info event;
+	int err;
+
+	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(IAVF_MAX_AQ_BUF_SIZE, GFP_KERNEL);
+	if (!event.msg_buf)
+		return -ENOMEM;
+
+	err = iavf_poll_virtchnl_msg_timeout(hw, &event, op, msecs);
+	kfree(event.msg_buf);
+	adapter->current_op = VIRTCHNL_OP_UNKNOWN;
+
+	return err;
+}
+
 /**
  * iavf_verify_api_ver
  * @adapter: adapter structure
@@ -263,16 +395,53 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
 }
 
 /**
- * iavf_configure_queues
+ * iavf_set_qp_config_info
+ * @vqpi: virtchannel structure for queue pair configuration
  * @adapter: adapter structure
+ * @queue_index: index of queue pair in the adapter structure
+ * @max_frame: maximal frame size supported by the adapter
  *
- * Request that the PF set up our (previously allocated) queues.
+ * Fill virtchannel queue pair configuration structure
+ * with data for the Rx and Tx queues of a given index.
  **/
-void iavf_configure_queues(struct iavf_adapter *adapter)
+static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
+				    struct iavf_adapter *adapter,
+				    int queue_index, int max_frame)
+{
+	struct iavf_ring *txq = &adapter->tx_rings[queue_index];
+	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
+
+	vqpi->txq.vsi_id = adapter->vsi_res->vsi_id;
+	vqpi->txq.queue_id = queue_index;
+	vqpi->txq.ring_len = txq->count;
+	vqpi->txq.dma_ring_addr = txq->dma;
+
+	vqpi->rxq.vsi_id = adapter->vsi_res->vsi_id;
+	vqpi->rxq.queue_id = queue_index;
+	vqpi->rxq.ring_len = rxq->count;
+	vqpi->rxq.dma_ring_addr = rxq->dma;
+	vqpi->rxq.max_pkt_size = max_frame;
+	vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
+}
+
+/**
+ * iavf_configure_selected_queues
+ * @adapter: adapter structure
+ * @qp_mask: mask of queue pairs to configure
+ * @wait: if true, wait until the request is completed
+ *
+ * Request PF to set up our selected (previously allocated) queues.
+ * Returns 0 if the command succeeds or negative value in case of error.
+ *
+ * Note: The caller must ensure that the calling context has taken
+ *	 'adapter->crit_lock' mutex when 'wait' parameter is set to true.
+ */
+int iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask,
+				   bool wait)
 {
+	unsigned long num_qps_to_config, mask = qp_mask;
+	u32 idx, max_frame = adapter->vf_res->max_mtu;
 	struct virtchnl_vsi_queue_config_info *vqci;
-	u32 i, max_frame = adapter->vf_res->max_mtu;
-	int pairs = adapter->num_active_queues;
 	struct virtchnl_queue_pair_info *vqpi;
 	size_t len;
 
@@ -280,33 +449,26 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
-		dev_err(&adapter->pdev->dev, "Cannot configure queues, command %d pending\n",
+		dev_err(&adapter->pdev->dev,
+			"Cannot configure queues, command %d pending\n",
 			adapter->current_op);
-		return;
+		return -EBUSY;
 	}
+	num_qps_to_config = hweight_long(mask);
 	adapter->current_op = VIRTCHNL_OP_CONFIG_VSI_QUEUES;
-	len = struct_size(vqci, qpair, pairs);
+	len = struct_size(vqci, qpair, num_qps_to_config);
 	vqci = kzalloc(len, GFP_KERNEL);
 	if (!vqci)
-		return;
+		return -ENOMEM;
 
 	vqci->vsi_id = adapter->vsi_res->vsi_id;
-	vqci->num_queue_pairs = pairs;
+	vqci->num_queue_pairs = num_qps_to_config;
 	vqpi = vqci->qpair;
 	/* Size check is not needed here - HW max is 16 queue pairs, and we
 	 * can fit info for 31 of them into the AQ buffer before it overflows.
 	 */
-	for (i = 0; i < pairs; i++) {
-		vqpi->txq.vsi_id = vqci->vsi_id;
-		vqpi->txq.queue_id = i;
-		vqpi->txq.ring_len = adapter->tx_rings[i].count;
-		vqpi->txq.dma_ring_addr = adapter->tx_rings[i].dma;
-		vqpi->rxq.vsi_id = vqci->vsi_id;
-		vqpi->rxq.queue_id = i;
-		vqpi->rxq.ring_len = adapter->rx_rings[i].count;
-		vqpi->rxq.dma_ring_addr = adapter->rx_rings[i].dma;
-		vqpi->rxq.max_pkt_size = max_frame;
-		vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
+	for_each_set_bit(idx, &mask, adapter->num_active_queues) {
+		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame);
 		vqpi++;
 	}
 
@@ -314,66 +476,166 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 	iavf_send_pf_msg(adapter, VIRTCHNL_OP_CONFIG_VSI_QUEUES,
 			 (u8 *)vqci, len);
 	kfree(vqci);
+
+	if (wait)
+		return iavf_get_vf_op_result(adapter,
+					     VIRTCHNL_OP_CONFIG_VSI_QUEUES,
+					     IAVF_VC_MSG_TIMEOUT_MS);
+	return 0;
 }
 
 /**
- * iavf_enable_queues
+ * iavf_configure_queues
  * @adapter: adapter structure
+ * @wait: if true, wait until the request is completed
  *
- * Request that the PF enable all of our queues.
- **/
-void iavf_enable_queues(struct iavf_adapter *adapter)
+ * Send a request to PF to set up all allocated queues.
+ * Returns 0 if the command succeeds or negative value in case of error.
+ *
+ * Note: The caller must ensure that the calling context has taken
+ *	 'adapter->crit_lock' mutex when 'wait' parameter is set to true.
+ */
+int iavf_configure_queues(struct iavf_adapter *adapter, bool wait)
+{
+	int pairs = adapter->num_active_queues;
+	u32 qpair_mask = BIT(pairs) - 1;
+
+	return iavf_configure_selected_queues(adapter, qpair_mask, wait);
+}
+
+/**
+ * iavf_enable_selected_queues
+ * @adapter: adapter structure
+ * @rx_queues: mask of Rx queues
+ * @tx_queues: mask of Tx queues
+ * @wait: if true, wait until the request is completed
+ *
+ * Send a request to PF to enable selected queues.
+ * Returns 0 if the command succeeds or negative value in case of error.
+ *
+ * Note: The caller must ensure that the calling context has taken
+ *	 'adapter->crit_lock' mutex when 'wait' parameter is set to true.
+ */
+int iavf_enable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				u32 tx_queues, bool wait)
 {
 	struct virtchnl_queue_select vqs;
 
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
-		dev_err(&adapter->pdev->dev, "Cannot enable queues, command %d pending\n",
+		dev_err(&adapter->pdev->dev,
+			"Cannot enable queues, command %d pending\n",
 			adapter->current_op);
-		return;
+		return -EBUSY;
 	}
 	adapter->current_op = VIRTCHNL_OP_ENABLE_QUEUES;
 	vqs.vsi_id = adapter->vsi_res->vsi_id;
-	vqs.tx_queues = BIT(adapter->num_active_queues) - 1;
-	vqs.rx_queues = vqs.tx_queues;
+	vqs.tx_queues = tx_queues;
+	vqs.rx_queues = rx_queues;
 	adapter->aq_required &= ~IAVF_FLAG_AQ_ENABLE_QUEUES;
 	iavf_send_pf_msg(adapter, VIRTCHNL_OP_ENABLE_QUEUES,
 			 (u8 *)&vqs, sizeof(vqs));
+
+	if (wait)
+		return iavf_get_vf_op_result(adapter, VIRTCHNL_OP_ENABLE_QUEUES,
+					     IAVF_VC_MSG_TIMEOUT_MS);
+	return 0;
 }
 
 /**
- * iavf_disable_queues
+ * iavf_disable_selected_queues
  * @adapter: adapter structure
+ * @rx_queues: mask of Rx queues
+ * @tx_queues: mask of Tx queues
+ * @wait: if true, wait until the request is completed
  *
- * Request that the PF disable all of our queues.
- **/
-void iavf_disable_queues(struct iavf_adapter *adapter)
+ * Send a request to PF to disable selected queues.
+ * Returns 0 if the command succeeds or negative value in case of error.
+ *
+ * Note: The caller must ensure that the calling context has taken
+ *	 'adapter->crit_lock' mutex when 'wait' parameter is set to true.
+ */
+int iavf_disable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				 u32 tx_queues, bool wait)
 {
 	struct virtchnl_queue_select vqs;
 
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
-		dev_err(&adapter->pdev->dev, "Cannot disable queues, command %d pending\n",
+		dev_err(&adapter->pdev->dev,
+			"Cannot disable queues, command %d pending\n",
 			adapter->current_op);
-		return;
+		return -EBUSY;
 	}
 	adapter->current_op = VIRTCHNL_OP_DISABLE_QUEUES;
 	vqs.vsi_id = adapter->vsi_res->vsi_id;
-	vqs.tx_queues = BIT(adapter->num_active_queues) - 1;
-	vqs.rx_queues = vqs.tx_queues;
+	vqs.tx_queues = tx_queues;
+	vqs.rx_queues = rx_queues;
 	adapter->aq_required &= ~IAVF_FLAG_AQ_DISABLE_QUEUES;
 	iavf_send_pf_msg(adapter, VIRTCHNL_OP_DISABLE_QUEUES,
 			 (u8 *)&vqs, sizeof(vqs));
+
+	if (wait)
+		return iavf_get_vf_op_result(adapter,
+					     VIRTCHNL_OP_DISABLE_QUEUES,
+					     IAVF_VC_MSG_TIMEOUT_MS);
+	return 0;
+}
+
+/**
+ * iavf_enable_queues
+ * @adapter: adapter structure
+ * @wait: if true, wait until the request is completed
+ *
+ * Send a request to PF to enable all allocated queues.
+ * Returns 0 if the command succeeds or negative value in case of error.
+ *
+ * Note: The caller must ensure that the calling context has taken
+ *	 'adapter->crit_lock' mutex when 'wait' parameter is set to true.
+ */
+int iavf_enable_queues(struct iavf_adapter *adapter, bool wait)
+{
+	u32 num_tx_queues = adapter->num_active_queues;
+	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
+	u32 tx_queues = BIT(num_tx_queues) - 1;
+
+	return iavf_enable_selected_queues(adapter, rx_queues, tx_queues, wait);
+}
+
+/**
+ * iavf_disable_queues
+ * @adapter: adapter structure
+ * @wait: if true, wait until the request is completed
+ *
+ * Send a request to PF to disable all allocated queues.
+ * Returns 0 if the command succeeds or negative value in case of error.
+ *
+ * Note: The caller must ensure that the calling context has taken
+ *	 'adapter->crit_lock' mutex when 'wait' parameter is set to true.
+ */
+int iavf_disable_queues(struct iavf_adapter *adapter, bool wait)
+{
+	u32 num_tx_queues = adapter->num_active_queues;
+	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
+	u32 tx_queues = BIT(num_tx_queues) - 1;
+
+	return iavf_disable_selected_queues(adapter, rx_queues, tx_queues,
+					    wait);
 }
 
 /**
  * iavf_map_queues
  * @adapter: adapter structure
+ * @wait: if true, wait until the request is completed
  *
- * Request that the PF map queues to interrupt vectors. Misc causes, including
- * admin queue, are always mapped to vector 0.
- **/
-void iavf_map_queues(struct iavf_adapter *adapter)
+ * Send a request to PF to update the mapping queues to interrupt vectors.
+ * Misc causes, including admin queue, are always mapped to vector 0.
+ * Returns 0 if the command succeeds or negative value in case of error.
+ *
+ * Note: The caller must ensure that the calling context has taken
+ *       'adapter->crit_lock' mutex when 'wait' parameter is set to true.
+ */
+int iavf_map_queues(struct iavf_adapter *adapter, bool wait)
 {
 	struct virtchnl_irq_map_info *vimi;
 	struct virtchnl_vector_map *vecmap;
@@ -385,7 +647,7 @@ void iavf_map_queues(struct iavf_adapter *adapter)
 		/* bail because we already have a command pending */
 		dev_err(&adapter->pdev->dev, "Cannot map queues to vectors, command %d pending\n",
 			adapter->current_op);
-		return;
+		return -EBUSY;
 	}
 	adapter->current_op = VIRTCHNL_OP_CONFIG_IRQ_MAP;
 
@@ -394,7 +656,7 @@ void iavf_map_queues(struct iavf_adapter *adapter)
 	len = struct_size(vimi, vecmap, adapter->num_msix_vectors);
 	vimi = kzalloc(len, GFP_KERNEL);
 	if (!vimi)
-		return;
+		return -ENOMEM;
 
 	vimi->num_vectors = adapter->num_msix_vectors;
 	/* Queue vectors first */
@@ -420,6 +682,12 @@ void iavf_map_queues(struct iavf_adapter *adapter)
 	iavf_send_pf_msg(adapter, VIRTCHNL_OP_CONFIG_IRQ_MAP,
 			 (u8 *)vimi, len);
 	kfree(vimi);
+
+	if (wait)
+		return iavf_get_vf_op_result(adapter,
+					     VIRTCHNL_OP_CONFIG_IRQ_MAP,
+					     IAVF_VC_MSG_TIMEOUT_MS);
+	return 0;
 }
 
 /**
@@ -1880,6 +2148,52 @@ static void iavf_netdev_features_vlan_strip_set(struct net_device *netdev,
 		netdev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
 }
 
+/**
+ * iavf_poll_for_link_status - poll for PF notification about link status
+ * @adapter: adapter structure
+ * @msecs: timeout in milliseconds
+ *
+ * Returns:
+ *   0 - if notification about link down was received,
+ *   1 - if notification about link up was received,
+ *   or negative error code in case of error.
+ */
+int iavf_poll_for_link_status(struct iavf_adapter *adapter, unsigned int msecs)
+{
+	struct iavf_hw *hw = &adapter->hw;
+	struct iavf_arq_event_info event;
+	struct virtchnl_pf_event *vpe;
+	int ret;
+
+	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(IAVF_MAX_AQ_BUF_SIZE, GFP_KERNEL);
+	if (!event.msg_buf)
+		return -ENOMEM;
+
+	ret = iavf_poll_virtchnl_msg_timeout(hw, &event, VIRTCHNL_OP_EVENT,
+					     msecs);
+	if (ret)
+		goto virtchnl_msg_err;
+
+	vpe = (struct virtchnl_pf_event *)event.msg_buf;
+	if (vpe->event == VIRTCHNL_EVENT_LINK_CHANGE) {
+		bool link_up = iavf_get_vpe_link_status(adapter, vpe);
+
+		iavf_set_adapter_link_speed_from_vpe(adapter, vpe);
+
+		ret = link_up ? 1 : 0;
+	} else {
+		iavf_virtchnl_completion(adapter, VIRTCHNL_OP_EVENT, 0,
+					 event.msg_buf, event.msg_len);
+		ret = -EBUSY;
+	}
+
+virtchnl_msg_err:
+	kfree(event.msg_buf);
+
+	return ret;
+}
+
 /**
  * iavf_virtchnl_completion
  * @adapter: adapter structure

From d1aea70d76e35be5f62bc26c346ccd303d7b3aff Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Tue, 29 Nov 2022 16:32:05 +0100
Subject: [PATCH 22/40] iavf: Refactor ring initialization functions to handle
 XDP

Introduce modular functions to allocate and initialize Rx and Tx rings
in order to prepare the initialization procedure to easily fit the XDP
setup.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h      |   6 +-
 drivers/net/ethernet/intel/iavf/iavf_main.c | 237 ++++++++++++--------
 drivers/net/ethernet/intel/iavf/iavf_txrx.c |   8 +-
 3 files changed, 149 insertions(+), 102 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 899fa1ef162263..4e44834ff9d8fa 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -264,8 +264,8 @@ struct iavf_adapter {
 	/* Lock to protect accesses to MAC and VLAN lists */
 	spinlock_t mac_vlan_list_lock;
 	char misc_vector_name[IFNAMSIZ + 9];
-	int num_active_queues;
-	int num_req_queues;
+	u32 num_active_queues;
+	u32 num_req_queues;
 
 	/* TX */
 	struct iavf_ring *tx_rings;
@@ -569,6 +569,8 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 			      enum iavf_status v_retval, u8 *msg, u16 msglen);
 int iavf_process_pending_pf_msg(struct iavf_adapter *adapter,
 				unsigned int timeout_msecs);
+void iavf_configure_rx_ring(struct iavf_adapter *adapter,
+			    struct iavf_ring *rx_ring);
 int iavf_config_rss(struct iavf_adapter *adapter);
 int iavf_lan_add_device(struct iavf_adapter *adapter);
 int iavf_lan_del_device(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 3737ea19e48efd..24c1a92998b177 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -705,18 +705,31 @@ static void iavf_configure_tx(struct iavf_adapter *adapter)
 		adapter->tx_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(i);
 }
 
+/**
+ * iavf_configure_rx_ring - Configure a single Rx ring
+ * @adapter: board private structure
+ * @rx_ring: Rx ring to be configured
+ * @rx_buf_len: buffer length that shall be used for the given Rx ring.
+ */
+void iavf_configure_rx_ring(struct iavf_adapter *adapter,
+			    struct iavf_ring *rx_ring)
+{
+	u32 queue_idx = rx_ring->queue_index;
+
+	rx_ring->tail = adapter->hw.hw_addr + IAVF_QRX_TAIL1(queue_idx);
+	iavf_alloc_rx_pages(rx_ring);
+}
+
 /**
  * iavf_configure_rx - Configure Receive Unit after Reset
  * @adapter: board private structure
  *
  * Configure the Rx unit of the MAC after a reset.
- **/
+ */
 static void iavf_configure_rx(struct iavf_adapter *adapter)
 {
-	struct iavf_hw *hw = &adapter->hw;
-
 	for (u32 i = 0; i < adapter->num_active_queues; i++)
-		adapter->rx_rings[i].tail = hw->hw_addr + IAVF_QRX_TAIL1(i);
+		iavf_configure_rx_ring(adapter, &adapter->rx_rings[i]);
 }
 
 /**
@@ -1208,19 +1221,12 @@ static void iavf_napi_disable_all(struct iavf_adapter *adapter)
 static void iavf_configure(struct iavf_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
-	int i;
 
 	iavf_set_rx_mode(netdev);
 
 	iavf_configure_tx(adapter);
 	iavf_configure_rx(adapter);
 	adapter->aq_required |= IAVF_FLAG_AQ_CONFIGURE_QUEUES;
-
-	for (i = 0; i < adapter->num_active_queues; i++) {
-		struct iavf_ring *ring = &adapter->rx_rings[i];
-
-		iavf_alloc_rx_pages(ring);
-	}
 }
 
 /**
@@ -1459,6 +1465,72 @@ static void iavf_free_queues(struct iavf_adapter *adapter)
 	adapter->rx_rings = NULL;
 }
 
+/**
+ * iavf_set_rx_queue_vlan_tag_loc - set location for VLAN tag offload in Rx
+ * @adapter: board private structure
+ * @rx_ring: Rx ring where VLAN tag offload for VLAN will be set
+ *
+ * Helper function for setting VLAN tag offload location in a given Rx ring.
+ */
+static void iavf_set_rx_queue_vlan_tag_loc(struct iavf_adapter *adapter,
+					   struct iavf_ring *rx_ring)
+{
+	struct virtchnl_vlan_supported_caps *caps;
+
+	/* prevent multiple L2TAG bits being set after VFR */
+	rx_ring->flags &=
+		~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
+		  IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2);
+
+	if (VLAN_ALLOWED(adapter)) {
+		rx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+		return;
+	}
+
+	if (!VLAN_V2_ALLOWED(adapter))
+		return;
+
+	caps = &adapter->vlan_v2_caps.offloads.stripping_support;
+
+	if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
+		rx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+	else if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2)
+		rx_ring->flags |= IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2;
+}
+
+/**
+ * iavf_set_tx_queue_vlan_tag_loc - set location for VLAN tag offload in Tx
+ * @adapter: board private structure
+ * @tx_ring: Tx ring where VLAN tag offload for VLAN will be set
+ *
+ * Helper function for setting VLAN tag offload location in a given Tx ring.
+ */
+static void iavf_set_tx_queue_vlan_tag_loc(struct iavf_adapter *adapter,
+					   struct iavf_ring *tx_ring)
+{
+	struct virtchnl_vlan_supported_caps *caps;
+
+	/* prevent multiple L2TAG bits being set after VFR */
+	tx_ring->flags &=
+		~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
+		  IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2);
+
+	if (VLAN_ALLOWED(adapter)) {
+		tx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+		return;
+	}
+
+	if (!VLAN_V2_ALLOWED(adapter))
+		return;
+
+	caps = &adapter->vlan_v2_caps.offloads.insertion_support;
+
+	if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
+		tx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+	else if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2)
+		tx_ring->flags |= IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
+}
+
 /**
  * iavf_set_queue_vlan_tag_loc - set location for VLAN tag offload
  * @adapter: board private structure
@@ -1473,72 +1545,58 @@ void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter)
 	int i;
 
 	for (i = 0; i < adapter->num_active_queues; i++) {
-		struct iavf_ring *tx_ring = &adapter->tx_rings[i];
-		struct iavf_ring *rx_ring = &adapter->rx_rings[i];
-
-		/* prevent multiple L2TAG bits being set after VFR */
-		tx_ring->flags &=
-			~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
-			  IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2);
-		rx_ring->flags &=
-			~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
-			  IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2);
-
-		if (VLAN_ALLOWED(adapter)) {
-			tx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-			rx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-		} else if (VLAN_V2_ALLOWED(adapter)) {
-			struct virtchnl_vlan_supported_caps *stripping_support;
-			struct virtchnl_vlan_supported_caps *insertion_support;
-
-			stripping_support =
-				&adapter->vlan_v2_caps.offloads.stripping_support;
-			insertion_support =
-				&adapter->vlan_v2_caps.offloads.insertion_support;
-
-			if (stripping_support->outer) {
-				if (stripping_support->outer &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					rx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (stripping_support->outer &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2)
-					rx_ring->flags |=
-						IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2;
-			} else if (stripping_support->inner) {
-				if (stripping_support->inner &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					rx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (stripping_support->inner &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2)
-					rx_ring->flags |=
-						IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2;
-			}
-
-			if (insertion_support->outer) {
-				if (insertion_support->outer &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					tx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (insertion_support->outer &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2)
-					tx_ring->flags |=
-						IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
-			} else if (insertion_support->inner) {
-				if (insertion_support->inner &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					tx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (insertion_support->inner &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2)
-					tx_ring->flags |=
-						IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
-			}
-		}
+		iavf_set_rx_queue_vlan_tag_loc(adapter, &adapter->rx_rings[i]);
+		iavf_set_tx_queue_vlan_tag_loc(adapter, &adapter->tx_rings[i]);
 	}
 }
 
+/**
+ * iavf_init_rx_ring - Init pointers and flags for a given Rx ring
+ * @adapter: board private structure to initialize
+ * @ring_index: index of the ring to be initialized
+ *
+ * Init all basic pointers and flags in a newly allocated Rx ring.
+ */
+static void iavf_init_rx_ring(struct iavf_adapter *adapter,
+			      int ring_index)
+{
+	struct iavf_ring *rx_ring = &adapter->rx_rings[ring_index];
+
+	rx_ring->vsi = &adapter->vsi;
+	rx_ring->queue_index = ring_index;
+	rx_ring->netdev = adapter->netdev;
+	rx_ring->dev = &adapter->pdev->dev;
+	rx_ring->count = adapter->rx_desc_count;
+	rx_ring->itr_setting = IAVF_ITR_RX_DEF;
+}
+
+/**
+ * iavf_init_tx_ring - Init pointers and flags for a given Tx ring
+ * @adapter: board private structure to initialize
+ * @ring_index: index of the ring to be initialized
+ * @xdp_ring: set to true if the ring is XDP Tx queue
+ *
+ * Init all basic pointers and flags in a newly allocated Tx ring.
+ */
+static void iavf_init_tx_ring(struct iavf_adapter *adapter, int ring_index)
+{
+	struct iavf_ring *tx_ring = &adapter->tx_rings[ring_index];
+
+	tx_ring->vsi = &adapter->vsi;
+	tx_ring->queue_index = ring_index;
+	tx_ring->netdev = adapter->netdev;
+	tx_ring->dev = &adapter->pdev->dev;
+	tx_ring->count = adapter->tx_desc_count;
+	tx_ring->itr_setting = IAVF_ITR_TX_DEF;
+
+	tx_ring->flags = 0;
+
+	if (adapter->flags & IAVF_FLAG_WB_ON_ITR_CAPABLE)
+		tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
+
+	u64_stats_init(&tx_ring->sq_stats.syncp);
+}
+
 /**
  * iavf_alloc_queues - Allocate memory for all rings
  * @adapter: board private structure to initialize
@@ -1549,7 +1607,8 @@ void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter)
  **/
 static int iavf_alloc_queues(struct iavf_adapter *adapter)
 {
-	int i, num_active_queues;
+	u32 num_active_queues;
+	int i;
 
 	/* If we're in reset reallocating queues we don't actually know yet for
 	 * certain the PF gave us the number of queues we asked for but we'll
@@ -1566,7 +1625,6 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 					  adapter->vsi_res->num_queue_pairs,
 					  (int)(num_online_cpus()));
 
-
 	adapter->tx_rings = kcalloc(num_active_queues,
 				    sizeof(struct iavf_ring), GFP_KERNEL);
 	if (!adapter->tx_rings)
@@ -1576,32 +1634,13 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 	if (!adapter->rx_rings)
 		goto err_out;
 
-	for (i = 0; i < num_active_queues; i++) {
-		struct iavf_ring *tx_ring;
-		struct iavf_ring *rx_ring;
-
-		tx_ring = &adapter->tx_rings[i];
-
-		tx_ring->queue_index = i;
-		tx_ring->netdev = adapter->netdev;
-		tx_ring->dev = &adapter->pdev->dev;
-		tx_ring->count = adapter->tx_desc_count;
-		tx_ring->itr_setting = IAVF_ITR_TX_DEF;
-		if (adapter->flags & IAVF_FLAG_WB_ON_ITR_CAPABLE)
-			tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
-		u64_stats_init(&tx_ring->sq_stats.syncp);
+	adapter->num_active_queues = num_active_queues;
 
-		rx_ring = &adapter->rx_rings[i];
-		rx_ring->queue_index = i;
-		rx_ring->netdev = adapter->netdev;
-		rx_ring->dev = &adapter->pdev->dev;
-		rx_ring->count = adapter->rx_desc_count;
-		rx_ring->itr_setting = IAVF_ITR_RX_DEF;
-		u64_stats_init(&rx_ring->rq_stats.syncp);
+	for (i = 0; i < num_active_queues; i++) {
+		iavf_init_tx_ring(adapter, i);
+		iavf_init_rx_ring(adapter, i);
 	}
 
-	adapter->num_active_queues = num_active_queues;
-
 	iavf_set_queue_vlan_tag_loc(adapter);
 
 	return 0;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index aa0009c853c453..a7f8c9ad87eae2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -648,7 +648,8 @@ static void iavf_update_itr(struct iavf_q_vector *q_vector,
 int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 {
 	struct device *dev = tx_ring->dev;
-	int bi_size;
+	struct iavf_tx_desc *tx_desc;
+	int bi_size, j;
 
 	if (!dev)
 		return -ENOMEM;
@@ -674,6 +675,11 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 	tx_ring->prev_pkt_ctr = -1;
+	for (j = 0; j < tx_ring->count; j++) {
+		tx_desc = IAVF_TX_DESC(tx_ring, j);
+		tx_desc->cmd_type_offset_bsz = 0;
+	}
+
 	return 0;
 
 err:

From 76bf4988f53b6f2bfb507bd8578dfbbf40bc5b6f Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Fri, 2 Dec 2022 14:34:16 +0100
Subject: [PATCH 23/40] iavf: Prepare rings to support XDP

Extend basic structures of the driver (e.g. 'iavf_adapter', 'iavf_ring')
by adding members necessary to support XDP. Register those members using
required functions from BPF API.
Implement a support for XDP_TX and XDP_REDIRECT actions by adding
additional XDP Tx queues to transmit packets without interferring a
regular Tx traffic.
Finally, add required XDP setup and release calls to queue allocation
and deallocation functions respectively.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |  16 ++
 drivers/net/ethernet/intel/iavf/iavf_main.c   | 138 ++++++++++++++++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  28 +++-
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   |  11 +-
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  43 +++++-
 5 files changed, 203 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 4e44834ff9d8fa..338bad44358a51 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -4,6 +4,7 @@
 #ifndef _IAVF_H_
 #define _IAVF_H_
 
+#include <linux/bpf.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
@@ -33,6 +34,7 @@
 #include <net/udp.h>
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_mirred.h>
+#include <net/xdp.h>
 
 #include "iavf_type.h"
 #include <linux/avf/virtchnl.h>
@@ -265,10 +267,13 @@ struct iavf_adapter {
 	spinlock_t mac_vlan_list_lock;
 	char misc_vector_name[IFNAMSIZ + 9];
 	u32 num_active_queues;
+	u32 num_xdp_tx_queues;
 	u32 num_req_queues;
+	struct bpf_prog *xdp_prog;
 
 	/* TX */
 	struct iavf_ring *tx_rings;
+	struct iavf_ring *xdp_rings;
 	u32 tx_timeout_count;
 	u32 tx_desc_count;
 
@@ -511,6 +516,17 @@ static inline void iavf_change_state(struct iavf_adapter *adapter,
 		iavf_state_str(adapter->state));
 }
 
+/**
+ * iavf_adapter_xdp_active - Determine if XDP program is loaded
+ * @adapter: board private structure
+ *
+ * Returns true if XDP program is loaded on a given adapter.
+ */
+static inline bool iavf_adapter_xdp_active(struct iavf_adapter *adapter)
+{
+	return !!READ_ONCE(adapter->xdp_prog);
+}
+
 int iavf_up(struct iavf_adapter *adapter);
 void iavf_down(struct iavf_adapter *adapter);
 int iavf_process_config(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 24c1a92998b177..6336ec35bd8605 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -466,12 +466,15 @@ iavf_map_vector_to_rxq(struct iavf_adapter *adapter, int v_idx, int r_idx)
  * @adapter: board private structure
  * @v_idx: interrupt number
  * @t_idx: queue number
- **/
+ * @xdpq: set to true if Tx queue is XDP Tx queue
+ */
 static void
-iavf_map_vector_to_txq(struct iavf_adapter *adapter, int v_idx, int t_idx)
+iavf_map_vector_to_txq(struct iavf_adapter *adapter, int v_idx, int t_idx,
+		       bool xdpq)
 {
+	struct iavf_ring *tx_ring =  xdpq ? &adapter->xdp_rings[t_idx]
+					  : &adapter->tx_rings[t_idx];
 	struct iavf_q_vector *q_vector = &adapter->q_vectors[v_idx];
-	struct iavf_ring *tx_ring = &adapter->tx_rings[t_idx];
 	struct iavf_hw *hw = &adapter->hw;
 
 	tx_ring->q_vector = q_vector;
@@ -481,7 +484,7 @@ iavf_map_vector_to_txq(struct iavf_adapter *adapter, int v_idx, int t_idx)
 	q_vector->tx.count++;
 	q_vector->tx.next_update = jiffies + 1;
 	q_vector->tx.target_itr = ITR_TO_REG(tx_ring->itr_setting);
-	q_vector->tx_ring_mask |= BIT(t_idx);
+	q_vector->tx_ring_mask |= BIT(tx_ring->queue_index);
 	wr32(hw, IAVF_VFINT_ITRN1(IAVF_TX_ITR, q_vector->reg_idx),
 	     q_vector->tx.target_itr >> 1);
 	q_vector->tx.current_itr = q_vector->tx.target_itr;
@@ -507,7 +510,9 @@ static void iavf_map_rings_to_vectors(struct iavf_adapter *adapter)
 
 	for (; ridx < rings_remaining; ridx++) {
 		iavf_map_vector_to_rxq(adapter, vidx, ridx);
-		iavf_map_vector_to_txq(adapter, vidx, ridx);
+		iavf_map_vector_to_txq(adapter, vidx, ridx, false);
+		if (iavf_adapter_xdp_active(adapter))
+			iavf_map_vector_to_txq(adapter, vidx, ridx, true);
 
 		adapter->q_vectors[vidx].num_ringpairs++;
 
@@ -699,10 +704,13 @@ static void iavf_free_misc_irq(struct iavf_adapter *adapter)
 static void iavf_configure_tx(struct iavf_adapter *adapter)
 {
 	struct iavf_hw *hw = &adapter->hw;
-	int i;
+	int i, j;
 
-	for (i = 0; i < adapter->num_active_queues; i++)
-		adapter->tx_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(i);
+	for (i = 0, j = 0; i < adapter->num_active_queues; i++, j++)
+		adapter->tx_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(j);
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++, j++)
+		adapter->xdp_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(j);
 }
 
 /**
@@ -715,8 +723,22 @@ void iavf_configure_rx_ring(struct iavf_adapter *adapter,
 			    struct iavf_ring *rx_ring)
 {
 	u32 queue_idx = rx_ring->queue_index;
+	int err;
 
 	rx_ring->tail = adapter->hw.hw_addr + IAVF_QRX_TAIL1(queue_idx);
+
+	if (!xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+		err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+				       rx_ring->queue_index,
+				       rx_ring->q_vector->napi.napi_id);
+
+	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_POOL,
+					 rx_ring->pool);
+	if (err)
+		netdev_err(adapter->netdev, "Could not register XDP memory model for RX queue %u, error: %d\n",
+			   queue_idx, err);
+
+	RCU_INIT_POINTER(rx_ring->xdp_prog, adapter->xdp_prog);
 	iavf_alloc_rx_pages(rx_ring);
 }
 
@@ -1448,6 +1470,19 @@ iavf_acquire_msix_vectors(struct iavf_adapter *adapter, int vectors)
 	return 0;
 }
 
+/**
+ * iavf_free_xdp_queues - Free memory for XDP rings
+ * @adapter: board private structure to update
+ *
+ * Free all of the memory associated with XDP queues.
+ */
+static void iavf_free_xdp_queues(struct iavf_adapter *adapter)
+{
+	adapter->num_xdp_tx_queues = 0;
+	kfree(adapter->xdp_rings);
+	adapter->xdp_rings = NULL;
+}
+
 /**
  * iavf_free_queues - Free memory for all rings
  * @adapter: board private structure to initialize
@@ -1456,13 +1491,12 @@ iavf_acquire_msix_vectors(struct iavf_adapter *adapter, int vectors)
  **/
 static void iavf_free_queues(struct iavf_adapter *adapter)
 {
-	if (!adapter->vsi_res)
-		return;
 	adapter->num_active_queues = 0;
 	kfree(adapter->tx_rings);
 	adapter->tx_rings = NULL;
 	kfree(adapter->rx_rings);
 	adapter->rx_rings = NULL;
+	iavf_free_xdp_queues(adapter);
 }
 
 /**
@@ -1531,6 +1565,20 @@ static void iavf_set_tx_queue_vlan_tag_loc(struct iavf_adapter *adapter,
 		tx_ring->flags |= IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
 }
 
+/**
+ * iavf_set_xdp_queue_vlan_tag_loc - set location for VLAN tag on XDP ring
+ * @adapter: board private structure
+ *
+ * Variation of iavf_set_queue_vlan_tag_loc, which configures XDP rings only.
+ */
+static void iavf_set_xdp_queue_vlan_tag_loc(struct iavf_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++)
+		iavf_set_tx_queue_vlan_tag_loc(adapter, &adapter->xdp_rings[i]);
+}
+
 /**
  * iavf_set_queue_vlan_tag_loc - set location for VLAN tag offload
  * @adapter: board private structure
@@ -1548,6 +1596,8 @@ void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter)
 		iavf_set_rx_queue_vlan_tag_loc(adapter, &adapter->rx_rings[i]);
 		iavf_set_tx_queue_vlan_tag_loc(adapter, &adapter->tx_rings[i]);
 	}
+
+	iavf_set_xdp_queue_vlan_tag_loc(adapter);
 }
 
 /**
@@ -1578,9 +1628,12 @@ static void iavf_init_rx_ring(struct iavf_adapter *adapter,
  *
  * Init all basic pointers and flags in a newly allocated Tx ring.
  */
-static void iavf_init_tx_ring(struct iavf_adapter *adapter, int ring_index)
+static void iavf_init_tx_ring(struct iavf_adapter *adapter,
+			      int ring_index,
+			      bool xdp_ring)
 {
-	struct iavf_ring *tx_ring = &adapter->tx_rings[ring_index];
+	struct iavf_ring *tx_ring = xdp_ring ? &adapter->xdp_rings[ring_index]
+					     : &adapter->tx_rings[ring_index];
 
 	tx_ring->vsi = &adapter->vsi;
 	tx_ring->queue_index = ring_index;
@@ -1595,6 +1648,38 @@ static void iavf_init_tx_ring(struct iavf_adapter *adapter, int ring_index)
 		tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
 
 	u64_stats_init(&tx_ring->sq_stats.syncp);
+
+	if (xdp_ring) {
+		tx_ring->queue_index += adapter->num_active_queues;
+		tx_ring->flags |= IAVF_TXRX_FLAGS_XDP;
+	}
+}
+
+/**
+ * iavf_alloc_xdp_queues - Allocate memory for XDP rings
+ * @adapter: board private structure to initialize
+ * @num_active_queues: number of exposed queue pairs
+ *
+ * Variation of iavf_alloc_queues(), which configures XDP queues only.
+ */
+static int iavf_alloc_xdp_queues(struct iavf_adapter *adapter, u32 num_active_queues)
+{
+	int i;
+
+	adapter->xdp_rings = kcalloc(num_active_queues,
+				     sizeof(struct iavf_ring), GFP_KERNEL);
+	if (!adapter->xdp_rings)
+		return -ENOMEM;
+
+	adapter->num_xdp_tx_queues = num_active_queues;
+
+	/* Setup extra XDP Tx queues if there are any */
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++) {
+		iavf_init_tx_ring(adapter, i, true);
+		adapter->rx_rings[i].xdp_ring = &adapter->xdp_rings[i];
+	}
+
+	return 0;
 }
 
 /**
@@ -1637,10 +1722,14 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 	adapter->num_active_queues = num_active_queues;
 
 	for (i = 0; i < num_active_queues; i++) {
-		iavf_init_tx_ring(adapter, i);
+		iavf_init_tx_ring(adapter, i, false);
 		iavf_init_rx_ring(adapter, i);
 	}
 
+	if (iavf_adapter_xdp_active(adapter))
+		if (iavf_alloc_xdp_queues(adapter, num_active_queues))
+			goto err_out;
+
 	iavf_set_queue_vlan_tag_loc(adapter);
 
 	return 0;
@@ -3369,6 +3458,10 @@ void iavf_free_all_tx_resources(struct iavf_adapter *adapter)
 	for (i = 0; i < adapter->num_active_queues; i++)
 		if (adapter->tx_rings[i].desc)
 			iavf_free_tx_resources(&adapter->tx_rings[i]);
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++)
+		if (adapter->xdp_rings[i].desc)
+			iavf_free_tx_resources(&adapter->xdp_rings[i]);
 }
 
 /**
@@ -3380,14 +3473,16 @@ void iavf_free_all_tx_resources(struct iavf_adapter *adapter)
  * callers duty to clean those orphaned rings.
  *
  * Return 0 on success, negative on failure
- **/
+ */
 static int iavf_setup_all_tx_resources(struct iavf_adapter *adapter)
 {
+	struct iavf_ring *ring;
 	int i, err = 0;
 
 	for (i = 0; i < adapter->num_active_queues; i++) {
-		adapter->tx_rings[i].count = adapter->tx_desc_count;
-		err = iavf_setup_tx_descriptors(&adapter->tx_rings[i]);
+		ring = &adapter->tx_rings[i];
+		ring->count = adapter->tx_desc_count;
+		err = iavf_setup_tx_descriptors(ring);
 		if (!err)
 			continue;
 		dev_err(&adapter->pdev->dev,
@@ -3395,6 +3490,17 @@ static int iavf_setup_all_tx_resources(struct iavf_adapter *adapter)
 		break;
 	}
 
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++) {
+		ring = &adapter->xdp_rings[i];
+		ring->count = adapter->tx_desc_count;
+		err = iavf_setup_tx_descriptors(ring);
+		if (!err)
+			continue;
+		dev_err(&adapter->pdev->dev,
+			"Allocation for XDP Queue %u failed\n", i);
+		break;
+	}
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index a7f8c9ad87eae2..f37fb0c35fea94 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -44,6 +44,18 @@ static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
 	/* tx_buffer must be completely set up in the transmit path */
 }
 
+/**
+ * iavf_release_tx_resources - Release all Tx buffers on ring
+ * @ring: TX or XDP ring
+ */
+static void iavf_release_tx_resources(struct iavf_ring *ring)
+{
+	u32 i;
+
+	for (i = 0; i < ring->count; i++)
+		iavf_unmap_and_free_tx_resource(ring, &ring->tx_bi[i]);
+}
+
 /**
  * iavf_clean_tx_ring - Free any empty Tx buffers
  * @tx_ring: ring to be cleaned
@@ -51,15 +63,13 @@ static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 {
 	unsigned long bi_size;
-	u16 i;
 
 	/* ring already cleared, nothing to do */
 	if (!tx_ring->tx_bi)
 		return;
 
 	/* Free all the Tx ring sk_buffs */
-	for (i = 0; i < tx_ring->count; i++)
-		iavf_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
+	iavf_release_tx_resources(tx_ring);
 
 	bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
 	memset(tx_ring->tx_bi, 0, bi_size);
@@ -74,7 +84,8 @@ void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 		return;
 
 	/* cleanup Tx queue statistics */
-	netdev_tx_reset_queue(txring_txq(tx_ring));
+	if (!(tx_ring->flags & IAVF_TXRX_FLAGS_XDP))
+		netdev_tx_reset_queue(txring_txq(tx_ring));
 }
 
 /**
@@ -303,8 +314,9 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 	}
 
 	/* notify netdev of completed buffers */
-	netdev_tx_completed_queue(txring_txq(tx_ring),
-				  stats.packets, stats.bytes);
+	if (!(tx_ring->flags & IAVF_TXRX_FLAGS_XDP))
+		netdev_tx_completed_queue(txring_txq(tx_ring),
+					  stats.packets, stats.bytes);
 
 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
 	if (unlikely(stats.packets && netif_carrier_ok(tx_ring->netdev) &&
@@ -735,6 +747,10 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
+	/* This also unregisters memory model */
+	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+
 	libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
 	rx_ring->dev = dev;
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 5826501c5df1c8..d6e4eba9492881 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -246,14 +246,13 @@ struct iavf_ring {
 	u16 flags;
 #define IAVF_TXR_FLAGS_WB_ON_ITR		BIT(0)
 #define IAVF_TXRX_FLAGS_ARM_WB			BIT(1)
-/* BIT(2) is free */
+#define IAVF_TXRX_FLAGS_XDP			BIT(2)
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
 
-	struct iavf_vsi *vsi;		/* Backreference to associated VSI */
-	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
-
+	struct bpf_prog __rcu *xdp_prog;
+	struct iavf_ring *xdp_ring;
 	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
@@ -269,11 +268,15 @@ struct iavf_ring {
 		struct libie_rq_stats rq_stats;
 	};
 
+	struct iavf_vsi *vsi;		/* Backreference to associated VSI */
+	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
+
 	int prev_pkt_ctr;		/* For stall detection */
 	unsigned int size;		/* length of descriptor ring in bytes */
 	dma_addr_t dma;			/* physical address of ring */
 
 	struct rcu_head rcu;		/* to avoid race on free */
+	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_internodealigned_in_smp;
 
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 9f25de3e2b6e78..781c006ade6d97 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -400,17 +400,30 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
  * @adapter: adapter structure
  * @queue_index: index of queue pair in the adapter structure
  * @max_frame: maximal frame size supported by the adapter
+ * @xdp_pair: true if the queue pair is assigned to XDP queues
  *
  * Fill virtchannel queue pair configuration structure
  * with data for the Rx and Tx queues of a given index.
- **/
+ * To handle XDP queues, only Tx part of vqpi structure is filled
+ * with data. Because of virtchnl protocol can operate on queue pairs only,
+ * associate each extra Tx queue with an empty Rx queue
+ * (with zero length).
+ */
 static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 				    struct iavf_adapter *adapter,
-				    int queue_index, int max_frame)
+				    int queue_index, int max_frame,
+				    bool xdp_pair)
 {
-	struct iavf_ring *txq = &adapter->tx_rings[queue_index];
 	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
+	struct iavf_ring *txq;
+	int xdpq_idx;
 
+	if (xdp_pair) {
+		xdpq_idx = queue_index - adapter->num_xdp_tx_queues;
+		txq = &adapter->xdp_rings[xdpq_idx];
+	} else {
+		txq = &adapter->tx_rings[queue_index];
+	}
 	vqpi->txq.vsi_id = adapter->vsi_res->vsi_id;
 	vqpi->txq.queue_id = queue_index;
 	vqpi->txq.ring_len = txq->count;
@@ -418,6 +431,11 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 
 	vqpi->rxq.vsi_id = adapter->vsi_res->vsi_id;
 	vqpi->rxq.queue_id = queue_index;
+	if (xdp_pair) {
+		vqpi->rxq.ring_len = 0;
+		return;
+	}
+
 	vqpi->rxq.ring_len = rxq->count;
 	vqpi->rxq.dma_ring_addr = rxq->dma;
 	vqpi->rxq.max_pkt_size = max_frame;
@@ -439,6 +457,7 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 int iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask,
 				   bool wait)
 {
+	int pairs = adapter->num_active_queues + adapter->num_xdp_tx_queues;
 	unsigned long num_qps_to_config, mask = qp_mask;
 	u32 idx, max_frame = adapter->vf_res->max_mtu;
 	struct virtchnl_vsi_queue_config_info *vqci;
@@ -468,7 +487,13 @@ int iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask,
 	 * can fit info for 31 of them into the AQ buffer before it overflows.
 	 */
 	for_each_set_bit(idx, &mask, adapter->num_active_queues) {
-		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame);
+		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame, false);
+		vqpi++;
+	}
+
+	/* Set configuration info for XDP Tx queues. */
+	for_each_set_bit_from(idx, &mask, pairs) {
+		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame, true);
 		vqpi++;
 	}
 
@@ -497,7 +522,7 @@ int iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask,
  */
 int iavf_configure_queues(struct iavf_adapter *adapter, bool wait)
 {
-	int pairs = adapter->num_active_queues;
+	int pairs = adapter->num_active_queues + adapter->num_xdp_tx_queues;
 	u32 qpair_mask = BIT(pairs) - 1;
 
 	return iavf_configure_selected_queues(adapter, qpair_mask, wait);
@@ -595,7 +620,9 @@ int iavf_disable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
  */
 int iavf_enable_queues(struct iavf_adapter *adapter, bool wait)
 {
-	u32 num_tx_queues = adapter->num_active_queues;
+	u32 num_tx_queues = adapter->num_active_queues +
+			    adapter->num_xdp_tx_queues;
+
 	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
 	u32 tx_queues = BIT(num_tx_queues) - 1;
 
@@ -615,7 +642,9 @@ int iavf_enable_queues(struct iavf_adapter *adapter, bool wait)
  */
 int iavf_disable_queues(struct iavf_adapter *adapter, bool wait)
 {
-	u32 num_tx_queues = adapter->num_active_queues;
+	u32 num_tx_queues = adapter->num_active_queues +
+			    adapter->num_xdp_tx_queues;
+
 	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
 	u32 tx_queues = BIT(num_tx_queues) - 1;
 

From 629e722c204f0ff9e545e9adeedf104ee7396417 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 22 Feb 2023 13:00:48 +0100
Subject: [PATCH 24/40] iavf: don't hardcode DMA direction, headroom and buffer
 len on Rx

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 27 ++++++++++---------
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  9 ++++---
 drivers/net/ethernet/intel/libie/rx.c         | 24 +++++++++++++++--
 include/linux/net/intel/libie/rx.h            | 15 ++++++-----
 4 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index f37fb0c35fea94..4c2fb1aef4dac6 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -868,6 +868,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 	struct page_pool *pool = rx_ring->pool;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
+	u32 hr = pool->p.offset;
 
 	/* do nothing if no valid netdev defined */
 	if (unlikely(!rx_ring->netdev || !to_refill))
@@ -889,7 +890,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
-		rx_desc->read.pkt_addr = cpu_to_le64(dma + LIBIE_SKB_HEADROOM);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma + hr);
 
 		rx_desc++;
 		ntu++;
@@ -1028,35 +1029,36 @@ void iavf_process_skb_fields(struct iavf_ring *rx_ring,
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @skb: sk_buff to place the data into
  * @page: page containing data to add
+ * @hr: headroom in front of the data
  * @size: packet length from rx_desc
  *
  * This function will add the data contained in page to the skb.
  * It will just attach the page as a frag to the skb.
- *
- * The function will then update the page offset.
- **/
-static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size)
+ */
+static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 hr,
+			     u32 size)
 {
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
-			LIBIE_SKB_HEADROOM, size, LIBIE_RX_TRUESIZE);
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, hr, size,
+			LIBIE_RX_TRUESIZE);
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @page: Rx page to with the data
+ * @hr: headroom in front of the data
  * @size: size of the data
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
+static struct sk_buff *iavf_build_skb(struct page *page, u32 hr, u32 size)
 {
 	struct sk_buff *skb;
 	void *va;
 
 	/* prefetch first cache line of first page */
 	va = page_address(page);
-	net_prefetch(va + LIBIE_SKB_HEADROOM);
+	net_prefetch(va + hr);
 
 	/* build an skb around the page buffer */
 	skb = napi_build_skb(va, LIBIE_RX_TRUESIZE);
@@ -1066,7 +1068,7 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 	skb_mark_for_recycle(skb);
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, LIBIE_SKB_HEADROOM);
+	skb_reserve(skb, hr);
 	__skb_put(skb, size);
 
 	return skb;
@@ -1109,6 +1111,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
+	u32 hr = pool->p.offset;
 	u32 cleaned_count = 0;
 
 	while (likely(cleaned_count < budget)) {
@@ -1165,9 +1168,9 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(skb, page, size);
+			iavf_add_rx_frag(skb, page, hr, size);
 		else
-			skb = iavf_build_skb(page, size);
+			skb = iavf_build_skb(page, hr, size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 781c006ade6d97..9a02a82e15343e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -411,10 +411,11 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
  */
 static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 				    struct iavf_adapter *adapter,
-				    int queue_index, int max_frame,
+				    int queue_index, u32 max_frame,
 				    bool xdp_pair)
 {
 	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
+	const struct page_pool_params *pp = &rxq->pool->p;
 	struct iavf_ring *txq;
 	int xdpq_idx;
 
@@ -436,10 +437,12 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 		return;
 	}
 
+	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN(pp->offset));
+
 	vqpi->rxq.ring_len = rxq->count;
 	vqpi->rxq.dma_ring_addr = rxq->dma;
 	vqpi->rxq.max_pkt_size = max_frame;
-	vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
+	vqpi->rxq.databuffer_size = pp->max_len;
 }
 
 /**
@@ -464,8 +467,6 @@ int iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask,
 	struct virtchnl_queue_pair_info *vqpi;
 	size_t len;
 
-	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN);
-
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
 		dev_err(&adapter->pdev->dev,
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index 10ef8741326ad2..293c2cc19a0ec3 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -109,6 +109,25 @@ EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
 
 /* Page Pool */
 
+/**
+ * libie_rx_sync_len - get the actual buffer size to be synced and passed to HW
+ * @dev: &net_device to calculate the size for
+ * @hr: headroom in front of each frame
+ *
+ * Returns the buffer size to pass it to HW and use for DMA synchronization
+ * for the MTU the @dev has.
+ */
+static u32 libie_rx_sync_len(const struct net_device *dev, u32 hr)
+{
+	u32 len;
+
+	len = READ_ONCE(dev->mtu) + LIBIE_RX_LL_LEN;
+	len = ALIGN(len, LIBIE_RX_BUF_LEN_ALIGN);
+	len = min(len, LIBIE_RX_BUF_LEN(hr));
+
+	return len;
+}
+
 /**
  * libie_rx_page_pool_create - create a PP with the default libie settings
  * @dev: &net_device which a PP will be created for
@@ -119,6 +138,7 @@ EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 					    u32 size)
 {
+	u32 hr = LIBIE_SKB_HEADROOM;
 	const struct page_pool_params pp = {
 		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK |
 				  PP_FLAG_DMA_SYNC_DEV,
@@ -127,8 +147,8 @@ struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 		.nid		= NUMA_NO_NODE,
 		.dev		= dev->dev.parent,
 		.dma_dir	= DMA_FROM_DEVICE,
-		.max_len	= LIBIE_RX_BUF_LEN,
-		.offset		= LIBIE_SKB_HEADROOM,
+		.max_len	= libie_rx_sync_len(dev, hr),
+		.offset		= hr,
 	};
 
 	static_assert((PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK) ==
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index f063a30f182ecb..ca601d8e4b8f8f 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -132,6 +132,8 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 
 /* Space reserved in front of each frame */
 #define LIBIE_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+/* Maximum headroom to calculate max MTU below */
+#define LIBIE_MAX_HEADROOM	LIBIE_SKB_HEADROOM
 /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
 #define LIBIE_RX_LL_LEN		(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
 
@@ -143,22 +145,23 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 /* HW-writeable space in one buffer: truesize - headroom/tailroom,
  * HW-aligned
  */
-#define __LIBIE_RX_BUF_LEN						    \
-	ALIGN_DOWN(SKB_MAX_ORDER(LIBIE_SKB_HEADROOM, LIBIE_RX_PAGE_ORDER),  \
+#define __LIBIE_RX_BUF_LEN(hr)						    \
+	ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBIE_RX_PAGE_ORDER),		    \
 		   LIBIE_RX_BUF_LEN_ALIGN)
 /* The largest size for a single descriptor as per HW */
 #define LIBIE_MAX_RX_BUF_LEN	9728U
 /* "True" HW-writeable space: minimum from SW and HW values */
-#define LIBIE_RX_BUF_LEN	min_t(u32, __LIBIE_RX_BUF_LEN,		    \
+#define LIBIE_RX_BUF_LEN(hr)	min_t(u32, __LIBIE_RX_BUF_LEN(hr),	    \
 				      LIBIE_MAX_RX_BUF_LEN)
 
 /* The maximum frame size as per HW (S/G) */
 #define __LIBIE_MAX_RX_FRM_LEN	16382U
 /* ATST, HW can chain up to 5 Rx descriptors */
-#define LIBIE_MAX_RX_FRM_LEN	min_t(u32, __LIBIE_MAX_RX_FRM_LEN,	    \
-				      LIBIE_RX_BUF_LEN * 5)
+#define LIBIE_MAX_RX_FRM_LEN(hr)					    \
+	min_t(u32, __LIBIE_MAX_RX_FRM_LEN, LIBIE_RX_BUF_LEN(hr) * 5)
 /* Maximum frame size minus LL overhead */
-#define LIBIE_MAX_MTU		(LIBIE_MAX_RX_FRM_LEN - LIBIE_RX_LL_LEN)
+#define LIBIE_MAX_MTU		(LIBIE_MAX_RX_FRM_LEN(LIBIE_MAX_HEADROOM) - \
+				 LIBIE_RX_LL_LEN)
 
 /* DMA mapping attributes for Rx buffers: no impl. sync + relaxed on Sparc */
 #define LIBIE_RX_DMA_ATTR						    \

From 9f85b9dbd42d49a98401291b1446ef0bb49131a8 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Fri, 2 Dec 2022 07:17:18 -0500
Subject: [PATCH 25/40] iavf: Handle XDP_SETUP_PROG command in .ndo_bpf

Add .ndo_bpf function to handle XDP_SETUP_PROG command.

In order to avoid synchronization issues, implement functions
dedicated to re-initialize only those parts of the interface which
are really necessary to setup the XDP program.
Such an approach is much lighter than performing a full reset of the
driver and thanks to it we can immediately know the result of traffic
initialization comparing to the reset task which triggers some
asynchronous events (e.g. link speed negotiation).

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 459 +++++++++++++++++++-
 include/linux/net/intel/libie/rx.h          |   5 +-
 2 files changed, 455 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 6336ec35bd8605..d654b5e4d43b13 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -526,6 +526,54 @@ static void iavf_map_rings_to_vectors(struct iavf_adapter *adapter)
 	adapter->aq_required |= IAVF_FLAG_AQ_MAP_VECTORS;
 }
 
+/**
+ * iavf_unmap_rings_from_vectors - Clear existing mapping for queues and vectors
+ * @adapter: board private structure
+ *
+ */
+static void iavf_unmap_rings_from_vectors(struct iavf_adapter *adapter)
+{
+	struct iavf_ring *rx_ring, *tx_ring;
+	struct iavf_q_vector *q_vector;
+	int num_q_vectors, i;
+
+	num_q_vectors = adapter->num_msix_vectors - NONQ_VECS;
+	for (i = 0; i < num_q_vectors; i++) {
+		q_vector = &adapter->q_vectors[i];
+		q_vector->tx.ring = NULL;
+		q_vector->tx.count = 0;
+		q_vector->tx.next_update = 0;
+		q_vector->tx.target_itr = 0;
+		q_vector->tx.current_itr = 0;
+		q_vector->num_ringpairs = 0;
+
+		q_vector->rx.ring = NULL;
+		q_vector->rx.count = 0;
+		q_vector->rx.next_update = 0;
+		q_vector->rx.target_itr = 0;
+		q_vector->rx.current_itr = 0;
+		q_vector->rx_ring_mask = 0;
+		q_vector->tx_ring_mask = 0;
+	}
+
+	for (i = 0; i < adapter->num_active_queues; i++) {
+		rx_ring = &adapter->rx_rings[i];
+		tx_ring = &adapter->tx_rings[i];
+
+		rx_ring->q_vector = NULL;
+		rx_ring->next = NULL;
+		tx_ring->q_vector = NULL;
+		tx_ring->next = NULL;
+	}
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++) {
+		tx_ring = &adapter->xdp_rings[i];
+
+		tx_ring->q_vector = NULL;
+		tx_ring->next = NULL;
+	}
+}
+
 /**
  * iavf_irq_affinity_notify - Callback for affinity changes
  * @notify: context as to what irq was changed
@@ -1381,23 +1429,47 @@ static void iavf_clear_adv_rss_conf(struct iavf_adapter *adapter)
 }
 
 /**
- * iavf_down - Shutdown the connection processing
+ * iavf_stop_traffic - Stop NAPI and interrupts before link down
  * @adapter: board private structure
- *
- * Expects to be called while holding the __IAVF_IN_CRITICAL_TASK bit lock.
- **/
-void iavf_down(struct iavf_adapter *adapter)
+ */
+void iavf_stop_traffic(struct iavf_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
 
-	if (adapter->state <= __IAVF_DOWN_PENDING)
-		return;
-
 	netif_carrier_off(netdev);
 	netif_tx_disable(netdev);
 	adapter->link_up = false;
 	iavf_napi_disable_all(adapter);
 	iavf_irq_disable(adapter);
+}
+
+/**
+ * iavf_start_traffic - Start NAPI and interrupts after link up
+ * @adapter: board private structure
+ */
+void iavf_start_traffic(struct iavf_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	iavf_napi_enable_all(adapter);
+	iavf_irq_enable(adapter, true);
+	adapter->link_up = true;
+	netif_tx_start_all_queues(netdev);
+	netif_carrier_on(netdev);
+}
+
+/**
+ * iavf_down - Shutdown the connection processing
+ * @adapter: board private structure
+ *
+ * Expects to be called while holding the __IAVF_IN_CRITICAL_TASK bit lock.
+ */
+void iavf_down(struct iavf_adapter *adapter)
+{
+	if (adapter->state <= __IAVF_DOWN_PENDING)
+		return;
+
+	iavf_stop_traffic(adapter);
 
 	iavf_clear_mac_vlan_filters(adapter);
 	iavf_clear_cloud_filters(adapter);
@@ -2972,6 +3044,36 @@ static void iavf_watchdog_task(struct work_struct *work)
 				   HZ * 2);
 }
 
+/**
+ * iavf_xchg_xdp_prog - set new prog and get an old one
+ * @adapter: board private structure
+ * @prog: new XDP program
+ *
+ * Returns pointer to the old XDP program.
+ * adapter->xdp_prog is not used in packet processing, so it can be
+ * safely set kinda like a flag before resource re-configuration (reset)
+ */
+static struct bpf_prog *iavf_xchg_xdp_prog(struct iavf_adapter *adapter,
+					   struct bpf_prog *prog)
+{
+	return xchg(&adapter->xdp_prog, prog);
+}
+
+/**
+ * iavf_free_xdp_prog - Release XDP program, if present
+ * @adapter: board private structure
+ *
+ * Should be used when adapter is being removed.
+ */
+static void iavf_free_xdp_prog(struct iavf_adapter *adapter)
+{
+	struct bpf_prog *old_xdp_prog;
+
+	old_xdp_prog = iavf_xchg_xdp_prog(adapter, NULL);
+	if (old_xdp_prog)
+		bpf_prog_put(old_xdp_prog);
+}
+
 /**
  * iavf_disable_vf - disable VF
  * @adapter: board private structure
@@ -3003,6 +3105,8 @@ static void iavf_disable_vf(struct iavf_adapter *adapter)
 		iavf_free_all_rx_resources(adapter);
 	}
 
+	iavf_free_xdp_prog(adapter);
+
 	spin_lock_bh(&adapter->mac_vlan_list_lock);
 
 	/* Delete all of the filters */
@@ -4793,6 +4897,342 @@ static netdev_features_t iavf_fix_features(struct net_device *netdev,
 	return iavf_fix_netdev_vlan_features(adapter, features);
 }
 
+/**
+ * iavf_copy_xdp_prog_to_rings - update XDP prog references in rings
+ * @adapter: board private structure
+ *
+ * If program change also requires XDP resources reconfiguration,
+ * schedule a reset instead
+ */
+static void iavf_copy_xdp_prog_to_rings(const struct iavf_adapter *adapter)
+{
+	for (u32 i = 0; i < adapter->num_active_queues; i++)
+		rcu_assign_pointer(adapter->rx_rings[i].xdp_prog,
+				   adapter->xdp_prog);
+
+	/* No queue changes are needed, but running RX processing must finish */
+	synchronize_net();
+}
+
+/**
+ * iavf_assign_bpf_prog - Assign a given BPF program to adapter
+ * @adapter: board private structure
+ * @prog: BPF program to be assigned to adapter
+ *
+ * Returns 0 on success, negative on failure
+ */
+static void iavf_assign_bpf_prog(struct iavf_adapter *adapter,
+				 struct bpf_prog *prog)
+{
+	struct bpf_prog *old_prog;
+
+	old_prog = iavf_xchg_xdp_prog(adapter, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+}
+
+#define IAVF_XDP_LINK_TIMEOUT_MS	1000
+#define IAVF_XDP_LOCK_TIMEOUT_MS	5000
+
+/**
+ * iavf_close_sync - Synchronous version of 'iavf_close', dedicated to XDP setup
+ * @adapter: board private structure
+ *
+ * Caller of this function needs to lock 'adapter->crit_lock' in order
+ * to prevent race conditions with 'reset_task' and VIRTCHNL communication.
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int iavf_close_sync(struct iavf_adapter *adapter)
+{
+	int err;
+
+	iavf_stop_traffic(adapter);
+
+	err = iavf_disable_queues(adapter, true);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "cannot disable queues for XDP setup, error: %d\n",
+			err);
+		goto err_virtchnl;
+	}
+
+	iavf_free_all_tx_resources(adapter);
+	iavf_free_all_rx_resources(adapter);
+
+	iavf_free_traffic_irqs(adapter);
+
+	return 0;
+
+err_virtchnl:
+	iavf_start_traffic(adapter);
+
+	return err;
+}
+
+/**
+ * iavf_open_sync - Synchronous version of 'iavf_open', dedicated to XDP setup
+ * @adapter: board private structure
+ *
+ * Caller of this function needs to lock 'adapter->crit_lock' in order
+ * to prevent race conditions with 'reset_task' and VIRTCHNL communication.
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int iavf_open_sync(struct iavf_adapter *adapter)
+{
+	int err, ret;
+
+	err = iavf_setup_all_tx_resources(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot setup Tx resources, error: %d\n", err);
+		goto err_setup_tx_resources;
+	}
+
+	err = iavf_setup_all_rx_resources(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot setup Rx resources, error: %d\n", err);
+		goto err_setup_rx_resources;
+	}
+
+	err = iavf_request_traffic_irqs(adapter, adapter->netdev->name);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot request interrupts, error: %d\n", err);
+		goto err_request_irq;
+	}
+
+	iavf_configure_tx(adapter);
+	iavf_configure_rx(adapter);
+
+	err = iavf_configure_queues(adapter, true);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot configure queues in PF, error: %d\n", err);
+		goto err_virtchnl_req;
+	}
+
+	err = iavf_map_queues(adapter, true);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot map queues to vectors in PF, error: %d\n", err);
+		goto err_virtchnl_req;
+	}
+
+	err = iavf_enable_queues(adapter, true);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot enable queues in PF, error: %d\n", err);
+		goto err_virtchnl_req;
+	}
+
+	ret = iavf_poll_for_link_status(adapter, IAVF_XDP_LINK_TIMEOUT_MS);
+	if (ret < 0) {
+		err = ret;
+		dev_err(&adapter->pdev->dev,
+			"cannot bring the link up, error: %d\n", err);
+		goto err_wrong_link_status;
+	} else if (!ret) {
+		err = -EBUSY;
+		dev_err(&adapter->pdev->dev,
+			"pf returned link down status, error: %d\n", err);
+		goto err_wrong_link_status;
+	}
+
+	iavf_start_traffic(adapter);
+
+	return 0;
+
+err_wrong_link_status:
+	iavf_close_sync(adapter);
+err_virtchnl_req:
+err_request_irq:
+	iavf_free_traffic_irqs(adapter);
+err_setup_rx_resources:
+	iavf_free_all_rx_resources(adapter);
+err_setup_tx_resources:
+	iavf_free_all_tx_resources(adapter);
+
+	return err;
+}
+
+/**
+ * iavf_destroy_xdp_rings - remove XDP program from adapter and release
+ *			    XDP rings related to that program.
+ * @adapter: board private structure
+ */
+static void iavf_destroy_xdp_rings(struct iavf_adapter *adapter)
+{
+	iavf_unmap_rings_from_vectors(adapter);
+	iavf_free_xdp_queues(adapter);
+	iavf_assign_bpf_prog(adapter, NULL);
+	iavf_map_rings_to_vectors(adapter);
+}
+
+/**
+ * iavf_prepare_xdp_rings - add XDP program to adapter and setup XDP rings
+ *			    to handle that program.
+ * @adapter: board private structure
+ * @prog: XDP program
+ */
+static int iavf_prepare_xdp_rings(struct iavf_adapter *adapter,
+				  struct bpf_prog *prog)
+{
+	int err;
+
+	iavf_unmap_rings_from_vectors(adapter);
+	iavf_assign_bpf_prog(adapter, prog);
+
+	err = iavf_alloc_xdp_queues(adapter, adapter->num_active_queues);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot allocate memory for queues, error: %d\n", err);
+		goto err_alloc_queues;
+	}
+
+	iavf_set_xdp_queue_vlan_tag_loc(adapter);
+
+	iavf_map_rings_to_vectors(adapter);
+
+	return 0;
+
+err_alloc_queues:
+	iavf_assign_bpf_prog(adapter, NULL);
+
+	return err;
+}
+
+/**
+ * iavf_xdp_can_create_queues - check if queue number is appropriate for XDP
+ * @adapter: board private structure
+ * @extack: netlink extended ack
+ */
+static bool iavf_xdp_can_create_queues(struct iavf_adapter *adapter,
+				       struct netlink_ext_ack *extack)
+{
+	u32 max_qp_num = adapter->vsi_res->num_queue_pairs;
+	u32 num_active_queues = adapter->num_active_queues;
+
+	if (num_active_queues * 2 <= max_qp_num)
+		return true;
+
+	netdev_warn(adapter->netdev,
+		    "Current number of queue pairs (%u) set on adapter is too high to enable XDP, please configure queue number through ethtool to be no bigger than %u",
+		    num_active_queues, max_qp_num);
+
+	NL_SET_ERR_MSG_MOD(extack,
+			   "XDP cannot be enabled due to configured queue number being too large, please check dmesg for more info");
+
+	return false;
+}
+
+/**
+ * iavf_setup_xdp - handle xdp program change
+ * @adapter: board private structure
+ * @prog: XDP program
+ * @extack: netlink extended ack
+ */
+static int iavf_setup_xdp(struct iavf_adapter *adapter, struct bpf_prog *prog,
+			  struct netlink_ext_ack *extack)
+{
+	u32 frame_size = READ_ONCE(adapter->netdev->mtu) + LIBIE_RX_LL_LEN;
+	bool needs_reconfig = !!prog != iavf_adapter_xdp_active(adapter);
+	bool was_running = netif_running(adapter->netdev);
+	int err;
+
+	if (prog && frame_size > LIBIE_RX_BUF_LEN(LIBIE_XDP_HEADROOM)) {
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
+		return -EOPNOTSUPP;
+	}
+
+	if (needs_reconfig) {
+		if (!iavf_xdp_can_create_queues(adapter, extack)) {
+			err = -EOPNOTSUPP;
+			goto err_no_queues;
+		}
+
+		if (iavf_lock_timeout(&adapter->crit_lock,
+				      IAVF_XDP_LOCK_TIMEOUT_MS)) {
+			err = -EBUSY;
+			dev_err(&adapter->pdev->dev,
+				"failed to acquire crit_lock in %s\n",
+				__func__);
+			goto err_crit_lock;
+		}
+		err = iavf_process_pending_pf_msg(adapter,
+						  IAVF_XDP_LOCK_TIMEOUT_MS);
+		if (err)
+			goto err_pending_pf_msg;
+
+		if (was_running) {
+			err = iavf_close_sync(adapter);
+			if (err) {
+				dev_err(&adapter->pdev->dev,
+					"cannot close the interface to setup XDP, error: %d\n",
+					err);
+				goto err_close_if;
+			}
+		}
+
+		if (prog) {
+			err = iavf_prepare_xdp_rings(adapter, prog);
+			if (err) {
+				dev_err(&adapter->pdev->dev,
+					"cannot prepare rings to support XDP, error: %d\n",
+					err);
+				goto err_prepare_xdp_rings;
+			}
+		} else {
+			iavf_destroy_xdp_rings(adapter);
+		}
+
+		if (was_running) {
+			err = iavf_open_sync(adapter);
+			if (err) {
+				dev_err(&adapter->pdev->dev,
+					"cannot open the interface after XDP setup, error: %d\n",
+					err);
+				goto err_open_if;
+			}
+		}
+		mutex_unlock(&adapter->crit_lock);
+	} else {
+		iavf_assign_bpf_prog(adapter, prog);
+		iavf_copy_xdp_prog_to_rings(adapter);
+	}
+
+	return 0;
+
+err_open_if:
+err_prepare_xdp_rings:
+	iavf_destroy_xdp_rings(adapter);
+	iavf_open_sync(adapter);
+err_close_if:
+err_pending_pf_msg:
+	mutex_unlock(&adapter->crit_lock);
+err_crit_lock:
+err_no_queues:
+	return err;
+}
+
+/**
+ * iavf_xdp - XDP command handler
+ * @netdev: netdevice
+ * @xdp: XDP command
+ */
+static int iavf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
+{
+	struct iavf_adapter *adapter = netdev_priv(netdev);
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return iavf_setup_xdp(adapter, xdp->prog, xdp->extack);
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_open		= iavf_open,
 	.ndo_stop		= iavf_close,
@@ -4808,6 +5248,7 @@ static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_fix_features	= iavf_fix_features,
 	.ndo_set_features	= iavf_set_features,
 	.ndo_setup_tc		= iavf_setup_tc,
+	.ndo_bpf		= iavf_xdp,
 };
 
 /**
@@ -5242,6 +5683,8 @@ static void iavf_remove(struct pci_dev *pdev)
 	iavf_free_all_rx_resources(adapter);
 	iavf_free_misc_irq(adapter);
 
+	iavf_free_xdp_prog(adapter);
+
 	iavf_reset_interrupt_capability(adapter);
 	iavf_free_q_vectors(adapter);
 
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index ca601d8e4b8f8f..72dd85f789fce1 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -6,6 +6,7 @@
 
 #include <linux/if_vlan.h>
 #include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
 
 /* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
  * bitfield struct.
@@ -132,8 +133,10 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 
 /* Space reserved in front of each frame */
 #define LIBIE_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+#define LIBIE_XDP_HEADROOM	(max(XDP_PACKET_HEADROOM, NET_SKB_PAD) +    \
+				 NET_IP_ALIGN)
 /* Maximum headroom to calculate max MTU below */
-#define LIBIE_MAX_HEADROOM	LIBIE_SKB_HEADROOM
+#define LIBIE_MAX_HEADROOM	LIBIE_XDP_HEADROOM
 /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
 #define LIBIE_RX_LL_LEN		(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
 

From c59cad5b11f95d32794cac8c03ba1a685c8b5117 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 29 Nov 2022 17:25:21 +0100
Subject: [PATCH 26/40] iavf: Add XDP_PASS and XDP_DROP support

Implement basic XDP program setup, refactor data path
to use xdp_buff, implement XDP_PASS and XDP_DROP actions.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h      |   1 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 113 ++++++++++++++++----
 drivers/net/ethernet/intel/libie/rx.c       |   7 +-
 include/linux/net/intel/libie/rx.h          |   2 +-
 4 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 338bad44358a51..5b91380060dc8c 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -28,6 +28,7 @@
 #include <linux/etherdevice.h>
 #include <linux/socket.h>
 #include <linux/jiffies.h>
+#include <linux/filter.h>
 #include <net/ip6_checksum.h>
 #include <net/pkt_cls.h>
 #include <net/pkt_sched.h>
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 4c2fb1aef4dac6..ceedbf52f1045a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -2,6 +2,8 @@
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/bitfield.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
 #include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 
@@ -761,6 +763,17 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	}
 }
 
+/**
+ * iavf_is_xdp_enabled - Check if XDP is enabled on the RX ring
+ * @rx_ring: Rx descriptor ring
+ *
+ * Returns true, if the ring has been configured for XDP.
+ */
+static bool iavf_is_xdp_enabled(const struct iavf_ring *rx_ring)
+{
+	return !!rcu_access_pointer(rx_ring->xdp_prog);
+}
+
 /**
  * iavf_setup_rx_descriptors - Allocate Rx descriptors
  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
@@ -792,7 +805,8 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
-	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count);
+	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count,
+					 iavf_is_xdp_enabled(rx_ring));
 	if (IS_ERR(pool)) {
 		ret = PTR_ERR(pool);
 		goto err_free_dma;
@@ -1044,32 +1058,32 @@ static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 hr,
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
- * @page: Rx page to with the data
- * @hr: headroom in front of the data
- * @size: size of the data
+ * @xdp: initialized XDP buffer
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct page *page, u32 hr, u32 size)
+static struct sk_buff *iavf_build_skb(const struct xdp_buff *xdp)
 {
 	struct sk_buff *skb;
-	void *va;
+	u32 metasize;
 
-	/* prefetch first cache line of first page */
-	va = page_address(page);
-	net_prefetch(va + hr);
+	net_prefetch(xdp->data_meta);
 
 	/* build an skb around the page buffer */
-	skb = napi_build_skb(va, LIBIE_RX_TRUESIZE);
+	skb = napi_build_skb(xdp->data_hard_start, LIBIE_RX_TRUESIZE);
 	if (unlikely(!skb))
 		return NULL;
 
 	skb_mark_for_recycle(skb);
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, hr);
-	__skb_put(skb, size);
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
+
+	metasize = xdp->data - xdp->data_meta;
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 
 	return skb;
 }
@@ -1090,6 +1104,39 @@ static bool iavf_is_non_eop(u64 qword, struct libie_rq_onstack_stats *stats)
 	return true;
 }
 
+/**
+ * iavf_run_xdp - Run XDP program and perform resulting action
+ * @rx_ring: RX descriptor ring to transact packets on
+ * @xdp: a prepared XDP buffer
+ * @xdp_prog: an XDP program assigned to the interface
+ *
+ * Returns resulting XDP action.
+ */
+static unsigned int
+iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
+	     struct bpf_prog *xdp_prog)
+{
+	unsigned int xdp_act;
+
+	xdp_act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	switch (xdp_act) {
+	case XDP_PASS:
+	case XDP_DROP:
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
+
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, xdp_act);
+
+		return XDP_DROP;
+	}
+
+	return xdp_act;
+}
+
 /**
  * iavf_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
  * @rx_ring: rx descriptor ring to transact packets on
@@ -1111,13 +1158,19 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
+	struct bpf_prog *xdp_prog;
 	u32 hr = pool->p.offset;
 	u32 cleaned_count = 0;
+	unsigned int xdp_act;
+	struct xdp_buff xdp;
+
+	xdp_prog = rcu_dereference(rx_ring->xdp_prog);
+	xdp_init_buff(&xdp, PAGE_SIZE, &rx_ring->xdp_rxq);
 
 	while (likely(cleaned_count < budget)) {
 		union iavf_rx_desc *rx_desc;
+		u32 size, put_size;
 		struct page *page;
-		unsigned int size;
 		u16 vlan_tag = 0;
 		u64 qword;
 
@@ -1161,32 +1214,52 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 */
 		if (unlikely(!size)) {
 			page_pool_recycle_direct(pool, page);
-			goto skip_data;
+			goto no_skb;
 		}
 
 		page_pool_dma_sync_for_cpu(pool, page, size);
+		put_size = size;
+
+		xdp_prepare_buff(&xdp, page_address(page), hr, size, true);
+		if (!xdp_prog)
+			goto construct_skb;
 
+		xdp_act = iavf_run_xdp(rx_ring, &xdp, xdp_prog);
+		put_size = max_t(u32, xdp.data_end - xdp.data_hard_start - hr,
+				 put_size);
+
+		if (xdp_act != XDP_PASS) {
+			page_pool_put_page(pool, page, put_size, true);
+
+			stats.bytes += size;
+			stats.packets++;
+
+			skb = NULL;
+			goto no_skb;
+		}
+
+construct_skb:
 		/* retrieve a buffer from the ring */
 		if (skb)
 			iavf_add_rx_frag(skb, page, hr, size);
 		else
-			skb = iavf_build_skb(page, hr, size);
+			skb = iavf_build_skb(&xdp);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
-			page_pool_put_page(pool, page, size, true);
+			page_pool_put_page(pool, page, put_size, true);
 			libie_stats_inc_one(&rx_ring->rq_stats,
 					    build_skb_fail);
 			break;
 		}
 
-skip_data:
+no_skb:
 		cleaned_count++;
 		to_refill++;
 		if (unlikely(++ntc == ring_size))
 			ntc = 0;
 
-		if (iavf_is_non_eop(qword, &stats))
+		if (iavf_is_non_eop(qword, &stats) || !skb)
 			continue;
 
 		prefetch(rx_desc);
@@ -1390,6 +1463,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 	 */
 	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
 
+	rcu_read_lock();
+
 	iavf_for_each_ring(ring, q_vector->rx) {
 		int cleaned = iavf_clean_rx_irq(ring, budget_per_ring);
 
@@ -1399,6 +1474,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 			clean_complete = false;
 	}
 
+	rcu_read_unlock();
+
 	/* If work not completed, return budget and polling will return */
 	if (!clean_complete) {
 		int cpu_id = smp_processor_id();
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index 293c2cc19a0ec3..65475bf6d2d27f 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -132,13 +132,14 @@ static u32 libie_rx_sync_len(const struct net_device *dev, u32 hr)
  * libie_rx_page_pool_create - create a PP with the default libie settings
  * @dev: &net_device which a PP will be created for
  * @size: size of the PP, usually simply Rx queue len
+ * @xdp: whether XDP is enabled on the device
  *
  * Returns &page_pool on success, casted -errno on failure.
  */
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
-					    u32 size)
+					    u32 size, bool xdp)
 {
-	u32 hr = LIBIE_SKB_HEADROOM;
+	u32 hr = xdp ? LIBIE_XDP_HEADROOM : LIBIE_SKB_HEADROOM;
 	const struct page_pool_params pp = {
 		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK |
 				  PP_FLAG_DMA_SYNC_DEV,
@@ -146,7 +147,7 @@ struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 		.pool_size	= size,
 		.nid		= NUMA_NO_NODE,
 		.dev		= dev->dev.parent,
-		.dma_dir	= DMA_FROM_DEVICE,
+		.dma_dir	= xdp ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE,
 		.max_len	= libie_rx_sync_len(dev, hr),
 		.offset		= hr,
 	};
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 72dd85f789fce1..d73efd721ffc6f 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -173,7 +173,7 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 struct libie_rq_stats;
 
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
-					    u32 size);
+					    u32 size, bool xdp);
 void libie_rx_page_pool_destroy(struct page_pool *pool,
 				struct libie_rq_stats *stats);
 

From 19cfad94443e0f54c4a9111fbb1b3c0ccc9e348d Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 14:42:21 +0100
Subject: [PATCH 27/40] iavf: Implement XDP_TX action

Implement sending the packet from an XDP ring.
XDP path functions are separate from the general TX routines,
because this allows to simplify and therefore speedup the process.
It also makes code more friendly to future XDP-specific optimizations.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h      |   4 -
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 182 ++++++++++++++++++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  94 +++++++++-
 3 files changed, 262 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 5b91380060dc8c..d10fb0f2606545 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -86,10 +86,6 @@ struct iavf_vsi {
 
 #define MAXIMUM_ETHERNET_VLAN_SIZE (VLAN_ETH_FRAME_LEN + ETH_FCS_LEN)
 
-#define IAVF_RX_DESC(R, i) (&(((union iavf_32byte_rx_desc *)((R)->desc))[i]))
-#define IAVF_TX_DESC(R, i) (&(((struct iavf_tx_desc *)((R)->desc))[i]))
-#define IAVF_TX_CTXTDESC(R, i) \
-	(&(((struct iavf_tx_context_desc *)((R)->desc))[i]))
 #define IAVF_MAX_REQ_QUEUES 16
 
 #define IAVF_HKEY_ARRAY_SIZE ((IAVF_VFQF_HKEY_MAX_INDEX + 1) * 4)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index ceedbf52f1045a..c61a7b09356afa 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -11,6 +11,10 @@
 #include "iavf_trace.h"
 #include "iavf_prototype.h"
 
+static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
+			      struct iavf_ring *xdp_ring,
+			      bool map);
+
 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
 				u32 td_tag)
 {
@@ -46,16 +50,34 @@ static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
 	/* tx_buffer must be completely set up in the transmit path */
 }
 
+/**
+ * iavf_free_xdp_resource - Correctly free XDP TX buffer
+ * @tx_buffer: the buffer being released
+ */
+static void iavf_free_xdp_resource(struct iavf_tx_buffer *tx_buffer)
+{
+	struct page *page;
+	u32 put_size;
+
+	page = tx_buffer->page;
+	put_size = dma_unmap_len(tx_buffer, len);
+	page_pool_put_page(page->pp, page, put_size, true);
+}
+
 /**
  * iavf_release_tx_resources - Release all Tx buffers on ring
  * @ring: TX or XDP ring
  */
 static void iavf_release_tx_resources(struct iavf_ring *ring)
 {
+	bool is_xdp = iavf_ring_is_xdp(ring);
 	u32 i;
 
 	for (i = 0; i < ring->count; i++)
-		iavf_unmap_and_free_tx_resource(ring, &ring->tx_bi[i]);
+		if (is_xdp)
+			iavf_free_xdp_resource(&ring->tx_bi[i]);
+		else
+			iavf_unmap_and_free_tx_resource(ring, &ring->tx_bi[i]);
 }
 
 /**
@@ -296,9 +318,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 
 	i += tx_ring->count;
 	tx_ring->next_to_clean = i;
-	libie_sq_napi_stats_add(&tx_ring->sq_stats, &stats);
-	tx_ring->q_vector->tx.total_bytes += stats.bytes;
-	tx_ring->q_vector->tx.total_packets += stats.packets;
+	iavf_update_tx_ring_stats(tx_ring, &stats);
 
 	if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) {
 		/* check to see if there are < 4 descriptors
@@ -1109,12 +1129,15 @@ static bool iavf_is_non_eop(u64 qword, struct libie_rq_onstack_stats *stats)
  * @rx_ring: RX descriptor ring to transact packets on
  * @xdp: a prepared XDP buffer
  * @xdp_prog: an XDP program assigned to the interface
+ * @xdp_ring: XDP TX queue assigned to the RX ring
+ * @rxq_xdp_act: Logical OR of flags of XDP actions that require finalization
  *
  * Returns resulting XDP action.
  */
 static unsigned int
 iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
-	     struct bpf_prog *xdp_prog)
+	     struct bpf_prog *xdp_prog, struct iavf_ring *xdp_ring,
+	     u32 *rxq_xdp_act)
 {
 	unsigned int xdp_act;
 
@@ -1124,11 +1147,18 @@ iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	case XDP_PASS:
 	case XDP_DROP:
 		break;
+	case XDP_TX:
+		if (unlikely(iavf_xmit_xdp_buff(xdp, xdp_ring, false)))
+			goto xdp_err;
+
+		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
+		break;
 	default:
 		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
 
 		fallthrough;
 	case XDP_ABORTED:
+xdp_err:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, xdp_act);
 
 		return XDP_DROP;
@@ -1158,13 +1188,20 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
+	struct iavf_ring *xdp_ring;
 	struct bpf_prog *xdp_prog;
 	u32 hr = pool->p.offset;
 	u32 cleaned_count = 0;
 	unsigned int xdp_act;
 	struct xdp_buff xdp;
+	u32 rxq_xdp_act = 0;
+	u16 cached_ntu;
 
 	xdp_prog = rcu_dereference(rx_ring->xdp_prog);
+	if (xdp_prog) {
+		xdp_ring = rx_ring->xdp_ring;
+		cached_ntu = xdp_ring->next_to_use;
+	}
 	xdp_init_buff(&xdp, PAGE_SIZE, &rx_ring->xdp_rxq);
 
 	while (likely(cleaned_count < budget)) {
@@ -1224,19 +1261,21 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		if (!xdp_prog)
 			goto construct_skb;
 
-		xdp_act = iavf_run_xdp(rx_ring, &xdp, xdp_prog);
+		xdp_act = iavf_run_xdp(rx_ring, &xdp, xdp_prog, xdp_ring,
+				       &rxq_xdp_act);
 		put_size = max_t(u32, xdp.data_end - xdp.data_hard_start - hr,
 				 put_size);
 
-		if (xdp_act != XDP_PASS) {
+		if (xdp_act == XDP_PASS)
+			goto construct_skb;
+		else if (xdp_act == XDP_DROP)
 			page_pool_put_page(pool, page, put_size, true);
 
-			stats.bytes += size;
-			stats.packets++;
+		stats.bytes += size;
+		stats.packets++;
 
-			skb = NULL;
-			goto no_skb;
-		}
+		skb = NULL;
+		goto no_skb;
 
 construct_skb:
 		/* retrieve a buffer from the ring */
@@ -1301,6 +1340,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	rx_ring->next_to_clean = ntc;
 	rx_ring->skb = skb;
 
+	iavf_finalize_xdp_rx(xdp_ring, rxq_xdp_act, cached_ntu);
+
 	if (to_refill >= IAVF_RX_BUFFER_WRITE) {
 		to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill, gfp);
 		/* guarantee a trip back through this routine if there was
@@ -2243,3 +2284,120 @@ netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 
 	return iavf_xmit_frame_ring(skb, tx_ring);
 }
+
+/**
+ * iavf_clean_xdp_irq - Reclaim a batch of TX resources from completed XDP_TX
+ * @xdp_ring: XDP Tx ring
+ *
+ * Returns number of cleaned descriptors.
+ */
+static u32 iavf_clean_xdp_irq(struct iavf_ring *xdp_ring)
+{
+	struct libie_sq_onstack_stats stats = { };
+	struct iavf_tx_desc *last_rs_desc;
+	u32 ntc = xdp_ring->next_to_clean;
+	u32 cnt = xdp_ring->count;
+	u16 done_frames = 0;
+	u16 rs_idx;
+	u32 i;
+
+	rs_idx = xdp_ring->tx_bi[ntc].rs_desc_idx;
+	last_rs_desc = IAVF_TX_DESC(xdp_ring, rs_idx);
+	if (last_rs_desc->cmd_type_offset_bsz &
+	    cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE)) {
+		done_frames = rs_idx >= ntc ? rs_idx - ntc + 1 :
+					      rs_idx + cnt - ntc + 1;
+		last_rs_desc->cmd_type_offset_bsz = 0;
+	}
+
+	for (i = 0; i < done_frames; i++) {
+		struct iavf_tx_buffer *tx_buf = &xdp_ring->tx_bi[ntc];
+
+		stats.bytes += tx_buf->bytecount;
+		/* normally tx_buf->gso_segs was taken but at this point
+		 * it's always 1 for us
+		 */
+		stats.packets++;
+
+		iavf_free_xdp_resource(tx_buf);
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+	xdp_ring->next_to_clean = ntc;
+	iavf_update_tx_ring_stats(xdp_ring, &stats);
+
+	return i;
+}
+
+/**
+ * iavf_xmit_xdp_buff - submit single buffer to XDP ring for transmission
+ * @xdp: XDP buffer pointer
+ * @xdp_ring: XDP ring for transmission
+ * @map: whether to map the buffer
+ *
+ * Returns negative on failure, 0 on success.
+ */
+static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
+			      struct iavf_ring *xdp_ring,
+			      bool map)
+{
+	u32 batch_sz = IAVF_RING_QUARTER(xdp_ring);
+	u32 size = xdp->data_end - xdp->data;
+	u32 ntu = xdp_ring->next_to_use;
+	struct iavf_tx_buffer *tx_buff;
+	struct iavf_tx_desc *tx_desc;
+	void *data = xdp->data;
+	dma_addr_t dma;
+	u32 free;
+
+	free = IAVF_DESC_UNUSED(xdp_ring);
+	if (unlikely(free < batch_sz))
+		free += iavf_clean_xdp_irq(xdp_ring);
+	if (unlikely(!free)) {
+		libie_stats_inc_one(&xdp_ring->sq_stats, busy);
+		return -EBUSY;
+	}
+
+	if (map) {
+		dma = dma_map_single(xdp_ring->dev, data, size, DMA_TO_DEVICE);
+		if (dma_mapping_error(xdp_ring->dev, dma))
+			return -ENOMEM;
+	} else {
+		struct page *page = virt_to_page(data);
+		u32 hr = data - xdp->data_hard_start;
+
+		dma = page_pool_get_dma_addr(page) + hr;
+		dma_sync_single_for_device(xdp_ring->dev, dma, size,
+					   DMA_BIDIRECTIONAL);
+	}
+
+	tx_buff = &xdp_ring->tx_bi[ntu];
+	tx_buff->bytecount = size;
+	tx_buff->gso_segs = 1;
+	/* TODO: set type to XDP_TX or XDP_XMIT depending on @map and assign
+	 * either ->data_hard_start (which is pointer to xdp_frame) or @page
+	 * above.
+	 */
+	tx_buff->page = virt_to_page(data);
+
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_buff, len, size);
+	dma_unmap_addr_set(tx_buff, dma, dma);
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, ntu);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
+						  size, 0);
+
+	ntu++;
+
+	if (ntu == xdp_ring->count)
+		ntu = 0;
+
+	xdp_ring->next_to_use = ntu;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index d6e4eba9492881..31a44352dfbd40 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -188,10 +188,15 @@ static inline unsigned int iavf_txd_use_count(unsigned int size)
 #define IAVF_TX_FLAGS_VLAN_SHIFT		16
 
 struct iavf_tx_buffer {
-	struct iavf_tx_desc *next_to_watch;
+
+	/* Track the last frame in batch/packet */
+	union {
+		struct iavf_tx_desc *next_to_watch;	/* on skb TX queue */
+		u16 rs_desc_idx;			/* on XDP queue */
+	};
 	union {
 		struct sk_buff *skb;
-		void *raw_buf;
+		struct page *page;
 	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
@@ -279,6 +284,12 @@ struct iavf_ring {
 	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_internodealigned_in_smp;
 
+#define IAVF_RING_QUARTER(R)		((R)->count >> 2)
+#define IAVF_RX_DESC(R, i) (&(((union iavf_32byte_rx_desc *)((R)->desc))[i]))
+#define IAVF_TX_DESC(R, i) (&(((struct iavf_tx_desc *)((R)->desc))[i]))
+#define IAVF_TX_CTXTDESC(R, i) \
+	(&(((struct iavf_tx_context_desc *)((R)->desc))[i]))
+
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002
 #define IAVF_ITR_ADAPTIVE_MIN_USECS	0x0002
 #define IAVF_ITR_ADAPTIVE_MAX_USECS	0x007e
@@ -384,4 +395,83 @@ static inline struct netdev_queue *txring_txq(const struct iavf_ring *ring)
 {
 	return netdev_get_tx_queue(ring->netdev, ring->queue_index);
 }
+
+/**
+ * iavf_xdp_ring_update_tail - Updates the XDP Tx ring tail register
+ * @xdp_ring: XDP Tx ring
+ *
+ * Notify hardware the new descriptor is ready to be transmitted
+ */
+static inline void iavf_xdp_ring_update_tail(const struct iavf_ring *xdp_ring)
+{
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.
+	 */
+	wmb();
+	writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail);
+}
+
+/**
+ * iavf_update_tx_ring_stats - Update TX ring stats after transmit completes
+ * @tx_ring: TX descriptor ring
+ * @tc: TODO
+ * @total_pkts: Number of packets transmitted since the last update
+ * @total_bytes: Number of bytes transmitted since the last update
+ **/
+static inline void
+__iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
+			    struct iavf_ring_container *tc,
+			    const struct libie_sq_onstack_stats *stats)
+{
+	libie_sq_napi_stats_add(&tx_ring->sq_stats, stats);
+	tc->total_bytes += stats->bytes;
+	tc->total_packets += stats->packets;
+}
+
+#define iavf_update_tx_ring_stats(r, s) \
+	__iavf_update_tx_ring_stats(r, &(r)->q_vector->tx, s)
+
+#define IAVF_RXQ_XDP_ACT_FINALIZE_TX	BIT(0)
+
+/**
+ * iavf_set_rs_bit - set RS bit on last produced descriptor.
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ *
+ * Returns the index of descriptor RS bit was set on (one behind current NTU).
+ */
+static inline u16 iavf_set_rs_bit(struct iavf_ring *xdp_ring)
+{
+	u16 rs_idx = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 :
+					     xdp_ring->count - 1;
+	struct iavf_tx_desc *tx_desc;
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, rs_idx);
+	tx_desc->cmd_type_offset_bsz |=
+		cpu_to_le64(IAVF_TX_DESC_CMD_RS << IAVF_TXD_QW1_CMD_SHIFT);
+
+	return rs_idx;
+}
+
+/**
+ * iavf_finalize_xdp_rx - Finalize XDP actions once per RX ring clean
+ * @xdp_ring: XDP TX queue assigned to a given RX ring
+ * @rxq_xdp_act: Logical OR of flags of XDP actions that require finalization
+ * @first_idx: index of the first frame in the transmitted batch on XDP queue
+ **/
+static inline void iavf_finalize_xdp_rx(struct iavf_ring *xdp_ring,
+					u32 rxq_xdp_act, u32 first_idx)
+{
+	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_TX) {
+		struct iavf_tx_buffer *tx_buf = &xdp_ring->tx_bi[first_idx];
+
+		tx_buf->rs_desc_idx = iavf_set_rs_bit(xdp_ring);
+		iavf_xdp_ring_update_tail(xdp_ring);
+	}
+}
+
+static inline bool iavf_ring_is_xdp(struct iavf_ring *ring)
+{
+	return !!(ring->flags & IAVF_TXRX_FLAGS_XDP);
+}
+
 #endif /* _IAVF_TXRX_H_ */

From ed6e94299e56c42c557c39527b7b50cb69b1839c Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 09:10:04 +0100
Subject: [PATCH 28/40] iavf: Implement XDP redirect path

Implement XDP_REDIRECT action and ndo_xdp_xmit() callback.

For now, packets redirected from CPU with index greater than
XDP queues number are just dropped with an error.
This is a rather common situation, especially when VF is configured
to run on host and will be addressed in later patches.

Patch also refactors RX XDP handling to use switch statement due to
increased number of actions.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |   1 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 101 +++++++++++++++++---
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  24 ++++-
 3 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index d654b5e4d43b13..2968e8541979a1 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -5249,6 +5249,7 @@ static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_set_features	= iavf_set_features,
 	.ndo_setup_tc		= iavf_setup_tc,
 	.ndo_bpf		= iavf_xdp,
+	.ndo_xdp_xmit		= iavf_xdp_xmit,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index c61a7b09356afa..2f6ccdace782fe 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -52,16 +52,31 @@ static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
 
 /**
  * iavf_free_xdp_resource - Correctly free XDP TX buffer
- * @tx_buffer: the buffer being released
+ * @ring:	XDP ring
+ * @tx_buffer:	the buffer being released
  */
-static void iavf_free_xdp_resource(struct iavf_tx_buffer *tx_buffer)
+static void iavf_free_xdp_resource(struct iavf_ring *ring,
+				   struct iavf_tx_buffer *tx_buffer)
 {
 	struct page *page;
 	u32 put_size;
 
-	page = tx_buffer->page;
-	put_size = dma_unmap_len(tx_buffer, len);
-	page_pool_put_page(page->pp, page, put_size, true);
+	switch (tx_buffer->xdp_type) {
+	case IAVF_XDP_BUFFER_TX:
+		page = tx_buffer->page;
+		put_size = dma_unmap_len(tx_buffer, len);
+		page_pool_put_page(page->pp, page, put_size, true);
+		break;
+	case IAVF_XDP_BUFFER_FRAME:
+		dma_unmap_page(ring->dev,
+			       dma_unmap_addr(tx_buffer, dma),
+			       dma_unmap_len(tx_buffer, len),
+			       DMA_TO_DEVICE);
+		xdp_return_frame(tx_buffer->xdpf);
+		break;
+	}
+
+	tx_buffer->xdp_type = IAVF_XDP_BUFFER_NONE;
 }
 
 /**
@@ -75,7 +90,7 @@ static void iavf_release_tx_resources(struct iavf_ring *ring)
 
 	for (i = 0; i < ring->count; i++)
 		if (is_xdp)
-			iavf_free_xdp_resource(&ring->tx_bi[i]);
+			iavf_free_xdp_resource(ring, &ring->tx_bi[i]);
 		else
 			iavf_unmap_and_free_tx_resource(ring, &ring->tx_bi[i]);
 }
@@ -1153,6 +1168,12 @@ iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 
 		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
 		break;
+	case XDP_REDIRECT:
+		if (unlikely(xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog)))
+			goto xdp_err;
+
+		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_REDIR;
+		break;
 	default:
 		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
 
@@ -2319,7 +2340,7 @@ static u32 iavf_clean_xdp_irq(struct iavf_ring *xdp_ring)
 		 */
 		stats.packets++;
 
-		iavf_free_xdp_resource(tx_buf);
+		iavf_free_xdp_resource(xdp_ring, tx_buf);
 
 		ntc++;
 		if (ntc >= xdp_ring->count)
@@ -2336,13 +2357,13 @@ static u32 iavf_clean_xdp_irq(struct iavf_ring *xdp_ring)
  * iavf_xmit_xdp_buff - submit single buffer to XDP ring for transmission
  * @xdp: XDP buffer pointer
  * @xdp_ring: XDP ring for transmission
- * @map: whether to map the buffer
+ * @frame: whether the function is called from .ndo_xdp_xmit()
  *
  * Returns negative on failure, 0 on success.
  */
 static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 			      struct iavf_ring *xdp_ring,
-			      bool map)
+			      bool frame)
 {
 	u32 batch_sz = IAVF_RING_QUARTER(xdp_ring);
 	u32 size = xdp->data_end - xdp->data;
@@ -2361,7 +2382,7 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 		return -EBUSY;
 	}
 
-	if (map) {
+	if (frame) {
 		dma = dma_map_single(xdp_ring->dev, data, size, DMA_TO_DEVICE);
 		if (dma_mapping_error(xdp_ring->dev, dma))
 			return -ENOMEM;
@@ -2377,11 +2398,13 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 	tx_buff = &xdp_ring->tx_bi[ntu];
 	tx_buff->bytecount = size;
 	tx_buff->gso_segs = 1;
-	/* TODO: set type to XDP_TX or XDP_XMIT depending on @map and assign
-	 * either ->data_hard_start (which is pointer to xdp_frame) or @page
-	 * above.
-	 */
-	tx_buff->page = virt_to_page(data);
+	if (frame) {
+		tx_buff->xdp_type = IAVF_XDP_BUFFER_FRAME;
+		tx_buff->xdpf = xdp->data_hard_start;
+	} else {
+		tx_buff->xdp_type = IAVF_XDP_BUFFER_TX;
+		tx_buff->page = virt_to_page(data);
+	}
 
 	/* record length, and DMA address */
 	dma_unmap_len_set(tx_buff, len, size);
@@ -2401,3 +2424,51 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 
 	return 0;
 }
+
+int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags)
+{
+	struct iavf_adapter *adapter = netdev_priv(dev);
+	struct iavf_tx_buffer *tx_buf;
+	struct iavf_ring *xdp_ring;
+	u32 queue_index, nxmit = 0;
+	int err = 0;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	if (unlikely(adapter->state == __IAVF_DOWN))
+		return -ENETDOWN;
+
+	if (!iavf_adapter_xdp_active(adapter))
+		return -ENXIO;
+
+	queue_index = smp_processor_id();
+	if (queue_index >= adapter->num_active_queues)
+		return -ENXIO;
+
+	xdp_ring = &adapter->xdp_rings[queue_index];
+
+	tx_buf = &xdp_ring->tx_bi[xdp_ring->next_to_use];
+	for (u32 i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		struct xdp_buff xdp;
+
+		xdp_convert_frame_to_buff(xdpf, &xdp);
+		err = iavf_xmit_xdp_buff(&xdp, xdp_ring, true);
+		if (unlikely(err)) {
+			netdev_err(dev, "XDP frame TX failed, error: %d\n",
+				   err);
+			break;
+		}
+
+		nxmit++;
+	}
+
+	if (likely(nxmit))
+		tx_buf->rs_desc_idx = iavf_set_rs_bit(xdp_ring);
+	if (flags & XDP_XMIT_FLUSH)
+		iavf_xdp_ring_update_tail(xdp_ring);
+
+	return nxmit;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 31a44352dfbd40..d446f19dc446d5 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -187,6 +187,18 @@ static inline unsigned int iavf_txd_use_count(unsigned int size)
 #define IAVF_TX_FLAGS_VLAN_PRIO_SHIFT		29
 #define IAVF_TX_FLAGS_VLAN_SHIFT		16
 
+/**
+ * enum iavf_xdp_buffer_type - type of &iavf_tx_buffer on XDP queue
+ * @IAVF_XDP_BUFFER_NONE: unused, no action required
+ * @IAVF_XDP_BUFFER_TX: free according to our memory model
+ * @IAVF_XDP_BUFFER_FRAME: use xdp_return_frame()
+ */
+enum iavf_xdp_buffer_type {
+	IAVF_XDP_BUFFER_NONE	= 0U,
+	IAVF_XDP_BUFFER_TX,
+	IAVF_XDP_BUFFER_FRAME,
+};
+
 struct iavf_tx_buffer {
 
 	/* Track the last frame in batch/packet */
@@ -195,11 +207,13 @@ struct iavf_tx_buffer {
 		u16 rs_desc_idx;			/* on XDP queue */
 	};
 	union {
-		struct sk_buff *skb;
-		struct page *page;
+		struct sk_buff *skb;		/* used for .ndo_start_xmit() */
+		struct page *page;		/* used for XDP_TX */
+		struct xdp_frame *xdpf;		/* used for .ndo_xdp_xmit() */
 	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
+	unsigned short xdp_type;
 
 	DEFINE_DMA_UNMAP_ADDR(dma);
 	DEFINE_DMA_UNMAP_LEN(len);
@@ -326,6 +340,9 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi);
 int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size);
 bool __iavf_chk_linearize(struct sk_buff *skb);
 
+int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags);
+
 /**
  * iavf_xmit_descriptor_count - calculate number of Tx descriptors needed
  * @skb:     send buffer
@@ -432,6 +449,7 @@ __iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
 	__iavf_update_tx_ring_stats(r, &(r)->q_vector->tx, s)
 
 #define IAVF_RXQ_XDP_ACT_FINALIZE_TX	BIT(0)
+#define IAVF_RXQ_XDP_ACT_FINALIZE_REDIR	BIT(1)
 
 /**
  * iavf_set_rs_bit - set RS bit on last produced descriptor.
@@ -461,6 +479,8 @@ static inline u16 iavf_set_rs_bit(struct iavf_ring *xdp_ring)
 static inline void iavf_finalize_xdp_rx(struct iavf_ring *xdp_ring,
 					u32 rxq_xdp_act, u32 first_idx)
 {
+	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_REDIR)
+		xdp_do_flush_map();
 	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_TX) {
 		struct iavf_tx_buffer *tx_buf = &xdp_ring->tx_bi[first_idx];
 

From 55adf9514f6061736684bbd1ed80f79274da43db Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 15:12:21 +0100
Subject: [PATCH 29/40] iavf: Allow XDP TxQ sharing

Port of commit 22bf877 ("ice: introduce XDP_TX fallback path").
The patch handles the case, when queue number is not sufficient for
the current number of CPUs. To avoid dropping some packets
redirected from other interfaces, XDP TxQs are allowed to be shared
between CPUs, which imposes the locking requirement.
Static key approach has little to none performance penalties
when sharing is not needed.

This mechanism is much more applicable when dealing with VFs.
In fact, maximum number of queue pairs that ice PF can give to
an iavf VF is 16, which allows up to 8 XDP TxQs, so without
XDP TxQ sharing, some redirected packets can be dropped even
on a 10 CPU system.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 27 ++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 34 ++++++++++++++++++---
 drivers/net/ethernet/intel/iavf/iavf_txrx.h | 11 ++++++-
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 2968e8541979a1..d1a7617db6fb2a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1724,9 +1724,31 @@ static void iavf_init_tx_ring(struct iavf_adapter *adapter,
 	if (xdp_ring) {
 		tx_ring->queue_index += adapter->num_active_queues;
 		tx_ring->flags |= IAVF_TXRX_FLAGS_XDP;
+		spin_lock_init(&tx_ring->tx_lock);
 	}
 }
 
+/**
+ * iavf_xdp_cfg_tx_sharing - Enable XDP TxQ sharing, if needed
+ * @adapter: board private structure
+ *
+ * If there is more CPUs than rings, sharing XDP TxQ allows us
+ * to handle XDP_REDIRECT from other interfaces.
+ */
+static void iavf_xdp_cfg_tx_sharing(struct iavf_adapter *adapter)
+{
+	u32 num_active_queues = adapter->num_active_queues;
+	u32 num_cpus = num_online_cpus();
+
+	if (!iavf_adapter_xdp_active(adapter) || num_active_queues >= num_cpus)
+		return;
+
+	netdev_warn(adapter->netdev,
+		    "System has %u CPUs, but only %u XDP queues can be configured, entering XDP TxQ sharing mode, performance is decreased\n",
+		    num_cpus, num_active_queues);
+	static_branch_inc(&iavf_xdp_locking_key);
+}
+
 /**
  * iavf_alloc_xdp_queues - Allocate memory for XDP rings
  * @adapter: board private structure to initialize
@@ -1751,6 +1773,8 @@ static int iavf_alloc_xdp_queues(struct iavf_adapter *adapter, u32 num_active_qu
 		adapter->rx_rings[i].xdp_ring = &adapter->xdp_rings[i];
 	}
 
+	iavf_xdp_cfg_tx_sharing(adapter);
+
 	return 0;
 }
 
@@ -3559,6 +3583,9 @@ void iavf_free_all_tx_resources(struct iavf_adapter *adapter)
 	if (!adapter->tx_rings)
 		return;
 
+	if (static_key_enabled(&iavf_xdp_locking_key))
+		static_branch_dec(&iavf_xdp_locking_key);
+
 	for (i = 0; i < adapter->num_active_queues; i++)
 		if (adapter->tx_rings[i].desc)
 			iavf_free_tx_resources(&adapter->tx_rings[i]);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 2f6ccdace782fe..85ee238d2fe50f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -11,9 +11,10 @@
 #include "iavf_trace.h"
 #include "iavf_prototype.h"
 
-static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
-			      struct iavf_ring *xdp_ring,
-			      bool map);
+DEFINE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
+
+static bool iavf_xdp_xmit_back(const struct xdp_buff *buff,
+			       struct iavf_ring *xdp_ring);
 
 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
 				u32 td_tag)
@@ -1163,7 +1164,7 @@ iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	case XDP_DROP:
 		break;
 	case XDP_TX:
-		if (unlikely(iavf_xmit_xdp_buff(xdp, xdp_ring, false)))
+		if (unlikely(!iavf_xdp_xmit_back(xdp, xdp_ring)))
 			goto xdp_err;
 
 		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
@@ -2425,6 +2426,23 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 	return 0;
 }
 
+static bool iavf_xdp_xmit_back(const struct xdp_buff *buff,
+			       struct iavf_ring *xdp_ring)
+{
+	bool ret;
+
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_lock(&xdp_ring->tx_lock);
+
+	/* TODO: improve XDP_TX by batching */
+	ret = !iavf_xmit_xdp_buff(buff, xdp_ring, false);
+
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_unlock(&xdp_ring->tx_lock);
+
+	return ret;
+}
+
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags)
 {
@@ -2444,11 +2462,16 @@ int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		return -ENXIO;
 
 	queue_index = smp_processor_id();
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		queue_index %= adapter->num_xdp_tx_queues;
 	if (queue_index >= adapter->num_active_queues)
 		return -ENXIO;
 
 	xdp_ring = &adapter->xdp_rings[queue_index];
 
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_lock(&xdp_ring->tx_lock);
+
 	tx_buf = &xdp_ring->tx_bi[xdp_ring->next_to_use];
 	for (u32 i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
@@ -2470,5 +2493,8 @@ int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (flags & XDP_XMIT_FLUSH)
 		iavf_xdp_ring_update_tail(xdp_ring);
 
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_unlock(&xdp_ring->tx_lock);
+
 	return nxmit;
 }
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index d446f19dc446d5..6d09fbb234cae3 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -272,7 +272,8 @@ struct iavf_ring {
 
 	struct bpf_prog __rcu *xdp_prog;
 	struct iavf_ring *xdp_ring;
-	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
+	union {
+		struct sk_buff *skb;	/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
 					 * here and resume receiving this
@@ -280,6 +281,8 @@ struct iavf_ring {
 					 * iavf_clean_rx_ring_irq() is called
 					 * for this ring.
 					 */
+		spinlock_t tx_lock;	/* Protect XDP TX ring, when shared */
+	};
 
 	/* stats structs */
 	union {
@@ -340,6 +343,8 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi);
 int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size);
 bool __iavf_chk_linearize(struct sk_buff *skb);
 
+DECLARE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
+
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
 
@@ -484,8 +489,12 @@ static inline void iavf_finalize_xdp_rx(struct iavf_ring *xdp_ring,
 	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_TX) {
 		struct iavf_tx_buffer *tx_buf = &xdp_ring->tx_bi[first_idx];
 
+		if (static_branch_unlikely(&iavf_xdp_locking_key))
+			spin_lock(&xdp_ring->tx_lock);
 		tx_buf->rs_desc_idx = iavf_set_rs_bit(xdp_ring);
 		iavf_xdp_ring_update_tail(xdp_ring);
+		if (static_branch_unlikely(&iavf_xdp_locking_key))
+			spin_unlock(&xdp_ring->tx_lock);
 	}
 }
 

From 8f9b548ab0e64310217143435bdfafa45d8987a3 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Thu, 9 Mar 2023 16:07:14 +0100
Subject: [PATCH 30/40] iavf: Enable XDP netdev features

Enable NETDEV_XDP_ACT_BASIC and NETDEV_XDP_ACT_REDIRECT
XDP features in netdev.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index d1a7617db6fb2a..5c252b2566681e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2793,6 +2793,8 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 	iavf_set_ethtool_ops(netdev);
 	netdev->max_mtu = LIBIE_MAX_MTU;
 
+	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT;
+
 	if (!is_valid_ether_addr(adapter->hw.mac.addr)) {
 		dev_info(&pdev->dev, "Invalid MAC address %pM, using random\n",
 			 adapter->hw.mac.addr);

From 7910142f57c8c0e1c52842752f0bde6ebb7624c1 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Wed, 30 Nov 2022 12:01:26 +0100
Subject: [PATCH 31/40] iavf: Add AF_XDP initialization

Add necessary functions and data structures to support
AF_XDP feature.
Implement handling of 'XDP_SETUP_XSK_POOL' in .ndo_bpf().
Also, implement functions for selectively stopping only
those queues which take part in XDP socket creation.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/Makefile    |   3 +-
 drivers/net/ethernet/intel/iavf/iavf.h      |   5 +
 drivers/net/ethernet/intel/iavf/iavf_main.c |  17 +-
 drivers/net/ethernet/intel/iavf/iavf_xsk.c  | 315 ++++++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_xsk.h  |  15 +
 5 files changed, 352 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/iavf/iavf_xsk.c
 create mode 100644 drivers/net/ethernet/intel/iavf/iavf_xsk.h

diff --git a/drivers/net/ethernet/intel/iavf/Makefile b/drivers/net/ethernet/intel/iavf/Makefile
index 9c3e45c54d0133..19eb29005e7a06 100644
--- a/drivers/net/ethernet/intel/iavf/Makefile
+++ b/drivers/net/ethernet/intel/iavf/Makefile
@@ -13,4 +13,5 @@ obj-$(CONFIG_IAVF) += iavf.o
 
 iavf-objs := iavf_main.o iavf_ethtool.o iavf_virtchnl.o iavf_fdir.o \
 	     iavf_adv_rss.o \
-	     iavf_txrx.o iavf_common.o iavf_adminq.o iavf_client.o
+	     iavf_txrx.o iavf_common.o iavf_adminq.o iavf_client.o \
+	     iavf_xsk.o
diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index d10fb0f2606545..75e66a53064d1c 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -36,7 +36,9 @@
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_mirred.h>
 #include <net/xdp.h>
+#include <net/xdp_sock_drv.h>
 
+#include "iavf_xsk.h"
 #include "iavf_type.h"
 #include <linux/avf/virtchnl.h>
 #include "iavf_txrx.h"
@@ -243,6 +245,8 @@ struct iavf_cloud_filter {
 	bool add;		/* filter needs to be added */
 };
 
+#define IAVF_XDP_LINK_TIMEOUT_MS 1000
+
 #define IAVF_RESET_WAIT_MS 10
 #define IAVF_RESET_WAIT_DETECTED_COUNT 500
 #define IAVF_RESET_WAIT_COMPLETE_COUNT 2000
@@ -267,6 +271,7 @@ struct iavf_adapter {
 	u32 num_xdp_tx_queues;
 	u32 num_req_queues;
 	struct bpf_prog *xdp_prog;
+	unsigned long *af_xdp_zc_qps;
 
 	/* TX */
 	struct iavf_ring *tx_rings;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 5c252b2566681e..d7f7fefe3afa92 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2851,6 +2851,11 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 	set_bit(__IAVF_VSI_DOWN, adapter->vsi.state);
 	rtnl_unlock();
 
+	adapter->af_xdp_zc_qps = bitmap_zalloc(adapter->num_active_queues,
+					       GFP_KERNEL);
+	if (!adapter->af_xdp_zc_qps)
+		goto err_zc_qps;
+
 	iavf_misc_irq_enable(adapter);
 	wake_up(&adapter->down_waitqueue);
 
@@ -2872,6 +2877,8 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 	return;
 err_mem:
 	iavf_free_rss(adapter);
+err_zc_qps:
+	bitmap_free(adapter->af_xdp_zc_qps);
 err_register:
 	iavf_free_misc_irq(adapter);
 err_sw_init:
@@ -3132,6 +3139,7 @@ static void iavf_disable_vf(struct iavf_adapter *adapter)
 	}
 
 	iavf_free_xdp_prog(adapter);
+	bitmap_free(adapter->af_xdp_zc_qps);
 
 	spin_lock_bh(&adapter->mac_vlan_list_lock);
 
@@ -4960,7 +4968,6 @@ static void iavf_assign_bpf_prog(struct iavf_adapter *adapter,
 		bpf_prog_put(old_prog);
 }
 
-#define IAVF_XDP_LINK_TIMEOUT_MS	1000
 #define IAVF_XDP_LOCK_TIMEOUT_MS	5000
 
 /**
@@ -5108,7 +5115,7 @@ static void iavf_destroy_xdp_rings(struct iavf_adapter *adapter)
 static int iavf_prepare_xdp_rings(struct iavf_adapter *adapter,
 				  struct bpf_prog *prog)
 {
-	int err;
+	int i, err;
 
 	iavf_unmap_rings_from_vectors(adapter);
 	iavf_assign_bpf_prog(adapter, prog);
@@ -5124,6 +5131,9 @@ static int iavf_prepare_xdp_rings(struct iavf_adapter *adapter,
 
 	iavf_map_rings_to_vectors(adapter);
 
+	for_each_set_bit(i, adapter->af_xdp_zc_qps, adapter->num_active_queues)
+		napi_schedule(&adapter->rx_rings[i].q_vector->napi);
+
 	return 0;
 
 err_alloc_queues:
@@ -5257,6 +5267,9 @@ static int iavf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return iavf_setup_xdp(adapter, xdp->prog, xdp->extack);
+	case XDP_SETUP_XSK_POOL:
+		return iavf_xsk_pool_setup(adapter, xdp->xsk.pool,
+					   xdp->xsk.queue_id);
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
new file mode 100644
index 00000000000000..6d0c40ef002d40
--- /dev/null
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. */
+
+#include <linux/net/intel/libie/rx.h>
+#include <net/xdp_sock_drv.h>
+#include <net/xdp_sock.h>
+#include "iavf.h"
+#include "iavf_xsk.h"
+
+#define IAVF_CRIT_LOCK_WAIT_TIMEOUT_MS	1000
+#define IAVF_VC_MSG_TIMEOUT_MS		3000
+
+/**
+ * iavf_max_xdp_queues_count - Returns the maximal number of XDP queues
+ *			       that can be created for current configuration
+ *			       of a given adapter.
+ * @adapter: adapter where XDP socket will be set up
+ */
+static u32
+iavf_max_xdp_queues_count(struct iavf_adapter *adapter)
+{
+	u32 max_qp_num = adapter->vsi_res->num_queue_pairs;
+	u32 num_active_queues = adapter->num_active_queues;
+
+	return num_active_queues * 2 > max_qp_num ? max_qp_num / 2 :
+						    num_active_queues;
+}
+
+/**
+ * iavf_qp_clean_rings - Cleans all the rings of a given index
+ * @adapter: adapter that contains rings of interest
+ * @q_idx: ring index in array
+ */
+static void
+iavf_qp_clean_rings(struct iavf_adapter *adapter, u16 q_idx)
+{
+	iavf_clean_tx_ring(&adapter->tx_rings[q_idx]);
+	if (iavf_adapter_xdp_active(adapter)) {
+		synchronize_rcu();
+		iavf_clean_tx_ring(&adapter->xdp_rings[q_idx]);
+	}
+	iavf_clean_rx_ring(&adapter->rx_rings[q_idx]);
+}
+
+/**
+ * iavf_qvec_toggle_napi - Enables/disables NAPI for a given q_vector
+ * @adapter: adapter that has netdev
+ * @q_vector: q_vector that has NAPI context
+ * @enable: true for enable, false for disable
+ */
+static void
+iavf_qvec_toggle_napi(struct iavf_adapter *adapter,
+		      struct iavf_q_vector *q_vector, bool enable)
+{
+	if (!adapter->vsi.netdev || !q_vector)
+		return;
+
+	if (enable)
+		napi_enable(&q_vector->napi);
+	else
+		napi_disable(&q_vector->napi);
+}
+
+/**
+ * iavf_qvec_dis_irq - Mask off queue interrupt generation on given ring
+ * @adapter: the adapter that contains queue vector being un-configured
+ * @q_vector: queue vector
+ */
+static void
+iavf_qvec_dis_irq(struct iavf_adapter *adapter, struct iavf_q_vector *q_vector)
+{
+	int base = adapter->vsi.base_vector;
+	struct iavf_hw *hw = &adapter->hw;
+	u16 reg = q_vector->reg_idx;
+
+	wr32(hw, IAVF_VFINT_DYN_CTLN1(reg), 0);
+	synchronize_irq(adapter->msix_entries[reg + base].vector);
+	iavf_flush(hw);
+}
+
+/**
+ * iavf_qvec_ena_irq - Enable IRQ for given queue vector
+ * @adapter: the adapter that contains queue vector
+ * @q_vector: queue vector
+ */
+static void
+iavf_qvec_ena_irq(struct iavf_adapter *adapter, struct iavf_q_vector *q_vector)
+{
+	struct iavf_hw *hw = &adapter->hw;
+
+	if (adapter)
+		if (adapter->state == __IAVF_DOWN)
+			return;
+
+	wr32(hw, IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx),
+	     IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
+	     IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK);
+
+	iavf_flush(hw);
+}
+
+/**
+ * iavf_qp_dis - Disables a queue pair
+ * @adapter: adapter of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int iavf_qp_dis(struct iavf_adapter *adapter, u16 q_idx)
+{
+	struct iavf_vsi *vsi = &adapter->vsi;
+	struct iavf_ring *rx_ring, *xdp_ring;
+	struct iavf_q_vector *q_vector;
+	u32 rx_queues, tx_queues;
+	int err;
+
+	if (q_idx >= adapter->num_active_queues)
+		return -EINVAL;
+
+	rx_ring = &adapter->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	rx_queues = BIT(q_idx);
+	tx_queues = rx_queues;
+
+	netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+
+	iavf_qvec_toggle_napi(adapter, q_vector, false);
+	iavf_qvec_dis_irq(adapter, q_vector);
+
+	xdp_ring = &adapter->xdp_rings[q_idx];
+
+	tx_queues |= BIT(xdp_ring->queue_index);
+
+	err = iavf_disable_selected_queues(adapter, rx_queues, tx_queues, true);
+	if (err)
+		goto dis_exit;
+
+	iavf_qp_clean_rings(adapter, q_idx);
+dis_exit:
+	return err;
+}
+
+/**
+ * iavf_qp_ena - Enables a queue pair
+ * @adapter: adapter of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int iavf_qp_ena(struct iavf_adapter *adapter, u16 q_idx)
+{
+	struct iavf_vsi *vsi = &adapter->vsi;
+	struct iavf_ring *rx_ring, *xdp_ring;
+	struct iavf_q_vector *q_vector;
+	u32 rx_queues, tx_queues;
+	int ret, err = 0;
+
+	if (q_idx >= adapter->num_active_queues)
+		return -EINVAL;
+
+	xdp_ring = &adapter->xdp_rings[q_idx];
+	rx_ring = &adapter->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	rx_queues = BIT(q_idx);
+	tx_queues = rx_queues;
+	tx_queues |= BIT(xdp_ring->queue_index);
+
+	iavf_configure_rx_ring(adapter, rx_ring);
+
+	/* Use 'tx_queues' mask as a queue pair mask to configure
+	 * also an extra XDP Tx queue.
+	 */
+	err = iavf_configure_selected_queues(adapter, tx_queues, true);
+	if (err)
+		goto ena_exit;
+
+	err = iavf_enable_selected_queues(adapter, rx_queues, tx_queues, true);
+	if (err)
+		goto ena_exit;
+
+	ret = iavf_poll_for_link_status(adapter, IAVF_XDP_LINK_TIMEOUT_MS);
+	if (ret < 0) {
+		err = ret;
+		dev_err(&adapter->pdev->dev,
+			"cannot bring the link up, error: %d\n", err);
+		goto ena_exit;
+	} else if (!ret) {
+		err = -EBUSY;
+		dev_err(&adapter->pdev->dev,
+			"pf returned link down status, error: %d\n", err);
+		goto ena_exit;
+	}
+
+	iavf_qvec_toggle_napi(adapter, q_vector, true);
+	iavf_qvec_ena_irq(adapter, q_vector);
+
+	netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+ena_exit:
+	return err;
+}
+
+/**
+ * iavf_xsk_pool_disable - disable a buffer pool region
+ * @adapter: Current adapter
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int iavf_xsk_pool_disable(struct iavf_adapter *adapter, u16 qid)
+{
+	struct xsk_buff_pool *pool = xsk_get_pool_from_qid(adapter->vsi.netdev,
+							   qid);
+	if (!pool)
+		return -EINVAL;
+
+	clear_bit(qid, adapter->af_xdp_zc_qps);
+	xsk_pool_dma_unmap(pool, LIBIE_RX_DMA_ATTR);
+
+	return 0;
+}
+
+/**
+ * iavf_xsk_pool_enable - enable a buffer pool region
+ * @adapter: Current adapter
+ * @pool: pointer to a requested buffer pool region
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int
+iavf_xsk_pool_enable(struct iavf_adapter *adapter, struct xsk_buff_pool *pool,
+		     u16 qid)
+{
+	struct iavf_vsi *vsi = &adapter->vsi;
+	int err;
+
+	if (qid >= vsi->netdev->real_num_rx_queues ||
+	    qid >= vsi->netdev->real_num_tx_queues)
+		return -EINVAL;
+
+	err = xsk_pool_dma_map(pool, &adapter->pdev->dev, LIBIE_RX_DMA_ATTR);
+	if (err)
+		return err;
+
+	set_bit(qid, adapter->af_xdp_zc_qps);
+
+	return 0;
+}
+
+/**
+ * iavf_xsk_pool_setup - enable/disable a buffer pool region depending
+ * 			 on its state
+ * @adapter: Current adapter
+ * @pool: buffer pool to enable/associate to a ring, NULL to disable
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
+			struct xsk_buff_pool *pool, u32 qid)
+{
+	bool if_running, pool_present = !!pool;
+	struct iavf_vsi *vsi = &adapter->vsi;
+	int ret = 0, pool_failure = 0;
+
+	if (qid >= iavf_max_xdp_queues_count(adapter)) {
+		netdev_err(vsi->netdev, "Wrong queue index for XDP.\n");
+		pool_failure = -EINVAL;
+		goto failure;
+	}
+
+	if_running = netif_running(vsi->netdev) &&
+		     iavf_adapter_xdp_active(adapter);
+
+	if (if_running) {
+		if (iavf_lock_timeout(&adapter->crit_lock,
+				      IAVF_CRIT_LOCK_WAIT_TIMEOUT_MS))
+			return -EBUSY;
+
+		ret = iavf_process_pending_pf_msg(adapter,
+						  IAVF_VC_MSG_TIMEOUT_MS);
+		if (ret)
+			goto xsk_pool_if_up;
+
+		ret = iavf_qp_dis(adapter, qid);
+		if (ret) {
+			netdev_err(vsi->netdev, "iavf_qp_dis error = %d\n", ret);
+			goto xsk_pool_if_up;
+		}
+	}
+
+	pool_failure = pool_present ? iavf_xsk_pool_enable(adapter, pool, qid) :
+				      iavf_xsk_pool_disable(adapter, qid);
+
+xsk_pool_if_up:
+	if (if_running) {
+		ret = iavf_qp_ena(adapter, qid);
+		mutex_unlock(&adapter->crit_lock);
+		if (!ret && pool_present)
+			napi_schedule(&adapter->rx_rings[qid].q_vector->napi);
+		else if (ret)
+			netdev_err(vsi->netdev, "iavf_qp_ena error = %d\n", ret);
+	}
+
+failure:
+	if (pool_failure) {
+		netdev_err(vsi->netdev, "Could not %sable buffer pool, error = %d\n",
+			   pool_present ? "en" : "dis", pool_failure);
+		return pool_failure;
+	}
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.h b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
new file mode 100644
index 00000000000000..c09cde98e36bc1
--- /dev/null
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2022 Intel Corporation. */
+
+#ifndef _IAVF_XSK_H_
+#define _IAVF_XSK_H_
+
+#include <linux/types.h>
+
+struct iavf_adapter;
+struct xsk_buff_pool;
+
+int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
+			struct xsk_buff_pool *pool, u32 qid);
+
+#endif /* !_IAVF_XSK_H_ */

From af588dde9714316dc35577869207667a0b19a2d9 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Wed, 30 Nov 2022 12:45:53 +0100
Subject: [PATCH 32/40] iavf: Implement Tx path for AF_XDP

Implement Tx handling for AF_XDP feature in zero-copy mode.
Add '.ndo_xdp_xmit()' and '.ndo_xsk_wakeup()' implementations
to support AF_XDP Tx path.
Also, add Tx interrupt handling function for zero-copy mode.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |   1 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c |  51 ++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  17 +-
 drivers/net/ethernet/intel/iavf/iavf_xsk.c  | 322 ++++++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_xsk.h  |  18 ++
 5 files changed, 388 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index d7f7fefe3afa92..5058af8a1f7c48 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -5292,6 +5292,7 @@ static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_setup_tc		= iavf_setup_tc,
 	.ndo_bpf		= iavf_xdp,
 	.ndo_xdp_xmit		= iavf_xdp_xmit,
+	.ndo_xsk_wakeup		= iavf_xsk_wakeup
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 85ee238d2fe50f..c27c4bb8dc12c0 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -16,16 +16,6 @@ DEFINE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
 static bool iavf_xdp_xmit_back(const struct xdp_buff *buff,
 			       struct iavf_ring *xdp_ring);
 
-static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
-				u32 td_tag)
-{
-	return cpu_to_le64(IAVF_TX_DESC_DTYPE_DATA |
-			   ((u64)td_cmd  << IAVF_TXD_QW1_CMD_SHIFT) |
-			   ((u64)td_offset << IAVF_TXD_QW1_OFFSET_SHIFT) |
-			   ((u64)size  << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT) |
-			   ((u64)td_tag  << IAVF_TXD_QW1_L2TAG1_SHIFT));
-}
-
 #define IAVF_TXD_CMD (IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_RS)
 
 /**
@@ -108,8 +98,12 @@ void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 	if (!tx_ring->tx_bi)
 		return;
 
-	/* Free all the Tx ring sk_buffs */
-	iavf_release_tx_resources(tx_ring);
+	if (tx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		iavf_xsk_clean_xdp_ring(tx_ring);
+	} else {
+		/* Free all the Tx ring sk_buffs */
+		iavf_release_tx_resources(tx_ring);
+	}
 
 	bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
 	memset(tx_ring->tx_bi, 0, bi_size);
@@ -730,6 +724,8 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 		tx_desc->cmd_type_offset_bsz = 0;
 	}
 
+	iavf_xsk_setup_xdp_ring(tx_ring);
+
 	return 0;
 
 err:
@@ -1509,7 +1505,16 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 	 * budget and be more aggressive about cleaning up the Tx descriptors.
 	 */
 	iavf_for_each_ring(ring, q_vector->tx) {
-		if (!iavf_clean_tx_irq(vsi, ring, budget)) {
+		bool wd;
+
+		if (ring->flags & IAVF_TXRX_FLAGS_XSK)
+			wd = iavf_xmit_zc(ring);
+		else if (ring->flags & IAVF_TXRX_FLAGS_XDP)
+			wd = true;
+		else
+			wd = iavf_clean_tx_irq(vsi, ring, budget);
+
+		if (!wd) {
 			clean_complete = false;
 			continue;
 		}
@@ -2088,8 +2093,8 @@ static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
 
 		while (unlikely(size > IAVF_MAX_DATA_PER_TXD)) {
 			tx_desc->cmd_type_offset_bsz =
-				build_ctob(td_cmd, td_offset,
-					   max_data, td_tag);
+				iavf_build_ctob(td_cmd, td_offset,
+						max_data, td_tag);
 
 			tx_desc++;
 			i++;
@@ -2109,8 +2114,9 @@ static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
 		if (likely(!data_len))
 			break;
 
-		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
-							  size, td_tag);
+		tx_desc->cmd_type_offset_bsz = iavf_build_ctob(td_cmd,
+							       td_offset,
+							       size, td_tag);
 
 		tx_desc++;
 		i++;
@@ -2142,7 +2148,7 @@ static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
 	/* write last descriptor with RS and EOP bits */
 	td_cmd |= IAVF_TXD_CMD;
 	tx_desc->cmd_type_offset_bsz =
-			build_ctob(td_cmd, td_offset, size, td_tag);
+			iavf_build_ctob(td_cmd, td_offset, size, td_tag);
 
 	skb_tx_timestamp(skb);
 
@@ -2323,6 +2329,10 @@ static u32 iavf_clean_xdp_irq(struct iavf_ring *xdp_ring)
 	u16 rs_idx;
 	u32 i;
 
+	/* Last RS index is invalid in xsk frames */
+	if (!xdp_ring->tx_bi[ntc].page)
+		return 0;
+
 	rs_idx = xdp_ring->tx_bi[ntc].rs_desc_idx;
 	last_rs_desc = IAVF_TX_DESC(xdp_ring, rs_idx);
 	if (last_rs_desc->cmd_type_offset_bsz &
@@ -2413,9 +2423,10 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 
 	tx_desc = IAVF_TX_DESC(xdp_ring, ntu);
 	tx_desc->buffer_addr = cpu_to_le64(dma);
-	tx_desc->cmd_type_offset_bsz = build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
-						  size, 0);
+	tx_desc->cmd_type_offset_bsz = iavf_build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
+						       size, 0);
 
+	xdp_ring->xdp_tx_active++;
 	ntu++;
 
 	if (ntu == xdp_ring->count)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 6d09fbb234cae3..868a840c2a0287 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -236,6 +236,7 @@ struct iavf_ring {
 	struct iavf_ring *next;		/* pointer to next ring in q_vector */
 	void *desc;			/* Descriptor ring memory */
 	union {
+		struct xsk_buff_pool *xsk_pool; /* Used on XSk queue pairs */
 		struct page_pool *pool;	/* Used for Rx page management */
 		struct device *dev;	/* Used for DMA mapping on Tx */
 	};
@@ -269,8 +270,12 @@ struct iavf_ring {
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
+#define IAVF_TXRX_FLAGS_XSK			BIT(6)
 
-	struct bpf_prog __rcu *xdp_prog;
+	union {
+		struct bpf_prog __rcu *xdp_prog;
+		u32 xdp_tx_active;		/* TODO: comment */
+	};
 	struct iavf_ring *xdp_ring;
 	union {
 		struct sk_buff *skb;	/* When iavf_clean_rx_ring_irq() must
@@ -348,6 +353,16 @@ DECLARE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
 
+static inline __le64 iavf_build_ctob(u32 td_cmd, u32 td_offset,
+				     unsigned int size, u32 td_tag)
+{
+	return cpu_to_le64(IAVF_TX_DESC_DTYPE_DATA |
+			   ((u64)td_cmd  << IAVF_TXD_QW1_CMD_SHIFT) |
+			   ((u64)td_offset << IAVF_TXD_QW1_OFFSET_SHIFT) |
+			   ((u64)size  << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT) |
+			   ((u64)td_tag  << IAVF_TXD_QW1_L2TAG1_SHIFT));
+}
+
 /**
  * iavf_xmit_descriptor_count - calculate number of Tx descriptors needed
  * @skb:     send buffer
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
index 6d0c40ef002d40..876aab2e76e111 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -61,6 +61,26 @@ iavf_qvec_toggle_napi(struct iavf_adapter *adapter,
 		napi_disable(&q_vector->napi);
 }
 
+/**
+ * iavf_trigger_sw_intr - trigger a software interrupt
+ * @adapter: adapter of interest
+ * @q_vector: interrupt vector to trigger the software interrupt for
+ */
+static void
+iavf_trigger_sw_intr(struct iavf_adapter *adapter,
+		     struct iavf_q_vector *q_vector)
+{
+        struct iavf_hw *hw = &adapter->hw;
+
+        wr32(hw, IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx),
+             (IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
+              IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK |
+              IAVF_VFINT_DYN_CTLN1_SWINT_TRIG_MASK |
+              IAVF_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK));
+
+        iavf_flush(hw);
+}
+
 /**
  * iavf_qvec_dis_irq - Mask off queue interrupt generation on given ring
  * @adapter: the adapter that contains queue vector being un-configured
@@ -167,6 +187,8 @@ static int iavf_qp_ena(struct iavf_adapter *adapter, u16 q_idx)
 	tx_queues = rx_queues;
 	tx_queues |= BIT(xdp_ring->queue_index);
 
+	iavf_xsk_setup_xdp_ring(xdp_ring);
+
 	iavf_configure_rx_ring(adapter, rx_ring);
 
 	/* Use 'tx_queues' mask as a queue pair mask to configure
@@ -313,3 +335,303 @@ int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
 
 	return ret;
 }
+
+/**
+ * iavf_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
+ * @xdp_ring: XDP Tx ring
+ * @tx_buf: Tx buffer to clean
+ */
+static void
+iavf_clean_xdp_tx_buf(struct iavf_ring *xdp_ring, struct iavf_tx_buffer *tx_buf)
+{
+	switch (tx_buf->xdp_type) {
+	case IAVF_XDP_BUFFER_FRAME:
+		dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
+				 dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
+		dma_unmap_len_set(tx_buf, len, 0);
+		xdp_return_frame(tx_buf->xdpf);
+		tx_buf->xdpf = NULL;
+		break;
+	}
+
+	xdp_ring->xdp_tx_active--;
+	tx_buf->xdp_type = IAVF_XDP_BUFFER_NONE;
+}
+
+/**
+ * iavf_clean_xdp_irq_zc - produce AF_XDP descriptors to CQ
+ * @xdp_ring: XDP Tx ring
+ */
+static void iavf_clean_xdp_irq_zc(struct iavf_ring *xdp_ring)
+{
+	u16 ntc = xdp_ring->next_to_clean;
+	struct iavf_tx_buffer *tx_buf;
+	struct iavf_tx_desc *tx_desc;
+	u16 cnt = xdp_ring->count;
+	u16 done_frames = 0;
+	u16 xsk_frames = 0;
+	u16 last_rs;
+	int i;
+
+	last_rs = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : cnt - 1;
+	tx_desc = IAVF_TX_DESC(xdp_ring, last_rs);
+	if ((tx_desc->cmd_type_offset_bsz &
+	    cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE))) {
+		if (last_rs >= ntc)
+			done_frames = last_rs - ntc + 1;
+		else
+			done_frames = last_rs + cnt - ntc + 1;
+	}
+
+	if (!done_frames)
+		return;
+
+	if (likely(!xdp_ring->xdp_tx_active)) {
+		xsk_frames = done_frames;
+		goto skip;
+	}
+
+	ntc = xdp_ring->next_to_clean;
+	for (i = 0; i < done_frames; i++) {
+		tx_buf = &xdp_ring->tx_bi[ntc];
+
+		if (tx_buf->xdp_type)
+			iavf_clean_xdp_tx_buf(xdp_ring, tx_buf);
+		else
+			xsk_frames++;
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+skip:
+	tx_desc->cmd_type_offset_bsz = 0;
+	xdp_ring->next_to_clean += done_frames;
+	if (xdp_ring->next_to_clean >= cnt)
+		xdp_ring->next_to_clean -= cnt;
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+}
+
+/**
+ * iavf_xmit_pkt - produce a single HW Tx descriptor out of AF_XDP descriptor
+ * @xdp_ring: XDP ring to produce the HW Tx descriptor on
+ * @desc: AF_XDP descriptor to pull the DMA address and length from
+ * @total_bytes: bytes accumulator that will be used for stats update
+ */
+static void iavf_xmit_pkt(struct iavf_ring *xdp_ring, struct xdp_desc *desc,
+			  unsigned int *total_bytes)
+{
+	struct iavf_tx_desc *tx_desc;
+	dma_addr_t dma;
+
+	dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
+	xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = iavf_build_ctob(IAVF_TX_DESC_CMD_EOP,
+						       0, desc->len, 0);
+
+	*total_bytes += desc->len;
+}
+
+/**
+ * iavf_xmit_pkt_batch - produce a batch of HW Tx descriptors out
+ * 			 of AF_XDP descriptors
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from
+ * @total_bytes: bytes accumulator that will be used for stats update
+ */
+static void iavf_xmit_pkt_batch(struct iavf_ring *xdp_ring,
+				struct xdp_desc *descs,
+				unsigned int *total_bytes)
+{
+	u16 ntu = xdp_ring->next_to_use;
+	struct iavf_tx_desc *tx_desc;
+	u32 i;
+
+	loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
+		dma_addr_t dma;
+
+		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, descs[i].addr);
+		xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
+						 descs[i].len);
+
+		tx_desc = IAVF_TX_DESC(xdp_ring, ntu++);
+		tx_desc->buffer_addr = cpu_to_le64(dma);
+		tx_desc->cmd_type_offset_bsz =
+			iavf_build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
+					descs[i].len, 0);
+
+		*total_bytes += descs[i].len;
+	}
+
+	xdp_ring->next_to_use = ntu;
+}
+
+/**
+ * iavf_fill_tx_hw_ring - produce the number of Tx descriptors onto ring
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from
+ * @nb_pkts: count of packets to be send
+ * @total_bytes: bytes accumulator that will be used for stats update
+ */
+static void iavf_fill_tx_hw_ring(struct iavf_ring *xdp_ring,
+				 struct xdp_desc *descs, u32 nb_pkts,
+				 unsigned int *total_bytes)
+{
+	u32 batched, leftover, i;
+
+	batched = ALIGN_DOWN(nb_pkts, PKTS_PER_BATCH);
+	leftover = nb_pkts & (PKTS_PER_BATCH - 1);
+
+	for (i = 0; i < batched; i += PKTS_PER_BATCH)
+		iavf_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
+	for (; i < batched + leftover; i++)
+		iavf_xmit_pkt(xdp_ring, &descs[i], total_bytes);
+}
+
+/**
+ * iavf_xmit_zc - take entries from XSK Tx ring and place them onto HW Tx ring
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ *
+ * Returns true if there is no more work that needs to be done, false otherwise
+ */
+bool iavf_xmit_zc(struct iavf_ring *xdp_ring)
+{
+	struct xdp_desc *descs = xdp_ring->xsk_pool->tx_descs;
+	struct libie_sq_onstack_stats stats = { };
+	u32 nb_processed = 0;
+	int budget;
+
+	iavf_clean_xdp_irq_zc(xdp_ring);
+
+	budget = IAVF_DESC_UNUSED(xdp_ring);
+	budget = min_t(u16, budget, IAVF_RING_QUARTER(xdp_ring));
+
+	stats.packets = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool,
+						       budget);
+	if (!stats.packets)
+		return true;
+
+	if (xdp_ring->next_to_use + stats.packets >= xdp_ring->count) {
+		nb_processed = xdp_ring->count - xdp_ring->next_to_use;
+		iavf_fill_tx_hw_ring(xdp_ring, descs, nb_processed,
+				     &stats.bytes);
+		xdp_ring->next_to_use = 0;
+	}
+
+	iavf_fill_tx_hw_ring(xdp_ring, &descs[nb_processed],
+			     stats.packets - nb_processed, &stats.bytes);
+
+	iavf_set_rs_bit(xdp_ring);
+	iavf_xdp_ring_update_tail(xdp_ring);
+	iavf_update_tx_ring_stats(xdp_ring, &stats);
+
+	if (xsk_uses_need_wakeup(xdp_ring->xsk_pool))
+		xsk_set_tx_need_wakeup(xdp_ring->xsk_pool);
+
+	return stats.packets < budget;
+}
+
+/**
+ * iavf_xsk_wakeup - Implements ndo_xsk_wakeup
+ * @netdev: net_device
+ * @queue_id: queue to wake up
+ * @flags: ignored in our case, since we have Rx and Tx in the same NAPI
+ *
+ * Returns negative on error, zero otherwise.
+ */
+int iavf_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags)
+{
+	struct iavf_adapter *adapter = netdev_priv(netdev);
+	struct iavf_q_vector *q_vector;
+	struct iavf_ring *ring;
+
+	if (adapter->state == __IAVF_DOWN ||
+	    adapter->state == __IAVF_RESETTING)
+		return -ENETDOWN;
+
+	if (!iavf_adapter_xdp_active(adapter))
+		return -EINVAL;
+
+	if (queue_id >= adapter->num_active_queues)
+		return -EINVAL;
+
+	ring = &adapter->rx_rings[queue_id];
+
+	if (!(ring->xdp_ring->flags & IAVF_TXRX_FLAGS_XSK))
+		return -EINVAL;
+
+	q_vector = ring->q_vector;
+	if (!napi_if_scheduled_mark_missed(&q_vector->napi))
+		iavf_trigger_sw_intr(adapter, q_vector);
+
+	return 0;
+}
+
+static u32 iavf_get_xdp_tx_qid(struct iavf_ring *ring)
+{
+	struct iavf_adapter *adapter = ring->vsi->back;
+
+	return ring->queue_index - adapter->num_active_queues;
+}
+
+static struct xsk_buff_pool *iavf_tx_xsk_pool(struct iavf_ring *ring)
+{
+	struct iavf_adapter *adapter = ring->vsi->back;
+	u32 qid;
+
+	if (!iavf_adapter_xdp_active(adapter) ||
+	    !(ring->flags & IAVF_TXRX_FLAGS_XDP))
+		return NULL;
+
+	qid = iavf_get_xdp_tx_qid(ring);
+	if (!test_bit(qid, adapter->af_xdp_zc_qps))
+		return NULL;
+
+	return xsk_get_pool_from_qid(adapter->netdev, qid);
+}
+
+void iavf_xsk_setup_xdp_ring(struct iavf_ring *xdp_ring)
+{
+	struct xsk_buff_pool *pool;
+
+	pool = iavf_tx_xsk_pool(xdp_ring);
+	if (pool) {
+		xdp_ring->xsk_pool = pool;
+		xdp_ring->flags |= IAVF_TXRX_FLAGS_XSK;
+	} else {
+		xdp_ring->dev = &xdp_ring->vsi->back->pdev->dev;
+		xdp_ring->flags &= ~IAVF_TXRX_FLAGS_XSK;
+	}
+}
+
+/**
+ * iavf_xsk_clean_xdp_ring - Clean the XDP Tx ring and its buffer pool queues
+ * @xdp_ring: XDP_Tx ring
+ */
+void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring)
+{
+	u16 ntc = xdp_ring->next_to_clean, ntu = xdp_ring->next_to_use;
+	u32 xsk_frames = 0;
+
+	while (ntc != ntu) {
+		struct iavf_tx_buffer *tx_buf = &xdp_ring->tx_bi[ntc];
+
+		if (tx_buf->xdp_type)
+			iavf_clean_xdp_tx_buf(xdp_ring, tx_buf);
+		else
+			xsk_frames++;
+
+		tx_buf->page = NULL;
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.h b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
index c09cde98e36bc1..2c3c103ddd7781 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
@@ -6,10 +6,28 @@
 
 #include <linux/types.h>
 
+#define PKTS_PER_BATCH 8
+
+#ifdef __clang__
+#define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for
+#elif __GNUC__ >= 8
+#define loop_unrolled_for _Pragma("GCC unroll 8") for
+#else
+#define loop_unrolled_for for
+#endif
+
 struct iavf_adapter;
+struct iavf_ring;
+struct net_device;
 struct xsk_buff_pool;
 
 int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
 			struct xsk_buff_pool *pool, u32 qid);
 
+int iavf_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
+bool iavf_xmit_zc(struct iavf_ring *xdp_ring);
+void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring);
+
+void iavf_xsk_setup_xdp_ring(struct iavf_ring *xdp_ring);
+
 #endif /* !_IAVF_XSK_H_ */

From af38da2cc69ed5f5bf0930242e158f6b79b88995 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 13:18:30 +0100
Subject: [PATCH 33/40] iavf: Implement AF_XDP RX processing

Implement RX packet processing specific to AF_XDP ZC.
All actions except XDP_PASS are supported, the skb path will
be implemented in later patches.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |  13 +
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  32 +-
 drivers/net/ethernet/intel/iavf/iavf_trace.h  |   8 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  82 ++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   |  53 ++-
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  14 +-
 drivers/net/ethernet/intel/iavf/iavf_xsk.c    | 428 ++++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_xsk.h    |   6 +
 8 files changed, 583 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 75e66a53064d1c..a91da041b43fa6 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -529,6 +529,19 @@ static inline bool iavf_adapter_xdp_active(struct iavf_adapter *adapter)
 	return !!READ_ONCE(adapter->xdp_prog);
 }
 
+static inline struct xsk_buff_pool *iavf_xsk_pool(struct iavf_ring *ring)
+{
+	struct iavf_adapter *adapter = ring->vsi->back;
+	struct iavf_vsi *vsi = ring->vsi;
+	u16 qid = ring->queue_index;
+
+	if (!iavf_adapter_xdp_active(adapter) ||
+	    !test_bit(qid, adapter->af_xdp_zc_qps))
+		return NULL;
+
+	return xsk_get_pool_from_qid(vsi->netdev, qid);
+}
+
 int iavf_up(struct iavf_adapter *adapter);
 void iavf_down(struct iavf_adapter *adapter);
 int iavf_process_config(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 5058af8a1f7c48..b4abed2e2c3df1 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -780,14 +780,29 @@ void iavf_configure_rx_ring(struct iavf_adapter *adapter,
 				       rx_ring->queue_index,
 				       rx_ring->q_vector->napi.napi_id);
 
-	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_POOL,
-					 rx_ring->pool);
-	if (err)
-		netdev_err(adapter->netdev, "Could not register XDP memory model for RX queue %u, error: %d\n",
-			   queue_idx, err);
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
+						 MEM_TYPE_XSK_BUFF_POOL,
+						 NULL);
+		if (err)
+			netdev_err(adapter->netdev, "xdp_rxq_info_reg_mem_model returned %d\n",
+				   err);
+
+		xsk_pool_set_rxq_info(rx_ring->xsk_pool, &rx_ring->xdp_rxq);
+
+		iavf_check_alloc_rx_buffers_zc(adapter, rx_ring);
+	} else {
+		err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
+						 MEM_TYPE_PAGE_POOL,
+						 rx_ring->pool);
+		if (err)
+			netdev_err(adapter->netdev, "Could not register XDP memory model for RX queue %u, error: %d\n",
+				   queue_idx, err);
+
+		iavf_alloc_rx_pages(rx_ring);
+	}
 
 	RCU_INIT_POINTER(rx_ring->xdp_prog, adapter->xdp_prog);
-	iavf_alloc_rx_pages(rx_ring);
 }
 
 /**
@@ -3657,10 +3672,13 @@ static int iavf_setup_all_tx_resources(struct iavf_adapter *adapter)
  **/
 static int iavf_setup_all_rx_resources(struct iavf_adapter *adapter)
 {
+	struct iavf_ring *rx_ring;
 	int i, err = 0;
 
 	for (i = 0; i < adapter->num_active_queues; i++) {
-		adapter->rx_rings[i].count = adapter->rx_desc_count;
+		rx_ring = &adapter->rx_rings[i];
+		rx_ring->count = adapter->rx_desc_count;
+
 		err = iavf_setup_rx_descriptors(&adapter->rx_rings[i]);
 		if (!err)
 			continue;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_trace.h b/drivers/net/ethernet/intel/iavf/iavf_trace.h
index 82fda6f5abf043..ac46fbe55bd2e5 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_trace.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_trace.h
@@ -145,6 +145,14 @@ DEFINE_EVENT(
 
 	TP_ARGS(ring, desc, skb));
 
+DEFINE_EVENT(
+	iavf_rx_template, iavf_clean_rx_irq_zc,
+	TP_PROTO(struct iavf_ring *ring,
+		 union iavf_32byte_rx_desc *desc,
+		 struct sk_buff *skb),
+
+	TP_ARGS(ring, desc, skb));
+
 DEFINE_EVENT(
 	iavf_rx_template, iavf_clean_rx_irq_rx,
 	TP_PROTO(struct iavf_ring *ring,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index c27c4bb8dc12c0..28d72bf3d9a5a5 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -134,6 +134,11 @@ void iavf_free_tx_resources(struct iavf_ring *tx_ring)
 	kfree(tx_ring->tx_bi);
 	tx_ring->tx_bi = NULL;
 
+	if (tx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		tx_ring->dev = tx_ring->xsk_pool->dev;
+		tx_ring->flags &= ~IAVF_TXRX_FLAGS_XSK;
+	}
+
 	if (tx_ring->desc) {
 		dma_free_coherent(tx_ring->dev, tx_ring->size,
 				  tx_ring->desc, tx_ring->dma);
@@ -734,6 +739,22 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 	return -ENOMEM;
 }
 
+static void iavf_clean_rx_pages(struct iavf_ring *rx_ring)
+{
+	for (u32 i = 0; i < rx_ring->count; i++) {
+		struct page *page = rx_ring->rx_pages[i];
+
+		if (!page)
+			continue;
+
+		/* Invalidate cache lines that may have been written to by
+		 * device so that we avoid corrupting memory.
+		 */
+		page_pool_dma_sync_full_for_cpu(rx_ring->pool, page);
+		page_pool_put_full_page(rx_ring->pool, page, false);
+	}
+}
+
 /**
  * iavf_clean_rx_ring - Free Rx buffers
  * @rx_ring: ring to be cleaned
@@ -749,19 +770,10 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 		rx_ring->skb = NULL;
 	}
 
-	/* Free all the Rx ring sk_buffs */
-	for (u32 i = 0; i < rx_ring->count; i++) {
-		struct page *page = rx_ring->rx_pages[i];
-
-		if (!page)
-			continue;
-
-		/* Invalidate cache lines that may have been written to by
-		 * device so that we avoid corrupting memory.
-		 */
-		page_pool_dma_sync_full_for_cpu(rx_ring->pool, page);
-		page_pool_put_full_page(rx_ring->pool, page, false);
-	}
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK)
+		iavf_xsk_clean_rx_ring(rx_ring);
+	else
+		iavf_clean_rx_pages(rx_ring);
 
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
@@ -775,7 +787,7 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
  **/
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
-	struct device *dev = rx_ring->pool->p.dev;
+	struct device *dev;
 
 	iavf_clean_rx_ring(rx_ring);
 	kfree(rx_ring->rx_pages);
@@ -785,7 +797,14 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 
-	libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		dev = rx_ring->xsk_pool->dev;
+		rx_ring->flags &= ~IAVF_TXRX_FLAGS_XSK;
+	} else {
+		dev = rx_ring->pool->p.dev;
+		libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
+	}
+
 	rx_ring->dev = dev;
 
 	if (rx_ring->desc) {
@@ -820,6 +839,8 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
 	/* warn if we are about to overwrite the pointer */
 	WARN_ON(rx_ring->rx_pages);
+
+	/* Both iavf_ring::rx_pages and ::xdp_buff are arrays of pointers */
 	rx_ring->rx_pages = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_pages),
 				    GFP_KERNEL);
 	if (!rx_ring->rx_pages)
@@ -837,6 +858,10 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
+	iavf_xsk_setup_rx_ring(rx_ring);
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK)
+		goto finish;
+
 	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count,
 					 iavf_is_xdp_enabled(rx_ring));
 	if (IS_ERR(pool)) {
@@ -846,6 +871,7 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
 	rx_ring->pool = pool;
 
+finish:
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
@@ -860,24 +886,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 	return ret;
 }
 
-/**
- * iavf_release_rx_desc - Store the new tail and head values
- * @rx_ring: ring to bump
- * @val: new head index
- **/
-static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
-{
-	rx_ring->next_to_use = val;
-
-	/* Force memory writes to complete before letting h/w
-	 * know there are new descriptors to fetch.  (Only
-	 * applicable for weak-ordered memory model archs,
-	 * such as IA-64).
-	 */
-	wmb();
-	writel(val, rx_ring->tail);
-}
-
 /**
  * iavf_receive_skb - Send a completed packet up the stack
  * @rx_ring:  rx ring in play
@@ -1372,9 +1380,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		}
 	}
 
-	libie_rq_napi_stats_add(&rx_ring->rq_stats, &stats);
-	rx_ring->q_vector->rx.total_packets += stats.packets;
-	rx_ring->q_vector->rx.total_bytes += stats.bytes;
+	iavf_update_rx_ring_stats(rx_ring, &stats);
 
 	return cleaned_count;
 }
@@ -1534,7 +1540,9 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 	rcu_read_lock();
 
 	iavf_for_each_ring(ring, q_vector->rx) {
-		int cleaned = iavf_clean_rx_irq(ring, budget_per_ring);
+		int cleaned = !!(ring->flags & IAVF_TXRX_FLAGS_XSK) ?
+			      iavf_clean_rx_irq_zc(ring, budget_per_ring) :
+			      iavf_clean_rx_irq(ring, budget_per_ring);
 
 		work_done += cleaned;
 		/* if we clean as many as budgeted, we must not be done */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 868a840c2a0287..15f9dcceab3b9c 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -210,6 +210,7 @@ struct iavf_tx_buffer {
 		struct sk_buff *skb;		/* used for .ndo_start_xmit() */
 		struct page *page;		/* used for XDP_TX */
 		struct xdp_frame *xdpf;		/* used for .ndo_xdp_xmit() */
+		struct xdp_buff *xdp;		/* used for XDP_TX in ZC mode */
 	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
@@ -243,6 +244,7 @@ struct iavf_ring {
 	struct net_device *netdev;	/* netdev ring maps to */
 	union {
 		struct iavf_tx_buffer *tx_bi;
+		struct xdp_buff **xdp_buff;
 		struct page **rx_pages;
 	};
 	u8 __iomem *tail;
@@ -370,7 +372,7 @@ static inline __le64 iavf_build_ctob(u32 td_cmd, u32 td_offset,
  * Returns number of data descriptors needed for this skb. Returns 0 to indicate
  * there is not enough descriptors available in this ring since we need at least
  * one descriptor.
- **/
+ */
 static inline int iavf_xmit_descriptor_count(struct sk_buff *skb)
 {
 	const skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
@@ -395,7 +397,7 @@ static inline int iavf_xmit_descriptor_count(struct sk_buff *skb)
  * @size:    the size buffer we want to assure is available
  *
  * Returns 0 if stop is not needed
- **/
+ */
 static inline int iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
 {
 	if (likely(IAVF_DESC_UNUSED(tx_ring) >= size))
@@ -411,7 +413,7 @@ static inline int iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
  * Note: Our HW can't scatter-gather more than 8 fragments to build
  * a packet on the wire and so we need to figure out the cases where we
  * need to linearize the skb.
- **/
+ */
 static inline bool iavf_chk_linearize(struct sk_buff *skb, int count)
 {
 	/* Both TSO and single send will work if count is less than 8 */
@@ -427,7 +429,7 @@ static inline bool iavf_chk_linearize(struct sk_buff *skb, int count)
 /**
  * txring_txq - helper to convert from a ring to a queue
  * @ring: Tx ring to find the netdev equivalent of
- **/
+ */
 static inline struct netdev_queue *txring_txq(const struct iavf_ring *ring)
 {
 	return netdev_get_tx_queue(ring->netdev, ring->queue_index);
@@ -454,7 +456,7 @@ static inline void iavf_xdp_ring_update_tail(const struct iavf_ring *xdp_ring)
  * @tc: TODO
  * @total_pkts: Number of packets transmitted since the last update
  * @total_bytes: Number of bytes transmitted since the last update
- **/
+ */
 static inline void
 __iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
 			    struct iavf_ring_container *tc,
@@ -468,8 +470,47 @@ __iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
 #define iavf_update_tx_ring_stats(r, s) \
 	__iavf_update_tx_ring_stats(r, &(r)->q_vector->tx, s)
 
+/**
+ * iavf_update_rx_ring_stats - Update RX ring stats
+ * @rx_ring: ring to bump
+ * @rc: TODO
+ * @rx_bytes: number of bytes processed since last update
+ * @rx_packets: number of packets processed since last update
+ */
+static inline void
+__iavf_update_rx_ring_stats(struct iavf_ring *rx_ring,
+			    struct iavf_ring_container *rc,
+			    const struct libie_rq_onstack_stats *stats)
+{
+	libie_rq_napi_stats_add(&rx_ring->rq_stats, stats);
+	rc->total_packets += stats->packets;
+	rc->total_bytes += stats->bytes;
+}
+
+#define iavf_update_rx_ring_stats(r, s) \
+	__iavf_update_rx_ring_stats(r, &(r)->q_vector->rx, s)
+
+/**
+ * iavf_release_rx_desc - Store the new tail and head values
+ * @rx_ring: ring to bump
+ * @val: new head index
+ */
+static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
+{
+	rx_ring->next_to_use = val;
+
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.  (Only
+	 * applicable for weak-ordered memory model archs,
+	 * such as IA-64).
+	 */
+	wmb();
+	writel(val, rx_ring->tail);
+}
+
 #define IAVF_RXQ_XDP_ACT_FINALIZE_TX	BIT(0)
 #define IAVF_RXQ_XDP_ACT_FINALIZE_REDIR	BIT(1)
+#define IAVF_RXQ_XDP_ACT_STOP_NOW	BIT(2)
 
 /**
  * iavf_set_rs_bit - set RS bit on last produced descriptor.
@@ -495,7 +536,7 @@ static inline u16 iavf_set_rs_bit(struct iavf_ring *xdp_ring)
  * @xdp_ring: XDP TX queue assigned to a given RX ring
  * @rxq_xdp_act: Logical OR of flags of XDP actions that require finalization
  * @first_idx: index of the first frame in the transmitted batch on XDP queue
- **/
+ */
 static inline void iavf_finalize_xdp_rx(struct iavf_ring *xdp_ring,
 					u32 rxq_xdp_act, u32 first_idx)
 {
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 9a02a82e15343e..d23d5097db97ad 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -415,8 +415,8 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 				    bool xdp_pair)
 {
 	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
-	const struct page_pool_params *pp = &rxq->pool->p;
 	struct iavf_ring *txq;
+	u32 hr, max_len;
 	int xdpq_idx;
 
 	if (xdp_pair) {
@@ -437,12 +437,20 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 		return;
 	}
 
-	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN(pp->offset));
+	if (rxq->flags & IAVF_TXRX_FLAGS_XSK) {
+		hr = xsk_pool_get_headroom(rxq->xsk_pool);
+		max_len = xsk_pool_get_rx_frame_size(rxq->xsk_pool);
+	} else {
+		hr = rxq->pool->p.offset;
+		max_len = rxq->pool->p.max_len;
+	}
+
+	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN(hr));
 
 	vqpi->rxq.ring_len = rxq->count;
 	vqpi->rxq.dma_ring_addr = rxq->dma;
 	vqpi->rxq.max_pkt_size = max_frame;
-	vqpi->rxq.databuffer_size = pp->max_len;
+	vqpi->rxq.databuffer_size = max_len;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
index 876aab2e76e111..edddad1abe2c7e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -1,10 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2022 Intel Corporation. */
 
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
 #include <linux/net/intel/libie/rx.h>
 #include <net/xdp_sock_drv.h>
 #include <net/xdp_sock.h>
 #include "iavf.h"
+#include "iavf_trace.h"
 #include "iavf_xsk.h"
 
 #define IAVF_CRIT_LOCK_WAIT_TIMEOUT_MS	1000
@@ -157,6 +160,12 @@ static int iavf_qp_dis(struct iavf_adapter *adapter, u16 q_idx)
 		goto dis_exit;
 
 	iavf_qp_clean_rings(adapter, q_idx);
+	if (!(rx_ring->flags & IAVF_TXRX_FLAGS_XSK)) {
+		struct device *dev = rx_ring->pool->p.dev;
+
+		libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
+		rx_ring->dev = dev;
+	}
 dis_exit:
 	return err;
 }
@@ -188,6 +197,17 @@ static int iavf_qp_ena(struct iavf_adapter *adapter, u16 q_idx)
 	tx_queues |= BIT(xdp_ring->queue_index);
 
 	iavf_xsk_setup_xdp_ring(xdp_ring);
+	iavf_xsk_setup_rx_ring(rx_ring);
+
+	if (!(rx_ring->flags & IAVF_TXRX_FLAGS_XSK)) {
+		rx_ring->pool = libie_rx_page_pool_create(rx_ring->netdev,
+							  rx_ring->count,
+							  true);
+		if (IS_ERR(rx_ring->pool)) {
+			err = PTR_ERR(rx_ring->pool);
+			goto ena_exit;
+		}
+	}
 
 	iavf_configure_rx_ring(adapter, rx_ring);
 
@@ -352,6 +372,9 @@ iavf_clean_xdp_tx_buf(struct iavf_ring *xdp_ring, struct iavf_tx_buffer *tx_buf)
 		xdp_return_frame(tx_buf->xdpf);
 		tx_buf->xdpf = NULL;
 		break;
+	case IAVF_XDP_BUFFER_TX:
+		xsk_buff_free(tx_buf->xdp);
+		break;
 	}
 
 	xdp_ring->xdp_tx_active--;
@@ -635,3 +658,408 @@ void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring)
 	if (xsk_frames)
 		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
 }
+
+/**
+ * iavf_init_rx_descs_zc - pick buffers from XSK buffer pool and use it
+ * @pool: XSK Buffer pool to pull the buffers from
+ * @xdp: SW ring of xdp_buff that will hold the buffers
+ * @rx_desc: Pointer to Rx descriptors that will be filled
+ * @count: The number of buffers to allocate
+ *
+ * This function allocates a number of Rx buffers from the fill ring
+ * or the internal recycle mechanism and places them on the Rx ring.
+ *
+ * Note that ring wrap should be handled by caller of this function.
+ *
+ * Returns the amount of allocated Rx descriptors
+ */
+static u16 iavf_init_rx_descs_zc(struct xsk_buff_pool *pool,
+				 struct xdp_buff **xdp,
+				 union iavf_rx_desc *rx_desc, u16 count)
+{
+	dma_addr_t dma;
+	u16 num_buffs;
+	u16 i;
+
+	num_buffs = xsk_buff_alloc_batch(pool, xdp, count);
+	for (i = 0; i < num_buffs; i++) {
+		dma = xsk_buff_xdp_get_dma(*xdp);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma);
+		rx_desc->wb.qword1.status_error_len = 0;
+
+		rx_desc++;
+		xdp++;
+	}
+
+	return num_buffs;
+}
+
+static struct xdp_buff **iavf_get_xdp_buff(struct iavf_ring *ring, u32 idx)
+{
+	return &ring->xdp_buff[idx];
+}
+
+/**
+ * __iavf_alloc_rx_buffers_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * Place the @count of descriptors onto Rx ring. Handle the ring wrap
+ * for case where space from next_to_use up to the end of ring is less
+ * than @count. Finally do a tail bump.
+ *
+ * Returns true if all allocations were successful, false if any fail.
+ */
+static bool __iavf_alloc_rx_buffers_zc(struct iavf_ring *rx_ring, u16 count)
+{
+	u32 nb_buffs_extra = 0, nb_buffs = 0;
+	u16 ntu = rx_ring->next_to_use;
+	union iavf_rx_desc *rx_desc;
+	u16 total_count = count;
+	struct xdp_buff **xdp;
+
+	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
+	xdp = iavf_get_xdp_buff(rx_ring, ntu);
+
+	if (ntu + count >= rx_ring->count) {
+		nb_buffs_extra = iavf_init_rx_descs_zc(rx_ring->xsk_pool, xdp,
+						       rx_desc,
+						       rx_ring->count - ntu);
+		if (nb_buffs_extra != rx_ring->count - ntu) {
+			ntu += nb_buffs_extra;
+			goto exit;
+		}
+		rx_desc = IAVF_RX_DESC(rx_ring, 0);
+		xdp = iavf_get_xdp_buff(rx_ring, 0);
+		ntu = 0;
+		count -= nb_buffs_extra;
+		iavf_release_rx_desc(rx_ring, 0);
+
+		if (!count)
+			goto exit;
+	}
+
+	nb_buffs = iavf_init_rx_descs_zc(rx_ring->xsk_pool, xdp, rx_desc, count);
+
+	ntu += nb_buffs;
+	if (ntu == rx_ring->count)
+		ntu = 0;
+
+exit:
+	if (rx_ring->next_to_use != ntu)
+		iavf_release_rx_desc(rx_ring, ntu);
+
+	return total_count == (nb_buffs_extra + nb_buffs);
+}
+
+/**
+ * iavf_alloc_rx_buffers_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * Wrapper for internal allocation routine; figure out how many tail
+ * bumps should take place based on the given threshold
+ *
+ * Returns true if all calls to internal alloc routine succeeded
+ */
+static bool iavf_alloc_rx_buffers_zc(struct iavf_ring *rx_ring, u16 count)
+{
+	u16 rx_thresh = IAVF_RING_QUARTER(rx_ring);
+	u16 leftover, i, tail_bumps;
+
+	tail_bumps = count / rx_thresh;
+	leftover = count - (tail_bumps * rx_thresh);
+
+	for (i = 0; i < tail_bumps; i++)
+		if (!__iavf_alloc_rx_buffers_zc(rx_ring, rx_thresh))
+			return false;
+	return __iavf_alloc_rx_buffers_zc(rx_ring, leftover);
+}
+
+/**
+ * iavf_check_alloc_rx_buffers_zc - allocate a number of Rx buffers with logs
+ * @adapter: board private structure
+ * @rx_ring: Rx ring
+ *
+ * Wrapper for internal allocation routine; Prints out logs, if allocation
+ * did not go as expected
+ */
+void iavf_check_alloc_rx_buffers_zc(struct iavf_adapter *adapter,
+				    struct iavf_ring *rx_ring)
+{
+	u32 count = IAVF_DESC_UNUSED(rx_ring);
+
+	if (!xsk_buff_can_alloc(rx_ring->xsk_pool, count)) {
+		netdev_warn(adapter->netdev,
+			    "XSK buffer pool does not provide enough addresses to fill %d buffers on Rx ring %d\n",
+			    count, rx_ring->queue_index);
+		netdev_warn(adapter->netdev,
+			    "Change Rx ring/fill queue size to avoid performance issues\n");
+	}
+
+	if (!iavf_alloc_rx_buffers_zc(rx_ring, count))
+		netdev_warn(adapter->netdev,
+			    "Failed to allocate some buffers on XSK buffer pool enabled Rx ring %d\n",
+			    rx_ring->queue_index);
+}
+
+/**
+ * iavf_rx_xsk_pool - Get a valid xsk pool for RX ring
+ * @ring: Rx ring being configured
+ *
+ * Do not return a xsk pool, if socket is TX-only
+ **/
+static struct xsk_buff_pool *iavf_rx_xsk_pool(struct iavf_ring *ring)
+{
+	struct iavf_adapter *adapter = ring->vsi->back;
+	u16 qid = ring->queue_index;
+	struct xsk_buff_pool *pool;
+
+	if (!iavf_adapter_xdp_active(adapter) ||
+	    !test_bit(qid, adapter->af_xdp_zc_qps))
+		return NULL;
+
+	pool = xsk_get_pool_from_qid(adapter->netdev, qid);
+	if (!pool || !xsk_buff_can_alloc(pool, 1))
+		return NULL;
+
+	return pool;
+}
+
+void iavf_xsk_setup_rx_ring(struct iavf_ring *rx_ring)
+{
+	struct xsk_buff_pool *pool;
+
+	pool = iavf_rx_xsk_pool(rx_ring);
+	if (pool) {
+		rx_ring->xsk_pool = pool;
+		rx_ring->flags |= IAVF_TXRX_FLAGS_XSK;
+	} else {
+		rx_ring->dev = &rx_ring->vsi->back->pdev->dev;
+		rx_ring->flags &= ~IAVF_TXRX_FLAGS_XSK;
+	}
+}
+
+/**
+ * iavf_xsk_clean_rx_ring - clean buffer pool queues connected to a given Rx ring
+ * @rx_ring: ring to be cleaned
+ */
+void iavf_xsk_clean_rx_ring(struct iavf_ring *rx_ring)
+{
+	u16 ntc = rx_ring->next_to_clean;
+	u16 ntu = rx_ring->next_to_use;
+
+	while (ntc != ntu) {
+		struct xdp_buff *xdp = *iavf_get_xdp_buff(rx_ring, ntc);
+
+		xsk_buff_free(xdp);
+		ntc++;
+		if (ntc >= rx_ring->count)
+			ntc = 0;
+	}
+}
+
+/**
+ * iavf_xmit_xdp_tx_zc - AF_XDP ZC handler for XDP_TX
+ * @xdp: XDP buffer to xmit
+ * @xdp_ring: XDP ring to produce descriptor onto
+ *
+ * Returns 0 for successfully produced desc,
+ * -EBUSY if there was not enough space on XDP ring.
+ */
+static int iavf_xmit_xdp_tx_zc(struct xdp_buff *xdp,
+			       struct iavf_ring *xdp_ring)
+{
+	u32 size = xdp->data_end - xdp->data;
+	u32 ntu = xdp_ring->next_to_use;
+	struct iavf_tx_buffer *tx_buf;
+	struct iavf_tx_desc *tx_desc;
+	dma_addr_t dma;
+
+	if (IAVF_DESC_UNUSED(xdp_ring) < IAVF_RING_QUARTER(xdp_ring))
+		iavf_clean_xdp_irq_zc(xdp_ring);
+
+	if (unlikely(!IAVF_DESC_UNUSED(xdp_ring))) {
+		libie_stats_inc_one(&xdp_ring->sq_stats, busy);
+		return -EBUSY;
+	}
+
+	dma = xsk_buff_xdp_get_dma(xdp);
+	xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size);
+
+	tx_buf = &xdp_ring->tx_bi[ntu];
+	tx_buf->bytecount = size;
+	tx_buf->gso_segs = 1;
+	tx_buf->xdp_type = IAVF_XDP_BUFFER_TX;
+	tx_buf->xdp = xdp;
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, ntu);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = iavf_build_ctob(IAVF_TX_DESC_CMD_EOP,
+						       0, size, 0);
+
+	xdp_ring->xdp_tx_active++;
+
+	if (++ntu == xdp_ring->count)
+		ntu = 0;
+	xdp_ring->next_to_use = ntu;
+
+	return 0;
+}
+
+/**
+ * iavf_run_xdp_zc - Run XDP program and perform resulting action for ZC
+ * @rx_ring: RX descriptor ring to transact packets on
+ * @xdp: a prepared XDP buffer
+ * @xdp_prog: an XDP program assigned to the interface
+ * @xdp_ring: XDP TX queue assigned to the RX ring
+ * @rxq_xdp_act: Logical OR of flags of XDP actions that require finalization
+ *
+ * Returns resulting XDP action.
+ */
+static unsigned int
+iavf_run_xdp_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
+		struct bpf_prog *xdp_prog, struct iavf_ring *xdp_ring,
+		u32 *rxq_xdp_act)
+{
+	unsigned int xdp_act;
+	int err;
+
+	xdp_act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	if (likely(xdp_act == XDP_REDIRECT)) {
+		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		if (likely(!err)) {
+			*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_REDIR;
+			return XDP_REDIRECT;
+		}
+
+		if (xsk_uses_need_wakeup(rx_ring->xsk_pool) && err == -ENOBUFS)
+			*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_STOP_NOW;
+
+		goto xdp_err;
+	}
+
+	switch (xdp_act) {
+	case XDP_TX:
+		err = iavf_xmit_xdp_tx_zc(xdp, xdp_ring);
+		if (unlikely(err))
+			goto xdp_err;
+
+		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
+
+		fallthrough;
+	case XDP_ABORTED:
+xdp_err:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, xdp_act);
+
+		fallthrough;
+	case XDP_DROP:
+		xsk_buff_free(xdp);
+
+		return XDP_DROP;
+	}
+
+	return xdp_act;
+}
+
+/**
+ * iavf_clean_rx_irq_zc - consumes packets from the hardware ring
+ * @rx_ring: AF_XDP Rx ring
+ * @budget: NAPI budget
+ *
+ * Returns number of processed packets on success, remaining budget on failure.
+ */
+int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
+{
+	struct libie_rq_onstack_stats stats = { };
+	u32 ntc = rx_ring->next_to_clean;
+	u32 ring_size = rx_ring->count;
+	struct iavf_ring *xdp_ring;
+	struct bpf_prog *xdp_prog;
+	u32 cleaned_count = 0;
+	bool failure = false;
+	u32 rxq_xdp_act = 0;
+	u32 to_refill;
+
+	xdp_prog = rcu_dereference(rx_ring->xdp_prog);
+	xdp_ring = rx_ring->xdp_ring;
+
+	while (likely(cleaned_count < budget)) {
+		union iavf_rx_desc *rx_desc;
+		struct xdp_buff *xdp;
+		unsigned int size;
+		u64 qword;
+
+		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
+
+		/* status_error_len will always be zero for unused descriptors
+		 * because it's cleared in cleanup, and overlaps with hdr_addr
+		 * which is always zero because packet split isn't used, if the
+		 * hardware wrote DD then the length will be non-zero
+		 */
+		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+		if (!iavf_test_staterr(qword, IAVF_RX_DESC_STATUS_DD_SHIFT))
+			break;
+
+		/* This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we have
+		 * verified the descriptor has been written back.
+		 */
+		dma_rmb();
+
+		size = (qword & IAVF_RXD_QW1_LENGTH_PBUF_MASK) >>
+		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
+
+		xdp = *iavf_get_xdp_buff(rx_ring, ntc);
+		iavf_trace(clean_rx_irq_zc, rx_ring, rx_desc, NULL);
+
+		if (unlikely(!size)) {
+			xsk_buff_free(xdp);
+			goto next;
+		}
+
+		xsk_buff_set_size(xdp, size);
+		xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
+
+		iavf_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring,
+				&rxq_xdp_act);
+
+		if (unlikely(rxq_xdp_act & IAVF_RXQ_XDP_ACT_STOP_NOW)) {
+			failure = true;
+			break;
+		}
+
+		stats.bytes += size;
+		stats.packets++;
+
+next:
+		cleaned_count++;
+		if (unlikely(++ntc == ring_size))
+			ntc = 0;
+	}
+
+	rx_ring->next_to_clean = ntc;
+
+	iavf_finalize_xdp_rx(xdp_ring, rxq_xdp_act, 0);
+
+	to_refill = IAVF_DESC_UNUSED(rx_ring);
+	if (to_refill > IAVF_RING_QUARTER(rx_ring))
+		failure |= !iavf_alloc_rx_buffers_zc(rx_ring, to_refill);
+
+	iavf_update_rx_ring_stats(rx_ring, &stats);
+
+	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
+		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+			xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
+		else
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
+
+		return cleaned_count;
+	}
+
+	return unlikely(failure) ? budget : cleaned_count;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.h b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
index 2c3c103ddd7781..65aae299db4c0a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
@@ -28,6 +28,12 @@ int iavf_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
 bool iavf_xmit_zc(struct iavf_ring *xdp_ring);
 void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring);
 
+void iavf_xsk_clean_rx_ring(struct iavf_ring *rx_ring);
+int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget);
+void iavf_check_alloc_rx_buffers_zc(struct iavf_adapter *adapter,
+				    struct iavf_ring *rx_ring);
+
 void iavf_xsk_setup_xdp_ring(struct iavf_ring *xdp_ring);
+void iavf_xsk_setup_rx_ring(struct iavf_ring *rx_ring);
 
 #endif /* !_IAVF_XSK_H_ */

From f0576804ed3d88c3fe620f6974ac27fa089eb335 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 23 Feb 2023 19:02:22 +0100
Subject: [PATCH 34/40] iavf: consolidate skb fields processing

For now, filling the skb fields on Rx is a bit scattered across RQ
polling function. This makes it harder to reuse the code on XSk Rx
path and also sometimes costs some CPU (e.g. doing a lookup for the
decoded packet type two times).
Make it consistent and do everything in iavf_process_skb_fields(). First
of all, get the packet type and decode it. Then, move to hash, csum and
VLAN, which is moved here too. iavf_receive_skb() becomes then the
classic eth_type_trans() + napi_gro_receive() pair.
Finally, make the fields processing function global and the skb receive
function static inline in order to call them from a different file later
on.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 108 +++++++++-----------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |   3 +
 2 files changed, 54 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 28d72bf3d9a5a5..cca9907bee5a1a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -886,27 +886,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 	return ret;
 }
 
-/**
- * iavf_receive_skb - Send a completed packet up the stack
- * @rx_ring:  rx ring in play
- * @skb: packet to send up
- * @vlan_tag: vlan tag for packet
- **/
-static void iavf_receive_skb(struct iavf_ring *rx_ring,
-			     struct sk_buff *skb, u16 vlan_tag)
-{
-	struct iavf_q_vector *q_vector = rx_ring->q_vector;
-
-	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
-	    (vlan_tag & VLAN_VID_MASK))
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-	else if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX) &&
-		 vlan_tag & VLAN_VID_MASK)
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
-
-	napi_gro_receive(&q_vector->napi, skb);
-}
-
 /**
  * __iavf_alloc_rx_pages - Replace used receive pages
  * @rx_ring: ring to place buffers on
@@ -973,17 +952,13 @@ void iavf_alloc_rx_pages(struct iavf_ring *rxr)
  * @vsi: the VSI we care about
  * @skb: skb currently being received and modified
  * @qword: `wb.qword1.status_error_len` from the descriptor
+ * @parsed: TODO
  **/
-static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
-				    struct sk_buff *skb,
-				    u64 qword)
+static void iavf_rx_checksum(struct iavf_vsi *vsi, struct sk_buff *skb,
+			     u64 qword, struct libie_rx_ptype_parsed parsed)
 {
-	struct libie_rx_ptype_parsed parsed;
-	u32 ptype, rx_error, rx_status;
-
-	ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
+	u32 rx_error, rx_status;
 
-	parsed = libie_parse_rx_ptype(ptype);
 	if (!libie_has_rx_checksum(vsi->netdev, parsed))
 		return;
 
@@ -1031,20 +1006,17 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
  * @rx_desc: specific descriptor
  * @skb: skb currently being received and modified
  * @qword: `wb.qword1.status_error_len` from the descriptor
+ * @parsed: TODO
  **/
-static inline void iavf_rx_hash(struct iavf_ring *ring,
-				union iavf_rx_desc *rx_desc,
-				struct sk_buff *skb,
-				u64 qword)
+static void iavf_rx_hash(const struct iavf_ring *ring,
+			 const union iavf_rx_desc *rx_desc,
+			 struct sk_buff *skb, u64 qword,
+			 struct libie_rx_ptype_parsed parsed)
 {
 	const u64 rss_mask = (u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
 			     IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT;
-	struct libie_rx_ptype_parsed parsed;
-	u32 rx_ptype, hash;
-
-	rx_ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
+	u32 hash;
 
-	parsed = libie_parse_rx_ptype(rx_ptype);
 	if (!libie_has_rx_hash(ring->netdev, parsed) ||
 	    (qword & rss_mask) != rss_mask)
 		return;
@@ -1053,6 +1025,34 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
 	libie_skb_set_hash(skb, hash, parsed);
 }
 
+static void iavf_rx_vlan(const struct iavf_ring *rx_ring,
+			 const union iavf_rx_desc *rx_desc,
+			 struct sk_buff *skb, u64 qword)
+{
+	u16 vlan_tag;
+	__be16 prot;
+
+	if (rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
+		prot = htons(ETH_P_8021Q);
+	else if (rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX)
+		prot = htons(ETH_P_8021AD);
+	else
+		return;
+
+	if ((qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT)) &&
+	    (rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1))
+		vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
+	else if ((rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2) &&
+		 (rx_desc->wb.qword2.ext_status &
+		  cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT))))
+		vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
+	else
+		vlan_tag = 0;
+
+	if (vlan_tag & VLAN_VID_MASK)
+		__vlan_hwaccel_put_tag(skb, prot, vlan_tag);
+}
+
 /**
  * iavf_process_skb_fields - Populate skb header fields from Rx descriptor
  * @rx_ring: rx descriptor ring packet is being transacted on
@@ -1064,19 +1064,21 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
  * order to populate the hash, checksum, VLAN, protocol, and
  * other fields within the skb.
  **/
-static inline
-void iavf_process_skb_fields(struct iavf_ring *rx_ring,
-			     union iavf_rx_desc *rx_desc, struct sk_buff *skb,
-			     u64 qword)
+void iavf_process_skb_fields(const struct iavf_ring *rx_ring,
+			     const union iavf_rx_desc *rx_desc,
+			     struct sk_buff *skb, u64 qword)
 {
-	iavf_rx_hash(rx_ring, rx_desc, skb, qword);
+	struct libie_rx_ptype_parsed parsed;
+	u32 ptype;
+
+	ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
+	parsed = libie_parse_rx_ptype(ptype);
 
-	iavf_rx_checksum(rx_ring->vsi, skb, qword);
+	iavf_rx_hash(rx_ring, rx_desc, skb, qword, parsed);
+	iavf_rx_checksum(rx_ring->vsi, skb, qword, parsed);
+	iavf_rx_vlan(rx_ring, rx_desc, skb, qword);
 
 	skb_record_rx_queue(skb, rx_ring->queue_index);
-
-	/* modifies the skb - consumes the enet header */
-	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 }
 
 /**
@@ -1234,7 +1236,6 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		union iavf_rx_desc *rx_desc;
 		u32 size, put_size;
 		struct page *page;
-		u16 vlan_tag = 0;
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
@@ -1347,16 +1348,9 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* populate checksum, VLAN, and protocol */
 		iavf_process_skb_fields(rx_ring, rx_desc, skb, qword);
 
-		if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) &&
-		    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
-			vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
-		if (rx_desc->wb.qword2.ext_status &
-		    cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) &&
-		    rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2)
-			vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
-
 		iavf_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
-		iavf_receive_skb(rx_ring, skb, vlan_tag);
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		napi_gro_receive(&rx_ring->q_vector->napi, skb);
 		skb = NULL;
 
 		/* update budget accounting */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 15f9dcceab3b9c..775eda86f05b54 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -352,6 +352,9 @@ bool __iavf_chk_linearize(struct sk_buff *skb);
 
 DECLARE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
 
+void iavf_process_skb_fields(const struct iavf_ring *rx_ring,
+			     const union iavf_rx_desc *rx_desc,
+			     struct sk_buff *skb, u64 qword);
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
 

From 7f53f9553cef32b711c1270dbe8fb80f519233cb Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 13:52:19 +0100
Subject: [PATCH 35/40] iavf: Implement XDP_PASS path in AF_XDP processing

Construct skb and fill in its fields, when AF_XDP
is enabled on the ring, if XDP program returns XDP_PASS.
(will be fixed up).

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_trace.h |  8 +++
 drivers/net/ethernet/intel/iavf/iavf_xsk.c   | 74 +++++++++++++++++++-
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_trace.h b/drivers/net/ethernet/intel/iavf/iavf_trace.h
index ac46fbe55bd2e5..383a5375392a20 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_trace.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_trace.h
@@ -161,6 +161,14 @@ DEFINE_EVENT(
 
 	TP_ARGS(ring, desc, skb));
 
+DEFINE_EVENT(
+	iavf_rx_template, iavf_clean_rx_irq_zc_rx,
+	TP_PROTO(struct iavf_ring *ring,
+		 union iavf_32byte_rx_desc *desc,
+		 struct sk_buff *skb),
+
+	TP_ARGS(ring, desc, skb));
+
 DECLARE_EVENT_CLASS(
 	iavf_xmit_template,
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
index edddad1abe2c7e..d6d3096401c153 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -941,6 +941,8 @@ iavf_run_xdp_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	}
 
 	switch (xdp_act) {
+	case XDP_PASS:
+		break;
 	case XDP_TX:
 		err = iavf_xmit_xdp_tx_zc(xdp, xdp_ring);
 		if (unlikely(err))
@@ -966,6 +968,42 @@ iavf_run_xdp_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	return xdp_act;
 }
 
+/**
+ * iavf_construct_skb_zc - Create an sk_buff from zero-copy buffer
+ * @rx_ring: Rx ring
+ * @xdp: Pointer to XDP buffer
+ *
+ * This function allocates a new skb from a zero-copy Rx buffer.
+ *
+ * Returns the skb on success, NULL on failure.
+ */
+static struct sk_buff *
+iavf_construct_skb_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp)
+{
+	unsigned int totalsize = xdp->data_end - xdp->data_meta;
+	unsigned int metasize = xdp->data - xdp->data_meta;
+	struct sk_buff *skb;
+
+	net_prefetch(xdp->data_meta);
+
+	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb))
+		return NULL;
+
+	memcpy(__skb_put(skb, totalsize), xdp->data_meta,
+	       ALIGN(totalsize, sizeof(long)));
+
+	if (metasize) {
+		skb_metadata_set(skb, metasize);
+		__skb_pull(skb, metasize);
+	}
+
+	xsk_buff_free(xdp);
+
+	return skb;
+}
+
 /**
  * iavf_clean_rx_irq_zc - consumes packets from the hardware ring
  * @rx_ring: AF_XDP Rx ring
@@ -991,6 +1029,8 @@ int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
 	while (likely(cleaned_count < budget)) {
 		union iavf_rx_desc *rx_desc;
 		struct xdp_buff *xdp;
+		unsigned int xdp_act;
+		struct sk_buff *skb;
 		unsigned int size;
 		u64 qword;
 
@@ -1025,8 +1065,10 @@ int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
 		xsk_buff_set_size(xdp, size);
 		xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
 
-		iavf_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring,
-				&rxq_xdp_act);
+		xdp_act = iavf_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring,
+					  &rxq_xdp_act);
+		if (xdp_act == XDP_PASS)
+			goto construct_skb;
 
 		if (unlikely(rxq_xdp_act & IAVF_RXQ_XDP_ACT_STOP_NOW)) {
 			failure = true;
@@ -1040,6 +1082,34 @@ int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
 		cleaned_count++;
 		if (unlikely(++ntc == ring_size))
 			ntc = 0;
+
+		continue;
+
+construct_skb:
+		skb = iavf_construct_skb_zc(rx_ring, xdp);
+		if (!skb) {
+			libie_stats_inc_one(&rx_ring->rq_stats,
+					    build_skb_fail);
+			break;
+		}
+
+		cleaned_count++;
+		if (unlikely(++ntc == ring_size))
+			ntc = 0;
+
+		prefetch(rx_desc);
+
+		/* probably a little skewed due to removing CRC */
+		stats.bytes += skb->len;
+
+		/* populate checksum, VLAN, and protocol */
+		iavf_process_skb_fields(rx_ring, rx_desc, skb, qword);
+
+		iavf_trace(clean_rx_irq_zc_rx, rx_ring, rx_desc, skb);
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		napi_gro_receive(&rx_ring->q_vector->napi, skb);
+
+		stats.packets++;
 	}
 
 	rx_ring->next_to_clean = ntc;

From 3e2fafcc43ad07a008dbde166d82c2e3a4e35d0d Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Tue, 6 Dec 2022 15:32:48 +0100
Subject: [PATCH 36/40] iavf: Make request and free traffic irqs symmetric

The existing implementation of 'iavf_request_traffic_irqs()'
function does not request any interrupt for q_vectors that
have no Tx nor Rx queues assigned to it. However, the function
'iavf_free_traffic_irqs()' releases interrupts for all q_vectors
unconditionally.
Such an approach may result in showing kernel warning about
an attempt of releasing the interrupt that was not requested.

In order to solve that potential issue make both functions
fully symmetric. Therefore, add the logic to 'iavf_free_traffic_irqs()'
for skipping not used q_vectors.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index b4abed2e2c3df1..e6807f1f5c937b 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -712,6 +712,7 @@ static int iavf_request_misc_irq(struct iavf_adapter *adapter)
  **/
 static void iavf_free_traffic_irqs(struct iavf_adapter *adapter)
 {
+	struct iavf_q_vector *q_vector;
 	int vector, irq_num, q_vectors;
 
 	if (!adapter->msix_entries)
@@ -720,10 +721,14 @@ static void iavf_free_traffic_irqs(struct iavf_adapter *adapter)
 	q_vectors = adapter->num_msix_vectors - NONQ_VECS;
 
 	for (vector = 0; vector < q_vectors; vector++) {
+		q_vector = &adapter->q_vectors[vector];
+		if (!q_vector->tx.ring && !q_vector->rx.ring)
+			continue;
+
 		irq_num = adapter->msix_entries[vector + NONQ_VECS].vector;
 		irq_set_affinity_notifier(irq_num, NULL);
 		irq_update_affinity_hint(irq_num, NULL);
-		free_irq(irq_num, &adapter->q_vectors[vector]);
+		free_irq(irq_num, q_vector);
 	}
 }
 

From a3e00d6b17afd70b4e09db0c80f2f7d1264f7ae3 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Tue, 13 Dec 2022 12:31:06 +0100
Subject: [PATCH 37/40] iavf: Do not reset the number of requested queues

When the number of queues is being changed by the user, the information
about a new queue number is kept in the adapter structure member
(num_req_queues). Such an information was always reset to zero just
after setting queue number request is processed.

However, that structure member should always provide an information
about user's preference regarding the requested queue number, so
it should be preserved for future driver reinitializations or
setting up the adapter for XDP program.

Remove setting the number of requested queues to zero and use that
value as a priority one during next reinitializations of the adapter,
in order to avoid the scenario when the queue count can be changed
automatically out of user's control.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index e6807f1f5c937b..e8c69c5bf151fc 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2617,7 +2617,6 @@ int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter)
 
 		return -EAGAIN;
 	}
-	adapter->num_req_queues = 0;
 	adapter->vsi.id = adapter->vsi_res->vsi_id;
 
 	adapter->vsi.back = adapter;

From c7845625fe7f54b4441225b0f0b6f2923d74d0de Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 24 Jan 2023 12:35:08 +0100
Subject: [PATCH 38/40] iavf: Limit number of channels in ethtool when XDP is
 enabled

When XDP is enabled, our true maximum number of queue pairs
can be reduced up to being cut in half, ex. for system with 10 CPUs
and 16 queue pairs allowed by PF, normally maximum would be 10 queues,
but XDP requires 2 queue pairs per channel, so the maximum of 8 queues
can be used if program is attached.

The above fact has to be reflected in ethtool.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index 0dcf50d75f8614..e14e5f84f6ebce 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -1644,6 +1644,7 @@ static int iavf_set_channels(struct net_device *netdev,
 			     struct ethtool_channels *ch)
 {
 	struct iavf_adapter *adapter = netdev_priv(netdev);
+	u32 num_allowed = adapter->vsi_res->num_queue_pairs;
 	u32 num_req = ch->combined_count;
 	int i;
 
@@ -1656,9 +1657,15 @@ static int iavf_set_channels(struct net_device *netdev,
 	/* All of these should have already been checked by ethtool before this
 	 * even gets to us, but just to be sure.
 	 */
-	if (num_req == 0 || num_req > adapter->vsi_res->num_queue_pairs)
+	if (num_req == 0 || num_req > num_allowed)
 		return -EINVAL;
 
+	if (iavf_adapter_xdp_active(adapter) && num_req * 2 > num_allowed) {
+		netdev_err(netdev, "XDP is enabled, so maximum allowed queue number is reduced to %u, %u queues where requested\n",
+			   num_allowed / 2, num_allowed);
+		return -EINVAL;
+	}
+
 	if (num_req == adapter->num_active_queues)
 		return 0;
 

From b7fbed3e98c2e638ff6a43a8603c2b0e9fa4e994 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Fri, 24 Feb 2023 15:32:11 +0100
Subject: [PATCH 39/40] iavf: lock XDP queue while using in ZC mode

To avoid race between .ndo_xdp_xmit(), normal ZC TX processing
and XDP_TX in ZC mode when also sharing queues,
add locking to the later two.

Locking in .ndo_xdp_xmit() is already present.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_xsk.c | 29 +++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
index d6d3096401c153..f0f88f25e4e011 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -526,8 +526,12 @@ bool iavf_xmit_zc(struct iavf_ring *xdp_ring)
 	struct xdp_desc *descs = xdp_ring->xsk_pool->tx_descs;
 	struct libie_sq_onstack_stats stats = { };
 	u32 nb_processed = 0;
+	bool ret = true;
 	int budget;
 
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_lock(&xdp_ring->tx_lock);
+
 	iavf_clean_xdp_irq_zc(xdp_ring);
 
 	budget = IAVF_DESC_UNUSED(xdp_ring);
@@ -536,7 +540,7 @@ bool iavf_xmit_zc(struct iavf_ring *xdp_ring)
 	stats.packets = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool,
 						       budget);
 	if (!stats.packets)
-		return true;
+		goto unlock;
 
 	if (xdp_ring->next_to_use + stats.packets >= xdp_ring->count) {
 		nb_processed = xdp_ring->count - xdp_ring->next_to_use;
@@ -555,7 +559,12 @@ bool iavf_xmit_zc(struct iavf_ring *xdp_ring)
 	if (xsk_uses_need_wakeup(xdp_ring->xsk_pool))
 		xsk_set_tx_need_wakeup(xdp_ring->xsk_pool);
 
-	return stats.packets < budget;
+	ret = stats.packets < budget;
+unlock:
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_unlock(&xdp_ring->tx_lock);
+
+	return ret;
 }
 
 /**
@@ -907,6 +916,20 @@ static int iavf_xmit_xdp_tx_zc(struct xdp_buff *xdp,
 	return 0;
 }
 
+static int iavf_xmit_xdp_tx_zc_locked(struct xdp_buff *xdp,
+				      struct iavf_ring *xdp_ring)
+{
+	int ret;
+
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_lock(&xdp_ring->tx_lock);
+	ret = iavf_xmit_xdp_tx_zc(xdp, xdp_ring);
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_unlock(&xdp_ring->tx_lock);
+
+	return ret;
+}
+
 /**
  * iavf_run_xdp_zc - Run XDP program and perform resulting action for ZC
  * @rx_ring: RX descriptor ring to transact packets on
@@ -944,7 +967,7 @@ iavf_run_xdp_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	case XDP_PASS:
 		break;
 	case XDP_TX:
-		err = iavf_xmit_xdp_tx_zc(xdp, xdp_ring);
+		err = iavf_xmit_xdp_tx_zc_locked(xdp, xdp_ring);
 		if (unlikely(err))
 			goto xdp_err;
 

From f71cafea4b538c1b76bcd74d720368a12d0d5624 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Thu, 9 Mar 2023 16:11:40 +0100
Subject: [PATCH 40/40] iavf: Enable AF_XDP zero-copy feature in netdev

Enable NETDEV_XDP_ACT_XSK_ZEROCOPY feature in
netdev structure.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index e8c69c5bf151fc..59e47968ad264a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2812,7 +2812,8 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 	iavf_set_ethtool_ops(netdev);
 	netdev->max_mtu = LIBIE_MAX_MTU;
 
-	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT;
+	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
+			       NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
 	if (!is_valid_ether_addr(adapter->hw.mac.addr)) {
 		dev_info(&pdev->dev, "Invalid MAC address %pM, using random\n",