From ec131bf4749024b05470f6bb72f5273366b2f2c7 Mon Sep 17 00:00:00 2001
From: neuron-code-sharing-robot <neuron-code-sharing@amazon.com>
Date: Sat, 20 Dec 2025 01:35:32 +0000
Subject: [PATCH] Extracted contents of aws-neuronx-dkms-2.25.4.0.noarch.rpm

---
 Kbuild                                       |   3 +-
 README.md                                    |   3 -
 dkms.conf                                    |   2 +-
 neuron_arch.c                                |  35 ++
 neuron_arch.h                                |  25 +-
 neuron_cdev.c                                | 441 +++++++++++++++-
 neuron_core.c                                |   2 +-
 neuron_device.h                              |   8 +-
 neuron_dhal.c                                |  16 +-
 neuron_dhal.h                                |  36 +-
 neuron_dma.c                                 | 503 +++++++++----------
 neuron_dma.h                                 |  93 +++-
 neuron_dmabuf.c                              |   9 +-
 neuron_fw_io.c                               | 218 +++++++-
 neuron_fw_io.h                               | 119 ++++-
 neuron_ioctl.h                               |  61 ++-
 neuron_metrics.c                             | 240 ++++++---
 neuron_metrics.h                             |  78 ++-
 neuron_module.c                              |   8 +-
 neuron_pci.c                                 |  21 +-
 neuron_power.c                               |   8 +
 neuron_reset.c                               |  11 +-
 neuron_ring.c                                | 266 +++++++++-
 neuron_ring.h                                |  67 ++-
 neuron_topsp.c                               |   2 +-
 share/neuron_driver_shared.h                 |  22 +
 share/neuron_driver_shared_tensor_batch_op.h |  24 +
 udma/udma.h                                  |   6 +-
 udma/udma_m2m.c                              |  33 +-
 udma/udma_main.c                             |  50 +-
 v2/neuron_dhal_v2.c                          | 172 +++----
 v3/neuron_dhal_v3.c                          | 356 +++++++------
 v3/neuron_pelect.c                           |  85 +++-
 v4/address_map.h                             | 230 +++++++++
 v4/neuron_dhal_v4.c                          | 468 +++++++++++++++++
 vc/neuron_dhal_vc.c                          |  17 +-
 36 files changed, 2874 insertions(+), 864 deletions(-)
 create mode 100644 share/neuron_driver_shared_tensor_batch_op.h
 create mode 100644 v4/address_map.h
 create mode 100644 v4/neuron_dhal_v4.c

diff --git a/Kbuild b/Kbuild
index 5eda22f..94f4589 100644
--- a/Kbuild
+++ b/Kbuild
@@ -14,10 +14,9 @@ neuron-objs += neuron_dmabuf.o
 neuron-objs += neuron_log.o
 neuron-objs += neuron_power.o
 neuron-objs += vc/neuron_dhal_vc.o
-neuron-objs += v1/fw_io.o v1/putils.o v1/neuron_dhal_v1.o
 neuron-objs += v2/notific.o v2/neuron_dhal_v2.o
 neuron-objs += v3/notific.o v3/neuron_dhal_v3.o v3/neuron_pelect.o
-
+neuron-objs += v4/neuron_dhal_v4.o
 ccflags-y += -O3 -Wall -Werror -Wno-declaration-after-statement -Wunused-macros -Wunused-local-typedefs
 ccflags-y += -I$(src)/
 ccflags-y += $(call cc-option,-march=armv8.2-a)
diff --git a/README.md b/README.md
index e00b7ec..f4525f5 100644
--- a/README.md
+++ b/README.md
@@ -46,9 +46,6 @@ Neuron Devices implement a communication channel (FWIO) that allows the driver a
 * neuron_cdev.c - char device interface.
 * fw_io.[ch] - Communication channel
 * udma/* - DMA engines and queues HAL
-* v1/address_map.h - Neuron Device address space
-* v1/putils.h - Notification HAL
-* v1/tdma.h - Additional DMA HAL functionality
 
 # Compiling and Installing
 
diff --git a/dkms.conf b/dkms.conf
index f087853..03f894c 100644
--- a/dkms.conf
+++ b/dkms.conf
@@ -1,5 +1,5 @@
 PACKAGE_NAME=aws-neuronx
-PACKAGE_VERSION=2.24.7.0
+PACKAGE_VERSION=2.25.4.0
 BUILT_MODULE_NAME[0]="neuron"
 MAKE[0]="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build"
 CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean"
diff --git a/neuron_arch.c b/neuron_arch.c
index 69a7991..f3a6763 100644
--- a/neuron_arch.c
+++ b/neuron_arch.c
@@ -10,6 +10,13 @@
 #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
 
 #include "neuron_arch.h"
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+#include <linux/kernel_read_file.h>
+#endif
 
 struct neuron_arch_info {
 	enum neuron_arch arch;
@@ -55,3 +62,31 @@ bool narch_is_emu(void)
 	BUG_ON(arch_info.arch == NEURON_ARCH_INVALID);
 	return arch_info.revision == REVID_EMU;
 }
+
+int narch_get_instance_type_name(char *instance_type_name, size_t instance_type_name_size) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+    ssize_t len;
+    ssize_t file_size;
+    void *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+    if (buf == NULL) {
+        pr_err("failed to allocate buffer to read instance type");
+        return -ENOMEM;
+    }
+
+    len = kernel_read_file_from_path("/sys/class/dmi/id/product_name",
+                                     0, &buf, 64, &file_size, READING_UNKNOWN);
+    if (!len) {
+        pr_err("read instance type failed");
+        kfree(buf);
+        return -EIO;
+    }
+
+    snprintf(instance_type_name, instance_type_name_size, "%s", (char *)buf);
+
+    kfree(buf);
+    return 0;
+#else
+	return -ENOSYS;
+#endif
+}
diff --git a/neuron_arch.h b/neuron_arch.h
index 6e438f5..c27e5b9 100644
--- a/neuron_arch.h
+++ b/neuron_arch.h
@@ -11,12 +11,19 @@
 
 enum neuron_arch {
 	NEURON_ARCH_INVALID,
-	NEURON_ARCH_V1 = 1,
 	NEURON_ARCH_V2 = 2,
 	NEURON_ARCH_V3 = 3,
+	NEURON_ARCH_V4 = 4,
 	NEURON_ARCH_NUM
 };
 
+enum neuron_platform_type {
+	NEURON_PLATFORM_TYPE_STD = 0,
+	NEURON_PLATFORM_TYPE_ULTRASERVER = 1,
+	NEURON_PLATFORM_TYPE_PDS = 2,
+	NEURON_PLATFORM_TYPE_INVALID,
+};
+
 /**
  * narch_init() - Set neuron devices architecture and revision.
  *
@@ -55,4 +62,20 @@ bool narch_is_qemu(void);
  */
 bool narch_is_emu(void);
 
+/**
+ * narch_get_instance_type_name() - Reads instance type name from device DMI data.
+ *
+ * @instance_type_name: Buffer to store the instance type name string.
+ * @instance_type_name_size: Size of the instance_type_name buffer.
+ *
+ * Note: This function is only available on kernel versions 5.10.0 and above.
+ *
+ * Return:
+ * * 0 if read succeeds,
+ * * -ENOMEM - Failed to allocate temporary buffer for reading.
+ * * -EIO    - Failed to read the DMI product_name file.
+ * * -ENOSYS - Kernel version is below 5.10.0, function not supported.
+ */
+int narch_get_instance_type_name(char *instance_type_name, size_t instance_type_name_size);
+
 #endif
diff --git a/neuron_cdev.c b/neuron_cdev.c
index 043e0a7..b8c3458 100644
--- a/neuron_cdev.c
+++ b/neuron_cdev.c
@@ -11,6 +11,7 @@
 #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
 
 #include <linux/kernel.h>
+#include <linux/limits.h>
 #include <linux/poll.h>
 #include <linux/cdev.h>
 #include <linux/sched.h>
@@ -40,6 +41,7 @@
 #include "neuron_cdev.h"
 #include "neuron_fw_io.h"
 #include "neuron_log.h"
+#include "neuron_metrics.h"
 
 static dev_t neuron_dev;
 static int major;
@@ -1143,7 +1145,8 @@ static int ncdev_mem_buf_copy(struct neuron_device *nd, unsigned int cmd, void *
 	}
 }
 
-static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, void *param)
+#define BAR4_WR_THRESHOLD_MAX (PAGE_SIZE*2) 
+static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, unsigned int cmd, void *param)
 {
 	void *buffer;
 	struct mem_chunk *mc;
@@ -1151,17 +1154,27 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, void *param)
 	u32 copy_to_mem_handle;
 	u64 offset;
 	u64 size;
+	u32 bar4_wr_threshold;
+	int h2t_qid;
 	int ret;
+	struct neuron_ioctl_mem_buf_copy64zc arg;
+	bool use_bar4_wr;
+
+	// TODO remove at some point
+	if (_IOC_SIZE(cmd) != sizeof(arg)) {
+		pr_err_once("error experimental zerocopy API is now obsolete.  Please upgrade to latest driver");
+        return -EINVAL;
+	}
 
-	struct neuron_ioctl_mem_buf_copy64 arg;
 	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_mem_buf_copy64 *)param, sizeof(arg));
 	if (ret)
 		return ret;
 	mem_handle = arg.mem_handle;
 	buffer = arg.buffer;
-	copy_to_mem_handle = arg.copy_to_mem_handle;
+	copy_to_mem_handle = arg.is_copy_to_device;
 	offset = arg.offset;
 	size = arg.size;
+	h2t_qid = arg.h2t_qid;
 
     mc = ncdev_mem_handle_to_mem_chunk(nd, mem_handle);
     if (!mc)
@@ -1172,7 +1185,241 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, void *param)
         return -EINVAL;
     }
 
-	return ndma_memcpy_zero_copy_mc(nd, buffer, mc, offset, size, copy_to_mem_handle ? true : false);
+	if (unlikely(!access_ok(buffer, size))) {
+		return -EFAULT;
+	}
+
+	// limit to internal threshold to prevent DoS attack
+	bar4_wr_threshold = (arg.bar4_wr_threshold < BAR4_WR_THRESHOLD_MAX) ? arg.bar4_wr_threshold : BAR4_WR_THRESHOLD_MAX;
+	use_bar4_wr = !narch_is_qemu() &&
+		      (size <= bar4_wr_threshold) &&
+		      copy_to_mem_handle &&
+		      nd->npdev.bar4_pa &&
+		      (mc->mem_location == MEM_LOC_DEVICE) &&
+		      IS_ALIGNED(size, 4) &&
+		      IS_ALIGNED(offset, 4);
+
+	// For smallish transfers, just do "copy from" directly to bar4
+	// simulation (inkling) does not have bar4 mapped to the actual memory, don't do it
+	if (use_bar4_wr) {
+		u64 cpy_offset;
+		ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + offset, size, &cpy_offset);
+		// copy from user is slow, try fast copy and fall back if fails
+		pagefault_disable();
+		ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, buffer, size);
+		pagefault_enable();
+		if (unlikely(ret)) {
+			ret = neuron_copy_from_user(__func__, nd->npdev.bar4 + cpy_offset, buffer, size);
+		}
+	} else {
+		nrt_tensor_batch_op_t op;
+
+		u32 nc_id    = ndma_mc_pair_to_nc(mc, mc);
+		int qid      = (h2t_qid == NEURON_DMA_H2T_DEFAULT_QID) ? ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id) : h2t_qid;
+		dma_addr_t dev_base = ndma_mc_to_pa(mc); // the caller already does the range check for dev_base+offset
+
+		if (!ndmar_qid_valid(qid)) {
+			pr_err("nd%02d: invalid h2t queue index %d", nd->device_index, qid);
+			return -ENOENT;
+		}
+
+		if (!ndma_zerocopy_supported()) {
+			pr_err_once("nd%02d: zero copy is not supported for architectures requiring DMA retry", nd->device_index);
+			return -EINVAL;
+		}
+
+		op.offset = offset;
+		op.buffer = buffer;
+		op.size = size;
+
+		ret = ndma_memcpy_zerocopy(nd, nc_id, &op, 1, dev_base, qid, copy_to_mem_handle ? true : false);
+	}
+
+	return ret;
+}
+
+static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param)
+{
+	int ret = 0;
+	u32 i, j = 0;
+
+	struct neuron_ioctl_mem_buf_copy64zc_batches arg = {0};
+	neuron_memcpy_batch_t *batches = NULL;
+	nrt_tensor_batch_op_t *ops_buffer = NULL;
+	struct mem_chunk *mc = NULL;
+	size_t total_ops = 0;
+	size_t ops_buffer_offset = 0;
+	const size_t op_size = sizeof(*ops_buffer);
+	u32 bar4_wr_threshold = 0;
+	bool use_bar4_wr = false;
+
+	// copy IOCTL struct from user space
+	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_mem_buf_copy64zc_batches *)param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	// validate batches
+	if (!arg.batches_ptr) {
+		pr_err("invalid batches pointer\n");
+		return -EINVAL;
+	}
+	if (arg.num_batches == 0) {
+		pr_err("the number of batches is 0\n");
+		return -EINVAL;
+	}
+
+	// allocate and copy the batches array from user space
+	batches = kzalloc(arg.num_batches * sizeof(neuron_memcpy_batch_t), GFP_KERNEL);
+	if (!batches) {
+		pr_err("failed to allocate memory for batches\n");
+		return -ENOMEM;
+	}
+	ret = neuron_copy_from_user(__func__, batches, arg.batches_ptr, arg.num_batches * sizeof(neuron_memcpy_batch_t));
+	if (ret) {
+		pr_err("failed to copy batches from user space\n");
+		goto cleanup;
+	}
+
+	for (i = 0; i < arg.num_batches; i++) {
+		neuron_memcpy_batch_t batch = batches[i];
+		size_t num_ops = batch.num_ops;
+
+		if (num_ops == 0) {
+			pr_err("the number of operations is 0 for batch %u\n", i);
+			ret = -EINVAL;
+			goto cleanup;
+		}
+		if (!batch.ops_ptr) {
+			pr_err("the ops pointer is NULL for batch %u\n", i);
+			ret = -EINVAL;
+			goto cleanup;
+		}
+		if (num_ops > SIZE_MAX - total_ops) {
+			pr_err("too many operations requested across batches\n");
+			ret = -EINVAL;
+			goto cleanup;
+		}
+
+		total_ops += num_ops;
+	}
+
+	// Holds the ops across batches
+	ops_buffer = kzalloc(total_ops * op_size, GFP_KERNEL);
+	if (!ops_buffer) {
+		pr_err("failed to allocate memory for ops across batches\n");
+		ret = -ENOMEM;
+		goto cleanup;
+	}
+
+	ops_buffer_offset = 0;
+
+	for (i = 0; i < arg.num_batches; i++) {
+		neuron_memcpy_batch_t *batch = &batches[i];
+		size_t num_ops = batch->num_ops;
+		void __user *user_ops_ptr = (void __user *)batch->ops_ptr;
+		nrt_tensor_batch_op_t *ops = ops_buffer + ops_buffer_offset;
+
+		// copy the ops array from user space into the ops buffer
+		ret = neuron_copy_from_user(__func__, ops, user_ops_ptr, num_ops * op_size);
+		if (ret) {
+			pr_err("failed to copy ops from user space\n");
+			goto cleanup;
+		}
+		batch->ops_ptr = ops;
+		ops_buffer_offset += num_ops;
+
+		mc = ncdev_mem_handle_to_mem_chunk(nd, batch->mem_handle);
+		if (!mc) {
+			pr_err("invalid mem handle %llx for batch %u\n", batch->mem_handle, i);
+			ret = -EINVAL;
+			goto cleanup;
+		}
+
+		bar4_wr_threshold = (batch->bar4_wr_threshold < BAR4_WR_THRESHOLD_MAX) ? batch->bar4_wr_threshold : BAR4_WR_THRESHOLD_MAX;
+		use_bar4_wr = !narch_is_qemu() && arg.is_copy_to_device && nd->npdev.bar4_pa && mc->mem_location == MEM_LOC_DEVICE;
+
+		for (j = 0; j < batch->num_ops; j++) {
+			nrt_tensor_batch_op_t *op = &ops[j];
+			// validate each operation
+			if (op->size == 0) {
+				pr_err("op %u of batch %u: the transfer size is 0\n", j, i);
+				ret = -EINVAL;
+				goto cleanup;
+			}
+			if (op->buffer == NULL) {
+				pr_err("op %u of batch %u: buffer is NULL\n", j, i);
+				ret = -EINVAL;
+				goto cleanup;
+			}
+			// validate and update offset
+			op->offset += batch->mem_handle_offset;
+			if (!mc_access_is_within_bounds(mc, op->offset, op->size)) {
+				pr_err("op %u of batch %u: device offset+size out of bounds\n", j, i);
+				ret = -EINVAL;
+				goto cleanup;
+			}
+			// validate buffer
+			if (unlikely(!access_ok(op->buffer, op->size))) {
+				pr_err("op %u of batch %u: invalid host buffer\n", j, i);
+				ret = -EFAULT;
+				goto cleanup;
+			}
+
+			if (op->size > bar4_wr_threshold || !IS_ALIGNED(op->size, 4) || !IS_ALIGNED(op->offset, 4)) {
+				use_bar4_wr = false;
+			}
+		}
+
+		// For smallish transfers, just do "copy from" directly to bar4
+		// simulation (inkling) does not have bar4 mapped to the actual memory, don't do it
+		if (use_bar4_wr) {
+			for (j = 0; j < batch->num_ops; j++) {
+				const nrt_tensor_batch_op_t op = batch->ops_ptr[j];
+
+				u64 cpy_offset = 0;
+				ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + op.offset, op.size, &cpy_offset);
+				// copy from user is slow, try fast copy and fall back if fails
+				pagefault_disable();
+				ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, op.buffer, op.size);
+				pagefault_enable();
+				if (unlikely(ret)) {
+					ret = neuron_copy_from_user(__func__, nd->npdev.bar4 + cpy_offset, op.buffer, op.size);
+					if (ret) {
+						pr_err("failed to do bar4 write on batch %d op %d on nd%02d: %d\n", i, j, nd->device_index, ret);
+						goto cleanup;
+					}
+				}
+			}
+		} else {
+			u32 nc_id    = ndma_mc_pair_to_nc(mc, mc);
+			int qid      = (arg.h2t_qid == NEURON_DMA_H2T_DEFAULT_QID) ? ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id) : arg.h2t_qid;
+			dma_addr_t dev_base = ndma_mc_to_pa(mc); // the caller already does the range check for dev_base+offset
+
+			if (!ndmar_qid_valid(qid)) {
+				pr_err("nd%02d: invalid h2t queue index %d", nd->device_index, qid);
+				return -ENOENT;
+			}
+
+			if (!ndma_zerocopy_supported()) {
+				pr_err_once("nd%02d: zero copy is not supported for architectures requiring DMA retry", nd->device_index);
+				return -EINVAL;
+			}
+
+			// use the zero-copy batch function for ops within a single batch
+			ret = ndma_memcpy_zerocopy(nd, nc_id, batch->ops_ptr, batch->num_ops, dev_base, qid, arg.is_copy_to_device);
+			if (ret) {
+				pr_err("batch zero-copy DMA failed on batch %d on nd%02d: %d\n", i, nd->device_index, ret);
+				goto cleanup;
+			}
+		}
+	}
+
+cleanup:
+	if (ops_buffer)
+		kfree(ops_buffer);
+	if (batches)
+		kfree(batches);
+	return ret;
 }
 
 static long ncdev_semaphore_ioctl(struct neuron_device *nd, unsigned int cmd, void *param)
@@ -1279,6 +1526,55 @@ static long ncdev_bar_read(struct neuron_device *nd, u8 bar, u64 *reg_addresses,
 	return ret;
 }
 
+/**
+ * ncdev_bar_write_data() - write data to bar
+ *
+ * @param nd: neuron device
+ * @param bar: the BAR to write to
+ * @param reg_addresses
+ * @param data: the data to be written into the bar
+ * @param data_count: the number of data to be written
+ * @return 0 on success, otherwise failure
+*/
+static int ncdev_bar_write_data(struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count)
+{
+	if (bar == 0) {
+		int i;
+		for (i = 0; i < data_count; i++) {
+			u64 off = reg_addresses[i] - (u64)nd->npdev.bar0;
+			if (off > nd->npdev.bar0_size) {
+				return -EINVAL;
+			}
+			if (ndhal->ndhal_ndma.ndma_is_bar0_write_blocked(off)) {
+				return -EINVAL;
+			}
+			writel(data[i], nd->npdev.bar0 + off);
+			trace_bar_write(nd, bar, off, data[i]);
+		}
+	} else if (bar == 4) {
+		// TODO: we don't have any use case for r/w memory over the BAR right now.  Disabling.
+		//
+		// We'd like to use DMA for r/w of BAR4 because we might expect access to large amounts of data.
+		// Access via DMA requires an application to own a TPB because it determines which of the h2t DMAs
+		// are safe to use, otherwise a TPB along with its DMA could be reset while that DMA is used here.
+		// Don't want/need to solve it now.
+		return -EINVAL;
+
+		/*
+		dma_addr_t dst_addr = reg_addresses[0] - (u64)nd->npdev.bar0;
+
+		ret = ndma_memcpy(nd, 0, virt_to_phys(data) | ndhal->ndhal_address_map.pci_host_base, dst_addr, data_size);
+		if (ret)
+			return ret;
+		*/
+	} else {
+		pr_err("direct BAR%d write is not supported.\n", bar);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static long ncdev_bar_write(struct neuron_device *nd, u8 bar, u64 *reg_addresses, void *user_va,
 			    u32 data_count)
 {
@@ -1293,7 +1589,7 @@ static long ncdev_bar_write(struct neuron_device *nd, u8 bar, u64 *reg_addresses
 	if (ret)
 		goto done;
 
-	ret = ndhal->ndhal_cdev.ncdev_bar_write_data(nd, bar, reg_addresses, data, data_count);
+	ret = ncdev_bar_write_data(nd, bar, reg_addresses, data, data_count);
 	if (ret)
 		goto done;
 done:
@@ -1356,12 +1652,24 @@ static long ncdev_post_metric(struct neuron_device *nd, void *param)
 	ret = neuron_copy_from_user(__func__, data, arg.data, arg.data_size);
 	if (ret)
 		goto done;
-	ret = fw_io_post_metric(nd->fw_io_ctx, (u8 *)data, arg.data_size);
+	ret = ndhal->ndhal_fw_io.fw_io_post_metric(nd->fw_io_ctx, (u8 *)data, arg.data_size);
 done:
 	kfree(data);
 	return ret;
 }
 
+static long ncdev_metric_ctrl(struct neuron_device *nd, void *param)
+{
+	int ret;
+	struct neuron_ioctl_metrics_ctrl arg;
+	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_metrics_ctrl *)param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	nmetric_set_mode(nd, arg.mode);
+	return 0;
+}
+
 static long ncdev_read_hw_counters(struct neuron_device *nd, void *param)
 {
 	int ret;
@@ -1583,7 +1891,8 @@ static long ncdev_driver_info(unsigned int cmd, void *param)
 			driver_info.feature_flags1 = NEURON_DRIVER_FEATURE_DMABUF | NEURON_DRIVER_FEATURE_ASYNC_DMA |
 										 NEURON_DRIVER_FEATURE_BATCH_DMAQ_INIT | NEURON_DRIVER_FEATURE_BIG_CORE_MAPS |
 										 NEURON_DRIVER_FEATURE_MEM_ALLOC_TYPE | NEURON_DRIVER_FEATURE_HBM_SCRUB |
-										 NEURON_DRIVER_FEATURE_MEM_ALLOC64 | NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD;
+										 NEURON_DRIVER_FEATURE_MEM_ALLOC64 | NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD |
+										 NEURON_DRIVER_FEATURE_ZEROCOPY;
 
 			return copy_to_user(param, &driver_info, sizeof(driver_info));
 		}
@@ -2311,7 +2620,7 @@ static long ncdev_hbm_scrub_start(struct neuron_device *nd, void *param) {
 		struct ndma_eng   *eng   = &nd->ndma_engine[eng_id];
 		struct ndma_queue *queue = &eng->queues[qid];
 		struct ndma_ring  *ring  = &queue->ring_info;
-		ret = ndma_memcpy_add_completion_desc(eng, ring, completion_bufs[i]);
+		ret = ndma_memcpy_add_completion_desc(eng, ring, completion_bufs[i], UDMA_M2M_BARRIER_NONE);
 		if (ret) {
 			goto scrub_init_fail;
 		}
@@ -2587,6 +2896,109 @@ static int ncdev_pod_ctrl(struct file *filep, unsigned int cmd, void *param)
 	return ret;
 }
 
+static int ncdev_h2t_dma_alloc_queues(struct neuron_device *nd, unsigned int cmd, void *param)
+{
+	int ret;
+	int i;
+	int qid;
+	struct neuron_ioctl_h2t_dma_alloc_queues arg;
+
+	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_h2t_dma_alloc_queues*)param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	if (arg.nc_id >= ndhal->ndhal_address_map.nc_per_device) {
+		pr_err("nd%02d: invalid nc %d provided", nd->device_index, arg.nc_id);
+		return -E2BIG;
+	}
+	
+	if (arg.copy_queue_cnt + arg.service_queue_cnt >= DMA_MAX_Q_MAX) {
+		pr_err("nd%02d: invalid total queue count %d provided", nd->device_index, arg.copy_queue_cnt + arg.service_queue_cnt);
+		return -E2BIG;
+	}
+
+	arg.copy_queue_bmap = 0;
+	arg.service_queue_bmap = 0;
+
+	for (i=0; i < arg.copy_queue_cnt; i++) {
+		ret = ndmar_h2t_ring_request(nd, arg.nc_id, true, &qid);
+		if (ret) {
+			goto done;
+		}
+		arg.copy_queue_bmap |= (1<<qid);
+	}
+
+	for (i=0; i < arg.service_queue_cnt; i++) {
+		ret = ndmar_h2t_ring_request(nd, arg.nc_id, false, &qid);
+		if (ret) {
+			goto done;
+		}
+		arg.service_queue_bmap |= (1<<qid);
+	}
+
+	arg.copy_default_queue = ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(arg.nc_id);
+
+	ret = copy_to_user(param, &arg, sizeof(arg));
+
+done:
+	if (ret) {
+		u32 combined_queue_bmap = arg.copy_queue_bmap | arg.service_queue_bmap;
+		for (i=0; i < DMA_MAX_Q_V4; i++) {
+			if ((1<<i) & combined_queue_bmap) {
+				ndmar_h2t_ring_release(nd, arg.nc_id, i);
+			}
+		}
+		arg.copy_queue_bmap = 0;
+		arg.service_queue_bmap = 0;
+	}
+	return ret;
+}
+
+static int ncdev_h2t_dma_free_queues(struct neuron_device *nd, unsigned int cmd,  void *param)
+{
+	int ret = 0;
+	int i;
+	struct neuron_ioctl_h2t_dma_free_queues arg;
+	
+	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_h2t_dma_free_queues*)param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	if (arg.nc_id >= ndhal->ndhal_address_map.nc_per_device) {
+		pr_err("nd%02d: invalid nc %d provided", nd->device_index, arg.nc_id);
+		return -E2BIG;
+	}
+	
+	for (i=0; i < DMA_MAX_Q_V4; i++) {
+		int lret;
+		if ((1<<i) & arg.queue_bmap) {
+			lret = ndmar_h2t_ring_release(nd, arg.nc_id, i);
+			if (lret) {
+				ret = lret;
+			}
+		}
+	}
+	return ret;
+}
+
+static int ncdev_power_profile_set(struct neuron_device *nd, void *param)
+{
+	struct neuron_ioctl_power_profile arg;
+	int ret;
+
+	ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_power_profile*) param, sizeof(arg));
+	if (ret)
+		return ret;
+
+	if (arg.sz != sizeof(arg)) {
+		return -ENXIO;
+	}
+	if (arg.ctrl != 0) {
+		return -ENOTSUPP;
+	}
+	return ndhal->ndhal_perf.perf_set_profile(nd, arg.profile);
+}
+
 inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsigned long param) {
 	if ((cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK) || (cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK_EXT0)) {
 		return ncdev_crwl_nc_range_mark(filep, cmd, (void *)param);
@@ -2741,7 +3153,9 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para
 	} else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_MEM_BUF_COPY)) {
 		return ncdev_mem_buf_copy(nd, cmd, (void *)param);
 	} else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_MEM_BUF_ZEROCOPY64)) {
-		return ncdev_mem_buf_zerocopy64(nd, (void *)param);
+		return ncdev_mem_buf_zerocopy64(nd, cmd, (void *)param);
+	} else if (cmd == NEURON_IOCTL_MEM_BUF_ZEROCOPY64_BATCHES) {
+		return ncdev_mem_buf_zerocopy64_batch(nd, (void *)param);
 	} else if (cmd == NEURON_IOCTL_PROGRAM_ENGINE) {
 		return ncdev_program_engine(nd, (void *)param);
 	} else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_PROGRAM_ENGINE_NC)) {
@@ -2766,6 +3180,8 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para
 		return ncdev_bar_rw(nd, (void *)param, false);
 	} else if (cmd == NEURON_IOCTL_POST_METRIC) {
 		return ncdev_post_metric(nd, (void *)param);
+	} else if (cmd == NEURON_IOCTL_METRICS_CTRL) {
+		return ncdev_metric_ctrl(nd, (void *)param);
 	} else if (cmd == NEURON_IOCTL_NOTIFICATIONS_INIT_V1) {
 		return ncdev_nc_nq_init_deprecated(nd, (void *)param);
 	} else if (cmd == NEURON_IOCTL_NOTIFICATIONS_INIT_V2) {
@@ -2804,7 +3220,14 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para
 		return ncdev_hbm_scrub_start(nd, (void*)param);
 	} else if (cmd == NEURON_IOCTL_HBM_SCRUB_WAIT) {
 		return ncdev_hbm_scrub_wait_for_cmpl(nd, (void*)param);
+	} else if (cmd == NEURON_IOCTL_H2T_DMA_ALLOC_QUEUES) {
+		return ncdev_h2t_dma_alloc_queues(nd, cmd, (void*)param);
+	} else if (cmd == NEURON_IOCTL_H2T_DMA_FREE_QUEUES) {
+		return ncdev_h2t_dma_free_queues(nd, cmd, (void*)param);
+	} else if (cmd == NEURON_IOCTL_POWER_PROFILE) {
+		return ncdev_power_profile_set(nd, (void*)param);
 	}
+
 	// B/W compatibility
 	return ncdev_misc_ioctl(filep, cmd, param);
 }
diff --git a/neuron_core.c b/neuron_core.c
index 477985e..89c3d4a 100644
--- a/neuron_core.c
+++ b/neuron_core.c
@@ -3,7 +3,7 @@
  * Copyright 2020, Amazon.com, Inc. or its affiliates. All Rights Reserved
  */
 
-/** Each neuron device has N number of neuron cores. (v1 has 4 neuron cores; v2 has 2 neuron cores).
+/** Each neuron device has N number of neuron cores.
  *
  * Engines:
  * -------
diff --git a/neuron_device.h b/neuron_device.h
index 46b2b67..93fd781 100644
--- a/neuron_device.h
+++ b/neuron_device.h
@@ -33,13 +33,11 @@
 
 /* Vendor / Device ID for all devices supported by the driver */
 #define AMZN_VENDOR_ID  0x1D0F
-#define INF1_DEVICE_ID0 0x7064
-#define INF1_DEVICE_ID1 0x7065
-#define INF1_DEVICE_ID2 0x7066
-#define INF1_DEVICE_ID3 0x7067
 #define INF2_DEVICE_ID0 0x7264
 #define TRN1_DEVICE_ID0 0x7164
 #define TRN2_DEVICE_ID0 0x7364
+#define TRN3_DEVICE_ID0 0x7564
+#define TRN3_DEVICE_ID1 0x7565
 
 // Global host memory buf size used for memset the device memory
 #define MEMSET_HOST_BUF_SIZE MAX_DMA_DESC_SIZE // guessed optimal DMA transfer and PCIe TLP size.
@@ -116,7 +114,7 @@ struct neuron_device {
 	u64 nc_model_started_count[MAX_NC_PER_DEVICE]; // number of times the NCs has started model
 
 	struct nsysfsmetric_metrics sysfs_metrics;
-	
+
 	struct neuron_log_obj log_obj; // logging object
 
 	struct neuron_hbm_scrub_ctx hbm_scrub_ctx;
diff --git a/neuron_dhal.c b/neuron_dhal.c
index 83ce1a1..bb269c2 100644
--- a/neuron_dhal.c
+++ b/neuron_dhal.c
@@ -29,21 +29,23 @@ int neuron_dhal_init(unsigned int pci_device_id) {
     }
     mutex_unlock(&ndhal_init_lock);
 
-    ndhal->arch = narch_get_arch();
+    ndhal->ndhal_arch.arch = narch_get_arch();
     ndhal->pci_device_id = pci_device_id;
-    ret = ndhal_register_funcs_vc();
-    switch (ndhal->arch) {
-        case NEURON_ARCH_V1:
-            ret = ndhal_register_funcs_v1();
-            break;
+    ndhal_register_funcs_vc();
+
+    switch (ndhal->ndhal_arch.arch) {
         case NEURON_ARCH_V2:
             ret = ndhal_register_funcs_v2();
             break;
         case NEURON_ARCH_V3:
             ret = ndhal_register_funcs_v3();
             break;
+        case NEURON_ARCH_V4:
+            ret = ndhal_register_funcs_v3(); // use v3 as base
+            ret = ndhal_register_funcs_v4(); // apply v4 overrides
+            break;
         default:
-            pr_err("Unknown HW architecture: %d. Can't init neuron_dhal.\n", ndhal->arch);
+            pr_err("Unknown HW architecture: %d. Can't init neuron_dhal.\n", ndhal->ndhal_arch.arch);
             return -EINVAL;
     }
 
diff --git a/neuron_dhal.h b/neuron_dhal.h
index a7089ad..bbdbbe5 100644
--- a/neuron_dhal.h
+++ b/neuron_dhal.h
@@ -12,8 +12,21 @@
 #define dhal_sysfs_emit(buf, ...) sysfs_emit((buf), __VA_ARGS__)
 #endif
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+#define dhal_atomic_fetch_add(v,a)  atomic_fetch_add(v, (a))
+#else
+#define dhal_atomic_fetch_add(v,a)  (atomic_add_return(v, (a)) -1)
+#endif
+
+
+
 extern int force_die_flip;
 
+struct ndhal_arch {
+    int arch;
+    enum neuron_platform_type platform_type;
+    u32 server_id;
+};
 struct ndhal_address_map {
 	// addresses
 	uint64_t pci_host_base;
@@ -44,7 +57,6 @@ struct ndhal_address_map {
 struct ndhal_reset {
     uint64_t reset_poll_interval;
     uint64_t reset_tpb_initial_poll_delay;
-    uint64_t reset_device_initial_poll_delay;
 	uint64_t initiate_max_wait_time;
     uint32_t retry_count;
     int (*nr_initiate_reset) (struct neuron_device *nd, uint32_t nc_map);
@@ -82,8 +94,8 @@ struct ndhal_mpset {
 
 struct ndhal_ndmar {
     uint32_t (*ndmar_get_h2t_eng_id) (struct neuron_device *nd, uint32_t nc_id);
-    int (*ndmar_get_h2t_qid) (uint32_t nc_id);
-    bool (*ndmar_is_h2t_q) (struct neuron_device *nd, uint32_t eng_id, uint32_t q_id);
+    int (*ndmar_get_h2t_def_qid) (uint32_t nc_id);
+    bool (*ndmar_is_h2t_def_q) (struct neuron_device *nd, uint32_t eng_id, uint32_t q_id);
     bool (*nr_init_h2t_eng) ( int nc_idx, uint32_t nc_map); 
     bool (*ndmar_is_nx_ring) (uint32_t eng_id, uint32_t q_id);
     int (*ndmar_quiesce_queues) (struct neuron_device *nd, u32 nc_id, u32 engine_count, u32 *queue_mask);
@@ -94,6 +106,8 @@ struct ndhal_fw_io {
     int (*fw_io_topology) (struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count);
     int (*fw_io_register_readless_read_region) (struct fw_io_ctx *ctx, void __iomem *bar0, u64 bar0_size, void __iomem *bar2, u64 bar2_size);
     int (*fw_io_read_csr_array) (void **addrs, u32 *values, u32 num_csrs, bool operational);
+    int (*fw_io_execute_request) (struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size);
+    int (*fw_io_post_metric) (struct fw_io_ctx *ctx, u8 *data, u32 size);
 };
 
 struct ndhal_reg_access {
@@ -153,15 +167,12 @@ struct ndhal_cdev {
 
     void (*ncdev_compatible_version) (struct neuron_ioctl_compatible_version *arg);
     void (*ncdev_quiesce_exec_on_proc_exit) (void);
-    int (*ncdev_bar_write_data) (struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count);
     int (*ncdev_logical_to_physical_nc_map)(struct neuron_ioctl_nc_map *map, uint32_t max_num_entries, enum neuron_ioctl_nc_mapping_type mapping_type);
     void (*ncdev_get_default_tpbs_for_hbm) (u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count);
 };
 
 struct ndhal_udma {
 	unsigned int num_beats;
-    void (*udma_m2s_data_rd_cfg_boundaries_set) (struct udma *udma);
-    void (*udma_q_config) (struct udma_q *udma_q);
 };
 
 struct ndhal_ndma {
@@ -183,6 +194,7 @@ struct ndhal_npe {
 	ssize_t (*npe_class_node_id_show_data)(char *buf, u32 sz);
 	ssize_t (*npe_class_server_id_show_data)(char *buf, u32 sz);
 	ssize_t (*npe_class_ultraserver_mode_show_data)(char *buf);
+	u32 (*npe_neighbor_eng_ids)[2];
 };
 
 struct ndhal_tpb {
@@ -201,10 +213,15 @@ struct ndhal_tpb {
     int (*pe_format_activity_stats)(struct neuron_device *nd, int nc_id, char buffer[], unsigned int bufflen);
 };
 
+struct ndhal_perf {
+    int current_performance_profile;
+    int (*perf_set_profile) (struct neuron_device *nd, uint32_t profile);
+};
+
 struct neuron_dhal {
-    int arch;
     unsigned int pci_device_id;
 
+    struct ndhal_arch ndhal_arch;
     struct ndhal_address_map ndhal_address_map;
     struct ndhal_reset ndhal_reset;
     struct ndhal_topsp ndhal_topsp;
@@ -222,6 +239,7 @@ struct neuron_dhal {
     struct ndhal_ndma ndhal_ndma;
 	struct ndhal_npe ndhal_npe;
     struct ndhal_tpb ndhal_tpb;
+    struct ndhal_perf ndhal_perf;
     void (*ndhal_ext_cleanup) (void);
 };
 
@@ -253,13 +271,13 @@ void neuron_dhal_cleanup(void);
 void neuron_dhal_free(void);
 
 /**
- * ndhal_register_funcs() - Register functions v1 (or inf1) v2 (or trn1 inf2) to the ndhal
+ * ndhal_register_funcs() - Register functions based on hardward arch to the ndhal
  * 
  * @return int 0 on success, negative for failures
  */
 int ndhal_register_funcs_vc(void);
-int ndhal_register_funcs_v1(void);
 int ndhal_register_funcs_v2(void);
 int ndhal_register_funcs_v3(void);
+int ndhal_register_funcs_v4(void);
 
 #endif
diff --git a/neuron_dma.c b/neuron_dma.c
index 04fb3a4..5f7cbc0 100644
--- a/neuron_dma.c
+++ b/neuron_dma.c
@@ -22,6 +22,9 @@
 DECLARE_FAULT_ATTR(neuron_fail_dma_wait);
 #endif
 
+int zerocopy_trn1_override = 0;
+module_param(zerocopy_trn1_override, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(zerocopy_trn1_override, "override zerocopy for trn1");
 
 //#define NUNUSED	__attribute__ ((unused))
 
@@ -37,7 +40,7 @@ static void ndma_ack_completed_desc(struct ndma_eng *eng, struct ndma_ring *ring
 	udma_cdesc_ack(txq, count);
 }
 
-static inline u32 ndma_mc_pair_to_nc( struct mem_chunk *src_mc, struct mem_chunk *dst_mc)
+u32 ndma_mc_pair_to_nc(struct mem_chunk *src_mc, struct mem_chunk *dst_mc)
 {
 	if (src_mc->mem_location != MEM_LOC_HOST)
 		return src_mc->nc_id;
@@ -88,7 +91,7 @@ static inline int ndma_dma_ctx_get_next_handle( int pdma_ctx_handle, int * dma_c
  * memchunk to dma phy addr
  *
  */
-static inline dma_addr_t ndma_mc_to_pa( struct mem_chunk *mc)
+dma_addr_t ndma_mc_to_pa(struct mem_chunk *mc)
 {
 	if (mc->mem_location == MEM_LOC_HOST)
 		return virt_to_phys(mc->va) | ndhal->ndhal_address_map.pci_host_base;   // why isn't this already set???
@@ -166,7 +169,7 @@ static inline struct ndma_h2t_dma_context * ndma_get_dma_ctx( struct ndma_eng *e
 	if (eng->used_for_h2t)
     	return &ring->h2t_dma_ctx[dma_ctx_handle];
 	else  {
-		pr_info("allocating descriptor for non-h2t\n");   // FIXME remove at some point
+		pr_info_once("allocating descriptor for non-h2t\n");
     	return kmalloc( sizeof(struct ndma_h2t_dma_context), GFP_KERNEL);
 	}
 }
@@ -191,7 +194,7 @@ static inline void ndma_release_dma_ctx( struct ndma_eng *eng, struct ndma_ring
  *    add a completion entry to the ring 
  *
  */
-int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer)
+int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer, int barrier_type)
 {
 	int ret = 0;
 	struct udma_ring_ptr completion;
@@ -210,7 +213,7 @@ int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *rin
 	completion.addr = virt_to_phys(completion.ptr) | ndhal->ndhal_address_map.pci_host_base;
 	ret = udma_m2m_copy_prepare_one(&eng->udma, ring->qid, completion.addr,
 					completion.addr + DMA_COMPLETION_MARKER_SIZE,
-					DMA_COMPLETION_MARKER_SIZE, UDMA_M2M_BARRIER_NONE, false);
+					DMA_COMPLETION_MARKER_SIZE, barrier_type, false);
 	if (ret) {
 		pr_err("failed to prepare DMA descriptor on nd%02d for %s q%d\n", eng->nd->device_index, eng->udma.name, ring->qid);
 		ret = -1;
@@ -342,12 +345,12 @@ static int ndma_memcpy_chunks( struct ndma_eng *eng, struct ndma_ring *ring, str
 		remaining -= chunk_size;
 		pending_transfers++;
 		
-		//FIXME trace_dma_memcpy(nd, nc_id, src_offset, dst_offset, chunk_size, pending_transfers);
+		//TODO trace_dma_memcpy(nd, nc_id, src_offset, dst_offset, chunk_size, pending_transfers);
 	}
 
 	// write completion descriptor, kick off DMAs, record pending xfers and data outstanding and prefetch if requested
 	//
-	ret = ndma_memcpy_add_completion_desc( eng, ring, dma_ctx->completion_ptr);
+	ret = ndma_memcpy_add_completion_desc( eng, ring, dma_ctx->completion_ptr, UDMA_M2M_BARRIER_NONE);
 	if (ret) {
 		return ret; 
 	}
@@ -432,7 +435,7 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr
 
 	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
 	// for v2 the last one is reserved for collectives
-	const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id);
+	const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id);
 
 	struct ndma_eng   *eng   = &nd->ndma_engine[eng_id];
 	struct ndma_queue *queue = &eng->queues[qid];
@@ -447,7 +450,7 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr
 	//   2. usage of the SYNC dma context (basically even though we specify we are using the SYNC ctxt handle outside this routine
 	//      the SYNC dma context itself is only used within this routine.
 	//
-	mutex_lock(&eng->h2t_ring_lock);
+	mutex_lock(&ring->h2t_ring_lock);
 
     // initialize the DMA context
 	dma_ctx->inuse             = true;
@@ -461,7 +464,12 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr
 	dma_ctx->size              = size;
 	dma_ctx->smove             = smove;
 	dma_ctx->dmove             = dmove;
-    dma_ctx->completion_ptr    = ndma_memcpy_get_completion_buf( eng, ring, wait_handle);
+	dma_ctx->completion_ptr    = ndma_memcpy_get_completion_buf( eng, ring, wait_handle);
+
+	if (dma_ctx->completion_ptr == NULL) {
+		ret = -ENOMEM;
+		goto fail;
+	}
 
 	// Sanity check 
 	if ((pdma_ctx != NULL) && (!pdma_ctx->inuse)) {
@@ -525,7 +533,7 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr
 
 	ndma_release_dma_ctx( eng, ring, pdma_ctx);
 	
-	mutex_unlock(&eng->h2t_ring_lock);
+	mutex_unlock(&ring->h2t_ring_lock);
 	return ret;
 }
 
@@ -610,7 +618,7 @@ int ndma_memcpy_mc(struct neuron_device *nd, struct mem_chunk *src_mc, struct me
 	}
 	dst_pa += dst_offset;
 
-	// FIXME: H2H memcpy's src and dst mc should have dedicated nc_id such as -1
+	// TODO: H2H memcpy's src and dst mc should have dedicated nc_id such as -1
 	if (src_mc->mem_location == MEM_LOC_HOST && dst_mc->mem_location == MEM_LOC_HOST) {
 		nc_id = dst_mc->nc_id;
 	}
@@ -631,7 +639,7 @@ int ndma_memcpy_mc_wait( struct neuron_device *nd, struct mem_chunk *src_mc, str
 	int ret;
 	const u32  nc_id         = ndma_mc_pair_to_nc( src_mc, dst_mc);
 	const int eng_id         = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
-	const int qid            = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id);
+	const int qid            = ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id);
 	struct ndma_eng *eng     = &nd->ndma_engine[eng_id];
 	struct ndma_queue *queue = &eng->queues[qid];
 	struct ndma_ring *ring   = &queue->ring_info;
@@ -895,6 +903,7 @@ struct ndma_h2t_zcdma_context {
 	dma_addr_t        dev_addr;           // device address
 	u64               size;               // size for this transfer
 	bool              direction;          // direction. true = to device
+	bool              last;               // last transfer for the entire request.
 	u64               start_time;         // start time for this transfer
 	int               nr_pages;           // number of pages for this transfer
 	int               nr_desc;            // number of descriptors which is equal to pending transfers -1
@@ -909,7 +918,34 @@ struct ndma_h2t_zcdma_context {
 										  //       dma time > (pin time + setup time + completion update + initial poll wait)
 										  // That's the simple explanation. It's a tad more complicated in trading off smaller
 										  // transfers where even if that equation doesn't hold, the overlap can be beneficial.
-										  // Right now the sweet spot looks to be ~ 32 pages
+										  // Right now the sweet spot looks to be ~ 64 pages.  More tuning is required.
+										  // 
+#define NDMA_ZC_MIN_PAGES_PER_XFER 64
+
+/** ndma_calc_zc_pin_size()
+ *
+ *   determine how many pages to pin per step for zercopy dma pipelining.
+ */
+static size_t ndma_calc_zc_pin_size(size_t size)
+{
+	if (size > NDMA_ZC_PAGES_PER_XFER * PAGE_SIZE * 2) {
+		return NDMA_ZC_PAGES_PER_XFER * PAGE_SIZE;
+	} else if (size <= NDMA_ZC_MIN_PAGES_PER_XFER * PAGE_SIZE) {
+		return  size;
+	}
+	return (size/2 + PAGE_SIZE-1) & ~(PAGE_SIZE-1);
+}
+
+/**
+ * ndma_zerocopy_supported()
+ *
+ *   zero copy is not support for platforms that require retry
+ *   
+ */
+bool ndma_zerocopy_supported(void)
+{
+	return !ndhal->ndhal_ndma.ndma_retry_memcpy || zerocopy_trn1_override;
+}
 
 /**
  * ndma_build_n_issue_zc_descs() 
@@ -919,7 +955,7 @@ struct ndma_h2t_zcdma_context {
  *
  *   explain how alignment is handled.
  *
- *   Todo:
+ *   TODO:
  *     go i=0 to nr_pages
  *     Think about using some permanent location in HBM as source for completion descriptor update.  Like
  *     why are we reading across the PCIe bus to fetch completion data.
@@ -934,6 +970,7 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 	int            i = 0;
 	u64            chunk_size;
 	int            pending_transfers = 0;
+	int barrier_type;
 
 	while (i < dma_ctx->nr_pages) {
 		dma_addr_t src_addr;
@@ -951,10 +988,10 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 			contig_size += PAGE_SIZE;
 		}
 
-		if (dma_ctx->direction) {
+		if (dma_ctx->direction) { // write to device
 			src_addr = (contig_start + offset) | pci_host_base;
 			dst_addr = dev_addr;
-		} else {
+		} else {                 // read from device
 			src_addr = dev_addr;
 			dst_addr = (contig_start + offset) | pci_host_base;
 		}
@@ -966,8 +1003,25 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 			chunk_size = (remaining < contig_size) ? remaining : contig_size;
 			if (chunk_size > MAX_DMA_DESC_SIZE)
 				chunk_size = MAX_DMA_DESC_SIZE;
+			// on the read path completion write follows data writes in order, that means when the completion write finishes
+			// it's guaranteed that all the data has been written, no need for a barrier
+
+			// on the write path we only need the barrier for the last transfer (the last set of pinned pages), why?
+			// HBM writes (data) and host write (completion) take different path through data fabric.  That means w/o a barrier
+			// it's possible for the completion to be written before the data.
+
+			// We don't need the barrier to ensure it's safe to unpin.  
+			// s2m descriptors are executed in order, that means when s2m completion write is executed all s2m data writes 
+			// have been executed as well, that means all m2s data reads have been executed, that means it's safe to unpin
 
-			ret = udma_m2m_copy_prepare_one(&dma_ctx->eng->udma, dma_ctx->ring->qid, src_addr, dst_addr, chunk_size, remaining == chunk_size, false); // set the barrier if the last descriptor
+			// use WRITE_BARRIER on V2 (set on the last data descriptor)
+			// use SOW on V3+ (set on completion descriptor below)
+			if (narch_get_arch() == NEURON_ARCH_V2)
+				barrier_type = (remaining == chunk_size && dma_ctx->direction && dma_ctx->last) ? UDMA_M2M_BARRIER_WRITE_BARRIER : UDMA_M2M_BARRIER_NONE;
+			else
+				barrier_type = UDMA_M2M_BARRIER_NONE;
+
+			ret = udma_m2m_copy_prepare_one(&dma_ctx->eng->udma, dma_ctx->ring->qid, src_addr, dst_addr, chunk_size,  barrier_type, false);
 			if (ret) {
 				pr_err("failed to prepare DMA descriptor for %s q%d\n", dma_ctx->eng->udma.name, dma_ctx->ring->qid);
 				goto error;
@@ -983,7 +1037,11 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 	
 	dma_ctx->nr_desc = pending_transfers;
 
-	ret = ndma_memcpy_add_completion_desc( dma_ctx->eng, dma_ctx->ring, dma_ctx->completion_ptr);
+	if (narch_get_arch() != NEURON_ARCH_V2)
+		barrier_type = (dma_ctx->direction && dma_ctx->last) ? UDMA_M2M_BARRIER_SOW: UDMA_M2M_BARRIER_NONE;
+	else
+		barrier_type = UDMA_M2M_BARRIER_NONE;
+	ret = ndma_memcpy_add_completion_desc( dma_ctx->eng, dma_ctx->ring, dma_ctx->completion_ptr, barrier_type);
 	if (ret) {
 		goto error;
 	}
@@ -1001,279 +1059,192 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx)
 }
 
 /**
- * ndma_zero_copy_wait_for_completion()
+ * ndma_zerocopy_wait_for_completion()
  *
  *
  *
  */
-static int ndma_zero_copy_wait_for_completion( struct neuron_device *nd, u32 nc_id, struct ndma_eng   *eng, struct ndma_ring  *ring,
-											   struct ndma_h2t_zcdma_context * dma_ctx, struct ndma_h2t_zcdma_context * ndma_ctx)
+static int ndma_zerocopy_wait_for_completion( struct neuron_device *nd, u32 nc_id, struct ndma_eng   *eng, struct ndma_ring  *ring,
+											  struct ndma_h2t_zcdma_context * dma_ctx, struct ndma_h2t_zcdma_context * ndma_ctx)
 {
 	int  ret;
-	bool async = true;
-
-	while(true) {
-		ret = ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, async, false);  // FIXM we shouldn't even be waiting 1usec here
-
-		if (ret == 0) {
-			if (dma_ctx->direction)
-				unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages);
-			else
-				unpin_user_pages_dirty_lock( dma_ctx->page_list, dma_ctx->nr_pages, true);
-			return ret;
-		}
-
-		// if the memcpy starts within a NeuronCore reset window,
-		// the timeout is possible due to DMA hanging caused by hardware issue.
-		// if so, restart DMA and retry the memcpy
-		if (narch_get_arch() != NEURON_ARCH_V2) {    // FIXME - this should be if (!ndhal.tpb_reset_dma_retry) or part of dma_ctx
-			break;
-		}
-
-		if (!nr_op_in_reset_wnd(dma_ctx->start_time, nd)) {
-			break;
-		}
-
-		pr_info( "Failed to copy memory during a NeuronCore reset: nd %d, host %#llx, dev  %#llx, size %llu. Retrying the copy.\n",
-				nd->device_index, (dma_addr_t)dma_ctx->host_addr, dma_ctx->dev_addr, dma_ctx->size);
 
-		dma_ctx->start_time = get_jiffies_64();
-		if (ndma_ctx != NULL)
-			ndma_ctx->start_time = get_jiffies_64();
-
-		ret = ndmar_h2t_ring_init(eng, ring->qid);
-
-		if (ret) {
-			pr_err("H2T ring init failed on nd %d: ret %d\n", nd->device_index, ret);
-			break;
-		}
+	ret = ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, true, false);
+	//atomic_sub(dma_ctx->nr_desc+1, &dma_ctx->ring->h2t_outstanding_desc);
 
-		// restart dmas
-		//
-		ret = ndma_build_n_issue_zc_descs( dma_ctx);
-		if (ret)
-			break;
-
-		if (ndma_ctx != NULL) {
-			ret = ndma_build_n_issue_zc_descs( ndma_ctx);
-			if (ret) {
-				ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, false, false);
-				break;
-			}
-		}
-
-		async = false;
+	if (ret == 0) {
+		if (dma_ctx->direction)
+			unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages);
+		else
+			unpin_user_pages_dirty_lock(dma_ctx->page_list, dma_ctx->nr_pages, true);
+		return ret;
 	}
 
 	// If we are exiting here, we've failed so unpin pages associated with the DMA.  If the next DMA
 	// context is valid, do an obligatory wait for the DMA operation so we don't splat data on someone 
 	// else's memory just in case the physical pages are reassigned after unpinning.
 	//
-	unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages);
+	unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages);
 
-	// blindly wait - 
+	// blindly wait
 	if (ndma_ctx != NULL) {
 		ndma_memcpy_wait_for_completion(eng, ring, ndma_ctx->nr_desc+1, ndma_ctx->completion_ptr, false, false);
-		unpin_user_pages( ndma_ctx->page_list, ndma_ctx->nr_pages);
+		unpin_user_pages(ndma_ctx->page_list, ndma_ctx->nr_pages);
 	}
-
+	
 	return ret;
 }
 
-/**
- * ndma_memcpy_zero_copy()
- *
- *   dma data between a user space virtual address range and a contiguous location in device memory.
- *   In order to do this, we need to know the physical pages are associated with
- *   the user virtual address range and we need to make sure those physical pages stay
- *   associated with the user virtual address range while the DMA is happening.
- *   
- *   How do we do this?  By asking the kernel to pin the physical pages in memory until we are 
- *   done with them.  But our transaction could be large, the physical pages won't be contiguous,
- *   and pinning takes CPU cycles, so we break the dma transfer up into a series of smaller transfers
- *   where we pipeline the pinning of physical pages with dma transfers.
- *
- *   We use pin_user_pages_fast() to reduce pinning overhead because we know the process can't go 
- *   away while we are down here doing our thing in the kernel within a single IOCTL call. 
- *   
- *   We ping pong back and forth between two dma contexts. So while dma for context A is in progress, 
- *   we are pinning pages and starting dmas for context B. 
- *
- *   Algorithm goes like this:
- *      initial a pair of dma contexts 
- *      prev dma ctx = null
- *      lock()
- *      while still more data remaining
- *         current dma ctx = next available context
- *         init current dma context
- *         calc size of the transfer for this dma context.  We want to transfer up to page boundaries
- *         calc number of pages that need to be pinned for this dma
- *         pin host pages in memory
- *         generate descriptors for 
- *         if prev dma ctx != NULL, wait for the prev dma to complete
- *         update host address, device address and ammount remaining
- *      wait for the last dma ctx to complete
- *      unlock()
- *      free resources
- *
- *  Notes:
- *    unpinning responsibilities. Up until a dma is successfully launched, this routine is responsible for unpinning
- *    host memory.  After that ndma_zero_copy_wait_for_completion() owns responsibility for unpinning pages.
- *
- *    We don't do this here, but pinning user pages across system (IOCTL) calls has a number of additional requirements.
- *    We would have to cleanup any pinned pages when the process goes away, so any pinned pages have to get tracked in 
- *    process context.
- *    
- * direction == true means write from host to device
- *
- */
-
-static int ndma_memcpy_zero_copy(struct neuron_device *nd, u32 nc_id, void * host_addr, dma_addr_t dev_addr, u64 size, bool direction)
+int ndma_memcpy_zerocopy(struct neuron_device *nd,
+                                u32 nc_id,
+                                const nrt_tensor_batch_op_t *ops,
+                                u32 num_ops,
+                                dma_addr_t dev_base,
+                                int qid,
+                                bool direction)
 {
-	int ret = 0;
-
-	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
-	const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id);   // TODO - this needs direction or transfer type to select qid
-	struct ndma_eng   *eng   = &nd->ndma_engine[eng_id];
-	struct ndma_queue *queue = &eng->queues[qid];
-	struct ndma_ring  *ring  = &queue->ring_info;
-	struct ndma_h2t_zcdma_context   dma_ctx_tbl[2] = {0};
-	struct ndma_h2t_zcdma_context * dma_ctx;
-	struct ndma_h2t_zcdma_context * pdma_ctx = NULL;
-	int    next_dma_idx = 0;
-	int    i;
-	u64    remaining  = size;
-	u64    cpy_size = (NDMA_ZC_PAGES_PER_XFER*PAGE_SIZE < size) ? NDMA_ZC_PAGES_PER_XFER*PAGE_SIZE : size;
-	int nr_pinned;
-
-	// initialize the static fields in the dma contexts that are the same for every operation
-	//
-	for (i=0;i< 2;i++) {
-		dma_ctx_tbl[i].eng            = eng;
-		dma_ctx_tbl[i].ring           = ring;
-		dma_ctx_tbl[i].direction      = direction;
-		dma_ctx_tbl[i].page_list      = kcalloc( NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL);
-		dma_ctx_tbl[i].completion_ptr = kmalloc(DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL);
-
-		if ((dma_ctx_tbl[i].page_list == NULL) || (dma_ctx_tbl[i].completion_ptr == NULL)) {
-			pr_err("could not allocate memory for dma contexts on nd %d\n", nd->device_index);
-			goto fail;
-		}
-	}
-	pdma_ctx = NULL;
-
-	mutex_lock(&eng->h2t_ring_lock);
-
-	while (remaining) {
-		unsigned long offset = (unsigned long)(host_addr) & (PAGE_SIZE-1);
-		dma_ctx = &dma_ctx_tbl[next_dma_idx];
-		dma_ctx->start_time = get_jiffies_64();
-		dma_ctx->host_addr  = host_addr;
-		dma_ctx->dev_addr   = dev_addr;
-		dma_ctx->size       = (cpy_size == remaining) ? cpy_size : cpy_size - offset; // slightly non-obvious, we are setting up xfer size
-		                                                                              // that only the first xfer has its starting address
-																					  // not aligned to the page boundary.  First time around
-																					  // offset >= 0 and cpy_size <= xfer size.  Other times
-																					  // host_addr is aligned, offset = 0 and cpy_size = xfer_size
-		dma_ctx->nr_pages   = DIV_ROUND_UP(offset + dma_ctx->size, PAGE_SIZE);
-
-		//__GFP_SKIP_ZERO
-		nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
-		if (nr_pinned != dma_ctx->nr_pages) {
-			// if failed pin_fast because of page fault, do the regular pinning
-			if (nr_pinned > 0)
-				unpin_user_pages( dma_ctx->page_list, nr_pinned);
+    int ret = 0;
+    const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
+    struct ndma_eng   *eng   = &nd->ndma_engine[eng_id];
+    struct ndma_queue *queue = &eng->queues[qid];
+    struct ndma_ring  *ring  = &queue->ring_info;
+    struct ndma_h2t_zcdma_context   dma_ctx_tbl[2] = {0};
+    struct ndma_h2t_zcdma_context *pdma_ctx = NULL;
+    int    next_dma_idx = 0;
+    int    i            = 0;
+    bool   locked       = false;
+
+    // sanity check ring is owned by nc_id
+    if (!ndmar_h2t_ring_is_owner(ring, nc_id)) {
+        pr_err("nd%02d: attempting to use qid %d that was not assigned to nc %d\n", nd->device_index, qid, nc_id);
+        return -ENOENT;
+    }
+
+    // initialize the static fields in the dma contexts that are the same for every operation
+    for (i=0;i< 2;i++) {
+        dma_ctx_tbl[i].eng            = eng;
+        dma_ctx_tbl[i].ring           = ring;
+        dma_ctx_tbl[i].direction      = direction;
+        dma_ctx_tbl[i].page_list      = kcalloc( NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL);
+        dma_ctx_tbl[i].completion_ptr = kmalloc(DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL);
+
+        if ((dma_ctx_tbl[i].page_list == NULL) || (dma_ctx_tbl[i].completion_ptr == NULL)) {
+            pr_err("could not allocate memory for dma contexts on nd %d\n", nd->device_index);
+            ret = -ENOMEM;
+            goto fail;
+        }
+    }
+    pdma_ctx = NULL;
+
+    mutex_lock(&ring->h2t_ring_lock);
+    locked = true;
+
+    // Process all operations with pipelining
+    for (i = 0; i < num_ops; i++) {
+        const nrt_tensor_batch_op_t *op = &ops[i];
+        u64 remaining = op->size;
+        void *host_addr = op->buffer;
+        dma_addr_t dev_addr = dev_base + op->offset;
+        u64 offset = (unsigned long)host_addr & (PAGE_SIZE - 1);
+        u64 pin_size = ndma_calc_zc_pin_size(op->size + offset);  // pin size is in page units, so include the page offset in size calc
+
+        while (remaining) {
+            struct ndma_h2t_zcdma_context *dma_ctx = &dma_ctx_tbl[next_dma_idx];
+            dma_ctx->start_time = get_jiffies_64();
+            dma_ctx->host_addr  = host_addr;
+            dma_ctx->dev_addr   = dev_addr;
+            dma_ctx->size       = pin_size - offset; // first chunk might not be aligned on the page boundary, all subsequent chunk will be aligned
+                                                     // and the offset will be 0
+            dma_ctx->last       = (dma_ctx->size == remaining && i == num_ops - 1);
+            dma_ctx->nr_pages   = DIV_ROUND_UP(pin_size, PAGE_SIZE);
+            if (dma_ctx->nr_pages > NDMA_ZC_PAGES_PER_XFER) {
+                pr_err_once("page count too large: %u\n", dma_ctx->nr_pages);
+            }
+
+            //__GFP_SKIP_ZERO
+            int nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages,
+                                                direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
+            if (nr_pinned != dma_ctx->nr_pages) {
+                // if failed pin_fast because of page fault, do the regular pinning
+                if (nr_pinned > 0) {
+                    unpin_user_pages( dma_ctx->page_list, nr_pinned);
+                }
 
 #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6)))
-			nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
+                nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list);
 #else
-			nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL);
+                nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL);
 #endif
-			if (nr_pinned != dma_ctx->nr_pages) {
-				ret = -ENOMEM; // could use -EBUSY instead
-				pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd->device_index, nr_pinned);
-
-				if (nr_pinned > 0)
-					unpin_user_pages( dma_ctx->page_list, nr_pinned);
-				// cleanup: wait for prev dma to complete (which also unpins pages)
-				if (pdma_ctx != NULL)
-					ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL);
-				goto fail;
-			}
-		}
-
-		// TODO need to have this for other architectures
-		// for (i=0; i < dma_ctx->nr_pages; i++) {
-		//     struct device
-		// 	    dma_ctx->addr[i] = dma_map_page( nd->pdev->dev, dma_ctx_page_list[i], 0, PAGE_SIZE, DMA_TO_DEVICE/DMA_FROM_DEVICE);
-		// 	    ret = dma_mapping_error(dev->dev, dma_ctx->addr[i]);
-		//		if (ret) { }
-		// }
-		// flush_cache_range(vma,  
-		//
-		// TODO - may need a callback here to check descriptors
-
-		ret = ndma_build_n_issue_zc_descs( dma_ctx);
-		if (ret) {
-			unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages);
-			// cleanup: wait for prev dma to complete (which also unpins pages)
-			if (pdma_ctx != NULL) ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL);
-			goto fail;
-		}
-
-		if (pdma_ctx != NULL) {
-			ret = ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, dma_ctx); 
-			if (ret)
-				goto fail;
-		}
-		pdma_ctx     = dma_ctx;
-		next_dma_idx = (next_dma_idx+1) % 2;
-
-		remaining -= dma_ctx->size;
-		host_addr += dma_ctx->size;
-		dev_addr  += dma_ctx->size;
-		cpy_size   = (remaining < cpy_size) ? remaining : cpy_size;
-	}
-
-	ret = ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL);
+                if (nr_pinned != dma_ctx->nr_pages) {
+                    ret = -ENOMEM; // could use -EBUSY instead
+                    pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd->device_index, nr_pinned);
+
+                    if (nr_pinned > 0) {
+                        unpin_user_pages( dma_ctx->page_list, nr_pinned);
+                    }
+                    // cleanup: wait for prev dma to complete (which also unpins pages)
+                    if (pdma_ctx != NULL) {
+                        ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL);
+                    }
+                    goto fail;
+                }
+            }
+
+            // TODO need to have this for other architectures
+            // for (i=0; i < dma_ctx->nr_pages; i++) {
+            //     struct device
+            //      dma_ctx->addr[i] = dma_map_page( nd->pdev->dev, dma_ctx_page_list[i], 0, PAGE_SIZE, DMA_TO_DEVICE/DMA_FROM_DEVICE);
+            //      ret = dma_mapping_error(dev->dev, dma_ctx->addr[i]);
+            //      if (ret) { }
+            // }
+            // flush_cache_range(vma, 
+
+            ret = ndma_build_n_issue_zc_descs(dma_ctx);
+            if (ret) {
+                unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages);
+                // cleanup: wait for prev dma to complete (which also unpins pages)
+                if (pdma_ctx != NULL) {
+                    ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, NULL);
+                }
+                goto fail;
+            }
+
+            if (pdma_ctx != NULL) {
+                ret = ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, dma_ctx);
+                if (ret) {
+                    goto fail;
+                }
+            }
+
+            pdma_ctx     = dma_ctx;
+            next_dma_idx = (next_dma_idx+1) % 2;
+
+            remaining -= dma_ctx->size;
+            host_addr += dma_ctx->size;
+            dev_addr  += dma_ctx->size;
+            pin_size   = (remaining < pin_size) ? remaining : pin_size;
+            offset     = 0;
+        }
+    }
+
+
+    // Wait for the last chunk
+    if (pdma_ctx) {
+        ret = ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL);
+    }
 
 fail:
-
-	// release resources
-	//
-	for (i=0;i< 2;i++) {
-		if (dma_ctx_tbl[i].page_list != NULL)
-			kfree(dma_ctx_tbl[i].page_list);
-		if (dma_ctx_tbl[i].completion_ptr != NULL) {
-			kfree(dma_ctx_tbl[i].completion_ptr);
-		}
-	}
-	mutex_unlock(&eng->h2t_ring_lock);
-
-	return ret;
-}
-
-/**
- * ndma_memcpy_zero_copy_mc()
- *
- *   Wrapper around ndma_memcpy_zero_copy() that pulls nc_id and device phyical address from
- *   the mem chunk.
- *
- *   Todo:
- *     Range check the device address here.  
- *
- *   Assumptions:
- *     caller has done access_ok() check on the host address 
- *     if (!access_ok(blah) return -EFAULT;
- *     or check_copy_size()
- */
-int ndma_memcpy_zero_copy_mc( struct neuron_device *nd,  void * host_addr, struct mem_chunk *dev_mc, u64 dev_offset, u64 size, bool direction)
-{
-	dma_addr_t dev_addr;
-	u32 nc_id;
-
-	nc_id    = ndma_mc_pair_to_nc( dev_mc, dev_mc);
-	dev_addr = ndma_mc_to_pa( dev_mc) + dev_offset;   // range has been checked by the caller
-
-	return ndma_memcpy_zero_copy(nd, nc_id, host_addr, dev_addr, size, direction);
+    // release resources
+    for (i = 0; i < 2; i++) {
+        if (dma_ctx_tbl[i].page_list != NULL)
+            kfree(dma_ctx_tbl[i].page_list);
+        if (dma_ctx_tbl[i].completion_ptr != NULL) {
+            kfree(dma_ctx_tbl[i].completion_ptr);
+        }
+    }
+    if (locked) {
+        mutex_unlock(&ring->h2t_ring_lock);
+    }
+
+    return ret;
 }
diff --git a/neuron_dma.h b/neuron_dma.h
index 79f8254..012bef4 100644
--- a/neuron_dma.h
+++ b/neuron_dma.h
@@ -165,7 +165,7 @@ int ndma_memcpy64k(struct ndma_eng *eng, struct ndma_ring *ring, dma_addr_t src,
  *    add a completion entry to the ring 
  *
  */
-int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer);
+int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer, int barrier_type);
 
 /**
  * Wait for completion by start transfer of a DMA between two host memory locations and polling
@@ -174,18 +174,89 @@ int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *rin
 int ndma_memcpy_wait_for_completion(struct ndma_eng *eng, struct ndma_ring *ring, u32 count, void * ptr, bool async, bool is_d2d);
 
 /**
- * ndma_memcpy_zero_copy_mc()
+ * ndma_mc_pair_to_nc - Resolve the neuron core id for two memory chunks.
+ * @src_mc: Source memory chunk participating in the transfer.
+ * @dst_mc: Destination memory chunk participating in the transfer.
  *
- *   Wrapper around ndma_memcpyzero_copy() that pulls nc_id and device phyical address from
- *   the mem chunk.
+ * Returns the NC identifier that owns the DMA engine, favoring the device-side
+ * chunk when one side resides in host memory.
+ */
+u32 ndma_mc_pair_to_nc(struct mem_chunk *src_mc, struct mem_chunk *dst_mc);
+
+/**
+ * ndma_mc_to_pa - Translate a memory chunk into a DMA-usable physical address.
+ * @mc: Memory chunk to translate.
+ *
+ * Host chunks map through the PCI host BAR, while device chunks already carry
+ * their physical base address.
+ */
+dma_addr_t ndma_mc_to_pa(struct mem_chunk *mc);
+
+/**
+ * ndma_zerocopy_supported - Check whether zero-copy DMA is permitted.
+ *
+ * Architectures that require DMA retry disable the zero-copy pipeline.
+ */
+bool ndma_zerocopy_supported(void);
+
+/**
+ * ndma_memcpy_zerocopy - Perform a pipelined zero-copy DMA transfer.
+ * @nd: Neuron device whose DMA engine is used.
+ * @nc_id: Neuron core identifier owning the queue.
+ * @ops: Array of host buffer descriptors.
+ * @num_ops: Number of descriptors in @ops.
+ * @dev_base: Base device physical address for the transfer.
+ * @qid: Queue identifier to submit descriptors on.
+ * @direction: true for host-to-device, false for device-to-host.
+ *
+ *   DMA data between a user space virtual address range and a contiguous location in device memory.
+ *   In order to do this, we need to know the physical pages are associated with
+ *   the user virtual address range and we need to make sure those physical pages stay
+ *   associated with the user virtual address range while the DMA is happening.
+ *   
+ *   How do we do this?  By asking the kernel to pin the physical pages in memory until we are 
+ *   done with them.  But our transaction could be large, the physical pages won't be contiguous,
+ *   and pinning takes CPU cycles, so we break the dma transfer up into a series of smaller transfers
+ *   where we pipeline the pinning of physical pages with dma transfers.
  *
- *   Todo:
- *     Range check the device address here.  
+ *   We use pin_user_pages_fast() to reduce pinning overhead because we know the process can't go 
+ *   away while we are down here doing our thing in the kernel within a single IOCTL call. 
+ *   
+ *   We ping pong back and forth between two dma contexts. So while dma for context A is in progress, 
+ *   we are pinning pages and starting dmas for context B. 
+ *
+ *   Algorithm goes like this:
+ *      initial a pair of dma contexts 
+ *      prev dma ctx = null
+ *      lock()
+ *      while still more data remaining
+ *         current dma ctx = next available context
+ *         init current dma context
+ *         calc size of the transfer for this dma context.  We want to transfer up to page boundaries
+ *         calc number of pages that need to be pinned for this dma
+ *         pin host pages in memory
+ *         generate descriptors for 
+ *         if prev dma ctx != NULL, wait for the prev dma to complete
+ *         update host address, device address and ammount remaining
+ *      wait for the last dma ctx to complete
+ *      unlock()
+ *      free resources
+ *
+ *  Notes:
+ *    unpinning responsibilities. Up until a dma is successfully launched, this routine is responsible for unpinning
+ *    host memory.  After that ndma_zerocopy_wait_for_completion() owns responsibility for unpinning pages.
+ *
+ *    We don't do this here, but pinning user pages across system (IOCTL) calls has a number of additional requirements.
+ *    We would have to cleanup any pinned pages when the process goes away, so any pinned pages have to get tracked in 
+ *    process context.
  *
- *   Assumptions:
- *     caller has done access_ok() check on the host address 
- *     if (!access_ok(blah) return -EFAULT;
- *     or check_copy_size()
  */
-int ndma_memcpy_zero_copy_mc( struct neuron_device *nd,  void * host_addr, struct mem_chunk *dev_mc, u64 dev_offset, u64 size, bool direction);
+int ndma_memcpy_zerocopy(struct neuron_device *nd,
+			 u32 nc_id,
+			 const nrt_tensor_batch_op_t *ops,
+			 u32 num_ops,
+			 dma_addr_t dev_base,
+			 int qid,
+			 bool direction);
+
 #endif
diff --git a/neuron_dmabuf.c b/neuron_dmabuf.c
index e7457ed..6a510e1 100644
--- a/neuron_dmabuf.c
+++ b/neuron_dmabuf.c
@@ -344,8 +344,13 @@ int ndmabuf_get_fd(u64 va, u64 size, int *dmabuf_fd)
 
 	fd = dma_buf_fd(dmabuf, exp_info.flags);
 	if (fd < 0) {
-		pr_err("error %d while installing a file descriptor for dma-buf\n", ret);
-		ret = -EINVAL;
+		if (fd == -EMFILE) {
+			pr_err("dma_buf_fd failed: too many open files\n");
+			ret = -EMFILE;
+		} else {
+			pr_err("error %d while installing a file descriptor for dma-buf\n", ret);
+			ret = -EINVAL;
+		}
 		goto err_dma_buf_put;
 	}
 
diff --git a/neuron_fw_io.c b/neuron_fw_io.c
index 44f4a18..8632d04 100644
--- a/neuron_fw_io.c
+++ b/neuron_fw_io.c
@@ -25,7 +25,6 @@
 #include "neuron_reg_access.h"
 #include "neuron_device.h"
 #include "neuron_arch.h"
-#include "v1/fw_io.h"
 #include "neuron_fw_io.h"
 #include "neuron_dhal.h"
 
@@ -46,7 +45,7 @@ int fw_io_ecc_read(void *bar0, uint64_t ecc_offset, uint32_t *ecc_err_count)
 	}
 
 	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + ecc_offset;
-	int ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, ecc_err_count, 1, false);
+	int ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, ecc_err_count, 1, true);
 	if (ret) {
 		pr_err("failed to get ecc error count from the device for ecc_offset=%llu\n", ecc_offset);
 		return -EIO;
@@ -60,7 +59,7 @@ int fw_io_hbm_uecc_repair_state_read(void *bar0, uint32_t *hbm_repair_state)
 	int ret;
 	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_HBM_REPAIR_STATE_OFFSET;
 
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, hbm_repair_state, 1, false);
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, hbm_repair_state, 1, true);
 	if (ret) {
 		pr_err("failed to get hbm reapirable state\n");
 		return -EIO;
@@ -82,7 +81,7 @@ int fw_io_serial_number_read(void *bar0, uint64_t *serial_number)
 
 	uint32_t serial_number_lo = 0;
 	void *addr_lo = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERIAL_NUMBER_LO_OFFSET;
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_lo, &serial_number_lo, 1, false);
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_lo, &serial_number_lo, 1, true);
 	if (ret) {
 		pr_err("failed to get the lower 32 bits of the serial number from the device\n");
 		return -EIO;
@@ -90,7 +89,7 @@ int fw_io_serial_number_read(void *bar0, uint64_t *serial_number)
 
 	uint32_t serial_number_hi = 0;
 	void *addr_hi = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERIAL_NUMBER_HI_OFFSET;
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_hi, &serial_number_hi, 1, false);
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_hi, &serial_number_hi, 1, true);
 	if (ret) {
 		pr_err("failed to get the higher 32 bits of the serial number from the device\n");
 		return -EIO;
@@ -113,7 +112,7 @@ int fw_io_device_power_read(void *bar0, u32 *power, unsigned die)
     // Read power utilization from MiscRAM.  The power utilization for each die are set up in contiguous 32 bit
     // miscram registers, so we can treat it like an array of uint32s for our purposes.
 	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_POWER_UTIL_D0_OFFSET + 4*die;
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, power, 1, false);
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, power, 1, true);
 	if (ret) {
 		pr_err("failed to get device power from the device, ret = %d\n", ret);
 	}
@@ -126,7 +125,7 @@ int fw_io_api_version_read(void * bar0, u32 *version)
 	int ret;
 
 	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_API_VERSION_OFFSET;
-	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, version, 1, false);
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, version, 1, true);
 	if (ret) {
 		pr_err("failed to get api version from the device, ret = %d\n", ret);
 	}
@@ -134,6 +133,19 @@ int fw_io_api_version_read(void * bar0, u32 *version)
 	return ret;
 }
 
+int fw_io_server_info_read(void *bar0, u32 *server_info)
+{
+	int ret;
+
+	void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERVER_RACK_ID_OFFSET;
+	ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, server_info, 1, true);
+	if (ret) {
+		pr_err("failed to get server info from the device, ret = %d\n", ret);
+	}
+
+	return ret;
+}
+
 int fw_io_device_id_read(void *bar0, u32 *device_id)
 {
 	void * addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DEVICE_ID_OFFSET;
@@ -217,14 +229,30 @@ static void dx_crc32c_add(const u8 *data, size_t len, u32 *csum)
 	}
 }
 
-static u32 crc32c(const u8 *data, size_t len)
+static const u32 fw_io_cmd_timeout_tbl[FW_IO_CMD_MAX] = {
+	0,                   // cmd 0
+	(1000 * 1000 * 1),   // cmd 1 (FW_IO_CMD_READ)
+	(1000 * 1000 * 1),   // cmd 2 (FW_IO_CMD_POST_TO_CW)
+	(1000 * 1000 * 60)   // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
+};
+
+static const u32 fw_io_cmd_retry_tbl[FW_IO_CMD_MAX] = {
+	0,   // cmd 0
+	15,  // cmd 1 (FW_IO_CMD_READ)
+	15,  // cmd 2 (FW_IO_CMD_POST_TO_CW)
+	3    // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE)
+};
+
+static u32 crc32c(const u8 *hdr, const u8 *data, size_t len) 
 {
 	u32 csum = 0xffffffff;
+	if (hdr != NULL)
+		dx_crc32c_add(hdr, 8, &csum);
 	dx_crc32c_add(data, len, &csum);
 	return csum ^ 0xffffffff;
 }
 
-static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size,
+int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size,
 			  u8 *resp, u32 resp_size)
 {
 	int ret;
@@ -240,7 +268,11 @@ static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8
 		return -EINVAL;
 	}
 
-	mutex_lock(&ctx->lock);
+	// HACK: Skip mutex lock for POST_TO_CW commands as fw_io_post_metric() already holds the lock
+	// TODO: Remove this hack implementation when legacy API is deprecated
+	if (command_id != FW_IO_CMD_POST_TO_CW) {
+		mutex_lock(&ctx->lock);
+	}
 
 	int i;
 	for (i=0; i < FW_IO_RD_RETRY; i++){
@@ -250,26 +282,27 @@ static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8
 			ctx->next_seq_num = 1;
 
 		memcpy(ctx->request->data, req, req_size);
-		ctx->request->sequence_number = ctx->next_seq_num;
-		ctx->request->command_id = command_id;
-		ctx->request->size = req_size + sizeof(struct fw_io_request);
-		ctx->request->crc32 = 0;
-		ctx->request->crc32 = crc32c((const u8 *)ctx->request, ctx->request->size);
+		ctx->request->request_hdr.hdr.sequence_number = ctx->next_seq_num;
+		ctx->request->request_hdr.hdr.command_id = command_id;
+		ctx->request->request_hdr.hdr.size = req_size + sizeof(struct fw_io_request);
+		ctx->request->request_hdr.hdr.crc32 = 0;
+		ctx->request->request_hdr.hdr.crc32 = crc32c((const u8 *)&ctx->request->request_hdr, ctx->request->data, ctx->request->request_hdr.hdr.size - sizeof(ctx->request->request_hdr.hdr));
 		// make sure the sequence number we will wait on is not the same
-		ctx->response->sequence_number = 0;
+		ctx->response->response_hdr.hdr.sequence_number = 0;
 		dma_rmb();
 		fw_io_trigger(ctx->bar0);
 		// now wait for resp->seq == req->seq which indicates that request has been completed and
 		// we have a response
 		ktime_t start_time = ktime_get();
 
-		volatile u8 *fwio_seq = (volatile u8 *)&ctx->response->sequence_number;
+		volatile u8 *fwio_seq = (volatile u8 *)&ctx->response->response_hdr.hdr.sequence_number;
+
 		do {
 			resp_seq = READ_ONCE(*fwio_seq);
 			if (resp_seq == ctx->next_seq_num)
 				break;
 			msleep(1);
-		} while ( ktime_to_us(ktime_sub(ktime_get(), start_time)) <  FW_IO_RD_TIMEOUT);
+		} while ( ktime_to_us(ktime_sub(ktime_get(), start_time)) < FW_IO_RD_TIMEOUT);
 
 		ret = -1;
 		if (resp_seq != ctx->next_seq_num) {
@@ -277,29 +310,131 @@ static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8
 				pr_err("seq: %u, cmd: %u timed out\n", ctx->next_seq_num, command_id);
 			continue;
 		}
-		if (ctx->response->error_code == FW_IO_SUCCESS) {
-			if ((ctx->response->size - sizeof(struct fw_io_response)) > resp_size) {
+		if (ctx->response->response_hdr.hdr.error_code == FW_IO_SUCCESS) {
+			if ((ctx->response->response_hdr.hdr.size - sizeof(struct fw_io_response)) > resp_size) {
 				// this is probably not possible
 				pr_err("seq: %u, cmd: %u response too large (%u)\n", ctx->next_seq_num,
-			       	command_id, ctx->response->size);
+			       	command_id, ctx->response->response_hdr.hdr.size);
 				goto done;
 			}
 			memcpy(resp, ctx->response->data,
-		       	ctx->response->size - sizeof(struct fw_io_response));
+		       	ctx->response->response_hdr.hdr.size - sizeof(struct fw_io_response));
 			ret = 0;
 			goto done;
 		}
 		ctx->fw_io_err_count++;
 		pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id,
-	       	ctx->response->error_code);
+	       	ctx->response->response_hdr.hdr.error_code);
 		// if we get an unsupported command response, don't retry
-		if (ctx->response->error_code == FW_IO_UNKNOWN_COMMAND) {
+		if (ctx->response->response_hdr.hdr.error_code == FW_IO_UNKNOWN_COMMAND) {
 			ret = -1;
 			goto done;
 		}
 	}
 done:
 
+	// HACK: Only unlock if we locked (not FW_IO_CMD_POST_TO_CW)
+	// TODO: Remove this hack implementation when legacy API is deprecated
+	if (command_id != FW_IO_CMD_POST_TO_CW) {
+		mutex_unlock(&ctx->lock);
+	}
+	return ret;
+}
+
+int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size)
+{
+	int ret;
+	int i, j;
+	union fw_io_request_hdr req_header;
+
+	u32 api_version_num = 0;
+
+	ret = fw_io_api_version_read(ctx->bar0, &api_version_num);
+
+	if ((ret != 0) || (api_version_num < FW_IO_NEW_READLESS_READ_MIN_API_VERSION)) {
+		pr_info_once("Pacific version %d, using legacy Pacific/Runtime comm framework", api_version_num);
+		return -ENOTSUPP;
+	}
+
+	mutex_lock(&ctx->lock);
+
+	u32 retry_count = (command_id < FW_IO_CMD_MAX) ? fw_io_cmd_retry_tbl[command_id] : FW_IO_RD_RETRY;
+	for (i=0; i < retry_count; i++){
+		if (++ctx->next_seq_num == 0)
+			ctx->next_seq_num = 1;
+
+		req_header.hdr.sequence_number = ctx->next_seq_num;
+		req_header.hdr.command_id = command_id;
+		req_header.hdr.size = req_size + sizeof(req_header);
+		req_header.hdr.crc32 = 0;
+		req_header.hdr.crc32 = crc32c((const u8 *)&req_header, req, req_size);
+
+		// Write data
+		if (req_size > 0) {
+			u32 *data = (u32*)req;
+			for (j=0; j < (req_size + 3) / 4; j++) {
+				reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + j*4, data[j]);
+			}
+		}
+
+		// Write header
+		reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET, req_header.reg.dw0);
+		reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_LOW_OFFSET, req_header.reg.dw1);
+
+		// Zero response header
+		reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESPONSE_BASE_ADDR_HIGH_OFFSET, 0);
+		reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESPONSE_BASE_ADDR_LOW_OFFSET, 0);
+
+		// Set ack and trigger
+		reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_ACK_OFFSET, 1);
+		reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET, 1);
+
+		// Poll for completion
+		ktime_t start_time = ktime_get();
+		u32 trigger;
+		u32 timeout = (command_id < FW_IO_CMD_MAX) ? fw_io_cmd_timeout_tbl[command_id] : FW_IO_RD_TIMEOUT;
+		do {
+			reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET, &trigger);
+			if (!trigger) break;
+			msleep(1);
+		} while (ktime_to_us(ktime_sub(ktime_get(), start_time)) < timeout);
+		if (trigger) {
+			if (command_id != FW_IO_CMD_POST_TO_CW)
+				pr_err("seq: %u, cmd: %u timed out\n", ctx->next_seq_num, command_id);
+			continue;
+		}
+		
+		// Read response header
+		union fw_io_response_hdr resp_header;
+		reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET, &resp_header.reg.dw0);
+
+		if (resp_header.hdr.sequence_number != ctx->next_seq_num) {
+			if (command_id != FW_IO_CMD_POST_TO_CW)
+				pr_err("seq: %u, cmd: %u seq mismatch\n", ctx->next_seq_num, command_id);
+			continue;
+		}
+
+		if (resp_header.hdr.error_code == FW_IO_SUCCESS) {
+			u32 data_size = resp_header.hdr.size - sizeof(resp_header);
+			if (data_size > 0 && resp != NULL) {
+				u32 copy_size = min(resp_size, data_size);
+				u32 *resp_data = (u32*)resp;
+				for (j = 0; j < (copy_size + 3) / 4; j++) {
+					reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + j*4, &resp_data[j]);
+				}
+			}
+			ret = 0;
+			break;
+		}
+
+		ctx->fw_io_err_count++;
+		pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code);
+		if (resp_header.hdr.error_code == FW_IO_UNKNOWN_COMMAND) {
+			ret = -1;
+			break;
+		}
+	}
+
 	mutex_unlock(&ctx->lock);
 	return ret;
 }
@@ -319,7 +454,7 @@ int fw_io_read(struct fw_io_ctx *ctx, u64 addr_in[], u32 val_out[], u32 num_req)
 	if (should_fail(&neuron_fail_fwio_read, 1))
 		return -ETIMEDOUT;
 #endif
-	return fw_io_execute_request(ctx, FW_IO_CMD_READ, (u8 *)addr_in, sizeof(u64) * num_req,
+	return ndhal->ndhal_fw_io.fw_io_execute_request(ctx, FW_IO_CMD_READ, (u8 *)addr_in, sizeof(u64) * num_req,
 			     	(u8 *)val_out, sizeof(u32) * num_req);
 }
 
@@ -453,6 +588,7 @@ int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size)
 	u32 padded_u32 = 0;
 	u32 *m = (u32 *)data;
 	int i;
+	int ret;
 
 #ifdef CONFIG_FAULT_INJECTION
 	if (should_fail(&neuron_fail_fwio_post_metric, 1))
@@ -462,8 +598,11 @@ int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size)
 		return -E2BIG;
 	}
 
+	// Lock mutex to prevent race condition with new interface
+	mutex_lock(&ctx->lock);
+
 	// Write the data in the misc ram first
-	void * offset = (void *) (ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_METRIC_OFFSET);
+	void * offset = (void *) (ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET);
 	for (i = 0; i < (size / 4); i++) {
 		reg_write32(offset + (i * 4), m[i]);
 	}
@@ -474,9 +613,19 @@ int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size)
 		reg_write32(offset + size_aligned, padded_u32);
 	}
 
-	return fw_io_execute_request(ctx, FW_IO_CMD_POST_TO_CW, data, size, data, size);
+	ret = ndhal->ndhal_fw_io.fw_io_execute_request(ctx, FW_IO_CMD_POST_TO_CW, data, size, data, size);
+
+	mutex_unlock(&ctx->lock);
+	return ret;
 }
 
+int fw_io_post_metric_new(struct fw_io_ctx *ctx, u8 *data, u32 size)
+{
+    if (size > FW_IO_REG_METRIC_BUF_SZ) {
+        return -E2BIG;
+    }
+    return fw_io_execute_request_new(ctx, FW_IO_CMD_POST_TO_CW, data, size, NULL, 0);
+}
 
 int fw_io_read_counters(struct fw_io_ctx *ctx, uint64_t addr_in[], uint32_t val_out[],
 			uint32_t num_counters)
@@ -575,3 +724,18 @@ uint32_t fw_io_get_total_uecc_err_count(void *bar0) {
 	}
 	return total_uncorrected_ecc_err_count;
 }
+
+int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile)
+{
+	union fw_io_req_perfprofile_data data = {0};
+	data.rec.profile = (uint8_t)profile;
+	data.rec.voltage_margin = 0;
+	data.rec.frequency_index = 0;
+	data.rec.ocw_index = 0;
+
+	if (!ctx) {
+		return -EINVAL;
+	}
+
+	return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_POWER_PROFILE, (u8 *)&data, sizeof(data), NULL, 0);
+}
diff --git a/neuron_fw_io.h b/neuron_fw_io.h
index f915d55..6c60d19 100644
--- a/neuron_fw_io.h
+++ b/neuron_fw_io.h
@@ -7,24 +7,55 @@
 
 #include <linux/types.h>
 
+union fw_io_request_hdr {
+	struct {
+		u8 sequence_number; // sequence number to be copied in the next response.
+		u8 command_id; // command to hw.
+		u16 size; // request size in bytes including the header.
+		u32 crc32; // crc32 of the entire request, crc32 must be set to 0 before calculating
+	} hdr;
+	struct {
+		u32 dw0; // bytes 0-3: sequence_number, command_id, size
+		u32 dw1; // bytes 4-7: crc32
+	} reg;
+};
+
+union fw_io_response_hdr {
+	struct {
+		u8 sequence_number; // request sequence number
+		u8 error_code; // 0 means request was successfully completed
+		u16 size; // response size in bytes including this header
+	} hdr;
+	struct {
+		u32 dw0;
+	} reg;
+};
+
 struct fw_io_request {
-	u8 sequence_number; // sequence number to be copied in the next response.
-	u8 command_id; // command to hw.
-	u16 size; // request size in bytes including the header.
-	u32 crc32; // crc32 of the entire request, crc32 must be set to 0 before calculating
-	u8 data[0];
+	union fw_io_request_hdr request_hdr;
+	u8 data[];
 };
 
 struct fw_io_response {
-	u8 sequence_number; // request sequence number
-	u8 error_code; // 0 means request was successfully completed
-	u16 size; // response size in bytes including this header
-	u8 data[0]; // response data if any
+	union fw_io_response_hdr response_hdr;
+	u8 data[];
+};
+
+union fw_io_req_perfprofile_data {
+	struct {
+		uint32_t reserved;
+		uint8_t profile;
+		uint8_t voltage_margin;
+		uint8_t frequency_index;
+		uint8_t ocw_index;
+	} rec;
+	uint32_t raw[2];
 };
 
 enum {
 	FW_IO_CMD_READ = 1, // read a register value
-	FW_IO_CMD_POST_TO_CW = 2 // post given blob as metrics to CloudWatch
+	FW_IO_CMD_POST_TO_CW = 2, // post given blob as metrics to CloudWatch
+	FW_IO_CMD_SET_POWER_PROFILE = 3 // set power profile
 };
 
 enum {
@@ -42,6 +73,7 @@ enum {
 // offsets in MISC RAM for FWIO
 enum {
 	FW_IO_REG_DEVICE_ID_OFFSET = 0x24,
+	FW_IO_REG_SERVER_RACK_ID_OFFSET = 0x70,
 
 	// MISC RAM register for API version
 	//   - This register is used to determine the API version of the firmware.
@@ -77,7 +109,7 @@ enum {
 
 	FW_IO_REG_RUNTIME_RESERVED0 = 0xC0, // 0xC0 to 0xF0
 
-	FW_IO_REG_METRIC_OFFSET = 0x100, // 0x100 to 0x17F, 128 bytes
+	FW_IO_REG_DATA_OFFSET = 0x100, // 0x100 to 0x17F, 128 bytes
 	FW_IO_REG_LH_NEIGHBOR_SERNUM_HI = 0x180, // LH/RH neighbors
 	FW_IO_REG_LH_NEIGHBOR_SERNUM_LO = 0x184, 
 	FW_IO_REG_RH_NEIGHBOR_SERNUM_HI = 0x188, 
@@ -94,6 +126,7 @@ enum {
 	FW_IO_REG_RESPONSE_BASE_ADDR_LOW_OFFSET = 0x1fc,
 	FW_IO_REG_RESPONSE_BASE_ADDR_HIGH_OFFSET = 0x1f8,
 	FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET = 0x800,
+	FW_IO_REG_ACK_OFFSET = 0xf0,
 };
 	
 #define FW_IO_REG_METRIC_BUF_SZ 128
@@ -113,13 +146,24 @@ struct fw_io_ctx {
 #define UINT64_LOW(x) ((u32)(((u64)(x)) & 0xffffffffULL))
 #define UINT64_HIGH(x) ((u32)((x) >> 32))
 
-// Hardware might take up to 15 seconds in worst case.
+#define FW_IO_CMD_MAX 4
+
+#define FW_IO_CMD_MAX 4
+
+// Wait up to 30 seconds in worst case.
+// Hardware can in some cases take longer to come out of reset but for some reads
+// (like reading device ID before creating the device) we cannot wait too long
+// because it's confusing if driver load appears to hang and kernel may complain
 #define FW_IO_RD_TIMEOUT (1000 * 1000 * 1)
-#define FW_IO_RD_RETRY   15
+#define FW_IO_RD_RETRY   30
 
 // max number of registers can be read in single function call
 #define FW_IO_MAX_READLESS_READ_REGISTER_COUNT 100
 
+// Min Pacific API version for new readless read framework
+#define FW_IO_NEW_READLESS_READ_MIN_API_VERSION 7
+#define FW_IO_POWER_MIN_API_VERSION 3
+
 
 /**
  * fw_io_register_read_region - Read a BAR region
@@ -212,6 +256,15 @@ void fw_io_destroy(struct fw_io_ctx *ctx);
  */
 int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size);
 
+/**
+ * fw_io_post_metric_new() - Post given block data as metric using new framework
+ * @param ctx: FWIO context
+ * @param data: data to post
+ * @param size: size of the data
+ * Return: 0 if metric is successfully posted, negative on failure
+ */
+int fw_io_post_metric_new(struct fw_io_ctx *ctx, u8 *data, u32 size);
+
 /**
  * fw_io_initiate_reset() - Initiate device local reset.
  *
@@ -246,6 +299,14 @@ bool fw_io_is_reset_initiated(void __iomem *bar0);
 int fw_io_read_counters(struct fw_io_ctx *ctx, uint64_t addr_in[], uint32_t val_out[],
 			uint32_t num_counters);
 
+/**
+ * fw_io_server_info_read() - Read server info
+ * @param bar - from bar
+ * @param server_info  - server info containing rack & server ids
+ * @return  0 on success.
+ */
+int fw_io_server_info_read(void *bar0, u32 *server_info);
+
 /**
  * fw_io_device_id_read() - Read device id
  * @param bar - from bar
@@ -321,4 +382,36 @@ uint32_t fw_io_get_total_uecc_err_count(void *bar0);
  * @param bar0: from bar
  */
 int fw_io_hbm_uecc_repair_state_read(void *bar0, uint32_t *hbm_repair_state);
+
+/**
+ * fw_io_execute_request() - Execute request (original protocols)
+ * @param ctx: FWIO context
+ * @param command_id: Command ID
+ * @param req: Request data
+ * @param req_size: Request size
+ * @param resp: Response buffer
+ * @param resp_size: Response buffer size
+ * @return 0 on success, negative on failure
+ */
+int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size);
+
+/**
+ * fw_io_execute_request_new() - Execute request (new protocols)
+ * @param ctx: FWIO context
+ * @param command_id: Command ID
+ * @param req: Request data
+ * @param req_size: Request size
+ * @param resp: Response buffer
+ * @param resp_size: Response buffer size
+ * @return 0 on success, negative on failure
+ */
+int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size);
+
+/**
+ * fw_io_set_power_profile() - Set power profile
+ * @param ctx: FWIO context
+ * @param profile: Power profile value
+ * @return 0 on success, negative on failure
+ */
+int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile);
 #endif
diff --git a/neuron_ioctl.h b/neuron_ioctl.h
index e34c1b0..80989d9 100644
--- a/neuron_ioctl.h
+++ b/neuron_ioctl.h
@@ -186,6 +186,26 @@ struct neuron_ioctl_mem_buf_copy64 {
 	__u32 copy_to_mem_handle; // [in] if set to True copies from buffer to memhandle else copies from memhandle to buffer.
 };
 
+struct neuron_ioctl_mem_buf_copy64zc {
+	__u64 mem_handle; // [in] Source or Destination memory handle from/to data needs to be copied.
+	void *buffer; // [in] Buffer from/to where data to be copied.
+	__u64 size; // [in] Size of the data to be copied.
+	__u64 offset; // [in] Offset in the memory handle where the data to be written/read.
+	__u32 is_copy_to_device; // [in] if set to True copies to device
+	__u32 bar4_wr_threshold; // [in] threshold below which we will use bar4 direct write vs. DMA. Subject to driver limits.
+	__s32 h2t_qid; // [in] h2t queue to use for the transfer.  -1 = use default
+	__u32 dummy;  // [na] pad to change size of struct to version ioctl
+};
+
+struct neuron_ioctl_mem_buf_copy64zc_batches {
+	__u64 sequence_num;                 // [in] The sequence number that uniquely indentifies each async I/O.
+	neuron_memcpy_batch_t *batches_ptr; // [in] Pointer to an array of memory copy batches.
+	__u32 num_batches;                  // [in] Number of batches in the batches_ptr array.
+	__u16 is_copy_to_device;            // [in] If true, then copies from host to device.
+	__s32 h2t_qid;                      // [in] H2T queue ID. -1 for default.
+	__u16 flags;                        // [in] TBD
+};
+
 struct neuron_ioctl_program_engine {
 	__u64 dst; // [in] Destination engine address
 	void *buffer; // [in] Buffer from/to where data to be copied.
@@ -581,6 +601,34 @@ struct neuron_ioctl_pod_ctrl_v2 {
 	__u32 mode;			// [in] operating mode
 };
 
+struct neuron_ioctl_h2t_dma_alloc_queues {
+	__u16 sz;					// [in] structure size for versioning.
+	__u16 nc_id;				// [in] neuron core to allocate h2t queues for
+	__u16 copy_queue_cnt;		// [in] number of copy queues requested
+	__u16 service_queue_cnt;	// [in] number of service queues requested
+	__u32 copy_queue_bmap;		// [out] return bitmap of copy queues allocated
+	__u32 service_queue_bmap;	// [out] return bitmap of service queues allocated
+	__u32 copy_default_queue;	// [out] return default h2t copy queue
+};
+
+struct neuron_ioctl_h2t_dma_free_queues {
+	__u16 sz;           // [in] structure size for versioning.
+	__u8 nc_id;			// [in] neuron core to free h2t queues for
+	__u8 fill0;			// [in] padding 
+	__u32 queue_bmap;	// [in] bitmap of queues to free
+};
+
+struct neuron_ioctl_power_profile {
+	__u16 sz;		// [in] size of the structure
+	__u16 ctrl;		// [in] control 0 = set; 1 = get
+	__u32 profile;	// [in] power profile to set
+};
+
+struct neuron_ioctl_metrics_ctrl {
+	__u32 mode;     // [in] modifications to metric behavior (neuron_metrics_mode)
+};
+
+
 #define NEURON_IOCTL_BASE 'N'
 
 /* Deprecated reset related IOCTLs. Now it would always return success. */
@@ -775,9 +823,18 @@ struct neuron_ioctl_pod_ctrl_v2 {
 #define NEURON_IOCTL_POD_CTRL _IOWR(NEURON_IOCTL_BASE, 123, struct neuron_ioctl_pod_ctrl)
 #define NEURON_IOCTL_POD_CTRL_V2 _IOWR(NEURON_IOCTL_BASE, 123, struct neuron_ioctl_pod_ctrl_v2)
 
-#define NEURON_IOCTL_MEM_BUF_ZEROCOPY64 _IOWR(NEURON_IOCTL_BASE, 124, struct neuron_ioctl_mem_buf_copy64)
+#define NEURON_IOCTL_MEM_BUF_ZEROCOPY64 _IOWR(NEURON_IOCTL_BASE, 124, struct neuron_ioctl_mem_buf_copy64zc)
+
+#define NEURON_IOCTL_H2T_DMA_ALLOC_QUEUES _IOWR(NEURON_IOCTL_BASE, 125, struct neuron_ioctl_h2t_dma_alloc_queues)
+#define NEURON_IOCTL_H2T_DMA_FREE_QUEUES _IOWR(NEURON_IOCTL_BASE, 126, struct neuron_ioctl_h2t_dma_free_queues)
+
+#define NEURON_IOCTL_POWER_PROFILE _IOW(NEURON_IOCTL_BASE, 127, struct neuron_ioctl_power_profile)
+
+#define NEURON_IOCTL_METRICS_CTRL _IOW(NEURON_IOCTL_BASE, 128, struct neuron_ioctl_metrics_ctrl)
+
+#define NEURON_IOCTL_MEM_BUF_ZEROCOPY64_BATCHES _IOWR(NEURON_IOCTL_BASE, 129, struct neuron_ioctl_mem_buf_copy64zc_batches)
 
 // Note: 133 is taken by NEURON_IOCTL_DMA_QUEUE_INIT_BATCH
-#define NEURON_IOCTL_MAX 125
+#define NEURON_IOCTL_MAX 130
 
 #endif
diff --git a/neuron_metrics.c b/neuron_metrics.c
index f9b9085..6fbc28d 100644
--- a/neuron_metrics.c
+++ b/neuron_metrics.c
@@ -51,6 +51,7 @@ enum nmetric_cw_id {
 	NMETRIC_CW_ID_AVG_TPB_RESET_TIME_MS = 53,
 	NMETRIC_CW_ID_DEVICE_RESET_FAILURE_COUNT = 54,
 	NMETRIC_CW_ID_TPB_RESET_FAILURE_COUNT = 55,
+	NMETRIC_CW_ID_PERFORMANCE_PROFILE_ID = 56,
 
 	// Extra versions
 	// extra space for reporting multiple versions of the same type in one post
@@ -112,6 +113,7 @@ static const nmetric_def_t nmetric_defs[] = {
 	// constant metrics
 	NMETRIC_CONSTANT_DEF(0, POST_TIME_ALWAYS, NMETRIC_CW_ID_INSTANCE_ID), // instance id
 	NMETRIC_CONSTANT_DEF(1, POST_TIME_ALWAYS, NMETRIC_CW_ID_DRIVER_VERSION), // driver version
+	NMETRIC_CONSTANT_DEF(2, POST_TIME_TICK_0, NMETRIC_CW_ID_PERFORMANCE_PROFILE_ID), // performance profile id
 
 	// version metrics
 	NMETRIC_VERSION_DEF(0, POST_TIME_ALWAYS, NMETRIC_CW_ID_RT_VERSION_BASE, NDS_ND_COUNTER_RUNTIME_VERSION, 0), // rt version
@@ -178,6 +180,7 @@ static const int nmetric_count = sizeof(nmetric_defs) / sizeof(nmetric_def_t);
 // AND don't forget to increase the NMETRIC_..._COUNT in neuron_metrics.h
 #define NMETRIC_INSTANCE_ID_IDX		0
 #define NMETRIC_DRIVER_VERS_IDX 	1
+#define NMETRIC_PROFILE_ID_IDX 		2
 #define NMETRIC_FW_IO_ERR_IDX		17
 
 struct nmetric_cw_metric {
@@ -197,6 +200,7 @@ void nmetric_init_constants_metrics()
 	int driver_ver_str_len;
 	int instance_id_idx = nmetric_defs[NMETRIC_INSTANCE_ID_IDX].index;
 	int driver_vers_idx = nmetric_defs[NMETRIC_DRIVER_VERS_IDX].index;
+	int profile_id_idx = nmetric_defs[NMETRIC_PROFILE_ID_IDX].index;
 	loff_t offset = 0;
 
 	// initiate buffer to 0
@@ -215,6 +219,9 @@ void nmetric_init_constants_metrics()
 	driver_ver_str_len = strlen(driver_version);
 	BUG_ON(driver_ver_str_len > NEURON_METRICS_VERSION_STRING_MAX_LEN); // check for buffer overflow
 	memcpy(nmetric_constant_metrics[driver_vers_idx], driver_version, min(driver_ver_str_len, (int)NEURON_METRICS_VERSION_STRING_MAX_LEN));
+
+	// record performance profile
+	snprintf(nmetric_constant_metrics[profile_id_idx], NEURON_METRICS_VERSION_STRING_MAX_LEN + 1, "%d", 0);
 }
 
 /**
@@ -578,27 +585,38 @@ static inline int nmetric_post_constant_u64(const nmetric_def_t *metric, struct
 	return nmetric_post_u64(metric, metric_value, dest, available_size);
 }
 
-static inline int nmetric_post_driver_metrics(const nmetric_def_t *metric,
-											  u64 *curr_metrics,
-											  u64 *prev_metrics,
-											  u64 *freed_metrics,
-											  struct nmetric_cw_metric *dest,
-											  u64 *driver_metrics,
-											  int available_size)
+// TODO: This function is a quick workaround to post and reset the driver metrics:
+//          1. it uses atomics to protect driver metrics from race conditions;
+//          2. it resets the driver metric and its correspondingly intermediate metrics immediately after posting.
+//       A better long term solution is needed.
+static inline int nmetric_post_and_reset_driver_metrics(const nmetric_def_t *driver_final_metric,
+                                                        struct nmetric_cw_metric *dest,
+                                                        struct nmetric_driver_metrics *driver_metrics,
+                                                        int available_size)
 {
-	u64 metric_value = driver_metrics[metric->index];
-
-	if (metric->index == NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS
-	  || metric->index == NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS
-	  || metric->index == NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS
-	  || metric->index == NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS) {
-		return nmetric_post_u64(metric, metric_value, dest, available_size);
-	} else if (metric->index == NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT
-			|| metric->index == NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT){
-		return nmetric_post_counter(curr_metrics, prev_metrics, freed_metrics, metric, dest, available_size);		
+	int metric_index = driver_final_metric->index;
+	u64 metric_value = 0;
+
+	if (metric_index < 0 || metric_index >= NMETRIC_FINAL_DRIVER_METRICS_COUNT) {
+		pr_err("invalid final driver metric with index %d\n", driver_final_metric->index);
+		return 0;
 	}
 
-	return 0;
+	if (metric_index == NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS) {
+		u64 total_time = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS], 0);
+		u64 total_count = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT], 0);
+
+		if (total_count != 0)
+			metric_value = total_time / total_count;
+	} else if (metric_index == NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS) {
+		u64 total_time = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS], 0);
+		u64 total_count = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT], 0);
+
+		if (total_count != 0)
+			metric_value = total_time / total_count;
+	}
+
+	return nmetric_post_u64(driver_final_metric, metric_value, dest, available_size);
 }
 
 /**
@@ -632,6 +650,9 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6
 		if (!nmetric_check_post_tick(tick, curr_metric))
 			continue;
 		available_size = NEURON_METRICS_MAX_POSTING_BUF_SIZE - data_size;
+		if (available_size <= 0) {
+			pr_err_once("ran out of metrics posting space for tick %d on metric %d", tick, nmetric_index);
+		}
 		dest = (struct nmetric_cw_metric *)&nd->metrics.posting_buffer[data_size];
 		switch(curr_metric->type) {
 		case NMETRIC_TYPE_CONSTANT:
@@ -652,7 +673,7 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6
 			data_size += nmetric_post_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size);
 		break;
 		case NMETRIC_TYPE_DRIVER:
-			data_size += nmetric_post_driver_metrics(curr_metric, curr_metrics, prev_metrics, freed_metrics, dest, nd->metrics.driver_metrics, available_size);
+			data_size += nmetric_post_and_reset_driver_metrics(curr_metric, dest, &nd->metrics.driver_metrics, available_size);
 		break;
 		}
 	}
@@ -663,7 +684,7 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6
 		nmetric_mock_fw_io_post_metric(nd->metrics.posting_buffer, data_size);
 	}
 	if (data_size && (nmetric_log_posts & (1<<0))) {
-		int ret = fw_io_post_metric(nd->fw_io_ctx, nd->metrics.posting_buffer, data_size);
+		int ret = ndhal->ndhal_fw_io.fw_io_post_metric(nd->fw_io_ctx, nd->metrics.posting_buffer, data_size);
 		if (ret < 0)
 			pr_err("Metric posting failed with error code: %d\n", ret);
 	}
@@ -715,6 +736,11 @@ static void nmetric_cache_shared_bufs(struct neuron_device *nd, u64 *freed_metri
 			freed_const_u64_metrics[curr_metric->index] = nd->metrics.ds_freed_const_u64_buf[curr_metric->index];
 			nd->metrics.ds_freed_const_u64_buf[curr_metric->index] = 0;
 		break;
+		case NMETRIC_TYPE_CONSTANT:
+			if (curr_metric->cw_id == NMETRIC_CW_ID_PERFORMANCE_PROFILE_ID) {
+				snprintf(nmetric_constant_metrics[curr_metric->index], NEURON_METRICS_VERSION_STRING_MAX_LEN + 1, "%d", ndhal->ndhal_perf.current_performance_profile);
+			}
+		break;
 		}
 	}
 }
@@ -753,9 +779,6 @@ static void nmetric_start_new_session(struct neuron_device *nd, u64 *curr_metric
 				const_u64_metrics[curr_metric->index] = 0;
 				freed_const_u64_metrics[curr_metric->index] = 0;
 			break;
-			case NMETRIC_TYPE_DRIVER:
-				nd->metrics.driver_metrics[curr_metric->index] = 0;
-			break;
 		}
 	}
 
@@ -776,6 +799,30 @@ static void nmetric_sample_high_freq(struct neuron_device *nd)
 	npower_sample_utilization(nd);
 }
 
+static void nmetric_aggregate_and_post_tick(struct neuron_device *nd, struct nmetric_versions *component_versions, u64 *curr_feature_bitmap, u64 *freed_feature_bitmap, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, u8 tick)
+{
+	neuron_ds_acquire_lock(&nd->datastore);
+	nmetric_full_aggregate(nd, nd->metrics.neuron_aggregation.curr,
+					curr_feature_bitmap, const_u64_metrics, tick);
+	nmetric_cache_shared_bufs(nd, nd->metrics.neuron_aggregation.freed,
+					component_versions, freed_feature_bitmap,
+					freed_const_u64_metrics, tick);
+	neuron_ds_release_lock(&nd->datastore);
+
+	nmetric_post_metrics(nd, nd->metrics.neuron_aggregation.curr,
+					nd->metrics.neuron_aggregation.prev,
+					nd->metrics.neuron_aggregation.freed,
+					component_versions, *curr_feature_bitmap,
+					*freed_feature_bitmap, const_u64_metrics,
+					freed_const_u64_metrics, tick);
+	nmetric_start_new_session(nd, nd->metrics.neuron_aggregation.curr,
+					nd->metrics.neuron_aggregation.prev,
+					nd->metrics.neuron_aggregation.freed,
+					curr_feature_bitmap, freed_feature_bitmap,
+					const_u64_metrics, freed_const_u64_metrics,
+					tick); // reset all current metrics for this tick
+}
+
 /**
  * nmetric_thread_fn() - periodically aggregates and posts metric at rate specified by module parameter
  *
@@ -795,8 +842,8 @@ static int nmetric_thread_fn(void *arg)
 	u64 post_delay_in_jiffies;
 	u64 last_metric_post_time;
 	u64 start_jiffies = jiffies;
-	u64 last_logged_slow_tick = 0;
 	u64 current_slow_tick;
+	u8 tick_budget = 0; // how many ticks can be posted in a certain iteration of the loop
 
 	// initialize all aggregation buffers
 	memset(nd->metrics.neuron_aggregation.prev, 0, nmetric_counters_buf_size);
@@ -816,11 +863,14 @@ static int nmetric_thread_fn(void *arg)
 		sample_delay_in_jiffies, nmetric_metric_post_delay, HZ);
 
 	// metrics are only sent once at rate specified by module param, new metric data may be saved without being immediately sent
-	while (!kthread_should_stop() && nd->metrics.neuron_aggregation.running) {
-	    long wait_return;
-		wait_return = wait_event_interruptible_timeout(nd->metrics.neuron_aggregation.wait_queue, !nd->metrics.neuron_aggregation.running,sample_delay_in_jiffies);
-
-		if (kthread_should_stop() || !nd->metrics.neuron_aggregation.running || (wait_return < 0)) {
+	while (!kthread_should_stop() && nd->metrics.neuron_aggregation.state != NMETRIC_STATE_STOPPED) {
+		long wait_return;
+		int flush_tick;
+		wait_return = wait_event_interruptible_timeout(nd->metrics.neuron_aggregation.wait_queue,
+									nd->metrics.neuron_aggregation.state == NMETRIC_STATE_STOPPED || nd->metrics.neuron_aggregation.state == NMETRIC_STATE_RESUMING,
+									sample_delay_in_jiffies);
+
+		if (kthread_should_stop() || nd->metrics.neuron_aggregation.state == NMETRIC_STATE_STOPPED || (wait_return < 0)) {
 			break;
 		};
 
@@ -831,31 +881,29 @@ static int nmetric_thread_fn(void *arg)
 		// We track this by keeping track of the number of intervals since this thread started
 		// up so that we don't introduce drift due to the latency of other loop operations.
 		current_slow_tick = (jiffies - start_jiffies)/post_delay_in_jiffies;
-		if (current_slow_tick != last_logged_slow_tick) {
-			last_logged_slow_tick = current_slow_tick;
-
-			// aggregate and post metrics
-			neuron_ds_acquire_lock(&nd->datastore);
-			nmetric_full_aggregate(nd, nd->metrics.neuron_aggregation.curr,
-					       &curr_feature_bitmap, const_u64_metrics, tick);
-			nmetric_cache_shared_bufs(nd, nd->metrics.neuron_aggregation.freed,
-						  component_versions, &freed_feature_bitmap,
-						  freed_const_u64_metrics, tick);
-			neuron_ds_release_lock(&nd->datastore);
-
-			nmetric_post_metrics(nd, nd->metrics.neuron_aggregation.curr,
-					     nd->metrics.neuron_aggregation.prev,
-					     nd->metrics.neuron_aggregation.freed,
-					     component_versions, curr_feature_bitmap,
-					     freed_feature_bitmap, const_u64_metrics,
-					     freed_const_u64_metrics, tick);
-			nmetric_start_new_session(nd, nd->metrics.neuron_aggregation.curr,
-						  nd->metrics.neuron_aggregation.prev,
-						  nd->metrics.neuron_aggregation.freed,
-						  &curr_feature_bitmap, &freed_feature_bitmap,
-						  const_u64_metrics, freed_const_u64_metrics,
-						  tick); // reset all current metrics for this tick
-			tick = (tick + 1) % POST_TICK_COUNT;
+
+		// periodic metrics posting on a timer
+		if (nd->metrics.neuron_aggregation.state == NMETRIC_STATE_PAUSED) {
+			// skip metrics post when paused via metrics_ctrl ioctl
+			continue;
+		} else if (current_slow_tick != nd->metrics.neuron_aggregation.last_logged_slow_tick) {
+			// post up to tick_budget ticks of metrics
+			// in the normal case, this will post one tick per iteration of the loop
+			// if the was paused then resumed, post up to the number of ticks that would have been posted if it had not been paused
+		
+			if (nd->metrics.neuron_aggregation.state == NMETRIC_STATE_RESUMING) {
+				nd->metrics.neuron_aggregation.state = NMETRIC_STATE_RUNNING;
+				tick_budget = current_slow_tick - nd->metrics.neuron_aggregation.last_logged_slow_tick;
+			 	tick_budget = (tick_budget > POST_TICK_COUNT) ? POST_TICK_COUNT : tick_budget;
+			} else {
+				tick_budget = 1;
+			}
+			
+			for (flush_tick = 0; flush_tick < tick_budget; flush_tick++) {
+				nmetric_aggregate_and_post_tick(nd, component_versions, &curr_feature_bitmap, &freed_feature_bitmap, const_u64_metrics, freed_const_u64_metrics, tick);
+				tick = (tick + 1) % POST_TICK_COUNT;
+			}
+			nd->metrics.neuron_aggregation.last_logged_slow_tick = current_slow_tick;
 		}
 	}
 
@@ -865,7 +913,7 @@ static int nmetric_thread_fn(void *arg)
 static int nmetric_create_thread(struct neuron_device *nd)
 {
 	init_waitqueue_head(&nd->metrics.neuron_aggregation.wait_queue);
-	nd->metrics.neuron_aggregation.running = true;
+	nd->metrics.neuron_aggregation.state = NMETRIC_STATE_RUNNING;
 	nd->metrics.neuron_aggregation.thread = kthread_run(nmetric_thread_fn, nd, "nd%d metrics", nd->device_index);
 	if (IS_ERR_OR_NULL(nd->metrics.neuron_aggregation.thread)) {
 		pr_err("nd%d metrics aggregation thread creation failed\n", nd->device_index);
@@ -878,15 +926,34 @@ void nmetric_stop_thread(struct neuron_device *nd)
 {
 	if (nd->metrics.neuron_aggregation.thread == NULL)
 		return;
-	nd->metrics.neuron_aggregation.running = false;
+	nd->metrics.neuron_aggregation.state = NMETRIC_STATE_STOPPED;
 	wake_up(&nd->metrics.neuron_aggregation.wait_queue);
 	kthread_stop(nd->metrics.neuron_aggregation.thread); //blocks till the thread exits
 	nd->metrics.neuron_aggregation.thread = NULL;
 }
 
+// if periodic posting from metrics thread is paused, need to request an explicit flush
+void nmetric_set_mode(struct neuron_device *nd, enum neuron_metrics_mode mode)
+{
+	switch (mode) {
+	case NEURON_METRICS_MODE_PERIODIC_ENABLE:
+		nd->metrics.neuron_aggregation.state = NMETRIC_STATE_RESUMING;
+	break;
+	case NEURON_METRICS_MODE_PERIODIC_DISABLE:
+		nd->metrics.neuron_aggregation.state = NMETRIC_STATE_PAUSED;
+	break;
+	}
+}
+
 void nmetric_init_driver_metrics(struct neuron_device *nd)
 {
-	memset(nd->metrics.driver_metrics, 0, NMETRIC_DRIVER_METRICS_COUNT * sizeof(u64));
+	int i;
+
+	for (i = 0; i < NMETRIC_FINAL_DRIVER_METRICS_COUNT; i++)
+		atomic64_set(&nd->metrics.driver_metrics.final_metrics[i], 0);
+
+	for (i = 0; i < NMETRIC_INTERMEDIATE_DRIVER_METRICS_COUNT; i++)
+		atomic64_set(&nd->metrics.driver_metrics.intermediate_metrics[i], 0);
 }
 
 int nmetric_init(struct neuron_device *nd)
@@ -895,7 +962,8 @@ int nmetric_init(struct neuron_device *nd)
 
 	memset(nd->metrics.ds_freed_metrics_buf, 0, nmetric_counters_buf_size);
 	memset(nd->metrics.ds_freed_const_u64_buf, 0, NMETRIC_CONSTANT_U64_COUNT * sizeof(u64));
-	npower_init_stats(nd);
+	nd->metrics.neuron_aggregation.state = NMETRIC_STATE_STOPPED;
+	nd->metrics.neuron_aggregation.last_logged_slow_tick = 0;
 
 	// initiate metric aggregator thread
 	ret = nmetric_create_thread(nd);
@@ -903,37 +971,55 @@ int nmetric_init(struct neuron_device *nd)
 	return ret;
 }
 
-void nmetric_set_reset_time_metrics(struct neuron_device *nd, uint64_t cur_reset_time_ms, bool is_device_reset) {
+void nmetric_set_reset_time_metrics(struct neuron_device *nd, s64 cur_reset_time_ms, bool is_device_reset) {
+	struct nmetric_driver_metrics *driver_metrics = &nd->metrics.driver_metrics;
+	atomic64_t *max_time_metric;
+	atomic64_t *total_time_metric;
+	atomic64_t *total_count_metric;
+	s64 max_time;
+	int max_time_index;
+	int total_time_index;
+	int total_count_index;
+
 	if (cur_reset_time_ms <= 0) {
 		return;
 	}
 
 	if (is_device_reset) {
-		if (nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS] < cur_reset_time_ms) {
-			nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS] = cur_reset_time_ms;
-		}
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS] += cur_reset_time_ms;
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT]++;
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS] = 
-		  nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS] / 
-		  nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT];
+		max_time_index = NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS;
+		total_time_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS;
+		total_count_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT;
 	} else {
-		if (nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS] < cur_reset_time_ms) {
-			nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS] = cur_reset_time_ms;
-		}
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS] += cur_reset_time_ms;
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT]++;
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS] = 
-		  nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS] / 
-		  nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT];
+		max_time_index = NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS;
+		total_time_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS;
+		total_count_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT;
 	}
+
+	max_time_metric = &driver_metrics->final_metrics[max_time_index];
+	total_time_metric = &driver_metrics->intermediate_metrics[total_time_index];
+	total_count_metric = &driver_metrics->intermediate_metrics[total_count_index];
+
+	max_time = atomic64_read(max_time_metric);
+
+	while (max_time < cur_reset_time_ms &&
+		   !atomic64_try_cmpxchg(max_time_metric, &max_time, cur_reset_time_ms));
+
+	atomic64_add(cur_reset_time_ms, total_time_metric);
+	atomic64_inc(total_count_metric);
 }
 
 void nmetric_increment_reset_failure_count(struct neuron_device *nd, bool is_device_reset)
 {
+	struct nmetric_driver_metrics *driver_metrics = &nd->metrics.driver_metrics;
+
 	if (is_device_reset) {
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT]++;
+		atomic64_inc(&driver_metrics->final_metrics[NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT]);
 	} else {
-		nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT]++;
+		atomic64_inc(&driver_metrics->final_metrics[NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT]);
 	}
 }
+
+void nmetric_set_performance_profile(struct neuron_device *nd, int profile)
+{
+	snprintf(nmetric_constant_metrics[NMETRIC_PROFILE_ID_IDX], NEURON_METRICS_VERSION_STRING_MAX_LEN + 1, "%d", ndhal->ndhal_perf.current_performance_profile);
+}
diff --git a/neuron_metrics.h b/neuron_metrics.h
index f79f2d9..a0df6e4 100644
--- a/neuron_metrics.h
+++ b/neuron_metrics.h
@@ -2,7 +2,10 @@
 /*
  * Copyright 2021, Amazon.com, Inc. or its affiliates. All Rights Reserved
  */
+#include <linux/atomic.h>
+
 #include "neuron_ds.h"
+#include "share/neuron_driver_shared.h"
 
 #ifndef _NEURON_METRICS_H
 #define _NEURON_METRICS_H
@@ -30,23 +33,36 @@
 #define NMETRIC_CONST_U64_FLAG_SKIP_ZERO (0x1ull << 0)
 #define NMETRIC_CONST_U64_FLAG_PREFER_FREED (0x1ull << 1)
 
-enum driver_metrics_idx {
-	NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS = 0,
-	NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS = 1,
+// The final driver metrics to be posted to CR.
+// They are not stored in datastore
+enum driver_final_metrics_idx {
+	NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS     = 0,
+	NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS        = 1,
+
+	NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS     = 2,
+	NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS        = 3,
 
-	NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS = 2,
-	NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS = 3,
+	NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT   = 4,
+	NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT      = 5,
 
-	// Intermediate metrics. do not post to CW directly.
-	NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS = 4,
-	NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS = 5,
-	NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT = 6,
-	NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT = 7,
+	NMETRIC_FINAL_DRIVER_METRICS_COUNT                      = 6,
+};
 
-	NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT = 8,
-	NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT = 9,
+// The intermediate driver metrics are not posted to CW.
+// They are used to calculate the final driver metrics above.
+// They are not stored in datastore
+enum driver_intermediate_metrics_idx {
+	NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS   = 0,
+	NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS      = 1,
+	NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT     = 2,
+	NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT        = 3,
+
+	NMETRIC_INTERMEDIATE_DRIVER_METRICS_COUNT               = 4,
+};
 
-	NMETRIC_DRIVER_METRICS_IDX_COUNT = 10,
+struct nmetric_driver_metrics {
+	atomic64_t final_metrics[NMETRIC_FINAL_DRIVER_METRICS_COUNT];               // final driver metrics to be posted to CR. Not in datastore
+	atomic64_t intermediate_metrics[NMETRIC_INTERMEDIATE_DRIVER_METRICS_COUNT]; // intermediate driver metrics, and not posted to CW. Not in datastore
 };
 
 // Sadly, the 3 #defines below need to be updated when adding new metrics to nmetric_defs
@@ -54,7 +70,7 @@ enum driver_metrics_idx {
 #define NMETRIC_VERSION_COUNT	3
 
 // Number of metrics of type NMETRIC_TYPE_CONSTANT
-#define NMETRIC_CONSTANTS_COUNT	2
+#define NMETRIC_CONSTANTS_COUNT	3
 
 // Number of metrics of type NMETRIC_TYPE_COUNTER + the special case (type NMETRIC_TYPE_FW_IO_ERR)
 #define NMETRIC_COUNTER_COUNT	29
@@ -65,9 +81,6 @@ enum driver_metrics_idx {
 // Number of metrics of type NMETRIC_CONSTANT_U64
 #define NMETRIC_CONSTANT_U64_COUNT 1
 
-// Number of metrics of type NMETRIC_DRIVER
-#define NMETRIC_DRIVER_METRICS_COUNT NMETRIC_DRIVER_METRICS_IDX_COUNT
-
 typedef struct {
 	u8 index;	// metric specific index
 	u8 type;	// metric type
@@ -93,10 +106,18 @@ struct nmetric_versions {
 	u64 version_metrics[NEURON_METRICS_VERSION_MAX_CAPACITY];
 };
 
+enum nmetric_state {
+	NMETRIC_STATE_STOPPED = 0,  // thread not active/signaled to exit the loop
+	NMETRIC_STATE_RUNNING = 1,  // thread is active and periodically posting metrics
+	NMETRIC_STATE_PAUSED = 2,   // thread is active, but periodic posting is skipped
+	NMETRIC_STATE_RESUMING = 3, // immediately wakes thread and transitions to NMETRIC_STATE_RUNNING
+};
+
 struct nmetric_aggregation_thread {
 	struct task_struct *thread; // aggregation thread that sends metrics every ~5 minutes
 	wait_queue_head_t wait_queue;
-	volatile bool running; // if cleared, thread would exit the loop
+	volatile enum nmetric_state state;
+	u64 last_logged_slow_tick; // when the last metric request was posted
 	u64 curr[NMETRIC_COUNTER_COUNT]; // metrics for the current session so far
 	u64 prev[NMETRIC_COUNTER_COUNT]; // recorded metrics from the last post
 	u64 freed[NMETRIC_COUNTER_COUNT]; // cache holding metrics that were freed before the posting period was reached
@@ -109,7 +130,7 @@ struct neuron_metrics {
 	u64 ds_freed_const_u64_buf[NMETRIC_CONSTANT_U64_COUNT];	// stores unsent constant u64 values about to be freed from datastore
 	struct nmetric_aggregation_thread neuron_aggregation;	// aggregation thread that periodically aggregates and posts metrics
 	u8 posting_buffer[NEURON_METRICS_MAX_POSTING_BUF_SIZE + 1];
-    u64 driver_metrics[NMETRIC_DRIVER_METRICS_COUNT];		// stores driver internal metrics that is not in datastore
+	struct nmetric_driver_metrics driver_metrics;			// driver metrics. not in datastore
 };
 
 /**
@@ -138,7 +159,7 @@ void nmetric_partial_aggregate(struct neuron_device *nd, struct neuron_datastore
 void nmetric_stop_thread(struct neuron_device *nd);
 
 /**
- * nmetric_init_driver_metrics() - Initializes the driver metrics to 0
+ * nmetric_init_driver_metrics() - Initializes the driver metrics lock and values to 0
  * 
  * @param nd - the neuron device
  */
@@ -158,7 +179,7 @@ int nmetric_init(struct neuron_device *nd);
  * @param cur_reset_time_ms: the current TPB or device reset time in milliseconds.
  * @param is_device_reset: whether it is TPB or device reset.
  */
-void nmetric_set_reset_time_metrics(struct neuron_device *nd, uint64_t cur_reset_time_ms, bool is_device_reset);
+void nmetric_set_reset_time_metrics(struct neuron_device *nd, s64 cur_reset_time_ms, bool is_device_reset);
 
 /**
  * nmetric_increment_reset_failure_count() - Increment the reset failure count by 1 for a device or TPB reset failure.
@@ -168,4 +189,19 @@ void nmetric_set_reset_time_metrics(struct neuron_device *nd, uint64_t cur_reset
  */
 void nmetric_increment_reset_failure_count(struct neuron_device *nd, bool is_device_reset);
 
+/**
+ * nmetric_set_performance_profile() - Set the current performance profile value for metrics posting.
+ * 
+ * @param nd: neuron device
+ * @param profile: performance profile value
+ */
+void nmetric_set_performance_profile(struct neuron_device *nd, int profile);
+
+/**
+ * nmetric_set_mode() - Enable or disable periodic posting of metrics.
+ *
+ * @param nd: neuron device
+ * @param mode: whether to change or maintain current behavior
+ */
+void nmetric_set_mode(struct neuron_device *nd, enum neuron_metrics_mode mode);
 #endif
diff --git a/neuron_module.c b/neuron_module.c
index e865d32..e6eb69a 100644
--- a/neuron_module.c
+++ b/neuron_module.c
@@ -18,13 +18,13 @@
 #include "neuron_cdev.h"
 #include "neuron_pci.h"
 
-MODULE_DESCRIPTION("Neuron Driver, built from SHA: bab563e32c62d9dd615a42079e5bbd8e1a6327b1");
+MODULE_DESCRIPTION("Neuron Driver, built from SHA: 5ebb67d2e5be7052dcf1774cff03c69ab40d21ee");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("2.24.7.0");
+MODULE_VERSION("2.25.4.0");
 MODULE_ALIAS("pci:v00001d0fd00007064sv*sd*bc*sc*i*");
 
-const char driver_version[] = "2.24.7.0";
-const char driver_revision[] = "bab563e32c62d9dd615a42079e5bbd8e1a6327b1";
+const char driver_version[] = "2.25.4.0";
+const char driver_revision[] = "5ebb67d2e5be7052dcf1774cff03c69ab40d21ee";
 
 #ifdef CONFIG_FAULT_INJECTION
 
diff --git a/neuron_pci.c b/neuron_pci.c
index 3233932..dbb1b14 100644
--- a/neuron_pci.c
+++ b/neuron_pci.c
@@ -19,7 +19,6 @@
 #include "neuron_ds.h"
 #include "neuron_reg_access.h"
 #include "neuron_metrics.h"
-#include "v1/fw_io.h"
 #include "neuron_dma.h"
 #include "neuron_dhal.h"
 #include "neuron_nq.h"
@@ -29,13 +28,11 @@
 
 
 static struct pci_device_id neuron_pci_dev_ids[] = {
-	{ PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID0) },
-	{ PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID1) },
-	{ PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID2) },
-	{ PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID3) },
 	{ PCI_DEVICE(AMZN_VENDOR_ID, TRN1_DEVICE_ID0) },
 	{ PCI_DEVICE(AMZN_VENDOR_ID, INF2_DEVICE_ID0) },
 	{ PCI_DEVICE(AMZN_VENDOR_ID, TRN2_DEVICE_ID0) },
+	{ PCI_DEVICE(AMZN_VENDOR_ID, TRN3_DEVICE_ID0) },
+	{ PCI_DEVICE(AMZN_VENDOR_ID, TRN3_DEVICE_ID1) },
 	{
 		0,
 	},
@@ -101,7 +98,7 @@ static int neuron_pci_device_init(struct neuron_device *nd)
 
 	// Initialize the mc handle map
 	ret = nmch_handle_init(nd);
-	if (ret) 
+	if (ret)
 		goto fail_mch;
 
 	// Initialize the device mpset
@@ -166,7 +163,7 @@ static int neuron_pci_device_close(struct neuron_device *nd)
 static void neuron_pci_set_device_architecture(struct neuron_device *nd)
 {
 	unsigned short device = nd->pdev->device;
-	enum neuron_arch arch;
+	enum neuron_arch arch = NEURON_ARCH_INVALID;
 	u8 revision;
 	pci_read_config_byte(nd->pdev, PCI_REVISION_ID, &revision);
 
@@ -178,8 +175,12 @@ static void neuron_pci_set_device_architecture(struct neuron_device *nd)
 		case TRN2_DEVICE_ID0:
 			arch = NEURON_ARCH_V3;
 			break;
+		case TRN3_DEVICE_ID0:
+		case TRN3_DEVICE_ID1:
+			arch = NEURON_ARCH_V4;
+			break;
 		default:
-			arch = NEURON_ARCH_V1;
+			return;
 	}
 	narch_init(arch, revision);
 }
@@ -196,7 +197,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	}
 
     nmetric_init_driver_metrics(nd);
-	
+
 	if (neuron_log_init(nd)) {
 		pci_warn(dev, "Warning: Can't allocate memory for neuron log\n");
 	}
@@ -256,7 +257,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	nd->device_index = atomic_fetch_add(1, &device_count);
 #else
 	nd->device_index = atomic_add_return(1, &device_count) - 1;
-#endif 
+#endif
 	nd->fw_io_ctx = fw_io_setup(nd->npdev.bar0, nd->npdev.bar0_size,
 				    nd->npdev.bar2, nd->npdev.bar2_size);
 	if (nd->fw_io_ctx == NULL) {
diff --git a/neuron_power.c b/neuron_power.c
index 81c3072..47f2d25 100644
--- a/neuron_power.c
+++ b/neuron_power.c
@@ -49,6 +49,14 @@ bool npower_enabled_in_fw(struct neuron_device *nd)
 	int ret = 0;
 	u32 api_version_num = 0;
 
+	extern unsigned int nmetric_log_posts;
+
+	// If metric log posting is disabled, don't do power reporting - we're likely on
+	// bringup hardware or simulation
+	if (!nmetric_log_posts) {
+		return false;
+	}
+
 	// Just read the API version from firmware.  We could try to be smart here and cache
 	// this, but we need to protect ourselves from rollbacks in the Pacific version or
 	// other changes.  Plus, this is just a simple MMIO read, so it's cheap.
diff --git a/neuron_reset.c b/neuron_reset.c
index 7bef5d4..2c908e3 100644
--- a/neuron_reset.c
+++ b/neuron_reset.c
@@ -379,7 +379,6 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t
 	uint32_t reset_retry_interval;
 	ktime_t start_time;
 	ktime_t next_reset_retry_time;
-	uint32_t initial_poll_delay;
 	ktime_t cur_time;
     s64 reset_time;
 
@@ -394,14 +393,6 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t
 	fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map);
 	next_reset_retry_time = ktime_add_ms(start_time, reset_retry_interval);
 
-	/* V1 only. Sleep extra time before polling */
-	initial_poll_delay = (nc_map == NEURON_NC_MAP_DEVICE ? 
-											 ndhal->ndhal_reset.reset_device_initial_poll_delay : 
-											 ndhal->ndhal_reset.reset_tpb_initial_poll_delay);
-	if (nr_msleep_stoppable(nd, initial_poll_delay)) {
-		return -1;
-	}
-
 	do {
 		/* 
 		 * After reset initiation, firmware becomes unresponsive until
@@ -416,7 +407,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t
 			// Reset is done. Record the time to metrics.
 			reset_time = ktime_ms_delta(ktime_get(), start_time);
 			if (reset_time > 0) {
-				nmetric_set_reset_time_metrics(nd, (uint64_t)reset_time, is_device_reset);
+				nmetric_set_reset_time_metrics(nd, reset_time, is_device_reset);
 			} else {
 				return -1;
 			}
diff --git a/neuron_ring.c b/neuron_ring.c
index 0483d9e..165e0c9 100644
--- a/neuron_ring.c
+++ b/neuron_ring.c
@@ -26,6 +26,9 @@ int dev_nc_map = 1;
 module_param(dev_nc_map, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
 MODULE_PARM_DESC(dev_nc_map, "Map of active neuron cores");
 
+// forward
+static void ndmar_h2t_ring_free(struct ndma_ring *ring);
+
 struct ndma_eng *ndmar_acquire_engine(struct neuron_device *nd, u32 eng_id)
 {
 	if (eng_id >= NUM_DMA_ENG_PER_DEVICE)
@@ -34,6 +37,14 @@ struct ndma_eng *ndmar_acquire_engine(struct neuron_device *nd, u32 eng_id)
 	return &nd->ndma_engine[eng_id];
 }
 
+// acquire dma engine w/o locking.  Use for scanning.
+static struct ndma_eng *ndmar_acquire_engine_nl(struct neuron_device *nd, u32 eng_id)
+{
+	if (eng_id >= NUM_DMA_ENG_PER_DEVICE)
+		return NULL;
+	return &nd->ndma_engine[eng_id];
+}
+
 void ndmar_release_engine(struct ndma_eng *eng)
 {
 	mutex_unlock(&eng->nd->ndma_engine[eng->eng_id].lock);
@@ -143,10 +154,18 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_
 	queue = ndmar_get_queue(eng, qid);
 	ring = ndmar_get_ring(queue);
 
+	// check if the ring has been allocated to h2t and fail
+	if (ndmar_h2t_ring_is_h2t(ring) && (tx_mc != nd->ndma_q_dummy_mc)) { 
+		pr_err("nd%02d: DMA ring allocation collision with h2t dma on eng: %d, queue: %d", nd->device_index, eng_id, qid);
+		ret = -EALREADY;
+		goto done;
+	}
+
 	queue->eng_id = eng_id;
 	queue->qid = qid;
 	queue->owner = task_tgid_nr(current);
 	ring->qid = qid;
+	ring->h2t_completion_mc = NULL;
 
 	trace_dma_queue_init(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc,
 			     port);
@@ -193,15 +212,39 @@ void ndmar_handle_process_exit(struct neuron_device *nd, pid_t pid)
 	const int desc_count = NDMA_QUEUE_DUMMY_RING_DESC_COUNT;
 	for (eng_id = 0; eng_id < ndhal->ndhal_address_map.dma_eng_per_nd; eng_id++) {
 		for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) {
-			if (nd->ndma_engine[eng_id].queues[qid].owner != pid) {
+			struct ndma_eng *eng = ndmar_acquire_engine_nl(nd, eng_id);
+			struct ndma_queue *queue;
+			struct ndma_ring *ring;
+
+			if (eng == NULL) {
+				// 
 				continue;
 			}
+			queue = ndmar_get_queue(eng, qid);
+			ring = ndmar_get_ring(queue);
 
-			// h2t rings are maintained by the driver so dont reset.
+			if (queue->owner != pid) {
+				continue;
+			}
+
+			// default h2t rings are maintained by the driver so dont reset.
 			// there cant be any outstanding DMA transaction in h2t since it is a
 			// synchronous system call(which will block till finished when a process crashes).
-			if (ndhal->ndhal_ndmar.ndmar_is_h2t_q(nd, eng_id, qid))
+			// TODO: async h2t will need to get cleaned up here.
+			
+			if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid)) {
+				pr_err_once("nd%02d: unexpected pid associated with default h2t ring", nd->device_index);
 				continue;
+			}
+		
+			// h2t rings owned by driver are freed up on different path
+			if (ndmar_h2t_ring_is_h2t(ring)) { 
+				pr_err_once("h2t ring should not be bound to process");
+				continue;
+			}
+
+			ndmar_h2t_ring_state_clr(ring);
+			queue->owner = 0;
                                                          
 			// rings owned by the nx should not be reset by us
 			// ok since they should never be interacting with host mem
@@ -280,30 +323,33 @@ int ndmar_queue_copy_start(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx
 int ndmar_queue_release(struct neuron_device *nd, u32 eng_id, u32 qid)
 {
 	trace_dma_queue_release(nd, eng_id, qid);
-	// inf1 does not need any special handling
 	return 0;
 }
 
-static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id)
+static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id, int qid)
 {
 	int ret = 0;
 	struct mem_chunk *rx_mc = NULL, *tx_mc = NULL, *h2t_completion_mc = NULL;
+	struct ndma_queue *queue;
+	struct ndma_ring *ring;
 
 	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
 	const int ndesc = DMA_H2T_DESC_COUNT;
 	const u32 ring_size = ndmar_ring_get_desc_count(ndesc) * sizeof(union udma_desc);
-	const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id);
 
 	struct ndma_eng *eng  = ndmar_acquire_engine(nd, eng_id);
 	if (eng == NULL)
 		return -EINVAL;
 
+	queue = ndmar_get_queue(eng, qid);
+	ring = ndmar_get_ring(queue);
+
 	eng->used_for_h2t = true;
-	struct ndma_queue *queue = &eng->queues[qid];
-	queue->qid = qid;
 	queue->eng_id = eng_id;
-	struct ndma_ring *ring = &queue->ring_info;
+	queue->qid = qid;
+	queue->owner = 0;
 	ring->qid = qid;
+	ring->h2t_nc_id = nc_id;
 	ring->size = ring_size;
 	ring->has_compl = false;
 
@@ -332,13 +378,18 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id)
 	ring->h2t_completion.ptr = h2t_completion_mc->va;
 	ring->h2t_completion.addr = virt_to_phys(ring->h2t_completion.ptr) | ndhal->ndhal_address_map.pci_host_base;
 
-	mutex_init(&eng->h2t_ring_lock);
+	mutex_init(&ring->h2t_ring_lock);
 
 	ndmar_release_engine(eng);
 
 	return 0;
 
 error:
+	ring->h2t_nc_id = -1;
+	ring->tx_mc = NULL;
+	ring->rx_mc = NULL;
+	ring->h2t_completion_mc = NULL;
+
 	ndmar_release_engine(eng);
 
 	if (rx_mc)
@@ -347,7 +398,7 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id)
 		mc_free(&tx_mc);
 	if (h2t_completion_mc)
 		mc_free(&h2t_completion_mc);
-
+	
 	return ret;
 }
 
@@ -359,13 +410,134 @@ int ndmar_h2t_ring_init(struct ndma_eng *eng, int qid)
 	int ndesc = DMA_H2T_DESC_COUNT;
 	u32 alloced_desc = ndmar_ring_get_desc_count(ndesc);
 
-	queue = &eng->queues[qid];
-	ring = &queue->ring_info;
+	queue = ndmar_get_queue(eng, qid);
+	ring = ndmar_get_ring(queue);
 	ret = udma_m2m_init_queue(&eng->udma, qid, eng->eng_id, alloced_desc, alloced_desc, true, &ring->tx,
 				  &ring->rx, NULL);
 	return ret;
 }
 
+static bool ndmar_h2t_ring_claim(struct neuron_device *nd, u32 eng_id, struct ndma_ring *ring)
+{
+	bool claimed = false;
+	struct ndma_eng *eng = ndmar_acquire_engine(nd, eng_id);
+	if (eng == NULL)
+		return false;
+	if (!ring->h2t_allocated) {
+		ring->h2t_nc_id = -1;
+		ring->h2t_allocated = true;
+		claimed = true;
+	}
+	ndmar_release_engine(eng);
+	return claimed;
+}
+
+/* ndmar_h2t_ring_request()
+ *
+ *    Ask the driver to dynamically allocate an h2t ring.
+ *
+ *   caveats:
+ *    - Other runtime allocated rings are not tracked by the driver, so h2t rings have to be requested
+ *      prior to any rings allocated for a model.  So basically we only track h2t ring allocations in the
+ *      driver.
+ */
+int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *rqid)
+{
+	int ret = -1;
+	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
+	struct ndma_eng *eng;
+	struct ndma_queue *queue;
+	struct ndma_ring *ring;
+	int qid;
+	
+	eng = ndmar_acquire_engine_nl(nd, eng_id);
+	if (eng == NULL)
+		return -EINVAL;
+
+	for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) {
+		if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid))
+			continue;
+		queue = ndmar_get_queue(eng, qid);
+		ring = ndmar_get_ring(queue);
+
+		// ring is unallocated, we can use it
+		if (ndmar_h2t_ring_claim(nd, eng_id, ring)) {
+			// For historical reasons, there are two ways we keep track of and manage queue ownership.  DMAs that
+			// are managed by the driver on behalf of a process are tagged by nc_id that is used by the process,
+			// DMAs managed elsewhere (such as DMAs used NX refill) are tagged with PID.  The tagging determines
+			// where the DMA resources are cleaned up.
+			if (h2t) {
+				// queue is used for driver memcopy
+				ret = ndmar_h2t_ring_alloc(nd, nc_id, qid);
+				if (ret) {
+					pr_err("nd%d:nc%d H2T ring allocation for qid:%d failed - %d\n", nd->device_index, nc_id, qid, ret);
+					ring->h2t_allocated = false;
+					goto done;
+				}
+				ret = ndmar_h2t_ring_init(eng, qid);
+				if (ret) {
+					ndmar_h2t_ring_free(ring);
+					pr_err("nd%d:nc%d H2T ring init for qid:%d failed - %d\n", nd->device_index, nc_id, qid, ret);
+					ring->h2t_allocated = false;
+					goto done;
+				}
+			} else {
+				// queue is use by HW or the RT directly
+				queue->owner = task_tgid_nr(current);
+				ring->h2t_nc_id = nc_id;
+				ret = 0;
+			}
+			*rqid = qid;
+			break;
+		}
+	}
+
+done:
+	return ret;
+}
+
+int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid)
+{
+	int ret = 0;
+	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id);
+	struct ndma_eng *eng;
+	struct ndma_queue *queue;
+	struct ndma_ring *ring;
+	
+	if (qid >= DMA_MAX_Q_MAX) {
+		return -EINVAL;
+	}
+
+	if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid)) {
+		return 0;
+	}
+
+	eng = ndmar_acquire_engine(nd, eng_id);
+	if (eng == NULL) {
+		return -EINVAL;
+	}
+
+	queue = ndmar_get_queue(eng, qid);
+	ring = ndmar_get_ring(queue);
+
+	if (!ndmar_h2t_ring_is_allocated(ring) || (ring->h2t_nc_id != nc_id)) {
+		pr_err("nd%02d: attempting to release ring %d on nc: %d that is not allocated as h2t ring", nd->device_index, qid, nc_id);
+		ret = -ENXIO;
+		goto done;
+	}
+
+	if (ndmar_h2t_ring_is_h2t(ring)) {
+		ndmar_h2t_ring_free(ring);
+	} else {
+		ndmar_h2t_ring_state_clr(ring);
+		queue->owner = 0;
+	}
+
+done:
+	ndmar_release_engine(eng);
+	return ret;
+}
+
 int ndmar_eng_set_state(struct neuron_device *nd, int eng_id, u32 state)
 {
 	struct ndma_eng *eng;
@@ -532,7 +704,9 @@ static int ndmar_init_nc(struct neuron_device *nd, int nc_idx, bool init_h2t_eng
 		}
 	}
 
-	ret = ndmar_h2t_ring_alloc(nd, nc_idx);
+	const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_idx);
+
+	ret = ndmar_h2t_ring_alloc(nd, nc_idx, qid);
 	if (ret) {
 		pr_err("nd%d:nc%d H2T ring allocation failed - %d\n", nd->device_index, nc_idx, ret);
 		return ret;
@@ -541,7 +715,7 @@ static int ndmar_init_nc(struct neuron_device *nd, int nc_idx, bool init_h2t_eng
 	struct ndma_eng *eng = ndmar_acquire_engine(nd, eng_id);
 	if (eng == NULL)
 		return -EINVAL;
-	const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_idx);
+
 	ret = ndmar_h2t_ring_init(eng, qid);
 	ndmar_release_engine(eng);
 	if (ret) {
@@ -579,22 +753,61 @@ int ndmar_init(struct neuron_device *nd)
 	return ndmar_init_ncs(nd, -1);
 }
 
-static void ndmar_h2t_ring_free(struct neuron_device *nd, int nc_idx, int eng_id)
+static void ndmar_h2t_ring_free(struct ndma_ring *ring)
 {
-	const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_idx);
-	struct ndma_eng *eng  = ndmar_acquire_engine(nd, eng_id);
-	BUG_ON(eng == NULL);
-	struct ndma_queue *queue = &eng->queues[qid];
-	struct ndma_ring *ring = &queue->ring_info;
-
-	if (ring->tx_mc)
+	if (ring->tx_mc) {
 		mc_free(&ring->tx_mc);
+		ring->tx_mc = NULL;
+	}
 
-	if (ring->rx_mc)
+	if (ring->rx_mc) {
 		mc_free(&ring->rx_mc);
+		ring->rx_mc = NULL;
+	}
 
-	if (ring->rxc_mc)
+	if (ring->rxc_mc) {
 		mc_free(&ring->rxc_mc);
+		ring->rxc_mc = NULL;
+	}
+	
+	if (ring->h2t_completion_mc) {
+		mc_free(&ring->h2t_completion_mc);
+		ring->h2t_completion_mc = NULL;
+	}
+
+	ndmar_h2t_ring_state_clr(ring);
+}
+
+/* ndmar_h2t_ring_free_all()
+ *
+ */
+static void ndmar_h2t_ring_free_all(struct neuron_device *nd, int nc_idx)
+{
+	struct ndma_eng *eng;
+	struct ndma_queue *queue;
+	struct ndma_ring *ring;
+	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_idx);
+	int qid;
+	
+	eng  = ndmar_acquire_engine(nd, eng_id);
+	if (eng == NULL) {
+		pr_err("nd%02d: fatal error unable to acquire engine %d", nd->device_index, eng_id);
+		return;
+	}
+
+	for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) {
+		queue = ndmar_get_queue(eng, qid);
+		ring = ndmar_get_ring(queue);
+		if (ndmar_h2t_ring_is_allocated(ring) && ring->h2t_nc_id == nc_idx) {
+			if (ndmar_h2t_ring_is_h2t(ring)) {
+				// h2t queue free all resources
+				ndmar_h2t_ring_free(ring);
+			} else {
+				// service queue only clear state
+				ndmar_h2t_ring_state_clr(ring);
+			}
+		}
+	}
 
 	ndmar_release_engine(eng);
 }
@@ -604,8 +817,7 @@ static void ndmar_close_nc(struct neuron_device *nd, int nc_idx)
 	if (!nd->dmar_init_done[nc_idx]) {
 		return;
 	}
-	const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_idx);
-	ndmar_h2t_ring_free(nd, nc_idx, eng_id);
+	ndmar_h2t_ring_free_all(nd, nc_idx);
 	nd->dmar_init_done[nc_idx] = false;
 }
 
diff --git a/neuron_ring.h b/neuron_ring.h
index 726f83a..f031be7 100644
--- a/neuron_ring.h
+++ b/neuron_ring.h
@@ -44,18 +44,22 @@ struct ndma_h2t_dma_context {
 };
 
 struct ndma_ring {
+	// TODO combine all the h2t stuff in a sub structure
+	struct mutex h2t_ring_lock;
+	struct udma_ring_ptr h2t_completion;  // TODO why are we using udma_ring_ptr...
+	struct mem_chunk *h2t_completion_mc;
+	struct ndma_h2t_dma_context h2t_dma_ctx[NEURON_DMA_H2T_CTX_HANDLE_CNT];
+	u32 h2t_nc_id;
+	bool h2t_allocated; // ring can be allocated for standard use or h2t 
 	u32 qid;
 	u32 size; //total size - num desc * desc size
 	bool has_compl;
 	struct udma_ring_ptr tx;
 	struct udma_ring_ptr rx;
 	struct udma_ring_ptr rxc;
-	struct udma_ring_ptr h2t_completion;
 	struct mem_chunk *tx_mc;
 	struct mem_chunk *rx_mc;
 	struct mem_chunk *rxc_mc;
-	struct mem_chunk *h2t_completion_mc;
-	struct ndma_h2t_dma_context h2t_dma_ctx[NEURON_DMA_H2T_CTX_HANDLE_CNT];
 	u32 dram_channel;
 };
 
@@ -73,7 +77,6 @@ struct ndma_eng {
 	struct ndma_queue queues[DMA_MAX_Q_MAX];
 	struct udma udma;
 	bool used_for_h2t;
-	struct mutex h2t_ring_lock;
 };
 
 /**
@@ -287,4 +290,60 @@ int ndmar_h2t_ring_init(struct ndma_eng *eng, int qid);
 
 u32 ndmar_ring_get_desc_count(u32 v);
 
+/** 
+ * ndmar_h2t_ring_request() - request a h2t ring
+ *
+ * @nd: Neuron device which contains the DMA engine
+ * @nc_id: neuron core id
+ * @h2t: initialize ring to be used for h2t traffic
+ * @rqid: returned id of the queue
+ *
+ */
+int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *rqid);
+
+/** 
+ * ndmar_h2t_ring_release()
+ *
+ * @nd: Neuron device which contains the DMA engine
+ * @nc_id: neuron core id
+ * @qid: id the h2t queue to release
+ *
+ */
+int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid);
+
+/**
+ * ndmar_h2t_ring_is_h2t() - return true if this is an h2t ring
+ */
+static inline bool ndmar_h2t_ring_is_h2t(struct ndma_ring *ring)
+{
+	return (ring->h2t_completion_mc != NULL);
+}
+
+/**
+ * ndmar_h2t_ring_is_owner - return true if this h2t ring is owned by nc_id
+ *
+ */
+static inline bool ndmar_h2t_ring_is_owner(struct ndma_ring *ring, int nc_id)
+{
+	return (nc_id == ring->h2t_nc_id) && ndmar_h2t_ring_is_h2t(ring);
+}
+		
+static inline bool ndmar_h2t_ring_is_allocated(struct ndma_ring *ring)
+{
+	return ring->h2t_allocated;
+}
+
+static inline void ndmar_h2t_ring_state_clr(struct ndma_ring *ring)
+{
+	ring->h2t_nc_id = -1;
+	ring->h2t_allocated = false;
+}
+
+/**
+ * ndmar_qid_valid() - return true if a queue is valid
+ */
+static inline bool ndmar_qid_valid(int qid)
+{
+	return ((qid >= 0) && (qid < DMA_MAX_Q_MAX));
+}
 #endif
diff --git a/neuron_topsp.c b/neuron_topsp.c
index 339c610..7638fbf 100644
--- a/neuron_topsp.c
+++ b/neuron_topsp.c
@@ -3,7 +3,7 @@
  * Copyright 2021, Amazon.com, Inc. or its affiliates. All Rights Reserved
  */
 
-/** Each neuron device has N number of TOP_SPs. (inf1 does not have it).
+/** Each neuron device has N number of TOP_SPs. TOP_SPs are only on V2 and after.
  *
  * Engine:
  * -------
diff --git a/share/neuron_driver_shared.h b/share/neuron_driver_shared.h
index ecb987e..b1f716c 100644
--- a/share/neuron_driver_shared.h
+++ b/share/neuron_driver_shared.h
@@ -6,6 +6,7 @@
 #define NEURON_DRIVER_SHARED_H
 
 #include <linux/types.h>
+#include "neuron_driver_shared_tensor_batch_op.h"
 
 enum neuron_driver_feature_flag {
 	NEURON_DRIVER_FEATURE_DMABUF = 1ull <<  0, 
@@ -16,6 +17,7 @@ enum neuron_driver_feature_flag {
 	NEURON_DRIVER_FEATURE_HBM_SCRUB = 1ull << 5,
 	NEURON_DRIVER_FEATURE_MEM_ALLOC64 = 1ull << 6,
 	NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD = 1ull << 7,
+	NEURON_DRIVER_FEATURE_ZEROCOPY = 1ull << 8,
 };
 
 // FIXME  this should be more generic - like node type.
@@ -48,6 +50,11 @@ enum neuron_ultraserver_mode {
 	NEURON_ULTRASERVER_MODE_X1 = 4,  		 // 1 node US configuration (standalone)
 };
 
+enum neuron_metrics_mode {
+    NEURON_METRICS_MODE_PERIODIC_ENABLE = 0,    // enable periodic posting
+    NEURON_METRICS_MODE_PERIODIC_DISABLE = 1,   // disable periodic posting
+};
+
 #define NEURON_NC_MAP_DEVICE (0xffffffff)
 
 enum neuron_dma_queue_type {
@@ -90,6 +97,10 @@ enum neuron_dma_h2t_ctx_handle_type {
 	NEURON_DMA_H2T_CTX_HANDLE_CNT    =  3   // number of dma 
 };
 
+/*
+ * H2T DMA Default Queue id
+ */
+#define NEURON_DMA_H2T_DEFAULT_QID (-1)
 
 /*
  * NOTE: In runtime version 5, this enum was passed in as a bool instead -
@@ -188,6 +199,17 @@ struct neuron_ioctl_nc_map {
     struct neuron_ioctl_nc_map_entry mappings[];
 };
 
+/* A batch of copy operations */
+typedef struct neuron_memcpy_batch {
+	__u64 mem_handle;               // [in] Source or Destination memory handle from/to data needs to be copied.
+	__u64 mem_handle_offset;        // [in] Memory offset of the memory handle
+	const nrt_tensor_batch_op_t *ops_ptr; // [in] Pointer to array of operations
+	__u32 num_ops;                  // [in] Number of neuron_memcpy_op operations.
+	__u16 bar4_wr_threshold;        // [in] Threshold below which we will use bar4 direct write vs. DMA. Subject to driver limits.
+	__u16 flags;                    // [in] TBD.
+	void *context;                  // [in] TBD. opaque context pointer passed back in completion queue
+} neuron_memcpy_batch_t;
+
 /*
  * Memory allocation categories for sysfs counters
 */
diff --git a/share/neuron_driver_shared_tensor_batch_op.h b/share/neuron_driver_shared_tensor_batch_op.h
new file mode 100644
index 0000000..59de68e
--- /dev/null
+++ b/share/neuron_driver_shared_tensor_batch_op.h
@@ -0,0 +1,24 @@
+/*
+ * Shared tensor batch operation between runtime and driver.
+ */
+
+#ifndef NEURON_DRIVER_SHARED_TENSOR_BATCH_OP_H
+#define NEURON_DRIVER_SHARED_TENSOR_BATCH_OP_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+typedef __u64 nrt_tensor_batch_offset_t;
+typedef __u64 nrt_tensor_batch_size_t;
+#else
+#include <stdint.h>
+typedef uint64_t nrt_tensor_batch_offset_t;
+typedef uint64_t nrt_tensor_batch_size_t;
+#endif
+
+typedef struct nrt_tensor_batch_op {
+    nrt_tensor_batch_offset_t offset;
+    nrt_tensor_batch_size_t size;
+    void *buffer;
+} nrt_tensor_batch_op_t;
+
+#endif  // NEURON_DRIVER_SHARED_TENSOR_BATCH_OP_H
diff --git a/udma/udma.h b/udma/udma.h
index a14c8ee..ce76549 100644
--- a/udma/udma.h
+++ b/udma/udma.h
@@ -66,7 +66,8 @@ union udma_desc {
 #define M2S_DESC_LEN_SHIFT 0
 #define M2S_DESC_LEN_MASK (0xffff << M2S_DESC_LEN_SHIFT) /* Data length */
 
-#define S2M_DESC_INT_EN BIT(28) /* Enable Interrupt on completion */
+#define S2M_DESC_INT_EN 			BIT(28) /* Enable Interrupt on completion */
+#define S2M_DESC_STRONG_ORDER_WR	BIT(29) /* Enable strong order write on this descriptor, V3+ only */
 #define S2M_DESC_RING_ID_SHIFT 24
 #define S2M_DESC_RING_ID_MASK (0x3 << S2M_DESC_RING_ID_SHIFT) /* Ring ID bits in s2m */
 #define S2M_DESC_RING_SHIFT UDMA_S2M_Q_RDRBP_LOW_ADDR_SHIFT
@@ -195,7 +196,8 @@ struct udma {
 enum {
 	UDMA_M2M_BARRIER_NONE = 0,
 	UDMA_M2M_BARRIER_DMB = 1,
-	UDMA_M2M_BARRIER_WRITE_BARRIER = 2
+	UDMA_M2M_BARRIER_WRITE_BARRIER = 2,
+	UDMA_M2M_BARRIER_SOW = 3
 };
 
 /**
diff --git a/udma/udma_m2m.c b/udma/udma_m2m.c
index 8d0812f..0b0a4eb 100644
--- a/udma/udma_m2m.c
+++ b/udma/udma_m2m.c
@@ -126,22 +126,18 @@ static int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs)
 		(1 << UDMA_M2S_RD_DESC_PREF_CFG_3_MIN_BURST_BELOW_THR_SHIFT);
 	reg_write32(&udma->udma_regs_m2s->m2s_rd.desc_pref_cfg_3, value);
 
-	// likely harmless, but just in case, keep the old V1 behavior where 
-	// we did not change default for s2m.  V1 support is on the way out,
-	// once it's deprecated just remove this comment and the "if"
-	if (narch_get_arch() != NEURON_ARCH_V1) {
-		value = (pref_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_PREF_THR_SHIFT) |
-			(min_burst_above_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_ABOVE_THR_SHIFT) |
-			(1 << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_BELOW_THR_SHIFT);
-		reg_write32(&udma->udma_regs_s2m->s2m_rd.desc_pref_cfg_3, value);
-		// configure max_burst for both m2s and s2m
-		value = (max_burst << UDMA_AXI_M2S_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) |
-			(always_break_on_max_boundary << UDMA_AXI_M2S_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT);
-		reg_write32(&udma->udma_regs_m2s->axi_m2s.desc_rd_cfg_3, value);
-		value = (max_burst << UDMA_AXI_S2M_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) |
-			(always_break_on_max_boundary << UDMA_AXI_S2M_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT);
-		reg_write32(&udma->udma_regs_s2m->axi_s2m.desc_rd_cfg_3, value);
-	}
+	value = (pref_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_PREF_THR_SHIFT) |
+		(min_burst_above_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_ABOVE_THR_SHIFT) |
+		(1 << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_BELOW_THR_SHIFT);
+	reg_write32(&udma->udma_regs_s2m->s2m_rd.desc_pref_cfg_3, value);
+	// configure max_burst for both m2s and s2m
+	value = (max_burst << UDMA_AXI_M2S_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) |
+		(always_break_on_max_boundary << UDMA_AXI_M2S_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT);
+	reg_write32(&udma->udma_regs_m2s->axi_m2s.desc_rd_cfg_3, value);
+	value = (max_burst << UDMA_AXI_S2M_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) |
+		(always_break_on_max_boundary << UDMA_AXI_S2M_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT);
+	reg_write32(&udma->udma_regs_s2m->axi_s2m.desc_rd_cfg_3, value);
+
 	return 0;
 }
 
@@ -341,6 +337,9 @@ static int udma_m2m_build_descriptor(union udma_desc *rx_desc_ptr, union udma_de
 		case UDMA_M2M_BARRIER_WRITE_BARRIER:
 			sdma_m2s_set_write_barrier(&meta_ctrl);
 			break;
+		case UDMA_M2M_BARRIER_SOW:
+			rx_flags |= S2M_DESC_STRONG_ORDER_WR;
+			break;
 		case UDMA_M2M_BARRIER_NONE:
 			break;
 		default:
@@ -355,7 +354,7 @@ static int udma_m2m_build_descriptor(union udma_desc *rx_desc_ptr, union udma_de
 
 	/* if rx should generate an interrupt make it so */
 	if (unlikely(set_dst_int))
-		rx_flags = S2M_DESC_INT_EN;
+		rx_flags |= S2M_DESC_INT_EN;
 
 	return udma_m2m_build_rx_descriptor(rx_desc_ptr, rx_ring_id, d_addr, size, rx_flags);
 }
diff --git a/udma/udma_main.c b/udma/udma_main.c
index f13bbb9..710da33 100644
--- a/udma/udma_main.c
+++ b/udma/udma_main.c
@@ -29,12 +29,13 @@ struct udma_m2s_pkt_len_conf {
 	bool encode_64k_as_zero;
 };
 
+#define UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB (1 << 2)
 
 /*  dma_q flags */
 #define UDMA_Q_FLAGS_NO_COMP_UPDATE BIT(1)
 
-/* M2S packet len configuration, configure maximum DMA packets size, i.e. 
- * the max size of the sum of all descriptors in a packet.  Configure 
+/* M2S packet len configuration, configure maximum DMA packets size, i.e.
+ * the max size of the sum of all descriptors in a packet.  Configure
  * whether len=0 encodes len=64k
  */
 static int udma_m2s_packet_size_cfg_set(struct udma *udma, struct udma_m2s_pkt_len_conf *conf)
@@ -60,6 +61,8 @@ static int udma_m2s_packet_size_cfg_set(struct udma *udma, struct udma_m2s_pkt_l
 	return 0;
 }
 
+#define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16)
+
 /* set default configuration of one DMA engine */
 static int udma_set_defaults(struct udma *udma)
 {
@@ -117,7 +120,8 @@ static int udma_set_defaults(struct udma *udma)
 		reg_write32(&gen_ex_regs->vmpr_v4[i].tx_sel, 0xffffffff);
 
 	/* Set M2S data read master configuration */
-	ndhal->ndhal_udma.udma_m2s_data_rd_cfg_boundaries_set(udma);
+	reg_write32(&udma->udma_regs_m2s->axi_m2s.data_rd_cfg,
+	  UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY | 0x8);
 
 	/* Ack time out */
 	reg_write32(&udma->udma_regs_s2m->s2m_comp.cfg_application_ack, 0);
@@ -142,19 +146,26 @@ static int udma_set_defaults(struct udma *udma)
 		(0x40 << UDMA_AXI_S2M_OSTAND_CFG_WR_MAX_COMP_DATA_WR_SHIFT);
 	reg_write32(&udma->udma_regs_s2m->axi_s2m.ostand_cfg_wr, value);
 
-	// Enable the completion ring head reporting by disabling bit0
-	struct udma_gen_regs_v4 __iomem *gen_regs = udma->gen_regs;
-	if (ndhal->arch == NEURON_ARCH_V1) {
-		// Keep completion disabled for V1
-		// V1 requires this fix to avoid race-condition when resetting the NC instruction buffers
-		value = 0x1ul;
-	} else {
-		ret = reg_read32(&gen_regs->spare_reg.zeroes0, &value);
+	/* Use ostand_cfg_wr_2 to program max outstanding data writes in v4 to 256+ values
+	* ostand_cfg_wr is not effective in v4 by default.
+	*/
+	if (ndhal->ndhal_arch.arch == NEURON_ARCH_V4) {
+		ret = reg_read32(&udma->udma_regs_s2m->axi_s2m.ostand_cfg_wr_2, &value);
 		if (ret) {
 			return ret;
 		}
-		value &= (~0x1ul);
- 	}
+		value = ((value & (~UDMA_AXI_S2M_OSTAND_CFG_WR_2_MAX_DATA_WR_OSTAND_MASK)) |
+			(256 << UDMA_AXI_S2M_OSTAND_CFG_WR_2_MAX_DATA_WR_OSTAND_SHIFT));
+		reg_write32(&udma->udma_regs_s2m->axi_s2m.ostand_cfg_wr_2, value);
+	}
+
+	// Enable the completion ring head reporting by disabling bit0
+	struct udma_gen_regs_v4 __iomem *gen_regs = udma->gen_regs;
+	ret = reg_read32(&gen_regs->spare_reg.zeroes0, &value);
+	if (ret) {
+		return ret;
+	}
+	value &= (~0x1ul);
 	reg_write32(&gen_regs->spare_reg.zeroes0, value);
 
 	return 0;
@@ -422,7 +433,6 @@ static int udma_q_reset(struct udma_q *udma_q)
 	return 0;
 }
 
-
 /** Initializes the udma queue data structure.
  */
 static void udma_q_init_internal(struct udma *udma, u32 qid, struct udma_q_params *q_params)
@@ -458,7 +468,17 @@ static void udma_q_init_internal(struct udma *udma, u32 qid, struct udma_q_param
 	udma_q->udma = udma;
 	udma_q->qid = qid;
 
-	ndhal->ndhal_udma.udma_q_config(udma_q);
+	if (udma_q->type == UDMA_TX) {
+		uint32_t *reg_addr;
+		uint32_t val;
+
+		reg_addr = &udma_q->q_regs->m2s_q.rlimit.mask;
+		val = udma_q->rlimit_mask;
+		// enable DMB
+		val &= ~UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB;
+		reg_write32(reg_addr, val);
+   }
+
 
 	/* clear all queue ptrs */
 	udma_q_reset(udma_q);
diff --git a/v2/neuron_dhal_v2.c b/v2/neuron_dhal_v2.c
index 61d40ff..465c811 100644
--- a/v2/neuron_dhal_v2.c
+++ b/v2/neuron_dhal_v2.c
@@ -464,7 +464,6 @@ static void mpset_set_dram_and_mpset_info_v2(struct mempool_set *mpset, u64 *dev
 /**
  * mpset_block_carveout_regions() 
  *          - in v2, block carve out regions: Upper 16 MB is used internally by firmware
- *          - in v1, do nothing and just return 0 
  * 
  * @param nd: neuron device
  * @param mpset: pointer to mpset
@@ -531,25 +530,25 @@ static uint32_t ndmar_get_h2t_eng_id_v2(struct neuron_device *nd, uint32_t nc_id
 }
 
 /** 
- * ndmar_get_h2t_qid()  - return the H2T engine's queue id for this core 
+ * ndmar_get_h2t_def_qid()  - return the H2T engine's default queue id for this core 
  *
  * @param nc_id: Neuron core corresponding to H2T engine
  * Return DMA queue id
  */
-static int ndmar_get_h2t_qid_v2(uint32_t nc_id)
+static int ndmar_get_h2t_def_qid_v2(uint32_t nc_id)
 {
 	return 0;
 }
 
 /** 
- * ndmar_is_h2t_q() - return true 
+ * ndmar_is_h2t_def_q() - return true 
  *
  * @param nd: Neuron device which contains the DMA engine
  * @param eng_id: engine id
  * @param q_id:  queue id
- * Return true if this is an h2t queue
+ * Return true if this is a default h2t queue
  */
-static bool ndmar_is_h2t_q_v2(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id)
+static bool ndmar_is_h2t_def_q_v2(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id)
 {
 	return (nd->ndma_engine[eng_id].used_for_h2t && (q_id == 0));
 }
@@ -699,7 +698,7 @@ const int *inf2_24xl_neighbor_ids[6] = {
  */
 static int fw_io_topology_v2(struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count)
 {
-	// V2 does not have the device support to detect east/west/south/north neighbors like V1,
+	// V2 does not have the device support to detect east/west/south/north neighbors,
 	// so its topology is hardcoded based on instance type.
 	*count = 0;
 
@@ -771,6 +770,39 @@ static int fw_io_read_csr_array_v2(void **ptrs, u32 *values, u32 num_csrs, bool
 }
 
 
+/**
+ * fw_io_execute_request() - Execute commands
+ * 
+ * @param ctx: FWIO context
+ * @param command_id: command ID
+ * @param req: Request data
+ * @param req_size: Request size
+ * @param resp: Response buffer
+ * @param resp_size: Response buffer size
+ *
+ * @return int: 0 on success, -1 on failure
+ */
+static int fw_io_execute_request_v2(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size)
+{
+	return fw_io_execute_request(ctx, command_id, req, req_size, resp, resp_size);
+}
+
+
+/**
+ * fw_io_post_metric() - Post metrics to CW
+ * 
+ * @param ctx: FWIO context
+ * @param data: Data to write
+ * @param size: Size of data
+ *
+ * @return int: 0 on success, -1 on failure
+ */
+static int fw_io_post_metric_v2(struct fw_io_ctx *ctx, u8 *data, u32 size)
+{
+	return fw_io_post_metric(ctx, data, size);
+}
+
+
 /* Register Access (read and write) Functions */
 /**
  * reg_read32_array() - read an array of 32bit registers.
@@ -841,7 +873,7 @@ static int root_info_node_attrs_info_tbl_cnt_v2 = sizeof(root_info_node_attrs_in
  * @param attr_info_tbl: the ecc attributes as an array
  * @return int 0 on success; otherwise on failure
  * 
- * Note: ecc errors are only supported by sysfs for V2. TODO: V1 support will be added 
+ * Note: ecc errors are only supported by sysfs for V2.
  */
 static int nsysfsmetric_add_ecc_nodes_v2(struct nsysfsmetric_metrics *metrics, 
                                   struct nsysfsmetric_node *stats_node,
@@ -1064,8 +1096,6 @@ static u32 neuron_pci_routing_id_to_user_id(u32 routing_id)
  * @param dev: PCI device
  * @param nd: neuron device
  * @return int: 0 on success, otherwise on failure
- * 
- * for V1, this function is dummy
  */
 static int neuron_pci_get_device_id_v2(struct neuron_device *nd, struct pci_dev *dev)
 {
@@ -1093,7 +1123,7 @@ static int neuron_pci_get_device_id_v2(struct neuron_device *nd, struct pci_dev
 		return -ENODEV;
 	}
 
-	// TODO - TRN1 and INF2 mappings are different - likely all of this and the INF1 should be encapsulated.
+	// TODO - TRN1 and INF2 mappings are different
 	if (nd->pdev->device == TRN1_DEVICE_ID0)
 		nd->device_index = neuron_pci_routing_id_to_user_id(routing_id);
 	else
@@ -1153,7 +1183,7 @@ neuron_pci_device_id_to_rid_map_v2(uint32_t * count, uint32_t * did_to_rid_map)
  *
  *           - Version 3 of runtime requires 1) aligned memory allocation support  2) SPROT.
  *           - Version 4 of the runtime requires support for DMA queue init w/o already allocated rings (2.7).
- *           - Version 5 of the runtime requires V2 device renumbering (don't care for V1).
+ *           - Version 5 of the runtime requires V2 device renumbering (don't care before V2).
  *           - Version 6 of the runtime requires ham notification support,
  *              + new V2 reset api for single-tpb reset + new notification init API with force mem realloc/resize.
  *           - Version 7 of the runtime requires udma queue size support for non power of 2 rings + dmabuf support.
@@ -1172,7 +1202,7 @@ static void ncdev_compatible_version_v2(struct neuron_ioctl_compatible_version *
 }
 
 /**
- * ncdev_quiesce_exec_on_proc_exit() - for V1, before resetting DMA, allow current NeuronCore execution to finish and settle
+ * ncdev_quiesce_exec_on_proc_exit()
  * 
  * Note:
  *      When a process is killed, the driver resets DMA but there is no
@@ -1196,110 +1226,18 @@ static void ncdev_quiesce_exec_on_proc_exit_v2(void)
 	return;
 }
 
-/**
- * ncdev_bar_write_data() - write data to bar
- * 
- * @param nd: neuron device
- * @param bar: the BAR to write to
- * @param reg_addresses
- * @param data: the data to be written into the bar
- * @param data_count: the number of data to be written
- * @return 0 on success, otherwise failure
- * 
- * V1:
- *    For BAR0 the addresses are passed as array(random access).
- *    For BAR2 a single address is provided and driver does sequential writes.
- * V2:
- *    Only BAR0 is used right now. TODO: change runtime ioctl
-*/
-static int ncdev_bar_write_data_v2(struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count)
-{
-	if (bar == 0) {
-		int i;
-		for (i = 0; i < data_count; i++) {
-			u64 off = reg_addresses[i] - (u64)nd->npdev.bar0;
-			if (off > nd->npdev.bar0_size) {
-				return -EINVAL;
-			}
-			if (ndhal->ndhal_ndma.ndma_is_bar0_write_blocked(off)) {
-				return -EINVAL;
-			}
-			writel(data[i], nd->npdev.bar0 + off);
-			trace_bar_write(nd, bar, off, data[i]);
-		}
-	} else if (bar == 4) {
-		// TODO: we don't have any use case for r/w memory over the BAR right now.  Disabling.
-		//
-		// We'd like to use DMA for r/w of BAR4 because we might expect access to large amounts of data.
-		// Access via DMA requires an application to own a TPB because it determines which of the h2t DMAs
-		// are safe to use, otherwise a TPB along with its DMA could be reset while that DMA is used here.
-		// Don't want/need to solve it now.
-		return -EINVAL;
-
-		/*
-		dma_addr_t dst_addr = reg_addresses[0] - (u64)nd->npdev.bar0;
-
-		ret = ndma_memcpy(nd, 0, virt_to_phys(data) | ndhal->ndhal_address_map.pci_host_base, dst_addr, data_size);
-		if (ret)
-			return ret;
-		*/
-	} else {
-		pr_err("direct BAR%d write is not supported.\n", bar);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static void ncdev_get_default_tpbs_for_hbm_v2(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count)
 {
 	tpbs[0] = hbm_index;
 	*tpb_count = 1;
 }
 
-/* UDMA Functions */
-#define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16)
-/**
- * udma_m2s_data_rd_cfg_boundaries_set(): set data_rd_cfg to break at 256B boundaries
- * 
- * @param udma: the UDMA structure
- * 
- * for V1, this function is dummy
- */
-static void udma_m2s_data_rd_cfg_boundaries_set_v2(struct udma *udma)
-{
-	reg_write32(&udma->udma_regs_m2s->axi_m2s.data_rd_cfg,
-	  UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY | 0x8);
-}
-
-#define UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB (1 << 2)
-/**
- * udma_q_config() - set misc queue configurations
- *
- * @param udma_q udma_q: the queue data structure
- *
- * for V1, this function is dummy
- */
-static void udma_q_config_v2(struct udma_q *udma_q)
-{
-	if (udma_q->type != UDMA_TX) {
-		return;
-	}
-
-	uint32_t *reg_addr = &udma_q->q_regs->m2s_q.rlimit.mask;
-	uint32_t val = udma_q->rlimit_mask;
-
-	// enable DMB
-	val &= ~UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB;
-	reg_write32(reg_addr, val);
-}
-
 
 /* NDMA Functions */
 /**
  * ndma_get_wait_for_completion_time() - calculate the first and the following wait times for a DMA tranfer completion
  * 
- *      One full descriptor takes ~4 usec to transfer (64K at 16G/sec) on V2  and ~16 usec to transfer on V1.
+ *      One full descriptor takes ~4 usec to transfer (64K at 16G/sec) on V2.
  *      The last descriptor may be partial, so wait 1/4 64K transfer time for that descriptor.
  *      Also, count includes the completion descriptor so don't include that in the count.
  * 
@@ -1332,14 +1270,12 @@ static void ndma_get_wait_for_completion_time_v2_emu(u32 count, bool async, u64
 
 /**
  * ndma_validate_pa() - check the validity of the desc physical addresses
- *      V1:
  *         west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000
  *         If west side is set then even host bit is set. When mc_alloc is called we set only the host bit
  *         and insert into tree.. If some one sets the west side on that PA, then there is no way to check that,
  *         since there could be a tdram address that could have the west side set
  *         (that will look as though host is also set)
- *      V2:
- *         similar idea.  Just check for valid address allocated in host memory
+ *         Just check for valid address allocated in host memory
  *
  * @param nd: the neuron device
  * @param pa: the desc physical addresses
@@ -1594,6 +1530,12 @@ static int npe_pod_ctrl_v2(struct neuron_device *nd, u32 pod_ctrl, enum neuron_u
 	return 0;
 }
 
+static int perf_set_profile_v2(struct neuron_device *nd, uint32_t profile)
+{
+	// NOP implementation for v2
+    return 0;
+}
+
 /**
  * npe_class_node_id_show_data() - return sysfs class node_id
  *
@@ -1708,8 +1650,6 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_address_map.dma_eng_per_nc = V2_DMA_ENG_PER_NC;
 	ndhal->ndhal_address_map.dram_channels = V2_MAX_DRAM_CHANNELS;
 	ndhal->ndhal_reset.reset_poll_interval = V2_NR_RESET_POLL_INTERVAL;
-	ndhal->ndhal_reset.reset_device_initial_poll_delay = 0;
-	ndhal->ndhal_reset.reset_tpb_initial_poll_delay = 0;
 	ndhal->ndhal_reset.initiate_max_wait_time = V2_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS;
 	ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT;
 	ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v2;
@@ -1726,8 +1666,8 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v2;
 	ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v2;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v2;
-    ndhal->ndhal_ndmar.ndmar_get_h2t_qid = ndmar_get_h2t_qid_v2;
-    ndhal->ndhal_ndmar.ndmar_is_h2t_q = ndmar_is_h2t_q_v2;
+    ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v2;
+    ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v2;
 	ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v2;
 	ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v2;
 	ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v2;
@@ -1735,6 +1675,8 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v2;
 	ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v2;
 	ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v2;
+	ndhal->ndhal_fw_io.fw_io_execute_request = fw_io_execute_request_v2;
+	ndhal->ndhal_fw_io.fw_io_post_metric = fw_io_post_metric_v2;
 	ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v2;
 	ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v2;
 	ndhal->ndhal_sysfs_metrics.root_info_node_attrs_info_tbl_cnt = root_info_node_attrs_info_tbl_cnt_v2;
@@ -1753,12 +1695,9 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v2;
 	ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v2;
 	ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v2;
-	ndhal->ndhal_cdev.ncdev_bar_write_data = ncdev_bar_write_data_v2;
 	ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = NULL;
 	ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v2;
 	ndhal->ndhal_udma.num_beats = 1024; // >= UDMA_REV_ID_4
-	ndhal->ndhal_udma.udma_m2s_data_rd_cfg_boundaries_set = udma_m2s_data_rd_cfg_boundaries_set_v2;
-	ndhal->ndhal_udma.udma_q_config = udma_q_config_v2;
 	ndhal->ndhal_ndma.ndma_retry_memcpy = true;
 	ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2;
 	ndhal->ndhal_ndma.ndma_validate_pa = ndma_validate_pa_v2;
@@ -1773,6 +1712,7 @@ int ndhal_register_funcs_v2(void) {
 	ndhal->ndhal_npe.npe_class_node_id_show_data = npe_class_node_id_show_data_v2;
 	ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v2;
 	ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v2;
+	ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v2;
 	ndhal->ndhal_tpb.pe_xbus_count = 5;
 	ndhal->ndhal_tpb.pe_row_grp_count = 4;
 	ndhal->ndhal_tpb.pe_col_grp_count = 4;
diff --git a/v3/neuron_dhal_v3.c b/v3/neuron_dhal_v3.c
index c138c30..c6682b4 100644
--- a/v3/neuron_dhal_v3.c
+++ b/v3/neuron_dhal_v3.c
@@ -6,10 +6,6 @@
 
 #include <linux/delay.h>
 #include <linux/pci.h>
-#include <linux/version.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
-#include <linux/kernel_read_file.h>
-#endif
 
 #include "sdma.h"
 #include "notific.h"
@@ -215,6 +211,12 @@ u64 ntpb_pe_idle_cntr_offsets_v3[V3_NC_PER_DEVICE] =
 	V3_TPB_PE_ACTIVITY_COUNTER_OFFSET(V3_TPB_PE_SEQ_QUEUE_PERF_OFFSET(V3_APB_IO_1_USER_SE_1_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE, V3_APB_IO_1_BASE, V3_PCIE_BAR0_APB_IO_1_OFFSET), 4, V3_TPB_ARR_SEQ_QUEUE_PERF_IDLE_CYCLE_CNT_LSB_OFFSET),
 };
 
+u32 npe_neighbor_eng_ids_v3[2][2] =
+{
+    {36, 68},  // Left
+    {4, 100}   // Right
+};
+
 static int ndhal_register_funcs_trn2(void) {
 	if (!ndhal) {
 		pr_err("ndhal is null. Can't register functions for trn2.");
@@ -227,59 +229,39 @@ static int ndhal_register_funcs_trn2(void) {
 	return 0;
 }
 
-static bool ndhal_instance_type_pod(void)
-{
-	static bool instance_type_is_pod = false;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0)
+/* Instance names
+ */
 #define NEURON_TRN2P_INSTANCE_NAME "trn2p.48xlarge"
 #define NEURON_TRN2EU_INSTANCE_NAME "trn2eu.48xlarge"
 #define NEURON_TRN2U_INSTANCE_NAME "trn2u.48xlarge"
-	static bool initialized = false;
-	ssize_t len;
-	ssize_t file_size;
-	void *buf = NULL;
-
-	if (initialized) {
-		return instance_type_is_pod;
-	}
-
-	initialized = true;
-
-	buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
-
-	if (buf == NULL) {
-		pr_err("failed to allocate buffer to read instance type");
-		goto done;
-	}
-
-	len = kernel_read_file_from_path("/sys/class/dmi/id/product_name",
-									 0, &buf, 64, &file_size, READING_UNKNOWN);
+#define NEURON_TRN2UAC_INSTANCE_NAME "trn2u-ac.24xlarge"
+#define NEURON_TRN2PDS_INSTANCE_NAME "trn2es.48xlarge"
 
-	if (!len) {
-		pr_err("read instance type failed");
-		goto done;
-	}
+static enum neuron_platform_type ndhal_platform_type_v3(void)
+{
+	enum neuron_platform_type platform_type = NEURON_PLATFORM_TYPE_INVALID;
+	char buf[128];
 
+	if (narch_get_instance_type_name(buf, sizeof(buf))) goto done;
 	if ((strncmp(buf, NEURON_TRN2P_INSTANCE_NAME, sizeof(NEURON_TRN2P_INSTANCE_NAME)-1) == 0) ||
 	    (strncmp(buf, NEURON_TRN2EU_INSTANCE_NAME, sizeof(NEURON_TRN2EU_INSTANCE_NAME)-1) == 0) ||
-	    (strncmp(buf, NEURON_TRN2U_INSTANCE_NAME, sizeof(NEURON_TRN2U_INSTANCE_NAME)-1) == 0)) {
-		instance_type_is_pod = true;
+	    (strncmp(buf, NEURON_TRN2U_INSTANCE_NAME, sizeof(NEURON_TRN2U_INSTANCE_NAME)-1) == 0) ||
+	    (strncmp(buf, NEURON_TRN2UAC_INSTANCE_NAME, sizeof(NEURON_TRN2UAC_INSTANCE_NAME)-1) == 0)) {
+		platform_type = NEURON_PLATFORM_TYPE_ULTRASERVER;
+	} else if ((strncmp(buf, NEURON_TRN2PDS_INSTANCE_NAME, sizeof(NEURON_TRN2PDS_INSTANCE_NAME)-1) == 0)) {
+		platform_type = NEURON_PLATFORM_TYPE_PDS;
+	} else {
+		platform_type = NEURON_PLATFORM_TYPE_STD;
 	}
 
 done:
-	if (buf) {
-		kfree(buf);
-	}
-#endif
 	if (force_userver) {
-		instance_type_is_pod = true;
+		platform_type = NEURON_PLATFORM_TYPE_ULTRASERVER;
 	}
 
-	return instance_type_is_pod;
+	return platform_type;
 }
 
-
-
 /* Device Reset Functions */
 /**
  * nr_get_tpb_reset_map() - generates a the reset map of all resources associated with resetting a particular TPB
@@ -416,7 +398,7 @@ static int nr_wait_for_reset_completion_v3_emu(struct neuron_device *nd)
  */
 static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successful)
 {
-	if (!ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
 		return 0;
 	}
 
@@ -744,26 +726,26 @@ static uint32_t ndmar_get_h2t_eng_id_v3(struct neuron_device *nd, uint32_t nc_id
 }
 
 /**
- * ndmar_get_h2t_qid()  - return the H2T engine's queue id for this core
+ * ndmar_get_h2t_def_qid()  - return the H2T engine's default queue id for this core
  *
  * @param nc_id: Neuron core corresponding to H2T engine
  * Return DMA queue id
  */
-static int ndmar_get_h2t_qid_v3(uint32_t nc_id)
+static int ndmar_get_h2t_def_qid_v3(uint32_t nc_id)
 {
  	// on V3 on h2t engines are shared between 2 cores so we give the even core queue 0 and the odd core queue 1
 	return nc_id % V3_NC_PER_SENG;
 }
 
 /**
- * ndmar_is_h2t_q() - return true
+ * ndmar_is_h2t_def_q() - return true
  *
  * @param nd: Neuron device which contains the DMA engine
  * @param eng_id: engine id
  * @param q_id:  queue id
- * Return true if this is an h2t queue
+ * Return true if this is a default h2t queue
  */
-static bool ndmar_is_h2t_q_v3(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id)
+static bool ndmar_is_h2t_def_q_v3(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id)
 {
 	return (nd->ndma_engine[eng_id].used_for_h2t && ((q_id == 0) || (q_id == 1)));
 }
@@ -915,7 +897,7 @@ const int trn2_32xl_neigbor_ids[16][4] = {
  */
 static int fw_io_topology_v3(struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count)
 {
-	// V3 does not have Pacific support to detect east/west/south/north neighbors like V1,
+	// V3 does not have Pacific support to detect east/west/south/north neighbors,
 	// so its topology is hardcoded based on instance type.
 	*count = 0;
 
@@ -924,6 +906,9 @@ static int fw_io_topology_v3(struct fw_io_ctx *ctx, int pdev_index, int device_i
 
 	switch (pdev_index) {
 		case TRN2_DEVICE_ID0: // Trn2
+		case TRN3_DEVICE_ID0: // Trn3
+		case TRN3_DEVICE_ID1: // Trn3
+			// TODO PDS update
 			if (total_neuron_devices % 16 == 0) {
 				int i;
 				*count = 4;
@@ -977,6 +962,39 @@ static int fw_io_read_csr_array_v3(void **ptrs, u32 *values, u32 num_csrs, bool
 }
 
 
+/**
+ * fw_io_execute_request() - Execute commands
+ *
+ * @param ctx: FWIO context
+ * @param command_id: command ID
+ * @param req: Request data
+ * @param req_size: Request size
+ * @param resp: Response buffer
+ * @param resp_size: Response buffer size
+ *
+ * @return int: 0 on success, -1 on failure
+ */
+static int fw_io_execute_request_v3(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size)
+{
+	return fw_io_execute_request(ctx, command_id, req, req_size, resp, resp_size);
+}
+
+
+/**
+ * fw_io_post_metric() - Post metrics to CW
+ *
+ * @param ctx: FWIO context
+ * @param data: Data to write
+ * @param size: Size of data
+ *
+ * @return int: 0 on success, -1 on failure
+ */
+static int fw_io_post_metric_v3(struct fw_io_ctx *ctx, u8 *data, u32 size)
+{
+	return fw_io_post_metric(ctx, data, size);
+}
+
+
 /* Register Access (read and write) Functions */
 /**
  * reg_read32_array() - read an array of 32bit registers.
@@ -1052,7 +1070,7 @@ static int root_info_node_attrs_info_tbl_cnt_v3 = sizeof(root_info_node_attrs_in
  * @param attr_info_tbl: the ecc attributes as an array
  * @return int 0 on success; otherwise on failure
  *
- * Note: ecc errors are only supported by sysfs for V2/3. TODO: V1 support will be added
+ * Note: ecc errors are only supported by sysfs for V2/3.
  */
 static int nsysfsmetric_add_ecc_nodes_v3(struct nsysfsmetric_metrics *metrics,
                                   struct nsysfsmetric_node *stats_node,
@@ -1297,18 +1315,34 @@ static int neuron_pci_handle_dup_routing_id(void)
 // for V3 rename Neuron devices for better customer experience.
 // see internal documentation: TRN2-Discovery
 // map routing id to user id:
-static const u32 v3_routing_id_to_user_id[] = {
+static const u32 v3_torus_routing_id_to_user_id[] = {
 	0,	3,	4,	7,
 	12,	15,	8,	11,
 	1,	2,	5,	6,
 	13,	14,	9,	10 };
 
-#define V3_ROUTING_ID_TBL_SZ  (sizeof(v3_routing_id_to_user_id) / sizeof(v3_routing_id_to_user_id[0]))
+// map routing id to user id for trn2pds instance type.
+// the only hard rule this map needs to follow is
+// rid (i*2) and rid (i*2)+1 map to did (n*2) and did (n*2)+1
+// since rid (i*2) and rid (i*2)+1 are on the same JBOG.
+static const u32 v3_pds_routing_id_to_user_id[] = {
+	0, 1,
+	2, 3,
+	4, 5,
+	6, 7,
+	8, 9,
+	10, 11,
+	12, 13,
+	14, 15 };
+
+#define V3_ROUTING_ID_TBL_SZ  (sizeof(v3_torus_routing_id_to_user_id) / sizeof(v3_torus_routing_id_to_user_id[0]))
 
 static u32 neuron_pci_routing_id_to_user_id(u32 routing_id)
 {
-	u32 user_id_base = v3_routing_id_to_user_id[ routing_id % V3_ROUTING_ID_TBL_SZ];
-	return user_id_base + (routing_id / V3_ROUTING_ID_TBL_SZ) * V3_ROUTING_ID_TBL_SZ;
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		return v3_pds_routing_id_to_user_id[ routing_id % V3_ROUTING_ID_TBL_SZ];
+	}
+	return v3_torus_routing_id_to_user_id[ routing_id % V3_ROUTING_ID_TBL_SZ];
 }
 
 /**
@@ -1317,14 +1351,13 @@ static u32 neuron_pci_routing_id_to_user_id(u32 routing_id)
  * @param dev: PCI device
  * @param nd: neuron device
  * @return int: 0 on success, otherwise on failure
- *
- * for V1, this function is dummy
  */
 static int neuron_pci_get_device_id_v3(struct neuron_device *nd, struct pci_dev *dev)
 {
 	int ret = 0;
 	int i;
 	u32 routing_id = (u32)-1;
+	u32 routing_id_max = MAX_NEURON_DEVICE_COUNT;
 
 	// Poll the device id until the device is ready
 	for (i = 0; i < 20; i++) {
@@ -1340,8 +1373,28 @@ static int neuron_pci_get_device_id_v3(struct neuron_device *nd, struct pci_dev
 		return -ENODEV;
 	}
 
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		u32 server_info = 0;
+		bool server_id_valid = 0;
+		u32 server_id = 0;
+		ret = fw_io_server_info_read(nd->npdev.bar0, &server_info);
+		if (ret) {
+			return -ENODEV;
+		}
+
+		server_id_valid = (server_info >> 15) & 0x1; // TODO PDS we probably need const shift value or macro
+		if (server_id_valid) {
+			server_id = server_info & 0x7fff; // TODO PDS we probably need constant mask for this
+		} else {
+			pr_err("Could not retrieve valid server id, ret = %d\n", ret);
+			return -ENODEV;
+		}
+		ndhal->ndhal_arch.server_id = server_id;
+		routing_id_max = MAX_NEURON_DEVICE_COUNT * 2;
+	}
+
 	// TODO - this should be a "valid routing_id check for TRN2
-	if (routing_id < 0 || routing_id >= MAX_NEURON_DEVICE_COUNT) {
+	if (routing_id < 0 || routing_id >= routing_id_max) {
 		pr_err("Invalid device index %u", routing_id);
 		return -ENODEV;
 	}
@@ -1374,8 +1427,16 @@ neuron_pci_device_id_to_rid_map_v3(uint32_t * count, uint32_t * did_to_rid_map)
 
 	switch (ndhal->pci_device_id) {
 		case TRN2_DEVICE_ID0:
+		case TRN3_DEVICE_ID0:
+		case TRN3_DEVICE_ID1:
 			for (i = 0; i < total_neuron_devices; i++) {
-				did_to_rid_map[neuron_pci_routing_id_to_user_id(i)] = i;
+				u32 routing_id;
+				if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+					routing_id = i + ndhal->ndhal_arch.server_id * total_neuron_devices;
+				} else {
+					routing_id = i;
+				}
+				did_to_rid_map[neuron_pci_routing_id_to_user_id(routing_id)] = routing_id;
 			}
 			break;
 
@@ -1408,7 +1469,7 @@ neuron_pci_device_id_to_rid_map_v3(uint32_t * count, uint32_t * did_to_rid_map)
  *
  *           - Version 3 of runtime requires 1) aligned memory allocation support  2) SPROT.
  *           - Version 4 of the runtime requires support for DMA queue init w/o already allocated rings (2.7).
- *           - Version 5 of the runtime requires V2 device renumbering (don't care for V1).
+ *           - Version 5 of the runtime requires V2 device renumbering (don't care before V2).
  *           - Version 6 of the runtime requires ham notification support,
  *              + new V2 reset api for single-tpb reset + new notification init API with force mem realloc/resize.
  *           - Version 7 of the runtime requires udma queue size support for non power of 2 rings + dmabuf support.
@@ -1431,7 +1492,7 @@ static void ncdev_compatible_version_v3(struct neuron_ioctl_compatible_version *
 }
 
 /**
- * ncdev_quiesce_exec_on_proc_exit() - for V1, before resetting DMA, allow current NeuronCore execution to finish and settle
+ * ncdev_quiesce_exec_on_proc_exit()
  *
  * Note:
  *      When a process is killed, the driver resets DMA but there is no
@@ -1455,61 +1516,6 @@ static void ncdev_quiesce_exec_on_proc_exit_v3(void)
 	return;
 }
 
-/**
- * ncdev_bar_write_data() - write data to bar
- *
- * @param nd: neuron device
- * @param bar: the BAR to write to
- * @param reg_addresses
- * @param data: the data to be written into the bar
- * @param data_count: the number of data to be written
- * @return 0 on success, otherwise failure
- *
- * V1:
- *    For BAR0 the addresses are passed as array(random access).
- *    For BAR2 a single address is provided and driver does sequential writes.
- * V2/3:
- *    Only BAR0 is used right now. TODO: change runtime ioctl
-*/
-static int ncdev_bar_write_data_v3(struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count)
-{
-	if (bar == 0) {
-		int i;
-		for (i = 0; i < data_count; i++) {
-			u64 off = reg_addresses[i] - (u64)nd->npdev.bar0;
-			if (off > nd->npdev.bar0_size) {
-				return -EINVAL;
-			}
-			if (ndhal->ndhal_ndma.ndma_is_bar0_write_blocked(off)) {
-				return -EINVAL;
-			}
-			writel(data[i], nd->npdev.bar0 + off);
-			trace_bar_write(nd, bar, off, data[i]);
-		}
-	} else if (bar == 4) {
-		// TODO: we don't have any use case for r/w memory over the BAR right now.  Disabling.
-		//
-		// We'd like to use DMA for r/w of BAR4 because we might expect access to large amounts of data.
-		// Access via DMA requires an application to own a TPB because it determines which of the h2t DMAs
-		// are safe to use, otherwise a TPB along with its DMA could be reset while that DMA is used here.
-		// Don't want/need to solve it now.
-		return -EINVAL;
-
-		/*
-		dma_addr_t dst_addr = reg_addresses[0] - (u64)nd->npdev.bar0;
-
-		ret = ndma_memcpy(nd, 0, virt_to_phys(data) | ndhal->ndhal_address_map.pci_host_base, dst_addr, data_size);
-		if (ret)
-			return ret;
-		*/
-	} else {
-		pr_err("direct BAR%d write is not supported.\n", bar);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static void ncdev_get_default_tpbs_for_hbm_v3(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count)
 {
 	tpbs[0] = hbm_index * 2;
@@ -1517,43 +1523,6 @@ static void ncdev_get_default_tpbs_for_hbm_v3(u32 hbm_index, u32 tpbs[MAX_NC_PER
 	*tpb_count = 2;
 }
 
-/* UDMA Functions */
-#define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16)
-/**
- * udma_m2s_data_rd_cfg_boundaries_set(): set data_rd_cfg to break at 256B boundaries
- *
- * @param udma: the UDMA structure
- *
- * for V1, this function is dummy
- */
-static void udma_m2s_data_rd_cfg_boundaries_set_v3(struct udma *udma)
-{
-	reg_write32(&udma->udma_regs_m2s->axi_m2s.data_rd_cfg,
-	  UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY | 0x8);
-}
-
-#define UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB (1 << 2)
-/**
- * udma_q_config() - set misc queue configurations
- *
- * @param udma_q udma_q: the queue data structure
- *
- * for V1, this function is dummy
- */
-static void udma_q_config_v3(struct udma_q *udma_q)
-{
-	if (udma_q->type != UDMA_TX) {
-		return;
-	}
-
-	uint32_t *reg_addr = &udma_q->q_regs->m2s_q.rlimit.mask;
-	uint32_t val = udma_q->rlimit_mask;
-
-	// enable DMB
-	val &= ~UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB;
-	reg_write32(reg_addr, val);
-}
-
 
 /* NDMA Functions */
 /**
@@ -1570,7 +1539,7 @@ static void ndma_get_wait_for_completion_time_v3(u32 count, bool async, u64 *fir
 {
 	u64 est_wait_time = 2 * (count -1);
 	*first_wait_time = async ? 1 : (est_wait_time - 1);  // FIXME - need to adjust for zerocopy
-	*following_wait_time = (est_wait_time * 100) - *first_wait_time;
+	*following_wait_time = (est_wait_time * 400) - *first_wait_time;
 
 	// for some reason getting a timeout when staging some of BERT training graphs.
 	// https://tiny.amazon.com/8jw7wl18
@@ -1592,16 +1561,12 @@ static void ndma_get_wait_for_completion_time_v3_emu(u32 count, bool async, u64
 
 /**
  * ndma_validate_pa() - check the validity of the desc physical addresses
- *      V1:
  *         west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000
  *         If west side is set then even host bit is set. When mc_alloc is called we set only the host bit
  *         and insert into tree.. If some one sets the west side on that PA, then there is no way to check that,
  *         since there could be a tdram address that could have the west side set
  *         (that will look as though host is also set)
- *      V2:
- *         similar idea.  Just check for valid address allocated in host memory
- *      V3:
- *         similar idea.  Just check for valid address allocated in host memory
+ *         Just check for valid address allocated in host memory
  *
  * @param nd: the neuron device
  * @param pa: the desc physical addresses
@@ -1838,7 +1803,7 @@ static bool ndhal_die_flipped(void)
 	if (force_die_flip) {
 		return true;
 	}
-	if (!ndhal_instance_type_pod()) {
+	if (!(ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER)) {
 		return false;
 	}
 
@@ -1897,6 +1862,7 @@ static void ndma_get_engines_with_host_connectivity_v3(u32 hbm_index, u32 engine
 
 
 /* POD Functions */
+
 /**
  * npe_notify_mark() - api for crwl to notify range marking (core claiming) activities
  *
@@ -1906,7 +1872,7 @@ static void ndma_get_engines_with_host_connectivity_v3(u32 hbm_index, u32 engine
  */
 static void npe_notify_mark_v3(int mark_cnt, bool mark)
 {
-	if (ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type != NEURON_PLATFORM_TYPE_STD) {
 		npe_notify_mark(mark_cnt, mark);
 	}
 }
@@ -1923,17 +1889,26 @@ static void npe_notify_mark_v3(int mark_cnt, bool mark)
  */
 static int npe_pod_info_v3(u8 *pod_type, u8 *pod_id, u8 *pod_sz, enum neuron_ultraserver_mode *mode, u32 *modes_supported)
 {
-	if (!ndhal_instance_type_pod()) {
+	if ((ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD)) {
 		*pod_type = NEURON_POD_TYPE_NONE;
 		*pod_sz = 0;
 		*mode = NEURON_ULTRASERVER_MODE_UNSET;
 		*modes_supported = 0;
-	} else {
+	} else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		*pod_type = NEURON_POD_TYPE_SWITCH;
+		npe_get_pod_sz(pod_sz);
+		npe_get_pod_id(pod_id);
+		npe_get_pod_mode(mode);
+		npe_get_pod_modes_supported(modes_supported);
+	} else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
 		*pod_type = NEURON_POD_TYPE_P2P;
 		npe_get_pod_sz(pod_sz);
 		npe_get_pod_id(pod_id);
 		npe_get_pod_mode(mode);
 		npe_get_pod_modes_supported(modes_supported);
+	} else {
+		pr_err("invalid platform type");
+		return -EINVAL;
 	}
 	return 0;
 }
@@ -1947,7 +1922,7 @@ static int npe_pod_info_v3(u8 *pod_type, u8 *pod_id, u8 *pod_sz, enum neuron_ult
  */
 static int npe_pod_status_v3(u32 *pod_state, s8 *node_id)
 {
-	if (!ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
 		*pod_state = NEURON_POD_E_STATE_SINGLE_NODE;
 		*node_id = -1;
 		return 0;
@@ -1967,12 +1942,23 @@ static int npe_pod_status_v3(u32 *pod_state, s8 *node_id)
  */
 static int npe_pod_ctrl_v3(struct neuron_device *nd, u32 pod_ctrl, enum neuron_ultraserver_mode mode, u32 timeout, u32 *pod_state)
 {
-	if (!ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
 		return 0;
 	}
 	return npe_pod_ctrl(nd, pod_ctrl, mode, timeout, pod_state);
 }
 
+static int perf_set_profile_v3(struct neuron_device *nd, uint32_t profile)
+{
+	int ret;
+	ret = fw_io_set_power_profile(nd->fw_io_ctx, profile);
+	if (ret == 0) {
+		ndhal->ndhal_perf.current_performance_profile = profile;
+		nmetric_set_performance_profile(nd, profile);
+	}
+    return ret;
+}
+
 /**
  * npe_class_node_id_show_data() - return sysfs class node_id
  *
@@ -1982,7 +1968,7 @@ static int npe_pod_ctrl_v3(struct neuron_device *nd, u32 pod_ctrl, enum neuron_u
  */
 static ssize_t npe_class_node_id_show_data_v3(char *buf, u32 sz)
 {
-	if (!ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
     	return dhal_sysfs_emit(buf, "-1\n");
 	}
 	return npe_class_node_id_show_data(buf, sz);
@@ -1997,7 +1983,7 @@ static ssize_t npe_class_node_id_show_data_v3(char *buf, u32 sz)
  */
 static 	ssize_t npe_class_server_id_show_data_v3(char *buf, u32 sz)
 {
-	if (!ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
     	return dhal_sysfs_emit(buf, "0000000000000000\n");
 	}
 	return npe_class_server_id_show_data(buf, sz);
@@ -2011,7 +1997,7 @@ static 	ssize_t npe_class_server_id_show_data_v3(char *buf, u32 sz)
  */
 static ssize_t npe_class_ultraserver_mode_show_data_v3(char *buf)
 {
-	if (!ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) {
     	return dhal_sysfs_emit(buf, "\n");
 	}
     return npe_class_ultraserver_mode_show_data(buf);
@@ -2053,7 +2039,7 @@ static int ntpb_pe_get_aggregated_wl_cycle_cnt_v3(struct neuron_device *nd, int
  */
 static void ndhal_ext_cleanup_v3(void)
 {
-	if (ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
     	npe_cleanup();
 	}
 	return;
@@ -2083,6 +2069,7 @@ int ndhal_register_funcs_v3(void) {
 		return -EINVAL;
 	}
 
+	ndhal->ndhal_arch.platform_type = ndhal_platform_type_v3();
 	ndhal->ndhal_address_map.pci_host_base = V3_PCIE_A0_BASE;
 	ndhal->ndhal_address_map.mmap_p_offset = V3_MMAP_P_OFFSET;
 	ndhal->ndhal_address_map.mmap_nc_event_offset = V3_MMAP_NC_EVENT_OFFSET;
@@ -2102,8 +2089,6 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_address_map.dma_eng_per_nc = V3_DMA_ENG_PER_NC;
 	ndhal->ndhal_address_map.dram_channels = V3_MAX_DRAM_CHANNELS;
 	ndhal->ndhal_reset.reset_poll_interval = V3_NR_RESET_POLL_INTERVAL;
-	ndhal->ndhal_reset.reset_device_initial_poll_delay = 0;
-	ndhal->ndhal_reset.reset_tpb_initial_poll_delay = 0;
 	ndhal->ndhal_reset.initiate_max_wait_time = V3_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS;
 	ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT;
 	ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v3;
@@ -2120,8 +2105,8 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v3;
 	ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v3;
 	ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v3;
-	ndhal->ndhal_ndmar.ndmar_get_h2t_qid = ndmar_get_h2t_qid_v3;
-	ndhal->ndhal_ndmar.ndmar_is_h2t_q = ndmar_is_h2t_q_v3;
+	ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v3;
+	ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v3;
 	ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v3;
 	ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v3;
 	ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v3;
@@ -2129,6 +2114,8 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v3;
 	ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v3;
 	ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v3;
+	ndhal->ndhal_fw_io.fw_io_execute_request = fw_io_execute_request_v3;
+	ndhal->ndhal_fw_io.fw_io_post_metric = fw_io_post_metric_v3;
 	ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v3;
 	ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v3;
 	ndhal->ndhal_sysfs_metrics.root_info_node_attrs_info_tbl_cnt = root_info_node_attrs_info_tbl_cnt_v3;
@@ -2137,6 +2124,7 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count = nsysfsmetric_get_hbm_error_count_v3;
 	ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_tensor_engine_node = nsysfsmetric_add_tensor_engine_node_v3;
 	ndhal->ndhal_pci.axi_bar = BAR_UNUSED;
+	ndhal->ndhal_pci.apb_bar = 0;
 	ndhal->ndhal_pci.dram_bar = 4;
 	ndhal->ndhal_pci.neuron_pci_release_bar = neuron_pci_release_bar_v3;
 	ndhal->ndhal_pci.neuron_pci_reserve_bar = neuron_pci_reserve_bar_v3;
@@ -2147,12 +2135,9 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v3;
 	ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v3;
 	ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v3;
-	ndhal->ndhal_cdev.ncdev_bar_write_data = ncdev_bar_write_data_v3;
 	ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = ncdev_logical_to_physical_nc_map_v3;
 	ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v3;
 	ndhal->ndhal_udma.num_beats = 2296;  // allow up to 288 outstanding writes
-	ndhal->ndhal_udma.udma_m2s_data_rd_cfg_boundaries_set = udma_m2s_data_rd_cfg_boundaries_set_v3;
-	ndhal->ndhal_udma.udma_q_config = udma_q_config_v3;
 	ndhal->ndhal_ndma.ndma_retry_memcpy = false;
 	ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3;
 	ndhal->ndhal_ndma.ndma_validate_pa = ndma_validate_pa_v3;
@@ -2167,6 +2152,8 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_npe.npe_class_node_id_show_data = npe_class_node_id_show_data_v3;
 	ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v3;
 	ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v3;
+	ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v3;
+	ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v3;
 	ndhal->ndhal_tpb.pe_xbus_count = 9;
 	ndhal->ndhal_tpb.pe_row_grp_count = 4;
 	ndhal->ndhal_tpb.pe_col_grp_count = 4;
@@ -2176,6 +2163,7 @@ int ndhal_register_funcs_v3(void) {
 	ndhal->ndhal_tpb.pe_fast_wl_cntr_offsets = ntpb_pe_fast_wl_cntr_offsets_v3;
 	ndhal->ndhal_tpb.pe_idle_cntr_offsets = ntpb_pe_idle_cntr_offsets_v3;
 	ndhal->ndhal_tpb.pe_get_aggregated_wl_cycle_cnt = ntpb_pe_get_aggregated_wl_cycle_cnt_v3;
+	ndhal->ndhal_perf.current_performance_profile = 0;
 	ndhal->ndhal_ext_cleanup = ndhal_ext_cleanup_v3;
 
 	extern unsigned int nmetric_log_posts;
@@ -2185,7 +2173,6 @@ int ndhal_register_funcs_v3(void) {
 		ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3_qemu;
 		ndhal->ndhal_address_map.dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC;
 		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu;
-		ndhal->ndhal_pci.apb_bar = 2;
 		ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_qemu;
 		ndhal->ndhal_address_map.dice_per_device = 1;
 
@@ -2199,24 +2186,27 @@ int ndhal_register_funcs_v3(void) {
 		ndhal->ndhal_address_map.nc_per_device = nc_per_dev_param;
 		ndhal->ndhal_address_map.dev_nc_map = dev_nc_map;
 		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu;
-		ndhal->ndhal_pci.apb_bar = 0;
 		ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_emu;
 		ndhal->ndhal_address_map.dice_per_device = 1;
 
 		// Disable metrics on emulation
 		nmetric_log_posts = 0;
-
 	} else {
 		ndhal->ndhal_reset.nr_initiate_reset = nr_initiate_reset_v3;
 		ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3;
 		ndhal->ndhal_address_map.dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC;
 		ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3;
-		ndhal->ndhal_pci.apb_bar = 0;
 	}
 
-	if (ndhal_instance_type_pod()) {
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
 		ret = npe_init();
-	}
+		if (ret) {
+			pr_err("failed to initialize pod election on V3\n");
+			return ret;
+		}
+	} else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		// TODO PDS
+	}	
 
 	switch (ndhal->pci_device_id) {
 		case TRN2_DEVICE_ID0:
@@ -2226,6 +2216,10 @@ int ndhal_register_funcs_v3(void) {
 				return ret;
 			}
 			break;
+		// TODO: remove once v4 dhal stops re-using v3
+		case TRN3_DEVICE_ID0:
+		case TRN3_DEVICE_ID1:
+			break;
 		default:
 			pr_err("Unknown HW architecture. Can't init neuron_dhal.\n");
 			return -EINVAL;
diff --git a/v3/neuron_pelect.c b/v3/neuron_pelect.c
index 7bcb027..a9d1b0d 100644
--- a/v3/neuron_pelect.c
+++ b/v3/neuron_pelect.c
@@ -157,6 +157,16 @@
 #include "../neuron_crwl.h"
 #include "neuron_pelect.h"
 
+int userver_pds_node_cnt = 2;
+module_param(userver_pds_node_cnt, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(userver_pds_node_cnt, "pds ultraserver node count");
+
+int userver_pds_server_id = 0x0001;
+module_param(userver_pds_server_id, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
+MODULE_PARM_DESC(userver_pds_server_id, "pds ultraserver id");
+
+
+/* Enable ultraserver auto election (4 node configuration) by default  */
 /*
  * UltraServer ctl to
  * - control of when election is triggered
@@ -283,6 +293,8 @@ typedef struct pod_neighbor_io {
 	struct mem_chunk *data_mc;
 } pod_neighbor_io_t;
 
+static void npe_pds_spoof(void);
+
 static bool npe_pod_ctl_is_set(int value)
 {
 	return (pod_ctl & value);
@@ -844,11 +856,11 @@ static int npe_primary_device_do_election(struct neuron_device *nd, int secondar
 
 	// Initialize neighbor io structures
 	// Left
-	ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, 36);
-	ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, 68);
+	ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][0]);
+	ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][1]);
 	// Right
-	ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, 4);
-	ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, 100);
+	ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][0]);
+	ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][1]);
 	if (ret) {
 		pr_err("neighbor io initialization failed");
 		goto done;
@@ -914,7 +926,6 @@ static int npe_primary_device_do_election(struct neuron_device *nd, int secondar
 	// determine our node id node cnt and pod serial number
 	//
 	node_id = npe_get_node_id(serial_number, nbr_serial_number[0], nbr_serial_number[1], diagonal, &node_cnt, &pod_serial_number);
-	ret = 0;
 
 	// set election status, with bad node id
 	//
@@ -988,11 +999,11 @@ static int npe_secondary_device_vet(struct neuron_device *nd, volatile long unsi
 
 	// Initialize neighbor io structures
 	// Left
-	ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, 36);
-	ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, 68);
+	ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][0]);
+	ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][1]);
 	// Right
-	ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, 4);
-	ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, 100);
+	ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][0]);
+	ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][1]);
 
 	if (ret) {
 		pr_err("nd%02d: neighbor io initialization failed", nd->device_index);
@@ -1041,6 +1052,9 @@ static int npe_secondary_device_vet(struct neuron_device *nd, volatile long unsi
 			ret = -EPIPE;
 		}
 	}
+	if (ret) {
+		goto done;
+	}
 
 	// set election status, check neighbor's election status, and
 	// clear election data (but not election status) from miscram.
@@ -1154,7 +1168,7 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful)
 	// Device 0 is the primary actor in the election/topology discovery process, so 
 	// when we process Device 0 reset completions, we need to do some bookkeeping.
 	//
-	if (nd->device_index == 0) {
+	if ((nd->device_index == 0) && (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER)) {
 		// Prior election results are cached in miscram, for testing purposes, 
 		// we can clear the results through a module parameter, allowing us
 		// to ignore the cached results.
@@ -1188,6 +1202,13 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful)
 			goto done;
 		}
 	}
+
+	// spoof PDS topology/election data
+	//
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		npe_pds_spoof();
+		goto done;
+	}
 	
 	// if we aren't kicking off election on first driver reset (testing) or 
 	// if we aren't in init state then we've already made an election decision.
@@ -1201,7 +1222,7 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful)
 	if (!npe_all_rst_complete()) {
 			goto done;
 	}
-
+	
 	npe_initiate_election(ndhal_pelect_data.nbr_data_read_timeout);
 
 done:
@@ -1563,6 +1584,12 @@ int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mod
 	} else if (ctrl == NEURON_NPE_POD_CTRL_REQ_POD) {
 		int mark_cnt = ncrwl_range_mark_cnt_get();
 
+		// no election required on PDS, return success
+		if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+			ret = 0;
+			goto done;
+		}
+
 		if ((mark_cnt == 0) && npe_all_rst_complete()) {
 			npe_initiate_election(timeout * 1000);
 			ret = 0;
@@ -1774,6 +1801,42 @@ ssize_t npe_class_ultraserver_mode_show_data(char *buf)
 	return dhal_sysfs_emit(buf, "%s\n", output);
 }
 
+/* npe_pds_spoof(void)
+ *
+ *   temp spoof of PDS platform data
+ *
+ */
+static void npe_pds_spoof(void)
+{
+	static bool initialized = false;
+	pr_info("spoofing pds data");
+	
+	if (initialized) {
+		return;
+	}
+
+	ndhal_pelect_data.node_cnt = userver_pds_node_cnt;
+
+	if (ndhal_pelect_data.node_cnt == 0) {
+		ndhal_pelect_data.node_id = -1;
+	} else if (ndhal_pelect_data.node_cnt == 2) {
+		// node_cnt of 2 uses V-links
+		ndhal_pelect_data.lr_mask = 0x1;
+		ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id;
+	} else if (ndhal_pelect_data.node_cnt == 4) {
+		// TODO PDS add in rack id
+		ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id; 
+	} else {
+		ndhal_pelect_data.node_cnt = 0;
+		pr_err("invalid PDS node count of %d", ndhal_pelect_data.node_cnt);
+	}
+
+	ndhal_pelect_data.pod_serial_num = userver_pds_server_id;
+	ndhal_pelect_data.pod_state_internal = NEURON_NPE_POD_ST_ELECTION_SUCCESS;
+	
+	initialized = true;
+}
+
 int npe_init(void)
 {
 	// force election to use right or left link only
diff --git a/v4/address_map.h b/v4/address_map.h
new file mode 100644
index 0000000..4391557
--- /dev/null
+++ b/v4/address_map.h
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2025 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef __V4_ADDR_MAP_H__
+
+// Host memory access
+#define V4_PCIE_A0_BASE                        0x00400000000000ull
+
+// Neighbor memory access
+#define V4_PCIE_B0_0_BASE                      0x10000000000000ull
+#define V4_PCIE_B0_1_BASE                      0x14000000000000ull
+#define V4_PCIE_B0_2_BASE                      0x18000000000000ull
+#define V4_PCIE_B0_3_BASE                      0x1c000000000000ull
+
+// relative to nc
+#define V4_MMAP_P_OFFSET 0x0000000d0000000ull
+#define V4_MMAP_NC_EVENT_OFFSET 0x00000002700000ull
+#define V4_MMAP_NC_SEMA_READ_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull
+#define V4_MMAP_NC_SEMA_SET_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull
+#define V4_MMAP_NC_SEMA_INCR_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001800ull
+#define V4_MMAP_NC_SEMA_DECR_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001c00ull
+#define V4_MMAP_NC_SEMA_SIZE 0x00000000002000ull
+
+
+#define V4_PCIE_BAR0_APB_IO_0_OFFSET 0x000000000000000ull
+#define V4_PCIE_BAR0_APB_IO_1_OFFSET 0x000000040000000ull
+#define V4_PCIE_BAR0_APB_SE_0_OFFSET 0x000000080000000ull
+#define V4_PCIE_BAR0_APB_SE_1_OFFSET 0x000000090000000ull
+#define V4_PCIE_BAR0_APB_SE_2_OFFSET 0x0000000a0000000ull
+#define V4_PCIE_BAR0_APB_SE_3_OFFSET 0x0000000b0000000ull
+#define V4_PCIE_BAR0_APB_IO_0_SIZE   0x000000020000000ull
+
+
+// relative to bar0
+#define V4_APB_IO_0_USER_IO_RELBASE                0x00000006800000ull
+#define V4_APB_IO_0_USER_IO_MISC_RAM_RELBASE       0x00000000484000ull
+
+// relative to address space
+#define V4_APB_MISC_RAM_OFFSET 0x00008006c84000ull
+
+#define V4_MMAP_NC_SIZE 0x00000804000000ull
+
+//
+#define V4_NUM_DIE_PER_DEVICE 2
+#define V4_NUM_SENG_PER_DIE   2
+#define V4_SENG_PER_DEVICE    (V4_NUM_DIE_PER_DEVICE * V4_NUM_SENG_PER_DIE)
+
+// Number of Neuron Core per device
+#define V4_NC_PER_SENG 2
+#define V4_NC_PER_DIE  (V4_NUM_SENG_PER_DIE * V4_NC_PER_SENG)
+#define V4_NC_PER_DEVICE (V4_NC_PER_SENG * V4_SENG_PER_DEVICE)
+// Number of DMA engines per NC
+#define V4_DMA_ENG_PER_NC 16
+
+// Number of DMA queues in each engine
+#define V4_DMA_QUEUE_PER_ENG 16
+
+#define V4_NUM_DMA_ENG_PER_SENG   (V4_NC_PER_SENG * V4_DMA_ENG_PER_NC)
+#define V4_NUM_SENG_DMA_PER_DEVICE (V4_NC_PER_DEVICE * V4_DMA_ENG_PER_NC)
+#define V4_NUM_H2D_DMA_PER_DEVICE 4
+#define V4_NUM_DMA_ENG_PER_DEVICE (V4_NUM_SENG_DMA_PER_DEVICE + V4_NUM_H2D_DMA_PER_DEVICE)
+#define V4_MAX_DMA_RINGS 16
+
+// Number of TPB engines per NC
+#define V4_TPB_ENG_PER_NC 5
+
+// Number of TOP_SP
+#define V4_TS_PER_NC     2
+#define V4_TS_PER_DIE    (V4_NC_PER_DIE * V4_TS_PER_NC)
+#define V4_TS_PER_DEVICE (V4_NC_PER_DEVICE * V4_TS_PER_NC)
+
+// max channels supported by V4 device
+#define V4_MAX_DRAM_CHANNELS 4
+
+#define V4_SEMAPHORE_COUNT 256
+#define V4_EVENTS_COUNT 256
+
+#define V4_ALLOWED_DESC_PER_PACKET 64
+
+#define V4_MAX_NQ_QUEUES 16
+#define V4_MAX_NQ_TYPE 5
+#define V4_MAX_NQ_SUPPORTED (V4_MAX_NQ_TYPE * V4_MAX_NQ_QUEUES)
+
+#define V4_APB_IO_0_BASE 0x00008000000000ull
+#define V4_APB_IO_1_BASE 0x00808000000000ull
+#define V4_HBM_0_BASE 0x00000000000000ull
+#define V4_HBM_1_BASE 0x00004000000000ull
+#define V4_HBM_2_BASE 0x00800000000000ull
+#define V4_HBM_3_BASE 0x00804000000000ull
+#define V4_HBM_SIZE 0x001000000000ull
+#define V4_HBM_ACTIVE_SIZE 0x900000000ull /* Each HBM stack is actually 36GB not 64GB as above define suggests */
+
+#define V4_PREPROC_0_BASE  0x00001200000000ull
+#define V4_PREPROC_1_BASE  0x00005200000000ull
+#define V4_PREPROC_2_BASE  0x00801200000000ull
+#define V4_PREPROC_3_BASE  0x00805200000000ull
+#define V4_PREPROC_SIZE  0x000000034c0000ull
+
+#define V4_MMAP_TPB_0_BASE 0x00002000000000ull
+#define V4_MMAP_TPB_1_BASE 0x00003000000000ull
+#define V4_MMAP_TPB_2_BASE 0x00006000000000ull
+#define V4_MMAP_TPB_3_BASE 0x00007000000000ull
+#define V4_MMAP_TPB_4_BASE 0x00802000000000ull
+#define V4_MMAP_TPB_5_BASE 0x00803000000000ull
+#define V4_MMAP_TPB_6_BASE 0x00806000000000ull
+#define V4_MMAP_TPB_7_BASE 0x00807000000000ull
+
+#define V4_PCIE_BAR0_TPB_0_OFFSET 0x0000000d0000000ull
+#define V4_PCIE_BAR0_TPB_1_OFFSET 0x0000000d4000000ull
+#define V4_PCIE_BAR0_TPB_2_OFFSET 0x0000000d8000000ull
+#define V4_PCIE_BAR0_TPB_3_OFFSET 0x0000000dc000000ull
+#define V4_PCIE_BAR0_TPB_4_OFFSET 0x0000000e0000000ull
+#define V4_PCIE_BAR0_TPB_5_OFFSET 0x0000000e4000000ull
+#define V4_PCIE_BAR0_TPB_6_OFFSET 0x0000000e8000000ull
+#define V4_PCIE_BAR0_TPB_7_OFFSET 0x0000000ec000000ull
+#define V4_PCIE_BAR0_TPB_SIZE 0x000000004000000ull
+#define V4_PCIE_BAR0_TPB_DIST (V4_PCIE_BAR0_TPB_1_OFFSET - V4_PCIE_BAR0_TPB_0_OFFSET)
+#define V4_PCIE_BAR0_TPB_SBUF_SIZE 0x000000002000000ull
+
+#define V4_PCIE_BAR0_APB_IO_0_OFFSET 0x000000000000000ull
+#define V4_PCIE_BAR0_APB_IO_1_OFFSET 0x000000040000000ull
+#define V4_PCIE_BAR0_APB_IO_DIST (V4_PCIE_BAR0_APB_IO_1_OFFSET - V4_PCIE_BAR0_APB_IO_0_OFFSET)
+
+#define V4_PCIE_BAR0_APB_SE_0_OFFSET 0x000000080000000ull
+#define V4_PCIE_BAR0_APB_SE_1_OFFSET 0x000000090000000ull
+#define V4_PCIE_BAR0_APB_SE_2_OFFSET 0x0000000a0000000ull
+#define V4_PCIE_BAR0_APB_SE_3_OFFSET 0x0000000b0000000ull
+#define V4_PCIE_BAR0_APB_SE_DIST (V4_PCIE_BAR0_APB_SE_1_OFFSET - V4_PCIE_BAR0_APB_SE_0_OFFSET)
+
+#define V4_PCIE_BAR4_HBM_0_OFFSET 0x000000000000000
+#define V4_PCIE_BAR4_HBM_1_OFFSET 0x000001000000000
+#define V4_PCIE_BAR4_HBM_2_OFFSET 0x000002000000000
+#define V4_PCIE_BAR4_HBM_3_OFFSET 0x000003000000000
+
+#define V4_APB_SE_0_USER_FIS_SDMA_0_OFFSET 0x0000000c000000ull
+#define V4_APB_SE_1_USER_FIS_SDMA_0_OFFSET 0x0000000c400000ull
+#define V4_APB_SE_2_USER_FIS_SDMA_0_OFFSET 0x0000000c000000ull
+#define V4_APB_SE_3_USER_FIS_SDMA_0_OFFSET 0x0000000c400000ull
+#define V4_APB_SE_USER_FIS_SDMA_0_FIS_0_USER_ERRTRIG_OFFSET 0x00000000000000ull
+#define V4_APB_SE_USER_FIS_SDMA_0_SIZE 0x00000000020000ull
+
+#define V4_TOP_SP_SIZE    0x00000000400000ull
+#define V4_TOP_SP_0_BASE  0x00008280000000ull
+#define V4_TOP_SP_1_BASE  0x000082c0000000ull
+#define V4_TOP_SP_2_BASE  0x00008300000000ull
+#define V4_TOP_SP_3_BASE  0x00008340000000ull
+#define V4_TOP_SP_4_BASE  0x00008380000000ull
+#define V4_TOP_SP_5_BASE  0x000083c0000000ull
+#define V4_TOP_SP_6_BASE  0x00008400000000ull
+#define V4_TOP_SP_7_BASE  0x00008440000000ull
+#define V4_TOP_SP_8_BASE  0x00008480000000ull
+#define V4_TOP_SP_9_BASE  0x000084c0000000ull
+#define V4_TOP_SP_10_BASE 0x00808280000000ull
+#define V4_TOP_SP_11_BASE 0x008082c0000000ull
+#define V4_TOP_SP_12_BASE 0x00808300000000ull
+#define V4_TOP_SP_13_BASE 0x00808340000000ull
+#define V4_TOP_SP_14_BASE 0x00808380000000ull
+#define V4_TOP_SP_15_BASE 0x008083c0000000ull
+#define V4_TOP_SP_16_BASE 0x00808400000000ull
+#define V4_TOP_SP_17_BASE 0x00808440000000ull
+#define V4_TOP_SP_18_BASE 0x00808480000000ull
+#define V4_TOP_SP_19_BASE 0x008084c0000000ull
+#define V4_TOP_SP_DIST   (V4_TOP_SP_1_BASE - V4_TOP_SP_0_BASE)
+
+#define V4_PCIE_BAR0_TOP_SP_0_OFFSET  0x0000000f0000000ull
+#define V4_PCIE_BAR0_TOP_SP_10_OFFSET 0x0000000f2800000ull
+#define V4_PCIE_BAR0_TOP_SP_0_SIZE    0x000000000400000ull
+
+#define V4_MMAP_TPB_COUNT 8
+#define V4_NUM_DMA_ENGINES_PER_TPB 16
+
+#define V4_D2H_0_IDX 128
+#define V4_H2D_0_IDX 129
+#define V4_D2H_1_IDX 130
+#define V4_H2D_1_IDX 131
+
+#define V4_APB_SE_0_BASE 0x00001000000000ull
+#define V4_APB_SE_1_BASE 0x00005000000000ull
+#define V4_APB_SE_2_BASE 0x00801000000000ull
+#define V4_APB_SE_3_BASE 0x00805000000000ull
+#define V4_APB_SDMA_DIST 0x00000000100000ull
+
+#define V4_APB_SE_0_SDMA_0_BASE 0x00001002000000ull
+#define V4_APB_SE_1_SDMA_0_BASE 0x00005004000000ull
+#define V4_APB_SE_2_SDMA_0_BASE 0x00801002000000ull
+#define V4_APB_SE_3_SDMA_0_BASE 0x00805004000000ull
+#define V4_APB_SDMA_MISC_OFFSET 0x00000000040000ull
+
+#define V4_APB_IO_0_D2H_UDMA_BASE   0x00008006800000ull
+#define V4_APB_IO_0_H2D_UDMA_BASE   0x00008006900000ull
+#define V4_APB_IO_1_D2H_UDMA_BASE   0x00808006800000ull
+#define V4_APB_IO_1_H2D_UDMA_BASE   0x00808006900000ull
+
+#define V4_APB_SENG_0_SDMA_0_NOTIFIC_RELBASE 0x00000000001000ull
+#define V4_APB_IO_0_SE_0_RELBASE 0x00000006000000ull
+#define V4_APB_IO_0_SE_1_RELBASE 0x00000006400000ull
+#define V4_APB_IO_1_SE_0_RELBASE 0x00000006000000ull
+#define V4_APB_IO_1_SE_1_RELBASE 0x00000006400000ull
+#define V4_APB_IO_0_SE_0_TPB_0_SIZE 0x00000000180000ull
+#define V4_APB_IO_0_SE_0_TPB_TOP_RELBASE 0x00000000000000ull
+#define V4_APB_IO_0_SE_0_TPB_TOP_NOTIFIC_RELBASE 0x00000000001000ull
+#define V4_APB_IO_0_SE_0_TPB_NOTIFIC_SIZE 0x00000000001000ull
+#define V4_APB_SENG_0_SDMA_0_APP_RELBASE 0x00000000000000ull
+
+#define V4_APB_IO_0_USER_IO_TOP_SP_0_RELBASE 0x00000000200000ull
+#define V4_APB_IO_0_USER_IO_TOP_SP_0_SIZE 0x00000000040000ull
+#define V4_APB_IO_0_USER_IO_TOP_SP_0_NOTIFIC_RELBASE 0x00000000000000ull
+
+#define V4_MMAP_BAR0_APB_IO_0_MISC_RAM_OFFSET      V4_PCIE_BAR0_APB_IO_0_OFFSET + V4_APB_IO_0_USER_IO_RELBASE + V4_APB_IO_0_USER_IO_MISC_RAM_RELBASE
+
+#define V4_APB_IO_0_USER_SE_0_RESERVED2_RELBASE 0x00000006300000ull
+
+#define V4_APB_IO_0_USER_SE_0_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00008006141000ull
+#define V4_APB_IO_0_USER_SE_0_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x000080062c1000ull
+#define V4_APB_IO_0_USER_SE_1_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00008006541000ull
+#define V4_APB_IO_0_USER_SE_1_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x000080066c1000ull
+#define V4_APB_IO_1_USER_SE_0_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00808006141000ull
+#define V4_APB_IO_1_USER_SE_0_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x008080062c1000ull
+#define V4_APB_IO_1_USER_SE_1_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00808006541000ull
+#define V4_APB_IO_1_USER_SE_1_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x008080066c1000ull
+
+#define V4_TPB_ARR_SEQ_QUEUE_PERF_BASE_OFFSET 0x00000000000100ull
+#define V4_TPB_ARR_SEQ_QUEUE_PERF_SIZE 0x00000000000030ull
+#define V4_TPB_ARR_SEQ_QUEUE_PERF_MATMUL_ACTIVE_CYCLE_CNT_LSB_OFFSET 0x00000000000000ull
+#define V4_TPB_ARR_SEQ_QUEUE_PERF_WL_ACTIVE_CYCLE_CNT_LSB_OFFSET 0x00000000000008ull
+#define V4_TPB_ARR_SEQ_QUEUE_PERF_IDLE_CYCLE_CNT_LSB_OFFSET 0x00000000000010ull
+
+#endif
diff --git a/v4/neuron_dhal_v4.c b/v4/neuron_dhal_v4.c
new file mode 100644
index 0000000..798719b
--- /dev/null
+++ b/v4/neuron_dhal_v4.c
@@ -0,0 +1,468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Copyright 2025, Amazon.com, Inc. or its affiliates. All Rights Reserved
+*/
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+
+#include "address_map.h"
+#include "../neuron_dhal.h"
+#include "../neuron_reset.h"
+#include "../neuron_arch.h"
+#include "../neuron_cdev.h"
+#include "../neuron_pci.h"
+#include "../v3/neuron_pelect.h"
+
+
+// TOP SP addresses are sparse on chip adjust to accommodate the table macro
+//
+#define V4_TOP_SP_GRP1_BASE V4_TOP_SP_0_BASE
+#define V4_TOP_SP_GRP2_BASE (V4_TOP_SP_10_BASE - 8 * V4_TOP_SP_DIST)
+
+#define V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET V4_PCIE_BAR0_TOP_SP_0_OFFSET
+#define V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET (V4_PCIE_BAR0_TOP_SP_10_OFFSET - 8 * V4_TOP_SP_SIZE)
+
+struct neuron_dm_special_mmap_ent dm_mmap_special_v4[] = {
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   0, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   1, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   2, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   3, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   4, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   5, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   6, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   7, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0),
+
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   0, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   1, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   2, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   3, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   4, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   5, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   6, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB,   7, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0),
+
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 0, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 1, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 2, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 3, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 4, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 5, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 6, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 7, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP,  8, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP,  9, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 10, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 11, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 12, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 13, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 14, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 15, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST,  V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0),
+
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 0, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 1, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 2, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 3, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 4, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 5, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 6, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 7, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP,  8, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP,  9, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 10, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 11, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 12, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 13, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 14, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+	DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 15, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0),
+
+	{.block = NEURON_DM_BLOCK_HBM, .block_id = 0, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_0_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 0, .bar_num = 4},
+	{.block = NEURON_DM_BLOCK_HBM, .block_id = 1, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_1_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 1, .bar_num = 4},
+	{.block = NEURON_DM_BLOCK_HBM, .block_id = 2, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_2_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 2, .bar_num = 4},
+	{.block = NEURON_DM_BLOCK_HBM, .block_id = 3, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_3_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 3, .bar_num = 4},
+
+	{NEURON_DM_BLOCK_INVALID, 0, 0, 0, 0, 0},
+};
+
+struct ncdev_mem_region ncdev_mem_regions_v4[] = {
+	{ V4_MMAP_TPB_0_BASE, V4_MMAP_NC_SIZE },    // FIXME this is inefficient this may need a routine to slice and range check
+	{ V4_MMAP_TPB_1_BASE, V4_MMAP_NC_SIZE },
+	{ V4_MMAP_TPB_2_BASE, V4_MMAP_NC_SIZE },
+	{ V4_MMAP_TPB_3_BASE, V4_MMAP_NC_SIZE },
+	{ V4_MMAP_TPB_4_BASE, V4_MMAP_NC_SIZE },
+	{ V4_MMAP_TPB_5_BASE, V4_MMAP_NC_SIZE },
+	{ V4_MMAP_TPB_6_BASE, V4_MMAP_NC_SIZE },
+	{ V4_MMAP_TPB_7_BASE, V4_MMAP_NC_SIZE },
+	{ V4_TOP_SP_0_BASE, V4_TOP_SP_SIZE },       // could flatten TOP_SP
+	{ V4_TOP_SP_1_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_2_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_3_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_4_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_5_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_6_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_7_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_8_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_9_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_10_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_11_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_12_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_13_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_14_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_15_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_16_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_17_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_18_BASE, V4_TOP_SP_SIZE },
+	{ V4_TOP_SP_19_BASE, V4_TOP_SP_SIZE },
+	{ V4_HBM_0_BASE, V4_HBM_ACTIVE_SIZE },
+	{ V4_HBM_1_BASE, V4_HBM_ACTIVE_SIZE },
+	{ V4_HBM_2_BASE, V4_HBM_ACTIVE_SIZE },
+	{ V4_HBM_3_BASE, V4_HBM_ACTIVE_SIZE },
+	{ V4_PREPROC_0_BASE, V4_PREPROC_SIZE},
+	{ V4_PREPROC_1_BASE, V4_PREPROC_SIZE},
+	{ V4_PREPROC_2_BASE, V4_PREPROC_SIZE},
+	{ V4_PREPROC_3_BASE, V4_PREPROC_SIZE},
+	{ NCDEV_MEM_REGION_INVALID, 0 },
+};
+
+
+u32 npe_neighbor_eng_ids_v4[2][2] =
+{
+    {40, 72},  // Left
+    {8, 104}   // Right
+};
+
+static int ndhal_register_funcs_trn3(void) {
+	if (!ndhal) {
+		pr_err("ndhal is null. Can't register functions for trn3.");
+		return -EINVAL;
+	}
+	ndhal->ndhal_sysfs_metrics.arch_nd_type_suffix = "v4";
+	ndhal->ndhal_sysfs_metrics.arch_nc_type_suffix = "v4";
+	ndhal->ndhal_sysfs_metrics.arch_instance_suffix = "Trn3";
+	ndhal->ndhal_sysfs_metrics.arch_device_name_suffix = "Trainium3";
+	return 0;
+}
+
+/* Instance names
+ */
+#define NEURON_TRN3PDS_INSTANCE_NAME "trn3s.48xlarge"
+#define NEURON_TRN3PDS0_INSTANCE_NAME "trn3-dev0.48xlarge"
+
+static enum neuron_platform_type ndhal_platform_type_v4(void)
+{
+	enum neuron_platform_type platform_type = NEURON_PLATFORM_TYPE_INVALID;
+	char buf[128];
+
+	if (narch_get_instance_type_name(buf, sizeof(buf))) goto done;
+	if ((strncmp(buf, NEURON_TRN3PDS_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) {
+		platform_type = NEURON_PLATFORM_TYPE_PDS;
+	} else if ((strncmp(buf, NEURON_TRN3PDS0_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) {
+		platform_type = NEURON_PLATFORM_TYPE_PDS;
+	} else {
+		platform_type = NEURON_PLATFORM_TYPE_STD;
+	}
+
+done:
+	return platform_type;
+}
+
+static bool ndhal_instance_type_3xl(void)
+{
+	static bool instance_type_is_3xl = false;
+#define NEURON_TRN3PD98_3XL_INSTANCE_NAME "trn3pd98.3xlarge"
+	char buf[128];
+	if (narch_get_instance_type_name(buf, sizeof(buf))) goto done;
+	if (strncmp(buf, NEURON_TRN3PD98_3XL_INSTANCE_NAME, sizeof(NEURON_TRN3PD98_3XL_INSTANCE_NAME)-1) == 0) {
+		instance_type_is_3xl = true;
+	}
+
+done:
+	return instance_type_is_3xl;
+}
+
+
+/* Memory Pool Functions */
+/**
+ * mpset_set_dram_and_mpset_info()
+ *              - set the address and size of device dram
+ *              - set mpset's num_channels and number of regions in the device pool
+ *
+ * @param mpset: pointer to mpset
+ * @param device_dram_addr: DRAM Channel 0 and 1's addresses
+ * @param device_dram_size: DRAM Channel 0 and 1's sizes
+ */
+static void mpset_set_dram_and_mpset_info_v4(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size)
+{
+	mpset->num_channels = V4_MAX_DRAM_CHANNELS;
+	mpset->mp_device_num_regions = 1;
+	device_dram_addr[0] = V4_HBM_0_BASE;
+	device_dram_addr[1] = V4_HBM_1_BASE;
+	device_dram_addr[2] = V4_HBM_2_BASE;
+	device_dram_addr[3] = V4_HBM_3_BASE;
+
+	if (narch_is_qemu()) {
+		// Allow qemu setups to dynamically allocate their HBM sizes
+		const u64 msize = ndhal->ndhal_pci.dram_bar_size / 4;
+		device_dram_size[0] = msize;
+		device_dram_size[1] = msize;
+		device_dram_size[2] = msize;
+		device_dram_size[3] = msize;
+
+		u32 mem_regions = sizeof(dm_mmap_special_v4) / sizeof(dm_mmap_special_v4[0]);
+		int i = 0;
+		for (; i < mem_regions; ++i) {
+			if ((dm_mmap_special_v4[i].offset == V4_HBM_0_BASE) ||
+				(dm_mmap_special_v4[i].offset == V4_HBM_1_BASE) ||
+				(dm_mmap_special_v4[i].offset == V4_HBM_2_BASE) ||
+				(dm_mmap_special_v4[i].offset == V4_HBM_3_BASE)) {
+				dm_mmap_special_v4[i].size = msize;
+			}
+		}
+		pr_info("overriding hbm size to %llu bytes", msize);
+	} else {
+		device_dram_size[0] = V4_HBM_ACTIVE_SIZE;
+		device_dram_size[1] = V4_HBM_ACTIVE_SIZE;
+		device_dram_size[2] = V4_HBM_ACTIVE_SIZE;
+		device_dram_size[3] = V4_HBM_ACTIVE_SIZE;
+	}
+	int i;
+	for (i = 0; i < mpset->num_channels; i++) {
+		ndhal->ndhal_mpset.device_dram_end_addr[i] = device_dram_addr[i] + device_dram_size[i];
+	}
+}
+
+
+/* Memory Map Functions */
+/**
+ * mmap_get_bar4_offset() - calculate the offset of BAR4
+ *
+ * @param start_addr: start address
+ * @param size: size of memory
+ * @param offset: offset of BAR4
+ * @return int: 0 on success; negative on failure
+ */
+static int mmap_get_bar4_offset_v4(u64 start_addr, u64 size, u64 *offset)
+{
+	u64 hbm_dist = narch_is_qemu() ? (ndhal->ndhal_pci.dram_bar_size / 4) : V4_HBM_SIZE;
+
+	if (start_addr >= V4_HBM_0_BASE && start_addr + size < V4_HBM_0_BASE + V4_HBM_ACTIVE_SIZE)
+		*offset = start_addr;
+	else if (start_addr >= V4_HBM_1_BASE && start_addr + size < V4_HBM_1_BASE + V4_HBM_ACTIVE_SIZE)
+		*offset = start_addr - V4_HBM_1_BASE + hbm_dist;
+	else if (start_addr >= V4_HBM_2_BASE && start_addr + size < V4_HBM_2_BASE + V4_HBM_ACTIVE_SIZE)
+		*offset = start_addr - V4_HBM_2_BASE + hbm_dist * 2;
+	else if (start_addr >= V4_HBM_3_BASE && start_addr + size < V4_HBM_3_BASE + V4_HBM_ACTIVE_SIZE)
+		*offset = start_addr - V4_HBM_3_BASE + hbm_dist * 3;
+	else
+		return -EINVAL;
+	return 0;
+}
+
+extern int dup_helper_enable;
+static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered
+static int neuron_pci_handle_dup_routing_id(void)
+{
+	int  ret = -ENODEV;
+	int  dup_cnt;
+	char cmd[256];
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+	dup_cnt = atomic_fetch_add(1, &dup_rid_cnt);
+#else
+	dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1;
+#endif
+
+	// If this is the first dup encounted, unload the driver
+	if ((dup_cnt == 0) && dup_helper_enable) {
+		pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE));
+
+		int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE));
+		if (n > sizeof(cmd)) {
+			pr_err("unable to schedule driver unload cmd buffer len exceeded\n");
+			return -EINVAL;
+		}
+		char *argv[] = 		  { "/bin/sh",
+								"-c",
+								cmd,
+								NULL};
+		static char *envp[] = { "HOME=/",
+								"TERM=linux",
+								"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+								NULL};
+
+		ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC);
+		if (ret)
+			pr_err("unable to schedule driver unload. Error: %d\n", ret);
+	}
+
+	return ret;
+}
+
+// for V4 rename Neuron devices for better customer experience.
+// see internal documentation: TRN2-Discovery
+// map routing id to user id:
+static const u32 v4_torus_routing_id_to_user_id[] = {
+	0,	3,	4,	7,
+	12,	15,	8,	11,
+	1,	2,	5,	6,
+	13,	14,	9,	10 };
+
+// map routing id to user id for trn2pds instance type.
+// the only hard rule this map needs to follow is
+// rid (i*2) and rid (i*2)+1 map to did (n*2) and did (n*2)+1
+// since rid (i*2) and rid (i*2)+1 are on the same JBOG.
+static const u32 v4_pds_routing_id_to_user_id[] = {
+	0, 1,
+	2, 3,
+	4, 5,
+	6, 7,
+	8, 9,
+	10, 11,
+	12, 13,
+	14, 15 };
+
+#define V4_ROUTING_ID_TBL_SZ  (sizeof(v4_torus_routing_id_to_user_id) / sizeof(v4_torus_routing_id_to_user_id[0]))
+
+static u32 neuron_pci_routing_id_to_user_id(u32 routing_id)
+{
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		return v4_pds_routing_id_to_user_id[ routing_id % V4_ROUTING_ID_TBL_SZ];
+	}
+	return v4_torus_routing_id_to_user_id[routing_id % V4_ROUTING_ID_TBL_SZ];
+}
+
+/**
+ * neuron_pci_get_device_id() - get device id from pacific and set nd->device_index
+ *
+ * @param dev: PCI device
+ * @param nd: neuron device
+ * @return int: 0 on success, otherwise on failure
+ */
+static int neuron_pci_get_device_id_v4(struct neuron_device *nd, struct pci_dev *dev)
+{
+	int ret = 0;
+	int i;
+	u32 routing_id = (u32)-1;
+	u32 routing_id_max = MAX_NEURON_DEVICE_COUNT;
+
+	if (ndhal_instance_type_3xl()) {
+		// Temporarily auto-assign routing_id to 0 for 3xl instances, since they
+		// only have 1 device anyways
+		routing_id = 0;
+	} else {
+		// Poll the device id until the device is ready
+		for (i = 0; i < 20; i++) {
+			ret = fw_io_device_id_read(nd->npdev.bar0, &routing_id);
+			if (!ret && routing_id != 0xdeadbeef) {
+				break;
+			}
+			msleep(1000);
+		}
+	}
+
+	if (ret) {
+		pr_err("Could not retrieve device index (read timeout)");
+		return -ENODEV;
+	}
+
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		u32 server_info = 0;
+		bool server_id_valid = 0;
+		u32 server_id = 0;
+		ret = fw_io_server_info_read(nd->npdev.bar0, &server_info);
+		if (ret) {
+			return -ENODEV;
+		}
+
+		server_id_valid = (server_info >> 15) & 0x1; // TODO we probably need const shift value or macro
+		if (server_id_valid) {
+			server_id = server_info & 0x7fff; // TODO we probably need constant mask for this
+		} else {
+			pr_err("Could not retrieve valid server id, ret = %d\n", ret);
+			return -ENODEV;
+		}
+		ndhal->ndhal_arch.server_id = server_id;
+		routing_id_max = MAX_NEURON_DEVICE_COUNT * 2;
+	}
+
+	// TODO - this should be a "valid routing_id check for TRN3
+	if (routing_id < 0 || routing_id >= routing_id_max) {
+		pr_err("Invalid device index %u", routing_id);
+		return -ENODEV;
+	}
+
+	nd->device_index = neuron_pci_routing_id_to_user_id(routing_id);
+
+	pr_err("** BDF: %2.2x:%2.2x.%x => nd[%d] (routing id: %u)\n", dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), nd->device_index, routing_id);
+
+	// protection against duplicate IDs - doesn't provide 100% protection in multi-threaded device discovery
+	if (neuron_devices[nd->device_index] != NULL) {
+		pr_err("duplicate routing id %u found\n", routing_id);
+		neuron_pci_handle_dup_routing_id();
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/**
+ * ndhal_register_funcs_v4() - initialize the dhal for v4 chips
+ *
+ * This function only overrides the functions and
+ * constants that are different from v3 in v4.
+ */
+int ndhal_register_funcs_v4(void) {
+	int ret = 0;
+
+	if (!ndhal) {
+		pr_err("ndhal is null. Can't register functions for V4.");
+		return -EINVAL;
+	}
+
+	ndhal->ndhal_arch.platform_type = ndhal_platform_type_v4();
+	ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v4;
+	ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v4;
+	ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v4;
+	ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v4;
+	ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v4;
+	ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v4;
+
+	if (narch_is_emu()) {
+		// Temporarily disable resets on mariana emulation until pacific is ready
+		extern int no_reset;
+		no_reset = 1;
+	}
+
+	// TODO initialization needs refactoring because V4 is piggybacking on V3
+	// which risks double calling any hal init functions
+	//
+	if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) {
+		ret = npe_init();
+		if (ret) {
+			pr_err("failed to initialize pod election on V4\n");
+			return ret;
+		}
+	} else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) {
+		//TODO PDS
+	}	
+
+	switch (ndhal->pci_device_id) {
+		case TRN3_DEVICE_ID0:
+		case TRN3_DEVICE_ID1:
+			ret = ndhal_register_funcs_trn3();
+			if (ret) {
+				pr_err("failed to register ndhal funcs for trn3.\n");
+				return ret;
+			}
+			break;
+		default:
+			pr_err("Unknown HW architecture. Can't init neuron_dhal.\n");
+			return -EINVAL;
+	}
+
+	return ret;
+}
diff --git a/vc/neuron_dhal_vc.c b/vc/neuron_dhal_vc.c
index 0f33e6c..e550d48 100644
--- a/vc/neuron_dhal_vc.c
+++ b/vc/neuron_dhal_vc.c
@@ -136,17 +136,16 @@ static int ntpb_pe_format_activity_stats_vc(struct neuron_device *nd, int nc_id,
  * ndhal_register_funcs_vc() - initialize the common dhal for all chips
  *
  */
-int ndhal_register_funcs_vc(void) {
-	int ret = 0;
-
-	if (!ndhal) {
-		pr_err("ndhal is null. Can't register functions for VC.");
-		return -EINVAL;
-	}
+int ndhal_register_funcs_vc(void)
+{
+    if (!ndhal) {
+        pr_err("ndhal is null. Can't register functions for VC.");
+        return -EINVAL;
+    }
 
     ndhal->ndhal_tpb.pe_format_activity_stats = ntpb_pe_format_activity_stats_vc;
     ndhal->ndhal_tpb.pe_get_counter_val = ntpb_pe_get_counter_val_vc;
     ndhal->ndhal_tpb.pe_get_row_grp_activity_counter_offset = ntpb_pe_get_row_grp_activity_counter_offset_vc;
     ndhal->ndhal_tpb.pe_get_fast_wl_cycle_cnt = ntpb_pe_get_fast_wl_cycle_cnt_vc;
-    return ret;
-}
\ No newline at end of file
+    return 0;
+}