From ec131bf4749024b05470f6bb72f5273366b2f2c7 Mon Sep 17 00:00:00 2001 From: neuron-code-sharing-robot Date: Sat, 20 Dec 2025 01:35:32 +0000 Subject: [PATCH] Extracted contents of aws-neuronx-dkms-2.25.4.0.noarch.rpm --- Kbuild | 3 +- README.md | 3 - dkms.conf | 2 +- neuron_arch.c | 35 ++ neuron_arch.h | 25 +- neuron_cdev.c | 441 +++++++++++++++- neuron_core.c | 2 +- neuron_device.h | 8 +- neuron_dhal.c | 16 +- neuron_dhal.h | 36 +- neuron_dma.c | 503 +++++++++---------- neuron_dma.h | 93 +++- neuron_dmabuf.c | 9 +- neuron_fw_io.c | 218 +++++++- neuron_fw_io.h | 119 ++++- neuron_ioctl.h | 61 ++- neuron_metrics.c | 240 ++++++--- neuron_metrics.h | 78 ++- neuron_module.c | 8 +- neuron_pci.c | 21 +- neuron_power.c | 8 + neuron_reset.c | 11 +- neuron_ring.c | 266 +++++++++- neuron_ring.h | 67 ++- neuron_topsp.c | 2 +- share/neuron_driver_shared.h | 22 + share/neuron_driver_shared_tensor_batch_op.h | 24 + udma/udma.h | 6 +- udma/udma_m2m.c | 33 +- udma/udma_main.c | 50 +- v2/neuron_dhal_v2.c | 172 +++---- v3/neuron_dhal_v3.c | 356 +++++++------ v3/neuron_pelect.c | 85 +++- v4/address_map.h | 230 +++++++++ v4/neuron_dhal_v4.c | 468 +++++++++++++++++ vc/neuron_dhal_vc.c | 17 +- 36 files changed, 2874 insertions(+), 864 deletions(-) create mode 100644 share/neuron_driver_shared_tensor_batch_op.h create mode 100644 v4/address_map.h create mode 100644 v4/neuron_dhal_v4.c diff --git a/Kbuild b/Kbuild index 5eda22f..94f4589 100644 --- a/Kbuild +++ b/Kbuild @@ -14,10 +14,9 @@ neuron-objs += neuron_dmabuf.o neuron-objs += neuron_log.o neuron-objs += neuron_power.o neuron-objs += vc/neuron_dhal_vc.o -neuron-objs += v1/fw_io.o v1/putils.o v1/neuron_dhal_v1.o neuron-objs += v2/notific.o v2/neuron_dhal_v2.o neuron-objs += v3/notific.o v3/neuron_dhal_v3.o v3/neuron_pelect.o - +neuron-objs += v4/neuron_dhal_v4.o ccflags-y += -O3 -Wall -Werror -Wno-declaration-after-statement -Wunused-macros -Wunused-local-typedefs ccflags-y += -I$(src)/ ccflags-y += $(call cc-option,-march=armv8.2-a) diff --git a/README.md b/README.md index e00b7ec..f4525f5 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,6 @@ Neuron Devices implement a communication channel (FWIO) that allows the driver a * neuron_cdev.c - char device interface. * fw_io.[ch] - Communication channel * udma/* - DMA engines and queues HAL -* v1/address_map.h - Neuron Device address space -* v1/putils.h - Notification HAL -* v1/tdma.h - Additional DMA HAL functionality # Compiling and Installing diff --git a/dkms.conf b/dkms.conf index f087853..03f894c 100644 --- a/dkms.conf +++ b/dkms.conf @@ -1,5 +1,5 @@ PACKAGE_NAME=aws-neuronx -PACKAGE_VERSION=2.24.7.0 +PACKAGE_VERSION=2.25.4.0 BUILT_MODULE_NAME[0]="neuron" MAKE[0]="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build" CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean" diff --git a/neuron_arch.c b/neuron_arch.c index 69a7991..f3a6763 100644 --- a/neuron_arch.c +++ b/neuron_arch.c @@ -10,6 +10,13 @@ #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ #include "neuron_arch.h" +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +#include +#endif struct neuron_arch_info { enum neuron_arch arch; @@ -55,3 +62,31 @@ bool narch_is_emu(void) BUG_ON(arch_info.arch == NEURON_ARCH_INVALID); return arch_info.revision == REVID_EMU; } + +int narch_get_instance_type_name(char *instance_type_name, size_t instance_type_name_size) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) + ssize_t len; + ssize_t file_size; + void *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + + if (buf == NULL) { + pr_err("failed to allocate buffer to read instance type"); + return -ENOMEM; + } + + len = kernel_read_file_from_path("/sys/class/dmi/id/product_name", + 0, &buf, 64, &file_size, READING_UNKNOWN); + if (!len) { + pr_err("read instance type failed"); + kfree(buf); + return -EIO; + } + + snprintf(instance_type_name, instance_type_name_size, "%s", (char *)buf); + + kfree(buf); + return 0; +#else + return -ENOSYS; +#endif +} diff --git a/neuron_arch.h b/neuron_arch.h index 6e438f5..c27e5b9 100644 --- a/neuron_arch.h +++ b/neuron_arch.h @@ -11,12 +11,19 @@ enum neuron_arch { NEURON_ARCH_INVALID, - NEURON_ARCH_V1 = 1, NEURON_ARCH_V2 = 2, NEURON_ARCH_V3 = 3, + NEURON_ARCH_V4 = 4, NEURON_ARCH_NUM }; +enum neuron_platform_type { + NEURON_PLATFORM_TYPE_STD = 0, + NEURON_PLATFORM_TYPE_ULTRASERVER = 1, + NEURON_PLATFORM_TYPE_PDS = 2, + NEURON_PLATFORM_TYPE_INVALID, +}; + /** * narch_init() - Set neuron devices architecture and revision. * @@ -55,4 +62,20 @@ bool narch_is_qemu(void); */ bool narch_is_emu(void); +/** + * narch_get_instance_type_name() - Reads instance type name from device DMI data. + * + * @instance_type_name: Buffer to store the instance type name string. + * @instance_type_name_size: Size of the instance_type_name buffer. + * + * Note: This function is only available on kernel versions 5.10.0 and above. + * + * Return: + * * 0 if read succeeds, + * * -ENOMEM - Failed to allocate temporary buffer for reading. + * * -EIO - Failed to read the DMI product_name file. + * * -ENOSYS - Kernel version is below 5.10.0, function not supported. + */ +int narch_get_instance_type_name(char *instance_type_name, size_t instance_type_name_size); + #endif diff --git a/neuron_cdev.c b/neuron_cdev.c index 043e0a7..b8c3458 100644 --- a/neuron_cdev.c +++ b/neuron_cdev.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ #include +#include #include #include #include @@ -40,6 +41,7 @@ #include "neuron_cdev.h" #include "neuron_fw_io.h" #include "neuron_log.h" +#include "neuron_metrics.h" static dev_t neuron_dev; static int major; @@ -1143,7 +1145,8 @@ static int ncdev_mem_buf_copy(struct neuron_device *nd, unsigned int cmd, void * } } -static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, void *param) +#define BAR4_WR_THRESHOLD_MAX (PAGE_SIZE*2) +static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, unsigned int cmd, void *param) { void *buffer; struct mem_chunk *mc; @@ -1151,17 +1154,27 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, void *param) u32 copy_to_mem_handle; u64 offset; u64 size; + u32 bar4_wr_threshold; + int h2t_qid; int ret; + struct neuron_ioctl_mem_buf_copy64zc arg; + bool use_bar4_wr; + + // TODO remove at some point + if (_IOC_SIZE(cmd) != sizeof(arg)) { + pr_err_once("error experimental zerocopy API is now obsolete. Please upgrade to latest driver"); + return -EINVAL; + } - struct neuron_ioctl_mem_buf_copy64 arg; ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_mem_buf_copy64 *)param, sizeof(arg)); if (ret) return ret; mem_handle = arg.mem_handle; buffer = arg.buffer; - copy_to_mem_handle = arg.copy_to_mem_handle; + copy_to_mem_handle = arg.is_copy_to_device; offset = arg.offset; size = arg.size; + h2t_qid = arg.h2t_qid; mc = ncdev_mem_handle_to_mem_chunk(nd, mem_handle); if (!mc) @@ -1172,7 +1185,241 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, void *param) return -EINVAL; } - return ndma_memcpy_zero_copy_mc(nd, buffer, mc, offset, size, copy_to_mem_handle ? true : false); + if (unlikely(!access_ok(buffer, size))) { + return -EFAULT; + } + + // limit to internal threshold to prevent DoS attack + bar4_wr_threshold = (arg.bar4_wr_threshold < BAR4_WR_THRESHOLD_MAX) ? arg.bar4_wr_threshold : BAR4_WR_THRESHOLD_MAX; + use_bar4_wr = !narch_is_qemu() && + (size <= bar4_wr_threshold) && + copy_to_mem_handle && + nd->npdev.bar4_pa && + (mc->mem_location == MEM_LOC_DEVICE) && + IS_ALIGNED(size, 4) && + IS_ALIGNED(offset, 4); + + // For smallish transfers, just do "copy from" directly to bar4 + // simulation (inkling) does not have bar4 mapped to the actual memory, don't do it + if (use_bar4_wr) { + u64 cpy_offset; + ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + offset, size, &cpy_offset); + // copy from user is slow, try fast copy and fall back if fails + pagefault_disable(); + ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, buffer, size); + pagefault_enable(); + if (unlikely(ret)) { + ret = neuron_copy_from_user(__func__, nd->npdev.bar4 + cpy_offset, buffer, size); + } + } else { + nrt_tensor_batch_op_t op; + + u32 nc_id = ndma_mc_pair_to_nc(mc, mc); + int qid = (h2t_qid == NEURON_DMA_H2T_DEFAULT_QID) ? ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id) : h2t_qid; + dma_addr_t dev_base = ndma_mc_to_pa(mc); // the caller already does the range check for dev_base+offset + + if (!ndmar_qid_valid(qid)) { + pr_err("nd%02d: invalid h2t queue index %d", nd->device_index, qid); + return -ENOENT; + } + + if (!ndma_zerocopy_supported()) { + pr_err_once("nd%02d: zero copy is not supported for architectures requiring DMA retry", nd->device_index); + return -EINVAL; + } + + op.offset = offset; + op.buffer = buffer; + op.size = size; + + ret = ndma_memcpy_zerocopy(nd, nc_id, &op, 1, dev_base, qid, copy_to_mem_handle ? true : false); + } + + return ret; +} + +static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param) +{ + int ret = 0; + u32 i, j = 0; + + struct neuron_ioctl_mem_buf_copy64zc_batches arg = {0}; + neuron_memcpy_batch_t *batches = NULL; + nrt_tensor_batch_op_t *ops_buffer = NULL; + struct mem_chunk *mc = NULL; + size_t total_ops = 0; + size_t ops_buffer_offset = 0; + const size_t op_size = sizeof(*ops_buffer); + u32 bar4_wr_threshold = 0; + bool use_bar4_wr = false; + + // copy IOCTL struct from user space + ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_mem_buf_copy64zc_batches *)param, sizeof(arg)); + if (ret) + return ret; + + // validate batches + if (!arg.batches_ptr) { + pr_err("invalid batches pointer\n"); + return -EINVAL; + } + if (arg.num_batches == 0) { + pr_err("the number of batches is 0\n"); + return -EINVAL; + } + + // allocate and copy the batches array from user space + batches = kzalloc(arg.num_batches * sizeof(neuron_memcpy_batch_t), GFP_KERNEL); + if (!batches) { + pr_err("failed to allocate memory for batches\n"); + return -ENOMEM; + } + ret = neuron_copy_from_user(__func__, batches, arg.batches_ptr, arg.num_batches * sizeof(neuron_memcpy_batch_t)); + if (ret) { + pr_err("failed to copy batches from user space\n"); + goto cleanup; + } + + for (i = 0; i < arg.num_batches; i++) { + neuron_memcpy_batch_t batch = batches[i]; + size_t num_ops = batch.num_ops; + + if (num_ops == 0) { + pr_err("the number of operations is 0 for batch %u\n", i); + ret = -EINVAL; + goto cleanup; + } + if (!batch.ops_ptr) { + pr_err("the ops pointer is NULL for batch %u\n", i); + ret = -EINVAL; + goto cleanup; + } + if (num_ops > SIZE_MAX - total_ops) { + pr_err("too many operations requested across batches\n"); + ret = -EINVAL; + goto cleanup; + } + + total_ops += num_ops; + } + + // Holds the ops across batches + ops_buffer = kzalloc(total_ops * op_size, GFP_KERNEL); + if (!ops_buffer) { + pr_err("failed to allocate memory for ops across batches\n"); + ret = -ENOMEM; + goto cleanup; + } + + ops_buffer_offset = 0; + + for (i = 0; i < arg.num_batches; i++) { + neuron_memcpy_batch_t *batch = &batches[i]; + size_t num_ops = batch->num_ops; + void __user *user_ops_ptr = (void __user *)batch->ops_ptr; + nrt_tensor_batch_op_t *ops = ops_buffer + ops_buffer_offset; + + // copy the ops array from user space into the ops buffer + ret = neuron_copy_from_user(__func__, ops, user_ops_ptr, num_ops * op_size); + if (ret) { + pr_err("failed to copy ops from user space\n"); + goto cleanup; + } + batch->ops_ptr = ops; + ops_buffer_offset += num_ops; + + mc = ncdev_mem_handle_to_mem_chunk(nd, batch->mem_handle); + if (!mc) { + pr_err("invalid mem handle %llx for batch %u\n", batch->mem_handle, i); + ret = -EINVAL; + goto cleanup; + } + + bar4_wr_threshold = (batch->bar4_wr_threshold < BAR4_WR_THRESHOLD_MAX) ? batch->bar4_wr_threshold : BAR4_WR_THRESHOLD_MAX; + use_bar4_wr = !narch_is_qemu() && arg.is_copy_to_device && nd->npdev.bar4_pa && mc->mem_location == MEM_LOC_DEVICE; + + for (j = 0; j < batch->num_ops; j++) { + nrt_tensor_batch_op_t *op = &ops[j]; + // validate each operation + if (op->size == 0) { + pr_err("op %u of batch %u: the transfer size is 0\n", j, i); + ret = -EINVAL; + goto cleanup; + } + if (op->buffer == NULL) { + pr_err("op %u of batch %u: buffer is NULL\n", j, i); + ret = -EINVAL; + goto cleanup; + } + // validate and update offset + op->offset += batch->mem_handle_offset; + if (!mc_access_is_within_bounds(mc, op->offset, op->size)) { + pr_err("op %u of batch %u: device offset+size out of bounds\n", j, i); + ret = -EINVAL; + goto cleanup; + } + // validate buffer + if (unlikely(!access_ok(op->buffer, op->size))) { + pr_err("op %u of batch %u: invalid host buffer\n", j, i); + ret = -EFAULT; + goto cleanup; + } + + if (op->size > bar4_wr_threshold || !IS_ALIGNED(op->size, 4) || !IS_ALIGNED(op->offset, 4)) { + use_bar4_wr = false; + } + } + + // For smallish transfers, just do "copy from" directly to bar4 + // simulation (inkling) does not have bar4 mapped to the actual memory, don't do it + if (use_bar4_wr) { + for (j = 0; j < batch->num_ops; j++) { + const nrt_tensor_batch_op_t op = batch->ops_ptr[j]; + + u64 cpy_offset = 0; + ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + op.offset, op.size, &cpy_offset); + // copy from user is slow, try fast copy and fall back if fails + pagefault_disable(); + ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, op.buffer, op.size); + pagefault_enable(); + if (unlikely(ret)) { + ret = neuron_copy_from_user(__func__, nd->npdev.bar4 + cpy_offset, op.buffer, op.size); + if (ret) { + pr_err("failed to do bar4 write on batch %d op %d on nd%02d: %d\n", i, j, nd->device_index, ret); + goto cleanup; + } + } + } + } else { + u32 nc_id = ndma_mc_pair_to_nc(mc, mc); + int qid = (arg.h2t_qid == NEURON_DMA_H2T_DEFAULT_QID) ? ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id) : arg.h2t_qid; + dma_addr_t dev_base = ndma_mc_to_pa(mc); // the caller already does the range check for dev_base+offset + + if (!ndmar_qid_valid(qid)) { + pr_err("nd%02d: invalid h2t queue index %d", nd->device_index, qid); + return -ENOENT; + } + + if (!ndma_zerocopy_supported()) { + pr_err_once("nd%02d: zero copy is not supported for architectures requiring DMA retry", nd->device_index); + return -EINVAL; + } + + // use the zero-copy batch function for ops within a single batch + ret = ndma_memcpy_zerocopy(nd, nc_id, batch->ops_ptr, batch->num_ops, dev_base, qid, arg.is_copy_to_device); + if (ret) { + pr_err("batch zero-copy DMA failed on batch %d on nd%02d: %d\n", i, nd->device_index, ret); + goto cleanup; + } + } + } + +cleanup: + if (ops_buffer) + kfree(ops_buffer); + if (batches) + kfree(batches); + return ret; } static long ncdev_semaphore_ioctl(struct neuron_device *nd, unsigned int cmd, void *param) @@ -1279,6 +1526,55 @@ static long ncdev_bar_read(struct neuron_device *nd, u8 bar, u64 *reg_addresses, return ret; } +/** + * ncdev_bar_write_data() - write data to bar + * + * @param nd: neuron device + * @param bar: the BAR to write to + * @param reg_addresses + * @param data: the data to be written into the bar + * @param data_count: the number of data to be written + * @return 0 on success, otherwise failure +*/ +static int ncdev_bar_write_data(struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count) +{ + if (bar == 0) { + int i; + for (i = 0; i < data_count; i++) { + u64 off = reg_addresses[i] - (u64)nd->npdev.bar0; + if (off > nd->npdev.bar0_size) { + return -EINVAL; + } + if (ndhal->ndhal_ndma.ndma_is_bar0_write_blocked(off)) { + return -EINVAL; + } + writel(data[i], nd->npdev.bar0 + off); + trace_bar_write(nd, bar, off, data[i]); + } + } else if (bar == 4) { + // TODO: we don't have any use case for r/w memory over the BAR right now. Disabling. + // + // We'd like to use DMA for r/w of BAR4 because we might expect access to large amounts of data. + // Access via DMA requires an application to own a TPB because it determines which of the h2t DMAs + // are safe to use, otherwise a TPB along with its DMA could be reset while that DMA is used here. + // Don't want/need to solve it now. + return -EINVAL; + + /* + dma_addr_t dst_addr = reg_addresses[0] - (u64)nd->npdev.bar0; + + ret = ndma_memcpy(nd, 0, virt_to_phys(data) | ndhal->ndhal_address_map.pci_host_base, dst_addr, data_size); + if (ret) + return ret; + */ + } else { + pr_err("direct BAR%d write is not supported.\n", bar); + return -EINVAL; + } + + return 0; +} + static long ncdev_bar_write(struct neuron_device *nd, u8 bar, u64 *reg_addresses, void *user_va, u32 data_count) { @@ -1293,7 +1589,7 @@ static long ncdev_bar_write(struct neuron_device *nd, u8 bar, u64 *reg_addresses if (ret) goto done; - ret = ndhal->ndhal_cdev.ncdev_bar_write_data(nd, bar, reg_addresses, data, data_count); + ret = ncdev_bar_write_data(nd, bar, reg_addresses, data, data_count); if (ret) goto done; done: @@ -1356,12 +1652,24 @@ static long ncdev_post_metric(struct neuron_device *nd, void *param) ret = neuron_copy_from_user(__func__, data, arg.data, arg.data_size); if (ret) goto done; - ret = fw_io_post_metric(nd->fw_io_ctx, (u8 *)data, arg.data_size); + ret = ndhal->ndhal_fw_io.fw_io_post_metric(nd->fw_io_ctx, (u8 *)data, arg.data_size); done: kfree(data); return ret; } +static long ncdev_metric_ctrl(struct neuron_device *nd, void *param) +{ + int ret; + struct neuron_ioctl_metrics_ctrl arg; + ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_metrics_ctrl *)param, sizeof(arg)); + if (ret) + return ret; + + nmetric_set_mode(nd, arg.mode); + return 0; +} + static long ncdev_read_hw_counters(struct neuron_device *nd, void *param) { int ret; @@ -1583,7 +1891,8 @@ static long ncdev_driver_info(unsigned int cmd, void *param) driver_info.feature_flags1 = NEURON_DRIVER_FEATURE_DMABUF | NEURON_DRIVER_FEATURE_ASYNC_DMA | NEURON_DRIVER_FEATURE_BATCH_DMAQ_INIT | NEURON_DRIVER_FEATURE_BIG_CORE_MAPS | NEURON_DRIVER_FEATURE_MEM_ALLOC_TYPE | NEURON_DRIVER_FEATURE_HBM_SCRUB | - NEURON_DRIVER_FEATURE_MEM_ALLOC64 | NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD; + NEURON_DRIVER_FEATURE_MEM_ALLOC64 | NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD | + NEURON_DRIVER_FEATURE_ZEROCOPY; return copy_to_user(param, &driver_info, sizeof(driver_info)); } @@ -2311,7 +2620,7 @@ static long ncdev_hbm_scrub_start(struct neuron_device *nd, void *param) { struct ndma_eng *eng = &nd->ndma_engine[eng_id]; struct ndma_queue *queue = &eng->queues[qid]; struct ndma_ring *ring = &queue->ring_info; - ret = ndma_memcpy_add_completion_desc(eng, ring, completion_bufs[i]); + ret = ndma_memcpy_add_completion_desc(eng, ring, completion_bufs[i], UDMA_M2M_BARRIER_NONE); if (ret) { goto scrub_init_fail; } @@ -2587,6 +2896,109 @@ static int ncdev_pod_ctrl(struct file *filep, unsigned int cmd, void *param) return ret; } +static int ncdev_h2t_dma_alloc_queues(struct neuron_device *nd, unsigned int cmd, void *param) +{ + int ret; + int i; + int qid; + struct neuron_ioctl_h2t_dma_alloc_queues arg; + + ret = neuron_copy_from_user(__func__, &arg, (struct neuron_ioctl_h2t_dma_alloc_queues*)param, sizeof(arg)); + if (ret) + return ret; + + if (arg.nc_id >= ndhal->ndhal_address_map.nc_per_device) { + pr_err("nd%02d: invalid nc %d provided", nd->device_index, arg.nc_id); + return -E2BIG; + } + + if (arg.copy_queue_cnt + arg.service_queue_cnt >= DMA_MAX_Q_MAX) { + pr_err("nd%02d: invalid total queue count %d provided", nd->device_index, arg.copy_queue_cnt + arg.service_queue_cnt); + return -E2BIG; + } + + arg.copy_queue_bmap = 0; + arg.service_queue_bmap = 0; + + for (i=0; i < arg.copy_queue_cnt; i++) { + ret = ndmar_h2t_ring_request(nd, arg.nc_id, true, &qid); + if (ret) { + goto done; + } + arg.copy_queue_bmap |= (1<ndhal_ndmar.ndmar_get_h2t_def_qid(arg.nc_id); + + ret = copy_to_user(param, &arg, sizeof(arg)); + +done: + if (ret) { + u32 combined_queue_bmap = arg.copy_queue_bmap | arg.service_queue_bmap; + for (i=0; i < DMA_MAX_Q_V4; i++) { + if ((1<= ndhal->ndhal_address_map.nc_per_device) { + pr_err("nd%02d: invalid nc %d provided", nd->device_index, arg.nc_id); + return -E2BIG; + } + + for (i=0; i < DMA_MAX_Q_V4; i++) { + int lret; + if ((1<ndhal_perf.perf_set_profile(nd, arg.profile); +} + inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsigned long param) { if ((cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK) || (cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK_EXT0)) { return ncdev_crwl_nc_range_mark(filep, cmd, (void *)param); @@ -2741,7 +3153,9 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para } else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_MEM_BUF_COPY)) { return ncdev_mem_buf_copy(nd, cmd, (void *)param); } else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_MEM_BUF_ZEROCOPY64)) { - return ncdev_mem_buf_zerocopy64(nd, (void *)param); + return ncdev_mem_buf_zerocopy64(nd, cmd, (void *)param); + } else if (cmd == NEURON_IOCTL_MEM_BUF_ZEROCOPY64_BATCHES) { + return ncdev_mem_buf_zerocopy64_batch(nd, (void *)param); } else if (cmd == NEURON_IOCTL_PROGRAM_ENGINE) { return ncdev_program_engine(nd, (void *)param); } else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_PROGRAM_ENGINE_NC)) { @@ -2766,6 +3180,8 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para return ncdev_bar_rw(nd, (void *)param, false); } else if (cmd == NEURON_IOCTL_POST_METRIC) { return ncdev_post_metric(nd, (void *)param); + } else if (cmd == NEURON_IOCTL_METRICS_CTRL) { + return ncdev_metric_ctrl(nd, (void *)param); } else if (cmd == NEURON_IOCTL_NOTIFICATIONS_INIT_V1) { return ncdev_nc_nq_init_deprecated(nd, (void *)param); } else if (cmd == NEURON_IOCTL_NOTIFICATIONS_INIT_V2) { @@ -2804,7 +3220,14 @@ static long ncdev_ioctl(struct file *filep, unsigned int cmd, unsigned long para return ncdev_hbm_scrub_start(nd, (void*)param); } else if (cmd == NEURON_IOCTL_HBM_SCRUB_WAIT) { return ncdev_hbm_scrub_wait_for_cmpl(nd, (void*)param); + } else if (cmd == NEURON_IOCTL_H2T_DMA_ALLOC_QUEUES) { + return ncdev_h2t_dma_alloc_queues(nd, cmd, (void*)param); + } else if (cmd == NEURON_IOCTL_H2T_DMA_FREE_QUEUES) { + return ncdev_h2t_dma_free_queues(nd, cmd, (void*)param); + } else if (cmd == NEURON_IOCTL_POWER_PROFILE) { + return ncdev_power_profile_set(nd, (void*)param); } + // B/W compatibility return ncdev_misc_ioctl(filep, cmd, param); } diff --git a/neuron_core.c b/neuron_core.c index 477985e..89c3d4a 100644 --- a/neuron_core.c +++ b/neuron_core.c @@ -3,7 +3,7 @@ * Copyright 2020, Amazon.com, Inc. or its affiliates. All Rights Reserved */ -/** Each neuron device has N number of neuron cores. (v1 has 4 neuron cores; v2 has 2 neuron cores). +/** Each neuron device has N number of neuron cores. * * Engines: * ------- diff --git a/neuron_device.h b/neuron_device.h index 46b2b67..93fd781 100644 --- a/neuron_device.h +++ b/neuron_device.h @@ -33,13 +33,11 @@ /* Vendor / Device ID for all devices supported by the driver */ #define AMZN_VENDOR_ID 0x1D0F -#define INF1_DEVICE_ID0 0x7064 -#define INF1_DEVICE_ID1 0x7065 -#define INF1_DEVICE_ID2 0x7066 -#define INF1_DEVICE_ID3 0x7067 #define INF2_DEVICE_ID0 0x7264 #define TRN1_DEVICE_ID0 0x7164 #define TRN2_DEVICE_ID0 0x7364 +#define TRN3_DEVICE_ID0 0x7564 +#define TRN3_DEVICE_ID1 0x7565 // Global host memory buf size used for memset the device memory #define MEMSET_HOST_BUF_SIZE MAX_DMA_DESC_SIZE // guessed optimal DMA transfer and PCIe TLP size. @@ -116,7 +114,7 @@ struct neuron_device { u64 nc_model_started_count[MAX_NC_PER_DEVICE]; // number of times the NCs has started model struct nsysfsmetric_metrics sysfs_metrics; - + struct neuron_log_obj log_obj; // logging object struct neuron_hbm_scrub_ctx hbm_scrub_ctx; diff --git a/neuron_dhal.c b/neuron_dhal.c index 83ce1a1..bb269c2 100644 --- a/neuron_dhal.c +++ b/neuron_dhal.c @@ -29,21 +29,23 @@ int neuron_dhal_init(unsigned int pci_device_id) { } mutex_unlock(&ndhal_init_lock); - ndhal->arch = narch_get_arch(); + ndhal->ndhal_arch.arch = narch_get_arch(); ndhal->pci_device_id = pci_device_id; - ret = ndhal_register_funcs_vc(); - switch (ndhal->arch) { - case NEURON_ARCH_V1: - ret = ndhal_register_funcs_v1(); - break; + ndhal_register_funcs_vc(); + + switch (ndhal->ndhal_arch.arch) { case NEURON_ARCH_V2: ret = ndhal_register_funcs_v2(); break; case NEURON_ARCH_V3: ret = ndhal_register_funcs_v3(); break; + case NEURON_ARCH_V4: + ret = ndhal_register_funcs_v3(); // use v3 as base + ret = ndhal_register_funcs_v4(); // apply v4 overrides + break; default: - pr_err("Unknown HW architecture: %d. Can't init neuron_dhal.\n", ndhal->arch); + pr_err("Unknown HW architecture: %d. Can't init neuron_dhal.\n", ndhal->ndhal_arch.arch); return -EINVAL; } diff --git a/neuron_dhal.h b/neuron_dhal.h index a7089ad..bbdbbe5 100644 --- a/neuron_dhal.h +++ b/neuron_dhal.h @@ -12,8 +12,21 @@ #define dhal_sysfs_emit(buf, ...) sysfs_emit((buf), __VA_ARGS__) #endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) +#define dhal_atomic_fetch_add(v,a) atomic_fetch_add(v, (a)) +#else +#define dhal_atomic_fetch_add(v,a) (atomic_add_return(v, (a)) -1) +#endif + + + extern int force_die_flip; +struct ndhal_arch { + int arch; + enum neuron_platform_type platform_type; + u32 server_id; +}; struct ndhal_address_map { // addresses uint64_t pci_host_base; @@ -44,7 +57,6 @@ struct ndhal_address_map { struct ndhal_reset { uint64_t reset_poll_interval; uint64_t reset_tpb_initial_poll_delay; - uint64_t reset_device_initial_poll_delay; uint64_t initiate_max_wait_time; uint32_t retry_count; int (*nr_initiate_reset) (struct neuron_device *nd, uint32_t nc_map); @@ -82,8 +94,8 @@ struct ndhal_mpset { struct ndhal_ndmar { uint32_t (*ndmar_get_h2t_eng_id) (struct neuron_device *nd, uint32_t nc_id); - int (*ndmar_get_h2t_qid) (uint32_t nc_id); - bool (*ndmar_is_h2t_q) (struct neuron_device *nd, uint32_t eng_id, uint32_t q_id); + int (*ndmar_get_h2t_def_qid) (uint32_t nc_id); + bool (*ndmar_is_h2t_def_q) (struct neuron_device *nd, uint32_t eng_id, uint32_t q_id); bool (*nr_init_h2t_eng) ( int nc_idx, uint32_t nc_map); bool (*ndmar_is_nx_ring) (uint32_t eng_id, uint32_t q_id); int (*ndmar_quiesce_queues) (struct neuron_device *nd, u32 nc_id, u32 engine_count, u32 *queue_mask); @@ -94,6 +106,8 @@ struct ndhal_fw_io { int (*fw_io_topology) (struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count); int (*fw_io_register_readless_read_region) (struct fw_io_ctx *ctx, void __iomem *bar0, u64 bar0_size, void __iomem *bar2, u64 bar2_size); int (*fw_io_read_csr_array) (void **addrs, u32 *values, u32 num_csrs, bool operational); + int (*fw_io_execute_request) (struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size); + int (*fw_io_post_metric) (struct fw_io_ctx *ctx, u8 *data, u32 size); }; struct ndhal_reg_access { @@ -153,15 +167,12 @@ struct ndhal_cdev { void (*ncdev_compatible_version) (struct neuron_ioctl_compatible_version *arg); void (*ncdev_quiesce_exec_on_proc_exit) (void); - int (*ncdev_bar_write_data) (struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count); int (*ncdev_logical_to_physical_nc_map)(struct neuron_ioctl_nc_map *map, uint32_t max_num_entries, enum neuron_ioctl_nc_mapping_type mapping_type); void (*ncdev_get_default_tpbs_for_hbm) (u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count); }; struct ndhal_udma { unsigned int num_beats; - void (*udma_m2s_data_rd_cfg_boundaries_set) (struct udma *udma); - void (*udma_q_config) (struct udma_q *udma_q); }; struct ndhal_ndma { @@ -183,6 +194,7 @@ struct ndhal_npe { ssize_t (*npe_class_node_id_show_data)(char *buf, u32 sz); ssize_t (*npe_class_server_id_show_data)(char *buf, u32 sz); ssize_t (*npe_class_ultraserver_mode_show_data)(char *buf); + u32 (*npe_neighbor_eng_ids)[2]; }; struct ndhal_tpb { @@ -201,10 +213,15 @@ struct ndhal_tpb { int (*pe_format_activity_stats)(struct neuron_device *nd, int nc_id, char buffer[], unsigned int bufflen); }; +struct ndhal_perf { + int current_performance_profile; + int (*perf_set_profile) (struct neuron_device *nd, uint32_t profile); +}; + struct neuron_dhal { - int arch; unsigned int pci_device_id; + struct ndhal_arch ndhal_arch; struct ndhal_address_map ndhal_address_map; struct ndhal_reset ndhal_reset; struct ndhal_topsp ndhal_topsp; @@ -222,6 +239,7 @@ struct neuron_dhal { struct ndhal_ndma ndhal_ndma; struct ndhal_npe ndhal_npe; struct ndhal_tpb ndhal_tpb; + struct ndhal_perf ndhal_perf; void (*ndhal_ext_cleanup) (void); }; @@ -253,13 +271,13 @@ void neuron_dhal_cleanup(void); void neuron_dhal_free(void); /** - * ndhal_register_funcs() - Register functions v1 (or inf1) v2 (or trn1 inf2) to the ndhal + * ndhal_register_funcs() - Register functions based on hardward arch to the ndhal * * @return int 0 on success, negative for failures */ int ndhal_register_funcs_vc(void); -int ndhal_register_funcs_v1(void); int ndhal_register_funcs_v2(void); int ndhal_register_funcs_v3(void); +int ndhal_register_funcs_v4(void); #endif diff --git a/neuron_dma.c b/neuron_dma.c index 04fb3a4..5f7cbc0 100644 --- a/neuron_dma.c +++ b/neuron_dma.c @@ -22,6 +22,9 @@ DECLARE_FAULT_ATTR(neuron_fail_dma_wait); #endif +int zerocopy_trn1_override = 0; +module_param(zerocopy_trn1_override, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(zerocopy_trn1_override, "override zerocopy for trn1"); //#define NUNUSED __attribute__ ((unused)) @@ -37,7 +40,7 @@ static void ndma_ack_completed_desc(struct ndma_eng *eng, struct ndma_ring *ring udma_cdesc_ack(txq, count); } -static inline u32 ndma_mc_pair_to_nc( struct mem_chunk *src_mc, struct mem_chunk *dst_mc) +u32 ndma_mc_pair_to_nc(struct mem_chunk *src_mc, struct mem_chunk *dst_mc) { if (src_mc->mem_location != MEM_LOC_HOST) return src_mc->nc_id; @@ -88,7 +91,7 @@ static inline int ndma_dma_ctx_get_next_handle( int pdma_ctx_handle, int * dma_c * memchunk to dma phy addr * */ -static inline dma_addr_t ndma_mc_to_pa( struct mem_chunk *mc) +dma_addr_t ndma_mc_to_pa(struct mem_chunk *mc) { if (mc->mem_location == MEM_LOC_HOST) return virt_to_phys(mc->va) | ndhal->ndhal_address_map.pci_host_base; // why isn't this already set??? @@ -166,7 +169,7 @@ static inline struct ndma_h2t_dma_context * ndma_get_dma_ctx( struct ndma_eng *e if (eng->used_for_h2t) return &ring->h2t_dma_ctx[dma_ctx_handle]; else { - pr_info("allocating descriptor for non-h2t\n"); // FIXME remove at some point + pr_info_once("allocating descriptor for non-h2t\n"); return kmalloc( sizeof(struct ndma_h2t_dma_context), GFP_KERNEL); } } @@ -191,7 +194,7 @@ static inline void ndma_release_dma_ctx( struct ndma_eng *eng, struct ndma_ring * add a completion entry to the ring * */ -int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer) +int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer, int barrier_type) { int ret = 0; struct udma_ring_ptr completion; @@ -210,7 +213,7 @@ int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *rin completion.addr = virt_to_phys(completion.ptr) | ndhal->ndhal_address_map.pci_host_base; ret = udma_m2m_copy_prepare_one(&eng->udma, ring->qid, completion.addr, completion.addr + DMA_COMPLETION_MARKER_SIZE, - DMA_COMPLETION_MARKER_SIZE, UDMA_M2M_BARRIER_NONE, false); + DMA_COMPLETION_MARKER_SIZE, barrier_type, false); if (ret) { pr_err("failed to prepare DMA descriptor on nd%02d for %s q%d\n", eng->nd->device_index, eng->udma.name, ring->qid); ret = -1; @@ -342,12 +345,12 @@ static int ndma_memcpy_chunks( struct ndma_eng *eng, struct ndma_ring *ring, str remaining -= chunk_size; pending_transfers++; - //FIXME trace_dma_memcpy(nd, nc_id, src_offset, dst_offset, chunk_size, pending_transfers); + //TODO trace_dma_memcpy(nd, nc_id, src_offset, dst_offset, chunk_size, pending_transfers); } // write completion descriptor, kick off DMAs, record pending xfers and data outstanding and prefetch if requested // - ret = ndma_memcpy_add_completion_desc( eng, ring, dma_ctx->completion_ptr); + ret = ndma_memcpy_add_completion_desc( eng, ring, dma_ctx->completion_ptr, UDMA_M2M_BARRIER_NONE); if (ret) { return ret; } @@ -432,7 +435,7 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); // for v2 the last one is reserved for collectives - const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id); + const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id); struct ndma_eng *eng = &nd->ndma_engine[eng_id]; struct ndma_queue *queue = &eng->queues[qid]; @@ -447,7 +450,7 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr // 2. usage of the SYNC dma context (basically even though we specify we are using the SYNC ctxt handle outside this routine // the SYNC dma context itself is only used within this routine. // - mutex_lock(&eng->h2t_ring_lock); + mutex_lock(&ring->h2t_ring_lock); // initialize the DMA context dma_ctx->inuse = true; @@ -461,7 +464,12 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr dma_ctx->size = size; dma_ctx->smove = smove; dma_ctx->dmove = dmove; - dma_ctx->completion_ptr = ndma_memcpy_get_completion_buf( eng, ring, wait_handle); + dma_ctx->completion_ptr = ndma_memcpy_get_completion_buf( eng, ring, wait_handle); + + if (dma_ctx->completion_ptr == NULL) { + ret = -ENOMEM; + goto fail; + } // Sanity check if ((pdma_ctx != NULL) && (!pdma_ctx->inuse)) { @@ -525,7 +533,7 @@ static int ndma_memcpy_offset_move(struct neuron_device *nd, u32 nc_id, dma_addr ndma_release_dma_ctx( eng, ring, pdma_ctx); - mutex_unlock(&eng->h2t_ring_lock); + mutex_unlock(&ring->h2t_ring_lock); return ret; } @@ -610,7 +618,7 @@ int ndma_memcpy_mc(struct neuron_device *nd, struct mem_chunk *src_mc, struct me } dst_pa += dst_offset; - // FIXME: H2H memcpy's src and dst mc should have dedicated nc_id such as -1 + // TODO: H2H memcpy's src and dst mc should have dedicated nc_id such as -1 if (src_mc->mem_location == MEM_LOC_HOST && dst_mc->mem_location == MEM_LOC_HOST) { nc_id = dst_mc->nc_id; } @@ -631,7 +639,7 @@ int ndma_memcpy_mc_wait( struct neuron_device *nd, struct mem_chunk *src_mc, str int ret; const u32 nc_id = ndma_mc_pair_to_nc( src_mc, dst_mc); const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); - const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id); + const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_id); struct ndma_eng *eng = &nd->ndma_engine[eng_id]; struct ndma_queue *queue = &eng->queues[qid]; struct ndma_ring *ring = &queue->ring_info; @@ -895,6 +903,7 @@ struct ndma_h2t_zcdma_context { dma_addr_t dev_addr; // device address u64 size; // size for this transfer bool direction; // direction. true = to device + bool last; // last transfer for the entire request. u64 start_time; // start time for this transfer int nr_pages; // number of pages for this transfer int nr_desc; // number of descriptors which is equal to pending transfers -1 @@ -909,7 +918,34 @@ struct ndma_h2t_zcdma_context { // dma time > (pin time + setup time + completion update + initial poll wait) // That's the simple explanation. It's a tad more complicated in trading off smaller // transfers where even if that equation doesn't hold, the overlap can be beneficial. - // Right now the sweet spot looks to be ~ 32 pages + // Right now the sweet spot looks to be ~ 64 pages. More tuning is required. + // +#define NDMA_ZC_MIN_PAGES_PER_XFER 64 + +/** ndma_calc_zc_pin_size() + * + * determine how many pages to pin per step for zercopy dma pipelining. + */ +static size_t ndma_calc_zc_pin_size(size_t size) +{ + if (size > NDMA_ZC_PAGES_PER_XFER * PAGE_SIZE * 2) { + return NDMA_ZC_PAGES_PER_XFER * PAGE_SIZE; + } else if (size <= NDMA_ZC_MIN_PAGES_PER_XFER * PAGE_SIZE) { + return size; + } + return (size/2 + PAGE_SIZE-1) & ~(PAGE_SIZE-1); +} + +/** + * ndma_zerocopy_supported() + * + * zero copy is not support for platforms that require retry + * + */ +bool ndma_zerocopy_supported(void) +{ + return !ndhal->ndhal_ndma.ndma_retry_memcpy || zerocopy_trn1_override; +} /** * ndma_build_n_issue_zc_descs() @@ -919,7 +955,7 @@ struct ndma_h2t_zcdma_context { * * explain how alignment is handled. * - * Todo: + * TODO: * go i=0 to nr_pages * Think about using some permanent location in HBM as source for completion descriptor update. Like * why are we reading across the PCIe bus to fetch completion data. @@ -934,6 +970,7 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) int i = 0; u64 chunk_size; int pending_transfers = 0; + int barrier_type; while (i < dma_ctx->nr_pages) { dma_addr_t src_addr; @@ -951,10 +988,10 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) contig_size += PAGE_SIZE; } - if (dma_ctx->direction) { + if (dma_ctx->direction) { // write to device src_addr = (contig_start + offset) | pci_host_base; dst_addr = dev_addr; - } else { + } else { // read from device src_addr = dev_addr; dst_addr = (contig_start + offset) | pci_host_base; } @@ -966,8 +1003,25 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) chunk_size = (remaining < contig_size) ? remaining : contig_size; if (chunk_size > MAX_DMA_DESC_SIZE) chunk_size = MAX_DMA_DESC_SIZE; + // on the read path completion write follows data writes in order, that means when the completion write finishes + // it's guaranteed that all the data has been written, no need for a barrier + + // on the write path we only need the barrier for the last transfer (the last set of pinned pages), why? + // HBM writes (data) and host write (completion) take different path through data fabric. That means w/o a barrier + // it's possible for the completion to be written before the data. + + // We don't need the barrier to ensure it's safe to unpin. + // s2m descriptors are executed in order, that means when s2m completion write is executed all s2m data writes + // have been executed as well, that means all m2s data reads have been executed, that means it's safe to unpin - ret = udma_m2m_copy_prepare_one(&dma_ctx->eng->udma, dma_ctx->ring->qid, src_addr, dst_addr, chunk_size, remaining == chunk_size, false); // set the barrier if the last descriptor + // use WRITE_BARRIER on V2 (set on the last data descriptor) + // use SOW on V3+ (set on completion descriptor below) + if (narch_get_arch() == NEURON_ARCH_V2) + barrier_type = (remaining == chunk_size && dma_ctx->direction && dma_ctx->last) ? UDMA_M2M_BARRIER_WRITE_BARRIER : UDMA_M2M_BARRIER_NONE; + else + barrier_type = UDMA_M2M_BARRIER_NONE; + + ret = udma_m2m_copy_prepare_one(&dma_ctx->eng->udma, dma_ctx->ring->qid, src_addr, dst_addr, chunk_size, barrier_type, false); if (ret) { pr_err("failed to prepare DMA descriptor for %s q%d\n", dma_ctx->eng->udma.name, dma_ctx->ring->qid); goto error; @@ -983,7 +1037,11 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) dma_ctx->nr_desc = pending_transfers; - ret = ndma_memcpy_add_completion_desc( dma_ctx->eng, dma_ctx->ring, dma_ctx->completion_ptr); + if (narch_get_arch() != NEURON_ARCH_V2) + barrier_type = (dma_ctx->direction && dma_ctx->last) ? UDMA_M2M_BARRIER_SOW: UDMA_M2M_BARRIER_NONE; + else + barrier_type = UDMA_M2M_BARRIER_NONE; + ret = ndma_memcpy_add_completion_desc( dma_ctx->eng, dma_ctx->ring, dma_ctx->completion_ptr, barrier_type); if (ret) { goto error; } @@ -1001,279 +1059,192 @@ static int ndma_build_n_issue_zc_descs( struct ndma_h2t_zcdma_context * dma_ctx) } /** - * ndma_zero_copy_wait_for_completion() + * ndma_zerocopy_wait_for_completion() * * * */ -static int ndma_zero_copy_wait_for_completion( struct neuron_device *nd, u32 nc_id, struct ndma_eng *eng, struct ndma_ring *ring, - struct ndma_h2t_zcdma_context * dma_ctx, struct ndma_h2t_zcdma_context * ndma_ctx) +static int ndma_zerocopy_wait_for_completion( struct neuron_device *nd, u32 nc_id, struct ndma_eng *eng, struct ndma_ring *ring, + struct ndma_h2t_zcdma_context * dma_ctx, struct ndma_h2t_zcdma_context * ndma_ctx) { int ret; - bool async = true; - - while(true) { - ret = ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, async, false); // FIXM we shouldn't even be waiting 1usec here - - if (ret == 0) { - if (dma_ctx->direction) - unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages); - else - unpin_user_pages_dirty_lock( dma_ctx->page_list, dma_ctx->nr_pages, true); - return ret; - } - - // if the memcpy starts within a NeuronCore reset window, - // the timeout is possible due to DMA hanging caused by hardware issue. - // if so, restart DMA and retry the memcpy - if (narch_get_arch() != NEURON_ARCH_V2) { // FIXME - this should be if (!ndhal.tpb_reset_dma_retry) or part of dma_ctx - break; - } - - if (!nr_op_in_reset_wnd(dma_ctx->start_time, nd)) { - break; - } - - pr_info( "Failed to copy memory during a NeuronCore reset: nd %d, host %#llx, dev %#llx, size %llu. Retrying the copy.\n", - nd->device_index, (dma_addr_t)dma_ctx->host_addr, dma_ctx->dev_addr, dma_ctx->size); - dma_ctx->start_time = get_jiffies_64(); - if (ndma_ctx != NULL) - ndma_ctx->start_time = get_jiffies_64(); - - ret = ndmar_h2t_ring_init(eng, ring->qid); - - if (ret) { - pr_err("H2T ring init failed on nd %d: ret %d\n", nd->device_index, ret); - break; - } + ret = ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, true, false); + //atomic_sub(dma_ctx->nr_desc+1, &dma_ctx->ring->h2t_outstanding_desc); - // restart dmas - // - ret = ndma_build_n_issue_zc_descs( dma_ctx); - if (ret) - break; - - if (ndma_ctx != NULL) { - ret = ndma_build_n_issue_zc_descs( ndma_ctx); - if (ret) { - ndma_memcpy_wait_for_completion(eng, ring, dma_ctx->nr_desc+1, dma_ctx->completion_ptr, false, false); - break; - } - } - - async = false; + if (ret == 0) { + if (dma_ctx->direction) + unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages); + else + unpin_user_pages_dirty_lock(dma_ctx->page_list, dma_ctx->nr_pages, true); + return ret; } // If we are exiting here, we've failed so unpin pages associated with the DMA. If the next DMA // context is valid, do an obligatory wait for the DMA operation so we don't splat data on someone // else's memory just in case the physical pages are reassigned after unpinning. // - unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages); + unpin_user_pages(dma_ctx->page_list, dma_ctx->nr_pages); - // blindly wait - + // blindly wait if (ndma_ctx != NULL) { ndma_memcpy_wait_for_completion(eng, ring, ndma_ctx->nr_desc+1, ndma_ctx->completion_ptr, false, false); - unpin_user_pages( ndma_ctx->page_list, ndma_ctx->nr_pages); + unpin_user_pages(ndma_ctx->page_list, ndma_ctx->nr_pages); } - + return ret; } -/** - * ndma_memcpy_zero_copy() - * - * dma data between a user space virtual address range and a contiguous location in device memory. - * In order to do this, we need to know the physical pages are associated with - * the user virtual address range and we need to make sure those physical pages stay - * associated with the user virtual address range while the DMA is happening. - * - * How do we do this? By asking the kernel to pin the physical pages in memory until we are - * done with them. But our transaction could be large, the physical pages won't be contiguous, - * and pinning takes CPU cycles, so we break the dma transfer up into a series of smaller transfers - * where we pipeline the pinning of physical pages with dma transfers. - * - * We use pin_user_pages_fast() to reduce pinning overhead because we know the process can't go - * away while we are down here doing our thing in the kernel within a single IOCTL call. - * - * We ping pong back and forth between two dma contexts. So while dma for context A is in progress, - * we are pinning pages and starting dmas for context B. - * - * Algorithm goes like this: - * initial a pair of dma contexts - * prev dma ctx = null - * lock() - * while still more data remaining - * current dma ctx = next available context - * init current dma context - * calc size of the transfer for this dma context. We want to transfer up to page boundaries - * calc number of pages that need to be pinned for this dma - * pin host pages in memory - * generate descriptors for - * if prev dma ctx != NULL, wait for the prev dma to complete - * update host address, device address and ammount remaining - * wait for the last dma ctx to complete - * unlock() - * free resources - * - * Notes: - * unpinning responsibilities. Up until a dma is successfully launched, this routine is responsible for unpinning - * host memory. After that ndma_zero_copy_wait_for_completion() owns responsibility for unpinning pages. - * - * We don't do this here, but pinning user pages across system (IOCTL) calls has a number of additional requirements. - * We would have to cleanup any pinned pages when the process goes away, so any pinned pages have to get tracked in - * process context. - * - * direction == true means write from host to device - * - */ - -static int ndma_memcpy_zero_copy(struct neuron_device *nd, u32 nc_id, void * host_addr, dma_addr_t dev_addr, u64 size, bool direction) +int ndma_memcpy_zerocopy(struct neuron_device *nd, + u32 nc_id, + const nrt_tensor_batch_op_t *ops, + u32 num_ops, + dma_addr_t dev_base, + int qid, + bool direction) { - int ret = 0; - - const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); - const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id); // TODO - this needs direction or transfer type to select qid - struct ndma_eng *eng = &nd->ndma_engine[eng_id]; - struct ndma_queue *queue = &eng->queues[qid]; - struct ndma_ring *ring = &queue->ring_info; - struct ndma_h2t_zcdma_context dma_ctx_tbl[2] = {0}; - struct ndma_h2t_zcdma_context * dma_ctx; - struct ndma_h2t_zcdma_context * pdma_ctx = NULL; - int next_dma_idx = 0; - int i; - u64 remaining = size; - u64 cpy_size = (NDMA_ZC_PAGES_PER_XFER*PAGE_SIZE < size) ? NDMA_ZC_PAGES_PER_XFER*PAGE_SIZE : size; - int nr_pinned; - - // initialize the static fields in the dma contexts that are the same for every operation - // - for (i=0;i< 2;i++) { - dma_ctx_tbl[i].eng = eng; - dma_ctx_tbl[i].ring = ring; - dma_ctx_tbl[i].direction = direction; - dma_ctx_tbl[i].page_list = kcalloc( NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL); - dma_ctx_tbl[i].completion_ptr = kmalloc(DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL); - - if ((dma_ctx_tbl[i].page_list == NULL) || (dma_ctx_tbl[i].completion_ptr == NULL)) { - pr_err("could not allocate memory for dma contexts on nd %d\n", nd->device_index); - goto fail; - } - } - pdma_ctx = NULL; - - mutex_lock(&eng->h2t_ring_lock); - - while (remaining) { - unsigned long offset = (unsigned long)(host_addr) & (PAGE_SIZE-1); - dma_ctx = &dma_ctx_tbl[next_dma_idx]; - dma_ctx->start_time = get_jiffies_64(); - dma_ctx->host_addr = host_addr; - dma_ctx->dev_addr = dev_addr; - dma_ctx->size = (cpy_size == remaining) ? cpy_size : cpy_size - offset; // slightly non-obvious, we are setting up xfer size - // that only the first xfer has its starting address - // not aligned to the page boundary. First time around - // offset >= 0 and cpy_size <= xfer size. Other times - // host_addr is aligned, offset = 0 and cpy_size = xfer_size - dma_ctx->nr_pages = DIV_ROUND_UP(offset + dma_ctx->size, PAGE_SIZE); - - //__GFP_SKIP_ZERO - nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list); - if (nr_pinned != dma_ctx->nr_pages) { - // if failed pin_fast because of page fault, do the regular pinning - if (nr_pinned > 0) - unpin_user_pages( dma_ctx->page_list, nr_pinned); + int ret = 0; + const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); + struct ndma_eng *eng = &nd->ndma_engine[eng_id]; + struct ndma_queue *queue = &eng->queues[qid]; + struct ndma_ring *ring = &queue->ring_info; + struct ndma_h2t_zcdma_context dma_ctx_tbl[2] = {0}; + struct ndma_h2t_zcdma_context *pdma_ctx = NULL; + int next_dma_idx = 0; + int i = 0; + bool locked = false; + + // sanity check ring is owned by nc_id + if (!ndmar_h2t_ring_is_owner(ring, nc_id)) { + pr_err("nd%02d: attempting to use qid %d that was not assigned to nc %d\n", nd->device_index, qid, nc_id); + return -ENOENT; + } + + // initialize the static fields in the dma contexts that are the same for every operation + for (i=0;i< 2;i++) { + dma_ctx_tbl[i].eng = eng; + dma_ctx_tbl[i].ring = ring; + dma_ctx_tbl[i].direction = direction; + dma_ctx_tbl[i].page_list = kcalloc( NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL); + dma_ctx_tbl[i].completion_ptr = kmalloc(DMA_COMPLETION_MARKER_SIZE * 2, GFP_KERNEL); + + if ((dma_ctx_tbl[i].page_list == NULL) || (dma_ctx_tbl[i].completion_ptr == NULL)) { + pr_err("could not allocate memory for dma contexts on nd %d\n", nd->device_index); + ret = -ENOMEM; + goto fail; + } + } + pdma_ctx = NULL; + + mutex_lock(&ring->h2t_ring_lock); + locked = true; + + // Process all operations with pipelining + for (i = 0; i < num_ops; i++) { + const nrt_tensor_batch_op_t *op = &ops[i]; + u64 remaining = op->size; + void *host_addr = op->buffer; + dma_addr_t dev_addr = dev_base + op->offset; + u64 offset = (unsigned long)host_addr & (PAGE_SIZE - 1); + u64 pin_size = ndma_calc_zc_pin_size(op->size + offset); // pin size is in page units, so include the page offset in size calc + + while (remaining) { + struct ndma_h2t_zcdma_context *dma_ctx = &dma_ctx_tbl[next_dma_idx]; + dma_ctx->start_time = get_jiffies_64(); + dma_ctx->host_addr = host_addr; + dma_ctx->dev_addr = dev_addr; + dma_ctx->size = pin_size - offset; // first chunk might not be aligned on the page boundary, all subsequent chunk will be aligned + // and the offset will be 0 + dma_ctx->last = (dma_ctx->size == remaining && i == num_ops - 1); + dma_ctx->nr_pages = DIV_ROUND_UP(pin_size, PAGE_SIZE); + if (dma_ctx->nr_pages > NDMA_ZC_PAGES_PER_XFER) { + pr_err_once("page count too large: %u\n", dma_ctx->nr_pages); + } + + //__GFP_SKIP_ZERO + int nr_pinned = pin_user_pages_fast((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, + direction ? 0 : FOLL_WRITE, dma_ctx->page_list); + if (nr_pinned != dma_ctx->nr_pages) { + // if failed pin_fast because of page fault, do the regular pinning + if (nr_pinned > 0) { + unpin_user_pages( dma_ctx->page_list, nr_pinned); + } #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6))) - nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list); + nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list); #else - nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL); + nr_pinned = pin_user_pages((unsigned long)dma_ctx->host_addr & PAGE_MASK, dma_ctx->nr_pages, direction ? 0 : FOLL_WRITE, dma_ctx->page_list, NULL); #endif - if (nr_pinned != dma_ctx->nr_pages) { - ret = -ENOMEM; // could use -EBUSY instead - pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd->device_index, nr_pinned); - - if (nr_pinned > 0) - unpin_user_pages( dma_ctx->page_list, nr_pinned); - // cleanup: wait for prev dma to complete (which also unpins pages) - if (pdma_ctx != NULL) - ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL); - goto fail; - } - } - - // TODO need to have this for other architectures - // for (i=0; i < dma_ctx->nr_pages; i++) { - // struct device - // dma_ctx->addr[i] = dma_map_page( nd->pdev->dev, dma_ctx_page_list[i], 0, PAGE_SIZE, DMA_TO_DEVICE/DMA_FROM_DEVICE); - // ret = dma_mapping_error(dev->dev, dma_ctx->addr[i]); - // if (ret) { } - // } - // flush_cache_range(vma, - // - // TODO - may need a callback here to check descriptors - - ret = ndma_build_n_issue_zc_descs( dma_ctx); - if (ret) { - unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages); - // cleanup: wait for prev dma to complete (which also unpins pages) - if (pdma_ctx != NULL) ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL); - goto fail; - } - - if (pdma_ctx != NULL) { - ret = ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, dma_ctx); - if (ret) - goto fail; - } - pdma_ctx = dma_ctx; - next_dma_idx = (next_dma_idx+1) % 2; - - remaining -= dma_ctx->size; - host_addr += dma_ctx->size; - dev_addr += dma_ctx->size; - cpy_size = (remaining < cpy_size) ? remaining : cpy_size; - } - - ret = ndma_zero_copy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL); + if (nr_pinned != dma_ctx->nr_pages) { + ret = -ENOMEM; // could use -EBUSY instead + pr_err("could not pin host pages for zero copy dma on nd %d: nr_pinned %d\n", nd->device_index, nr_pinned); + + if (nr_pinned > 0) { + unpin_user_pages( dma_ctx->page_list, nr_pinned); + } + // cleanup: wait for prev dma to complete (which also unpins pages) + if (pdma_ctx != NULL) { + ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL); + } + goto fail; + } + } + + // TODO need to have this for other architectures + // for (i=0; i < dma_ctx->nr_pages; i++) { + // struct device + // dma_ctx->addr[i] = dma_map_page( nd->pdev->dev, dma_ctx_page_list[i], 0, PAGE_SIZE, DMA_TO_DEVICE/DMA_FROM_DEVICE); + // ret = dma_mapping_error(dev->dev, dma_ctx->addr[i]); + // if (ret) { } + // } + // flush_cache_range(vma, + + ret = ndma_build_n_issue_zc_descs(dma_ctx); + if (ret) { + unpin_user_pages( dma_ctx->page_list, dma_ctx->nr_pages); + // cleanup: wait for prev dma to complete (which also unpins pages) + if (pdma_ctx != NULL) { + ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, NULL); + } + goto fail; + } + + if (pdma_ctx != NULL) { + ret = ndma_zerocopy_wait_for_completion(nd, nc_id, eng, ring, pdma_ctx, dma_ctx); + if (ret) { + goto fail; + } + } + + pdma_ctx = dma_ctx; + next_dma_idx = (next_dma_idx+1) % 2; + + remaining -= dma_ctx->size; + host_addr += dma_ctx->size; + dev_addr += dma_ctx->size; + pin_size = (remaining < pin_size) ? remaining : pin_size; + offset = 0; + } + } + + + // Wait for the last chunk + if (pdma_ctx) { + ret = ndma_zerocopy_wait_for_completion( nd, nc_id, eng, ring, pdma_ctx, NULL); + } fail: - - // release resources - // - for (i=0;i< 2;i++) { - if (dma_ctx_tbl[i].page_list != NULL) - kfree(dma_ctx_tbl[i].page_list); - if (dma_ctx_tbl[i].completion_ptr != NULL) { - kfree(dma_ctx_tbl[i].completion_ptr); - } - } - mutex_unlock(&eng->h2t_ring_lock); - - return ret; -} - -/** - * ndma_memcpy_zero_copy_mc() - * - * Wrapper around ndma_memcpy_zero_copy() that pulls nc_id and device phyical address from - * the mem chunk. - * - * Todo: - * Range check the device address here. - * - * Assumptions: - * caller has done access_ok() check on the host address - * if (!access_ok(blah) return -EFAULT; - * or check_copy_size() - */ -int ndma_memcpy_zero_copy_mc( struct neuron_device *nd, void * host_addr, struct mem_chunk *dev_mc, u64 dev_offset, u64 size, bool direction) -{ - dma_addr_t dev_addr; - u32 nc_id; - - nc_id = ndma_mc_pair_to_nc( dev_mc, dev_mc); - dev_addr = ndma_mc_to_pa( dev_mc) + dev_offset; // range has been checked by the caller - - return ndma_memcpy_zero_copy(nd, nc_id, host_addr, dev_addr, size, direction); + // release resources + for (i = 0; i < 2; i++) { + if (dma_ctx_tbl[i].page_list != NULL) + kfree(dma_ctx_tbl[i].page_list); + if (dma_ctx_tbl[i].completion_ptr != NULL) { + kfree(dma_ctx_tbl[i].completion_ptr); + } + } + if (locked) { + mutex_unlock(&ring->h2t_ring_lock); + } + + return ret; } diff --git a/neuron_dma.h b/neuron_dma.h index 79f8254..012bef4 100644 --- a/neuron_dma.h +++ b/neuron_dma.h @@ -165,7 +165,7 @@ int ndma_memcpy64k(struct ndma_eng *eng, struct ndma_ring *ring, dma_addr_t src, * add a completion entry to the ring * */ -int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer); +int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *ring, void * completion_buffer, int barrier_type); /** * Wait for completion by start transfer of a DMA between two host memory locations and polling @@ -174,18 +174,89 @@ int ndma_memcpy_add_completion_desc( struct ndma_eng *eng, struct ndma_ring *rin int ndma_memcpy_wait_for_completion(struct ndma_eng *eng, struct ndma_ring *ring, u32 count, void * ptr, bool async, bool is_d2d); /** - * ndma_memcpy_zero_copy_mc() + * ndma_mc_pair_to_nc - Resolve the neuron core id for two memory chunks. + * @src_mc: Source memory chunk participating in the transfer. + * @dst_mc: Destination memory chunk participating in the transfer. * - * Wrapper around ndma_memcpyzero_copy() that pulls nc_id and device phyical address from - * the mem chunk. + * Returns the NC identifier that owns the DMA engine, favoring the device-side + * chunk when one side resides in host memory. + */ +u32 ndma_mc_pair_to_nc(struct mem_chunk *src_mc, struct mem_chunk *dst_mc); + +/** + * ndma_mc_to_pa - Translate a memory chunk into a DMA-usable physical address. + * @mc: Memory chunk to translate. + * + * Host chunks map through the PCI host BAR, while device chunks already carry + * their physical base address. + */ +dma_addr_t ndma_mc_to_pa(struct mem_chunk *mc); + +/** + * ndma_zerocopy_supported - Check whether zero-copy DMA is permitted. + * + * Architectures that require DMA retry disable the zero-copy pipeline. + */ +bool ndma_zerocopy_supported(void); + +/** + * ndma_memcpy_zerocopy - Perform a pipelined zero-copy DMA transfer. + * @nd: Neuron device whose DMA engine is used. + * @nc_id: Neuron core identifier owning the queue. + * @ops: Array of host buffer descriptors. + * @num_ops: Number of descriptors in @ops. + * @dev_base: Base device physical address for the transfer. + * @qid: Queue identifier to submit descriptors on. + * @direction: true for host-to-device, false for device-to-host. + * + * DMA data between a user space virtual address range and a contiguous location in device memory. + * In order to do this, we need to know the physical pages are associated with + * the user virtual address range and we need to make sure those physical pages stay + * associated with the user virtual address range while the DMA is happening. + * + * How do we do this? By asking the kernel to pin the physical pages in memory until we are + * done with them. But our transaction could be large, the physical pages won't be contiguous, + * and pinning takes CPU cycles, so we break the dma transfer up into a series of smaller transfers + * where we pipeline the pinning of physical pages with dma transfers. * - * Todo: - * Range check the device address here. + * We use pin_user_pages_fast() to reduce pinning overhead because we know the process can't go + * away while we are down here doing our thing in the kernel within a single IOCTL call. + * + * We ping pong back and forth between two dma contexts. So while dma for context A is in progress, + * we are pinning pages and starting dmas for context B. + * + * Algorithm goes like this: + * initial a pair of dma contexts + * prev dma ctx = null + * lock() + * while still more data remaining + * current dma ctx = next available context + * init current dma context + * calc size of the transfer for this dma context. We want to transfer up to page boundaries + * calc number of pages that need to be pinned for this dma + * pin host pages in memory + * generate descriptors for + * if prev dma ctx != NULL, wait for the prev dma to complete + * update host address, device address and ammount remaining + * wait for the last dma ctx to complete + * unlock() + * free resources + * + * Notes: + * unpinning responsibilities. Up until a dma is successfully launched, this routine is responsible for unpinning + * host memory. After that ndma_zerocopy_wait_for_completion() owns responsibility for unpinning pages. + * + * We don't do this here, but pinning user pages across system (IOCTL) calls has a number of additional requirements. + * We would have to cleanup any pinned pages when the process goes away, so any pinned pages have to get tracked in + * process context. * - * Assumptions: - * caller has done access_ok() check on the host address - * if (!access_ok(blah) return -EFAULT; - * or check_copy_size() */ -int ndma_memcpy_zero_copy_mc( struct neuron_device *nd, void * host_addr, struct mem_chunk *dev_mc, u64 dev_offset, u64 size, bool direction); +int ndma_memcpy_zerocopy(struct neuron_device *nd, + u32 nc_id, + const nrt_tensor_batch_op_t *ops, + u32 num_ops, + dma_addr_t dev_base, + int qid, + bool direction); + #endif diff --git a/neuron_dmabuf.c b/neuron_dmabuf.c index e7457ed..6a510e1 100644 --- a/neuron_dmabuf.c +++ b/neuron_dmabuf.c @@ -344,8 +344,13 @@ int ndmabuf_get_fd(u64 va, u64 size, int *dmabuf_fd) fd = dma_buf_fd(dmabuf, exp_info.flags); if (fd < 0) { - pr_err("error %d while installing a file descriptor for dma-buf\n", ret); - ret = -EINVAL; + if (fd == -EMFILE) { + pr_err("dma_buf_fd failed: too many open files\n"); + ret = -EMFILE; + } else { + pr_err("error %d while installing a file descriptor for dma-buf\n", ret); + ret = -EINVAL; + } goto err_dma_buf_put; } diff --git a/neuron_fw_io.c b/neuron_fw_io.c index 44f4a18..8632d04 100644 --- a/neuron_fw_io.c +++ b/neuron_fw_io.c @@ -25,7 +25,6 @@ #include "neuron_reg_access.h" #include "neuron_device.h" #include "neuron_arch.h" -#include "v1/fw_io.h" #include "neuron_fw_io.h" #include "neuron_dhal.h" @@ -46,7 +45,7 @@ int fw_io_ecc_read(void *bar0, uint64_t ecc_offset, uint32_t *ecc_err_count) } void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + ecc_offset; - int ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, ecc_err_count, 1, false); + int ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, ecc_err_count, 1, true); if (ret) { pr_err("failed to get ecc error count from the device for ecc_offset=%llu\n", ecc_offset); return -EIO; @@ -60,7 +59,7 @@ int fw_io_hbm_uecc_repair_state_read(void *bar0, uint32_t *hbm_repair_state) int ret; void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_HBM_REPAIR_STATE_OFFSET; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, hbm_repair_state, 1, false); + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, hbm_repair_state, 1, true); if (ret) { pr_err("failed to get hbm reapirable state\n"); return -EIO; @@ -82,7 +81,7 @@ int fw_io_serial_number_read(void *bar0, uint64_t *serial_number) uint32_t serial_number_lo = 0; void *addr_lo = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERIAL_NUMBER_LO_OFFSET; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_lo, &serial_number_lo, 1, false); + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_lo, &serial_number_lo, 1, true); if (ret) { pr_err("failed to get the lower 32 bits of the serial number from the device\n"); return -EIO; @@ -90,7 +89,7 @@ int fw_io_serial_number_read(void *bar0, uint64_t *serial_number) uint32_t serial_number_hi = 0; void *addr_hi = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERIAL_NUMBER_HI_OFFSET; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_hi, &serial_number_hi, 1, false); + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr_hi, &serial_number_hi, 1, true); if (ret) { pr_err("failed to get the higher 32 bits of the serial number from the device\n"); return -EIO; @@ -113,7 +112,7 @@ int fw_io_device_power_read(void *bar0, u32 *power, unsigned die) // Read power utilization from MiscRAM. The power utilization for each die are set up in contiguous 32 bit // miscram registers, so we can treat it like an array of uint32s for our purposes. void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_POWER_UTIL_D0_OFFSET + 4*die; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, power, 1, false); + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, power, 1, true); if (ret) { pr_err("failed to get device power from the device, ret = %d\n", ret); } @@ -126,7 +125,7 @@ int fw_io_api_version_read(void * bar0, u32 *version) int ret; void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_API_VERSION_OFFSET; - ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, version, 1, false); + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, version, 1, true); if (ret) { pr_err("failed to get api version from the device, ret = %d\n", ret); } @@ -134,6 +133,19 @@ int fw_io_api_version_read(void * bar0, u32 *version) return ret; } +int fw_io_server_info_read(void *bar0, u32 *server_info) +{ + int ret; + + void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_SERVER_RACK_ID_OFFSET; + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, server_info, 1, true); + if (ret) { + pr_err("failed to get server info from the device, ret = %d\n", ret); + } + + return ret; +} + int fw_io_device_id_read(void *bar0, u32 *device_id) { void * addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DEVICE_ID_OFFSET; @@ -217,14 +229,30 @@ static void dx_crc32c_add(const u8 *data, size_t len, u32 *csum) } } -static u32 crc32c(const u8 *data, size_t len) +static const u32 fw_io_cmd_timeout_tbl[FW_IO_CMD_MAX] = { + 0, // cmd 0 + (1000 * 1000 * 1), // cmd 1 (FW_IO_CMD_READ) + (1000 * 1000 * 1), // cmd 2 (FW_IO_CMD_POST_TO_CW) + (1000 * 1000 * 60) // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE) +}; + +static const u32 fw_io_cmd_retry_tbl[FW_IO_CMD_MAX] = { + 0, // cmd 0 + 15, // cmd 1 (FW_IO_CMD_READ) + 15, // cmd 2 (FW_IO_CMD_POST_TO_CW) + 3 // cmd 3 (FW_IO_CMD_SET_POWER_PROFILE) +}; + +static u32 crc32c(const u8 *hdr, const u8 *data, size_t len) { u32 csum = 0xffffffff; + if (hdr != NULL) + dx_crc32c_add(hdr, 8, &csum); dx_crc32c_add(data, len, &csum); return csum ^ 0xffffffff; } -static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, +int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size) { int ret; @@ -240,7 +268,11 @@ static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 return -EINVAL; } - mutex_lock(&ctx->lock); + // HACK: Skip mutex lock for POST_TO_CW commands as fw_io_post_metric() already holds the lock + // TODO: Remove this hack implementation when legacy API is deprecated + if (command_id != FW_IO_CMD_POST_TO_CW) { + mutex_lock(&ctx->lock); + } int i; for (i=0; i < FW_IO_RD_RETRY; i++){ @@ -250,26 +282,27 @@ static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 ctx->next_seq_num = 1; memcpy(ctx->request->data, req, req_size); - ctx->request->sequence_number = ctx->next_seq_num; - ctx->request->command_id = command_id; - ctx->request->size = req_size + sizeof(struct fw_io_request); - ctx->request->crc32 = 0; - ctx->request->crc32 = crc32c((const u8 *)ctx->request, ctx->request->size); + ctx->request->request_hdr.hdr.sequence_number = ctx->next_seq_num; + ctx->request->request_hdr.hdr.command_id = command_id; + ctx->request->request_hdr.hdr.size = req_size + sizeof(struct fw_io_request); + ctx->request->request_hdr.hdr.crc32 = 0; + ctx->request->request_hdr.hdr.crc32 = crc32c((const u8 *)&ctx->request->request_hdr, ctx->request->data, ctx->request->request_hdr.hdr.size - sizeof(ctx->request->request_hdr.hdr)); // make sure the sequence number we will wait on is not the same - ctx->response->sequence_number = 0; + ctx->response->response_hdr.hdr.sequence_number = 0; dma_rmb(); fw_io_trigger(ctx->bar0); // now wait for resp->seq == req->seq which indicates that request has been completed and // we have a response ktime_t start_time = ktime_get(); - volatile u8 *fwio_seq = (volatile u8 *)&ctx->response->sequence_number; + volatile u8 *fwio_seq = (volatile u8 *)&ctx->response->response_hdr.hdr.sequence_number; + do { resp_seq = READ_ONCE(*fwio_seq); if (resp_seq == ctx->next_seq_num) break; msleep(1); - } while ( ktime_to_us(ktime_sub(ktime_get(), start_time)) < FW_IO_RD_TIMEOUT); + } while ( ktime_to_us(ktime_sub(ktime_get(), start_time)) < FW_IO_RD_TIMEOUT); ret = -1; if (resp_seq != ctx->next_seq_num) { @@ -277,29 +310,131 @@ static int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 pr_err("seq: %u, cmd: %u timed out\n", ctx->next_seq_num, command_id); continue; } - if (ctx->response->error_code == FW_IO_SUCCESS) { - if ((ctx->response->size - sizeof(struct fw_io_response)) > resp_size) { + if (ctx->response->response_hdr.hdr.error_code == FW_IO_SUCCESS) { + if ((ctx->response->response_hdr.hdr.size - sizeof(struct fw_io_response)) > resp_size) { // this is probably not possible pr_err("seq: %u, cmd: %u response too large (%u)\n", ctx->next_seq_num, - command_id, ctx->response->size); + command_id, ctx->response->response_hdr.hdr.size); goto done; } memcpy(resp, ctx->response->data, - ctx->response->size - sizeof(struct fw_io_response)); + ctx->response->response_hdr.hdr.size - sizeof(struct fw_io_response)); ret = 0; goto done; } ctx->fw_io_err_count++; pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, - ctx->response->error_code); + ctx->response->response_hdr.hdr.error_code); // if we get an unsupported command response, don't retry - if (ctx->response->error_code == FW_IO_UNKNOWN_COMMAND) { + if (ctx->response->response_hdr.hdr.error_code == FW_IO_UNKNOWN_COMMAND) { ret = -1; goto done; } } done: + // HACK: Only unlock if we locked (not FW_IO_CMD_POST_TO_CW) + // TODO: Remove this hack implementation when legacy API is deprecated + if (command_id != FW_IO_CMD_POST_TO_CW) { + mutex_unlock(&ctx->lock); + } + return ret; +} + +int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size) +{ + int ret; + int i, j; + union fw_io_request_hdr req_header; + + u32 api_version_num = 0; + + ret = fw_io_api_version_read(ctx->bar0, &api_version_num); + + if ((ret != 0) || (api_version_num < FW_IO_NEW_READLESS_READ_MIN_API_VERSION)) { + pr_info_once("Pacific version %d, using legacy Pacific/Runtime comm framework", api_version_num); + return -ENOTSUPP; + } + + mutex_lock(&ctx->lock); + + u32 retry_count = (command_id < FW_IO_CMD_MAX) ? fw_io_cmd_retry_tbl[command_id] : FW_IO_RD_RETRY; + for (i=0; i < retry_count; i++){ + if (++ctx->next_seq_num == 0) + ctx->next_seq_num = 1; + + req_header.hdr.sequence_number = ctx->next_seq_num; + req_header.hdr.command_id = command_id; + req_header.hdr.size = req_size + sizeof(req_header); + req_header.hdr.crc32 = 0; + req_header.hdr.crc32 = crc32c((const u8 *)&req_header, req, req_size); + + // Write data + if (req_size > 0) { + u32 *data = (u32*)req; + for (j=0; j < (req_size + 3) / 4; j++) { + reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + j*4, data[j]); + } + } + + // Write header + reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET, req_header.reg.dw0); + reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_LOW_OFFSET, req_header.reg.dw1); + + // Zero response header + reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESPONSE_BASE_ADDR_HIGH_OFFSET, 0); + reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_RESPONSE_BASE_ADDR_LOW_OFFSET, 0); + + // Set ack and trigger + reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_ACK_OFFSET, 1); + reg_write32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET, 1); + + // Poll for completion + ktime_t start_time = ktime_get(); + u32 trigger; + u32 timeout = (command_id < FW_IO_CMD_MAX) ? fw_io_cmd_timeout_tbl[command_id] : FW_IO_RD_TIMEOUT; + do { + reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET, &trigger); + if (!trigger) break; + msleep(1); + } while (ktime_to_us(ktime_sub(ktime_get(), start_time)) < timeout); + if (trigger) { + if (command_id != FW_IO_CMD_POST_TO_CW) + pr_err("seq: %u, cmd: %u timed out\n", ctx->next_seq_num, command_id); + continue; + } + + // Read response header + union fw_io_response_hdr resp_header; + reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_REQUEST_BASE_ADDR_HIG_OFFSET, &resp_header.reg.dw0); + + if (resp_header.hdr.sequence_number != ctx->next_seq_num) { + if (command_id != FW_IO_CMD_POST_TO_CW) + pr_err("seq: %u, cmd: %u seq mismatch\n", ctx->next_seq_num, command_id); + continue; + } + + if (resp_header.hdr.error_code == FW_IO_SUCCESS) { + u32 data_size = resp_header.hdr.size - sizeof(resp_header); + if (data_size > 0 && resp != NULL) { + u32 copy_size = min(resp_size, data_size); + u32 *resp_data = (u32*)resp; + for (j = 0; j < (copy_size + 3) / 4; j++) { + reg_read32(ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET + j*4, &resp_data[j]); + } + } + ret = 0; + break; + } + + ctx->fw_io_err_count++; + pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code); + if (resp_header.hdr.error_code == FW_IO_UNKNOWN_COMMAND) { + ret = -1; + break; + } + } + mutex_unlock(&ctx->lock); return ret; } @@ -319,7 +454,7 @@ int fw_io_read(struct fw_io_ctx *ctx, u64 addr_in[], u32 val_out[], u32 num_req) if (should_fail(&neuron_fail_fwio_read, 1)) return -ETIMEDOUT; #endif - return fw_io_execute_request(ctx, FW_IO_CMD_READ, (u8 *)addr_in, sizeof(u64) * num_req, + return ndhal->ndhal_fw_io.fw_io_execute_request(ctx, FW_IO_CMD_READ, (u8 *)addr_in, sizeof(u64) * num_req, (u8 *)val_out, sizeof(u32) * num_req); } @@ -453,6 +588,7 @@ int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size) u32 padded_u32 = 0; u32 *m = (u32 *)data; int i; + int ret; #ifdef CONFIG_FAULT_INJECTION if (should_fail(&neuron_fail_fwio_post_metric, 1)) @@ -462,8 +598,11 @@ int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size) return -E2BIG; } + // Lock mutex to prevent race condition with new interface + mutex_lock(&ctx->lock); + // Write the data in the misc ram first - void * offset = (void *) (ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_METRIC_OFFSET); + void * offset = (void *) (ctx->bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_DATA_OFFSET); for (i = 0; i < (size / 4); i++) { reg_write32(offset + (i * 4), m[i]); } @@ -474,9 +613,19 @@ int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size) reg_write32(offset + size_aligned, padded_u32); } - return fw_io_execute_request(ctx, FW_IO_CMD_POST_TO_CW, data, size, data, size); + ret = ndhal->ndhal_fw_io.fw_io_execute_request(ctx, FW_IO_CMD_POST_TO_CW, data, size, data, size); + + mutex_unlock(&ctx->lock); + return ret; } +int fw_io_post_metric_new(struct fw_io_ctx *ctx, u8 *data, u32 size) +{ + if (size > FW_IO_REG_METRIC_BUF_SZ) { + return -E2BIG; + } + return fw_io_execute_request_new(ctx, FW_IO_CMD_POST_TO_CW, data, size, NULL, 0); +} int fw_io_read_counters(struct fw_io_ctx *ctx, uint64_t addr_in[], uint32_t val_out[], uint32_t num_counters) @@ -575,3 +724,18 @@ uint32_t fw_io_get_total_uecc_err_count(void *bar0) { } return total_uncorrected_ecc_err_count; } + +int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile) +{ + union fw_io_req_perfprofile_data data = {0}; + data.rec.profile = (uint8_t)profile; + data.rec.voltage_margin = 0; + data.rec.frequency_index = 0; + data.rec.ocw_index = 0; + + if (!ctx) { + return -EINVAL; + } + + return fw_io_execute_request_new(ctx, FW_IO_CMD_SET_POWER_PROFILE, (u8 *)&data, sizeof(data), NULL, 0); +} diff --git a/neuron_fw_io.h b/neuron_fw_io.h index f915d55..6c60d19 100644 --- a/neuron_fw_io.h +++ b/neuron_fw_io.h @@ -7,24 +7,55 @@ #include +union fw_io_request_hdr { + struct { + u8 sequence_number; // sequence number to be copied in the next response. + u8 command_id; // command to hw. + u16 size; // request size in bytes including the header. + u32 crc32; // crc32 of the entire request, crc32 must be set to 0 before calculating + } hdr; + struct { + u32 dw0; // bytes 0-3: sequence_number, command_id, size + u32 dw1; // bytes 4-7: crc32 + } reg; +}; + +union fw_io_response_hdr { + struct { + u8 sequence_number; // request sequence number + u8 error_code; // 0 means request was successfully completed + u16 size; // response size in bytes including this header + } hdr; + struct { + u32 dw0; + } reg; +}; + struct fw_io_request { - u8 sequence_number; // sequence number to be copied in the next response. - u8 command_id; // command to hw. - u16 size; // request size in bytes including the header. - u32 crc32; // crc32 of the entire request, crc32 must be set to 0 before calculating - u8 data[0]; + union fw_io_request_hdr request_hdr; + u8 data[]; }; struct fw_io_response { - u8 sequence_number; // request sequence number - u8 error_code; // 0 means request was successfully completed - u16 size; // response size in bytes including this header - u8 data[0]; // response data if any + union fw_io_response_hdr response_hdr; + u8 data[]; +}; + +union fw_io_req_perfprofile_data { + struct { + uint32_t reserved; + uint8_t profile; + uint8_t voltage_margin; + uint8_t frequency_index; + uint8_t ocw_index; + } rec; + uint32_t raw[2]; }; enum { FW_IO_CMD_READ = 1, // read a register value - FW_IO_CMD_POST_TO_CW = 2 // post given blob as metrics to CloudWatch + FW_IO_CMD_POST_TO_CW = 2, // post given blob as metrics to CloudWatch + FW_IO_CMD_SET_POWER_PROFILE = 3 // set power profile }; enum { @@ -42,6 +73,7 @@ enum { // offsets in MISC RAM for FWIO enum { FW_IO_REG_DEVICE_ID_OFFSET = 0x24, + FW_IO_REG_SERVER_RACK_ID_OFFSET = 0x70, // MISC RAM register for API version // - This register is used to determine the API version of the firmware. @@ -77,7 +109,7 @@ enum { FW_IO_REG_RUNTIME_RESERVED0 = 0xC0, // 0xC0 to 0xF0 - FW_IO_REG_METRIC_OFFSET = 0x100, // 0x100 to 0x17F, 128 bytes + FW_IO_REG_DATA_OFFSET = 0x100, // 0x100 to 0x17F, 128 bytes FW_IO_REG_LH_NEIGHBOR_SERNUM_HI = 0x180, // LH/RH neighbors FW_IO_REG_LH_NEIGHBOR_SERNUM_LO = 0x184, FW_IO_REG_RH_NEIGHBOR_SERNUM_HI = 0x188, @@ -94,6 +126,7 @@ enum { FW_IO_REG_RESPONSE_BASE_ADDR_LOW_OFFSET = 0x1fc, FW_IO_REG_RESPONSE_BASE_ADDR_HIGH_OFFSET = 0x1f8, FW_IO_REG_TRIGGER_INT_NOSEC_OFFSET = 0x800, + FW_IO_REG_ACK_OFFSET = 0xf0, }; #define FW_IO_REG_METRIC_BUF_SZ 128 @@ -113,13 +146,24 @@ struct fw_io_ctx { #define UINT64_LOW(x) ((u32)(((u64)(x)) & 0xffffffffULL)) #define UINT64_HIGH(x) ((u32)((x) >> 32)) -// Hardware might take up to 15 seconds in worst case. +#define FW_IO_CMD_MAX 4 + +#define FW_IO_CMD_MAX 4 + +// Wait up to 30 seconds in worst case. +// Hardware can in some cases take longer to come out of reset but for some reads +// (like reading device ID before creating the device) we cannot wait too long +// because it's confusing if driver load appears to hang and kernel may complain #define FW_IO_RD_TIMEOUT (1000 * 1000 * 1) -#define FW_IO_RD_RETRY 15 +#define FW_IO_RD_RETRY 30 // max number of registers can be read in single function call #define FW_IO_MAX_READLESS_READ_REGISTER_COUNT 100 +// Min Pacific API version for new readless read framework +#define FW_IO_NEW_READLESS_READ_MIN_API_VERSION 7 +#define FW_IO_POWER_MIN_API_VERSION 3 + /** * fw_io_register_read_region - Read a BAR region @@ -212,6 +256,15 @@ void fw_io_destroy(struct fw_io_ctx *ctx); */ int fw_io_post_metric(struct fw_io_ctx *ctx, u8 *data, u32 size); +/** + * fw_io_post_metric_new() - Post given block data as metric using new framework + * @param ctx: FWIO context + * @param data: data to post + * @param size: size of the data + * Return: 0 if metric is successfully posted, negative on failure + */ +int fw_io_post_metric_new(struct fw_io_ctx *ctx, u8 *data, u32 size); + /** * fw_io_initiate_reset() - Initiate device local reset. * @@ -246,6 +299,14 @@ bool fw_io_is_reset_initiated(void __iomem *bar0); int fw_io_read_counters(struct fw_io_ctx *ctx, uint64_t addr_in[], uint32_t val_out[], uint32_t num_counters); +/** + * fw_io_server_info_read() - Read server info + * @param bar - from bar + * @param server_info - server info containing rack & server ids + * @return 0 on success. + */ +int fw_io_server_info_read(void *bar0, u32 *server_info); + /** * fw_io_device_id_read() - Read device id * @param bar - from bar @@ -321,4 +382,36 @@ uint32_t fw_io_get_total_uecc_err_count(void *bar0); * @param bar0: from bar */ int fw_io_hbm_uecc_repair_state_read(void *bar0, uint32_t *hbm_repair_state); + +/** + * fw_io_execute_request() - Execute request (original protocols) + * @param ctx: FWIO context + * @param command_id: Command ID + * @param req: Request data + * @param req_size: Request size + * @param resp: Response buffer + * @param resp_size: Response buffer size + * @return 0 on success, negative on failure + */ +int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size); + +/** + * fw_io_execute_request_new() - Execute request (new protocols) + * @param ctx: FWIO context + * @param command_id: Command ID + * @param req: Request data + * @param req_size: Request size + * @param resp: Response buffer + * @param resp_size: Response buffer size + * @return 0 on success, negative on failure + */ +int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size); + +/** + * fw_io_set_power_profile() - Set power profile + * @param ctx: FWIO context + * @param profile: Power profile value + * @return 0 on success, negative on failure + */ +int fw_io_set_power_profile(struct fw_io_ctx *ctx, uint32_t profile); #endif diff --git a/neuron_ioctl.h b/neuron_ioctl.h index e34c1b0..80989d9 100644 --- a/neuron_ioctl.h +++ b/neuron_ioctl.h @@ -186,6 +186,26 @@ struct neuron_ioctl_mem_buf_copy64 { __u32 copy_to_mem_handle; // [in] if set to True copies from buffer to memhandle else copies from memhandle to buffer. }; +struct neuron_ioctl_mem_buf_copy64zc { + __u64 mem_handle; // [in] Source or Destination memory handle from/to data needs to be copied. + void *buffer; // [in] Buffer from/to where data to be copied. + __u64 size; // [in] Size of the data to be copied. + __u64 offset; // [in] Offset in the memory handle where the data to be written/read. + __u32 is_copy_to_device; // [in] if set to True copies to device + __u32 bar4_wr_threshold; // [in] threshold below which we will use bar4 direct write vs. DMA. Subject to driver limits. + __s32 h2t_qid; // [in] h2t queue to use for the transfer. -1 = use default + __u32 dummy; // [na] pad to change size of struct to version ioctl +}; + +struct neuron_ioctl_mem_buf_copy64zc_batches { + __u64 sequence_num; // [in] The sequence number that uniquely indentifies each async I/O. + neuron_memcpy_batch_t *batches_ptr; // [in] Pointer to an array of memory copy batches. + __u32 num_batches; // [in] Number of batches in the batches_ptr array. + __u16 is_copy_to_device; // [in] If true, then copies from host to device. + __s32 h2t_qid; // [in] H2T queue ID. -1 for default. + __u16 flags; // [in] TBD +}; + struct neuron_ioctl_program_engine { __u64 dst; // [in] Destination engine address void *buffer; // [in] Buffer from/to where data to be copied. @@ -581,6 +601,34 @@ struct neuron_ioctl_pod_ctrl_v2 { __u32 mode; // [in] operating mode }; +struct neuron_ioctl_h2t_dma_alloc_queues { + __u16 sz; // [in] structure size for versioning. + __u16 nc_id; // [in] neuron core to allocate h2t queues for + __u16 copy_queue_cnt; // [in] number of copy queues requested + __u16 service_queue_cnt; // [in] number of service queues requested + __u32 copy_queue_bmap; // [out] return bitmap of copy queues allocated + __u32 service_queue_bmap; // [out] return bitmap of service queues allocated + __u32 copy_default_queue; // [out] return default h2t copy queue +}; + +struct neuron_ioctl_h2t_dma_free_queues { + __u16 sz; // [in] structure size for versioning. + __u8 nc_id; // [in] neuron core to free h2t queues for + __u8 fill0; // [in] padding + __u32 queue_bmap; // [in] bitmap of queues to free +}; + +struct neuron_ioctl_power_profile { + __u16 sz; // [in] size of the structure + __u16 ctrl; // [in] control 0 = set; 1 = get + __u32 profile; // [in] power profile to set +}; + +struct neuron_ioctl_metrics_ctrl { + __u32 mode; // [in] modifications to metric behavior (neuron_metrics_mode) +}; + + #define NEURON_IOCTL_BASE 'N' /* Deprecated reset related IOCTLs. Now it would always return success. */ @@ -775,9 +823,18 @@ struct neuron_ioctl_pod_ctrl_v2 { #define NEURON_IOCTL_POD_CTRL _IOWR(NEURON_IOCTL_BASE, 123, struct neuron_ioctl_pod_ctrl) #define NEURON_IOCTL_POD_CTRL_V2 _IOWR(NEURON_IOCTL_BASE, 123, struct neuron_ioctl_pod_ctrl_v2) -#define NEURON_IOCTL_MEM_BUF_ZEROCOPY64 _IOWR(NEURON_IOCTL_BASE, 124, struct neuron_ioctl_mem_buf_copy64) +#define NEURON_IOCTL_MEM_BUF_ZEROCOPY64 _IOWR(NEURON_IOCTL_BASE, 124, struct neuron_ioctl_mem_buf_copy64zc) + +#define NEURON_IOCTL_H2T_DMA_ALLOC_QUEUES _IOWR(NEURON_IOCTL_BASE, 125, struct neuron_ioctl_h2t_dma_alloc_queues) +#define NEURON_IOCTL_H2T_DMA_FREE_QUEUES _IOWR(NEURON_IOCTL_BASE, 126, struct neuron_ioctl_h2t_dma_free_queues) + +#define NEURON_IOCTL_POWER_PROFILE _IOW(NEURON_IOCTL_BASE, 127, struct neuron_ioctl_power_profile) + +#define NEURON_IOCTL_METRICS_CTRL _IOW(NEURON_IOCTL_BASE, 128, struct neuron_ioctl_metrics_ctrl) + +#define NEURON_IOCTL_MEM_BUF_ZEROCOPY64_BATCHES _IOWR(NEURON_IOCTL_BASE, 129, struct neuron_ioctl_mem_buf_copy64zc_batches) // Note: 133 is taken by NEURON_IOCTL_DMA_QUEUE_INIT_BATCH -#define NEURON_IOCTL_MAX 125 +#define NEURON_IOCTL_MAX 130 #endif diff --git a/neuron_metrics.c b/neuron_metrics.c index f9b9085..6fbc28d 100644 --- a/neuron_metrics.c +++ b/neuron_metrics.c @@ -51,6 +51,7 @@ enum nmetric_cw_id { NMETRIC_CW_ID_AVG_TPB_RESET_TIME_MS = 53, NMETRIC_CW_ID_DEVICE_RESET_FAILURE_COUNT = 54, NMETRIC_CW_ID_TPB_RESET_FAILURE_COUNT = 55, + NMETRIC_CW_ID_PERFORMANCE_PROFILE_ID = 56, // Extra versions // extra space for reporting multiple versions of the same type in one post @@ -112,6 +113,7 @@ static const nmetric_def_t nmetric_defs[] = { // constant metrics NMETRIC_CONSTANT_DEF(0, POST_TIME_ALWAYS, NMETRIC_CW_ID_INSTANCE_ID), // instance id NMETRIC_CONSTANT_DEF(1, POST_TIME_ALWAYS, NMETRIC_CW_ID_DRIVER_VERSION), // driver version + NMETRIC_CONSTANT_DEF(2, POST_TIME_TICK_0, NMETRIC_CW_ID_PERFORMANCE_PROFILE_ID), // performance profile id // version metrics NMETRIC_VERSION_DEF(0, POST_TIME_ALWAYS, NMETRIC_CW_ID_RT_VERSION_BASE, NDS_ND_COUNTER_RUNTIME_VERSION, 0), // rt version @@ -178,6 +180,7 @@ static const int nmetric_count = sizeof(nmetric_defs) / sizeof(nmetric_def_t); // AND don't forget to increase the NMETRIC_..._COUNT in neuron_metrics.h #define NMETRIC_INSTANCE_ID_IDX 0 #define NMETRIC_DRIVER_VERS_IDX 1 +#define NMETRIC_PROFILE_ID_IDX 2 #define NMETRIC_FW_IO_ERR_IDX 17 struct nmetric_cw_metric { @@ -197,6 +200,7 @@ void nmetric_init_constants_metrics() int driver_ver_str_len; int instance_id_idx = nmetric_defs[NMETRIC_INSTANCE_ID_IDX].index; int driver_vers_idx = nmetric_defs[NMETRIC_DRIVER_VERS_IDX].index; + int profile_id_idx = nmetric_defs[NMETRIC_PROFILE_ID_IDX].index; loff_t offset = 0; // initiate buffer to 0 @@ -215,6 +219,9 @@ void nmetric_init_constants_metrics() driver_ver_str_len = strlen(driver_version); BUG_ON(driver_ver_str_len > NEURON_METRICS_VERSION_STRING_MAX_LEN); // check for buffer overflow memcpy(nmetric_constant_metrics[driver_vers_idx], driver_version, min(driver_ver_str_len, (int)NEURON_METRICS_VERSION_STRING_MAX_LEN)); + + // record performance profile + snprintf(nmetric_constant_metrics[profile_id_idx], NEURON_METRICS_VERSION_STRING_MAX_LEN + 1, "%d", 0); } /** @@ -578,27 +585,38 @@ static inline int nmetric_post_constant_u64(const nmetric_def_t *metric, struct return nmetric_post_u64(metric, metric_value, dest, available_size); } -static inline int nmetric_post_driver_metrics(const nmetric_def_t *metric, - u64 *curr_metrics, - u64 *prev_metrics, - u64 *freed_metrics, - struct nmetric_cw_metric *dest, - u64 *driver_metrics, - int available_size) +// TODO: This function is a quick workaround to post and reset the driver metrics: +// 1. it uses atomics to protect driver metrics from race conditions; +// 2. it resets the driver metric and its correspondingly intermediate metrics immediately after posting. +// A better long term solution is needed. +static inline int nmetric_post_and_reset_driver_metrics(const nmetric_def_t *driver_final_metric, + struct nmetric_cw_metric *dest, + struct nmetric_driver_metrics *driver_metrics, + int available_size) { - u64 metric_value = driver_metrics[metric->index]; - - if (metric->index == NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS - || metric->index == NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS - || metric->index == NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS - || metric->index == NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS) { - return nmetric_post_u64(metric, metric_value, dest, available_size); - } else if (metric->index == NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT - || metric->index == NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT){ - return nmetric_post_counter(curr_metrics, prev_metrics, freed_metrics, metric, dest, available_size); + int metric_index = driver_final_metric->index; + u64 metric_value = 0; + + if (metric_index < 0 || metric_index >= NMETRIC_FINAL_DRIVER_METRICS_COUNT) { + pr_err("invalid final driver metric with index %d\n", driver_final_metric->index); + return 0; } - return 0; + if (metric_index == NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS) { + u64 total_time = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS], 0); + u64 total_count = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT], 0); + + if (total_count != 0) + metric_value = total_time / total_count; + } else if (metric_index == NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS) { + u64 total_time = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS], 0); + u64 total_count = atomic64_xchg(&driver_metrics->intermediate_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT], 0); + + if (total_count != 0) + metric_value = total_time / total_count; + } + + return nmetric_post_u64(driver_final_metric, metric_value, dest, available_size); } /** @@ -632,6 +650,9 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6 if (!nmetric_check_post_tick(tick, curr_metric)) continue; available_size = NEURON_METRICS_MAX_POSTING_BUF_SIZE - data_size; + if (available_size <= 0) { + pr_err_once("ran out of metrics posting space for tick %d on metric %d", tick, nmetric_index); + } dest = (struct nmetric_cw_metric *)&nd->metrics.posting_buffer[data_size]; switch(curr_metric->type) { case NMETRIC_TYPE_CONSTANT: @@ -652,7 +673,7 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6 data_size += nmetric_post_constant_u64(curr_metric, dest, const_u64_metrics, freed_const_u64_metrics, available_size); break; case NMETRIC_TYPE_DRIVER: - data_size += nmetric_post_driver_metrics(curr_metric, curr_metrics, prev_metrics, freed_metrics, dest, nd->metrics.driver_metrics, available_size); + data_size += nmetric_post_and_reset_driver_metrics(curr_metric, dest, &nd->metrics.driver_metrics, available_size); break; } } @@ -663,7 +684,7 @@ static void nmetric_post_metrics(struct neuron_device *nd, u64 *curr_metrics, u6 nmetric_mock_fw_io_post_metric(nd->metrics.posting_buffer, data_size); } if (data_size && (nmetric_log_posts & (1<<0))) { - int ret = fw_io_post_metric(nd->fw_io_ctx, nd->metrics.posting_buffer, data_size); + int ret = ndhal->ndhal_fw_io.fw_io_post_metric(nd->fw_io_ctx, nd->metrics.posting_buffer, data_size); if (ret < 0) pr_err("Metric posting failed with error code: %d\n", ret); } @@ -715,6 +736,11 @@ static void nmetric_cache_shared_bufs(struct neuron_device *nd, u64 *freed_metri freed_const_u64_metrics[curr_metric->index] = nd->metrics.ds_freed_const_u64_buf[curr_metric->index]; nd->metrics.ds_freed_const_u64_buf[curr_metric->index] = 0; break; + case NMETRIC_TYPE_CONSTANT: + if (curr_metric->cw_id == NMETRIC_CW_ID_PERFORMANCE_PROFILE_ID) { + snprintf(nmetric_constant_metrics[curr_metric->index], NEURON_METRICS_VERSION_STRING_MAX_LEN + 1, "%d", ndhal->ndhal_perf.current_performance_profile); + } + break; } } } @@ -753,9 +779,6 @@ static void nmetric_start_new_session(struct neuron_device *nd, u64 *curr_metric const_u64_metrics[curr_metric->index] = 0; freed_const_u64_metrics[curr_metric->index] = 0; break; - case NMETRIC_TYPE_DRIVER: - nd->metrics.driver_metrics[curr_metric->index] = 0; - break; } } @@ -776,6 +799,30 @@ static void nmetric_sample_high_freq(struct neuron_device *nd) npower_sample_utilization(nd); } +static void nmetric_aggregate_and_post_tick(struct neuron_device *nd, struct nmetric_versions *component_versions, u64 *curr_feature_bitmap, u64 *freed_feature_bitmap, u64 *const_u64_metrics, u64 *freed_const_u64_metrics, u8 tick) +{ + neuron_ds_acquire_lock(&nd->datastore); + nmetric_full_aggregate(nd, nd->metrics.neuron_aggregation.curr, + curr_feature_bitmap, const_u64_metrics, tick); + nmetric_cache_shared_bufs(nd, nd->metrics.neuron_aggregation.freed, + component_versions, freed_feature_bitmap, + freed_const_u64_metrics, tick); + neuron_ds_release_lock(&nd->datastore); + + nmetric_post_metrics(nd, nd->metrics.neuron_aggregation.curr, + nd->metrics.neuron_aggregation.prev, + nd->metrics.neuron_aggregation.freed, + component_versions, *curr_feature_bitmap, + *freed_feature_bitmap, const_u64_metrics, + freed_const_u64_metrics, tick); + nmetric_start_new_session(nd, nd->metrics.neuron_aggregation.curr, + nd->metrics.neuron_aggregation.prev, + nd->metrics.neuron_aggregation.freed, + curr_feature_bitmap, freed_feature_bitmap, + const_u64_metrics, freed_const_u64_metrics, + tick); // reset all current metrics for this tick +} + /** * nmetric_thread_fn() - periodically aggregates and posts metric at rate specified by module parameter * @@ -795,8 +842,8 @@ static int nmetric_thread_fn(void *arg) u64 post_delay_in_jiffies; u64 last_metric_post_time; u64 start_jiffies = jiffies; - u64 last_logged_slow_tick = 0; u64 current_slow_tick; + u8 tick_budget = 0; // how many ticks can be posted in a certain iteration of the loop // initialize all aggregation buffers memset(nd->metrics.neuron_aggregation.prev, 0, nmetric_counters_buf_size); @@ -816,11 +863,14 @@ static int nmetric_thread_fn(void *arg) sample_delay_in_jiffies, nmetric_metric_post_delay, HZ); // metrics are only sent once at rate specified by module param, new metric data may be saved without being immediately sent - while (!kthread_should_stop() && nd->metrics.neuron_aggregation.running) { - long wait_return; - wait_return = wait_event_interruptible_timeout(nd->metrics.neuron_aggregation.wait_queue, !nd->metrics.neuron_aggregation.running,sample_delay_in_jiffies); - - if (kthread_should_stop() || !nd->metrics.neuron_aggregation.running || (wait_return < 0)) { + while (!kthread_should_stop() && nd->metrics.neuron_aggregation.state != NMETRIC_STATE_STOPPED) { + long wait_return; + int flush_tick; + wait_return = wait_event_interruptible_timeout(nd->metrics.neuron_aggregation.wait_queue, + nd->metrics.neuron_aggregation.state == NMETRIC_STATE_STOPPED || nd->metrics.neuron_aggregation.state == NMETRIC_STATE_RESUMING, + sample_delay_in_jiffies); + + if (kthread_should_stop() || nd->metrics.neuron_aggregation.state == NMETRIC_STATE_STOPPED || (wait_return < 0)) { break; }; @@ -831,31 +881,29 @@ static int nmetric_thread_fn(void *arg) // We track this by keeping track of the number of intervals since this thread started // up so that we don't introduce drift due to the latency of other loop operations. current_slow_tick = (jiffies - start_jiffies)/post_delay_in_jiffies; - if (current_slow_tick != last_logged_slow_tick) { - last_logged_slow_tick = current_slow_tick; - - // aggregate and post metrics - neuron_ds_acquire_lock(&nd->datastore); - nmetric_full_aggregate(nd, nd->metrics.neuron_aggregation.curr, - &curr_feature_bitmap, const_u64_metrics, tick); - nmetric_cache_shared_bufs(nd, nd->metrics.neuron_aggregation.freed, - component_versions, &freed_feature_bitmap, - freed_const_u64_metrics, tick); - neuron_ds_release_lock(&nd->datastore); - - nmetric_post_metrics(nd, nd->metrics.neuron_aggregation.curr, - nd->metrics.neuron_aggregation.prev, - nd->metrics.neuron_aggregation.freed, - component_versions, curr_feature_bitmap, - freed_feature_bitmap, const_u64_metrics, - freed_const_u64_metrics, tick); - nmetric_start_new_session(nd, nd->metrics.neuron_aggregation.curr, - nd->metrics.neuron_aggregation.prev, - nd->metrics.neuron_aggregation.freed, - &curr_feature_bitmap, &freed_feature_bitmap, - const_u64_metrics, freed_const_u64_metrics, - tick); // reset all current metrics for this tick - tick = (tick + 1) % POST_TICK_COUNT; + + // periodic metrics posting on a timer + if (nd->metrics.neuron_aggregation.state == NMETRIC_STATE_PAUSED) { + // skip metrics post when paused via metrics_ctrl ioctl + continue; + } else if (current_slow_tick != nd->metrics.neuron_aggregation.last_logged_slow_tick) { + // post up to tick_budget ticks of metrics + // in the normal case, this will post one tick per iteration of the loop + // if the was paused then resumed, post up to the number of ticks that would have been posted if it had not been paused + + if (nd->metrics.neuron_aggregation.state == NMETRIC_STATE_RESUMING) { + nd->metrics.neuron_aggregation.state = NMETRIC_STATE_RUNNING; + tick_budget = current_slow_tick - nd->metrics.neuron_aggregation.last_logged_slow_tick; + tick_budget = (tick_budget > POST_TICK_COUNT) ? POST_TICK_COUNT : tick_budget; + } else { + tick_budget = 1; + } + + for (flush_tick = 0; flush_tick < tick_budget; flush_tick++) { + nmetric_aggregate_and_post_tick(nd, component_versions, &curr_feature_bitmap, &freed_feature_bitmap, const_u64_metrics, freed_const_u64_metrics, tick); + tick = (tick + 1) % POST_TICK_COUNT; + } + nd->metrics.neuron_aggregation.last_logged_slow_tick = current_slow_tick; } } @@ -865,7 +913,7 @@ static int nmetric_thread_fn(void *arg) static int nmetric_create_thread(struct neuron_device *nd) { init_waitqueue_head(&nd->metrics.neuron_aggregation.wait_queue); - nd->metrics.neuron_aggregation.running = true; + nd->metrics.neuron_aggregation.state = NMETRIC_STATE_RUNNING; nd->metrics.neuron_aggregation.thread = kthread_run(nmetric_thread_fn, nd, "nd%d metrics", nd->device_index); if (IS_ERR_OR_NULL(nd->metrics.neuron_aggregation.thread)) { pr_err("nd%d metrics aggregation thread creation failed\n", nd->device_index); @@ -878,15 +926,34 @@ void nmetric_stop_thread(struct neuron_device *nd) { if (nd->metrics.neuron_aggregation.thread == NULL) return; - nd->metrics.neuron_aggregation.running = false; + nd->metrics.neuron_aggregation.state = NMETRIC_STATE_STOPPED; wake_up(&nd->metrics.neuron_aggregation.wait_queue); kthread_stop(nd->metrics.neuron_aggregation.thread); //blocks till the thread exits nd->metrics.neuron_aggregation.thread = NULL; } +// if periodic posting from metrics thread is paused, need to request an explicit flush +void nmetric_set_mode(struct neuron_device *nd, enum neuron_metrics_mode mode) +{ + switch (mode) { + case NEURON_METRICS_MODE_PERIODIC_ENABLE: + nd->metrics.neuron_aggregation.state = NMETRIC_STATE_RESUMING; + break; + case NEURON_METRICS_MODE_PERIODIC_DISABLE: + nd->metrics.neuron_aggregation.state = NMETRIC_STATE_PAUSED; + break; + } +} + void nmetric_init_driver_metrics(struct neuron_device *nd) { - memset(nd->metrics.driver_metrics, 0, NMETRIC_DRIVER_METRICS_COUNT * sizeof(u64)); + int i; + + for (i = 0; i < NMETRIC_FINAL_DRIVER_METRICS_COUNT; i++) + atomic64_set(&nd->metrics.driver_metrics.final_metrics[i], 0); + + for (i = 0; i < NMETRIC_INTERMEDIATE_DRIVER_METRICS_COUNT; i++) + atomic64_set(&nd->metrics.driver_metrics.intermediate_metrics[i], 0); } int nmetric_init(struct neuron_device *nd) @@ -895,7 +962,8 @@ int nmetric_init(struct neuron_device *nd) memset(nd->metrics.ds_freed_metrics_buf, 0, nmetric_counters_buf_size); memset(nd->metrics.ds_freed_const_u64_buf, 0, NMETRIC_CONSTANT_U64_COUNT * sizeof(u64)); - npower_init_stats(nd); + nd->metrics.neuron_aggregation.state = NMETRIC_STATE_STOPPED; + nd->metrics.neuron_aggregation.last_logged_slow_tick = 0; // initiate metric aggregator thread ret = nmetric_create_thread(nd); @@ -903,37 +971,55 @@ int nmetric_init(struct neuron_device *nd) return ret; } -void nmetric_set_reset_time_metrics(struct neuron_device *nd, uint64_t cur_reset_time_ms, bool is_device_reset) { +void nmetric_set_reset_time_metrics(struct neuron_device *nd, s64 cur_reset_time_ms, bool is_device_reset) { + struct nmetric_driver_metrics *driver_metrics = &nd->metrics.driver_metrics; + atomic64_t *max_time_metric; + atomic64_t *total_time_metric; + atomic64_t *total_count_metric; + s64 max_time; + int max_time_index; + int total_time_index; + int total_count_index; + if (cur_reset_time_ms <= 0) { return; } if (is_device_reset) { - if (nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS] < cur_reset_time_ms) { - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS] = cur_reset_time_ms; - } - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS] += cur_reset_time_ms; - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT]++; - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS] = - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS] / - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT]; + max_time_index = NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS; + total_time_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS; + total_count_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT; } else { - if (nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS] < cur_reset_time_ms) { - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS] = cur_reset_time_ms; - } - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS] += cur_reset_time_ms; - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT]++; - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS] = - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS] / - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT]; + max_time_index = NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS; + total_time_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS; + total_count_index = NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT; } + + max_time_metric = &driver_metrics->final_metrics[max_time_index]; + total_time_metric = &driver_metrics->intermediate_metrics[total_time_index]; + total_count_metric = &driver_metrics->intermediate_metrics[total_count_index]; + + max_time = atomic64_read(max_time_metric); + + while (max_time < cur_reset_time_ms && + !atomic64_try_cmpxchg(max_time_metric, &max_time, cur_reset_time_ms)); + + atomic64_add(cur_reset_time_ms, total_time_metric); + atomic64_inc(total_count_metric); } void nmetric_increment_reset_failure_count(struct neuron_device *nd, bool is_device_reset) { + struct nmetric_driver_metrics *driver_metrics = &nd->metrics.driver_metrics; + if (is_device_reset) { - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT]++; + atomic64_inc(&driver_metrics->final_metrics[NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT]); } else { - nd->metrics.driver_metrics[NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT]++; + atomic64_inc(&driver_metrics->final_metrics[NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT]); } } + +void nmetric_set_performance_profile(struct neuron_device *nd, int profile) +{ + snprintf(nmetric_constant_metrics[NMETRIC_PROFILE_ID_IDX], NEURON_METRICS_VERSION_STRING_MAX_LEN + 1, "%d", ndhal->ndhal_perf.current_performance_profile); +} diff --git a/neuron_metrics.h b/neuron_metrics.h index f79f2d9..a0df6e4 100644 --- a/neuron_metrics.h +++ b/neuron_metrics.h @@ -2,7 +2,10 @@ /* * Copyright 2021, Amazon.com, Inc. or its affiliates. All Rights Reserved */ +#include + #include "neuron_ds.h" +#include "share/neuron_driver_shared.h" #ifndef _NEURON_METRICS_H #define _NEURON_METRICS_H @@ -30,23 +33,36 @@ #define NMETRIC_CONST_U64_FLAG_SKIP_ZERO (0x1ull << 0) #define NMETRIC_CONST_U64_FLAG_PREFER_FREED (0x1ull << 1) -enum driver_metrics_idx { - NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS = 0, - NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS = 1, +// The final driver metrics to be posted to CR. +// They are not stored in datastore +enum driver_final_metrics_idx { + NMETRIC_DRIVER_METRICS_IDX_MAX_DEVICE_RESET_TIME_MS = 0, + NMETRIC_DRIVER_METRICS_IDX_MAX_TPB_RESET_TIME_MS = 1, + + NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS = 2, + NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS = 3, - NMETRIC_DRIVER_METRICS_IDX_AVG_DEVICE_RESET_TIME_MS = 2, - NMETRIC_DRIVER_METRICS_IDX_AVG_TPB_RESET_TIME_MS = 3, + NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT = 4, + NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT = 5, - // Intermediate metrics. do not post to CW directly. - NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS = 4, - NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS = 5, - NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT = 6, - NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT = 7, + NMETRIC_FINAL_DRIVER_METRICS_COUNT = 6, +}; - NMETRIC_DRIVER_METRICS_IDX_DEVICE_RESET_FAILURE_COUNT = 8, - NMETRIC_DRIVER_METRICS_IDX_TPB_RESET_FAILURE_COUNT = 9, +// The intermediate driver metrics are not posted to CW. +// They are used to calculate the final driver metrics above. +// They are not stored in datastore +enum driver_intermediate_metrics_idx { + NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_TIME_MS = 0, + NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_TIME_MS = 1, + NMETRIC_DRIVER_METRICS_IDX_TOTAL_DEVICE_RESET_COUNT = 2, + NMETRIC_DRIVER_METRICS_IDX_TOTAL_TPB_RESET_COUNT = 3, + + NMETRIC_INTERMEDIATE_DRIVER_METRICS_COUNT = 4, +}; - NMETRIC_DRIVER_METRICS_IDX_COUNT = 10, +struct nmetric_driver_metrics { + atomic64_t final_metrics[NMETRIC_FINAL_DRIVER_METRICS_COUNT]; // final driver metrics to be posted to CR. Not in datastore + atomic64_t intermediate_metrics[NMETRIC_INTERMEDIATE_DRIVER_METRICS_COUNT]; // intermediate driver metrics, and not posted to CW. Not in datastore }; // Sadly, the 3 #defines below need to be updated when adding new metrics to nmetric_defs @@ -54,7 +70,7 @@ enum driver_metrics_idx { #define NMETRIC_VERSION_COUNT 3 // Number of metrics of type NMETRIC_TYPE_CONSTANT -#define NMETRIC_CONSTANTS_COUNT 2 +#define NMETRIC_CONSTANTS_COUNT 3 // Number of metrics of type NMETRIC_TYPE_COUNTER + the special case (type NMETRIC_TYPE_FW_IO_ERR) #define NMETRIC_COUNTER_COUNT 29 @@ -65,9 +81,6 @@ enum driver_metrics_idx { // Number of metrics of type NMETRIC_CONSTANT_U64 #define NMETRIC_CONSTANT_U64_COUNT 1 -// Number of metrics of type NMETRIC_DRIVER -#define NMETRIC_DRIVER_METRICS_COUNT NMETRIC_DRIVER_METRICS_IDX_COUNT - typedef struct { u8 index; // metric specific index u8 type; // metric type @@ -93,10 +106,18 @@ struct nmetric_versions { u64 version_metrics[NEURON_METRICS_VERSION_MAX_CAPACITY]; }; +enum nmetric_state { + NMETRIC_STATE_STOPPED = 0, // thread not active/signaled to exit the loop + NMETRIC_STATE_RUNNING = 1, // thread is active and periodically posting metrics + NMETRIC_STATE_PAUSED = 2, // thread is active, but periodic posting is skipped + NMETRIC_STATE_RESUMING = 3, // immediately wakes thread and transitions to NMETRIC_STATE_RUNNING +}; + struct nmetric_aggregation_thread { struct task_struct *thread; // aggregation thread that sends metrics every ~5 minutes wait_queue_head_t wait_queue; - volatile bool running; // if cleared, thread would exit the loop + volatile enum nmetric_state state; + u64 last_logged_slow_tick; // when the last metric request was posted u64 curr[NMETRIC_COUNTER_COUNT]; // metrics for the current session so far u64 prev[NMETRIC_COUNTER_COUNT]; // recorded metrics from the last post u64 freed[NMETRIC_COUNTER_COUNT]; // cache holding metrics that were freed before the posting period was reached @@ -109,7 +130,7 @@ struct neuron_metrics { u64 ds_freed_const_u64_buf[NMETRIC_CONSTANT_U64_COUNT]; // stores unsent constant u64 values about to be freed from datastore struct nmetric_aggregation_thread neuron_aggregation; // aggregation thread that periodically aggregates and posts metrics u8 posting_buffer[NEURON_METRICS_MAX_POSTING_BUF_SIZE + 1]; - u64 driver_metrics[NMETRIC_DRIVER_METRICS_COUNT]; // stores driver internal metrics that is not in datastore + struct nmetric_driver_metrics driver_metrics; // driver metrics. not in datastore }; /** @@ -138,7 +159,7 @@ void nmetric_partial_aggregate(struct neuron_device *nd, struct neuron_datastore void nmetric_stop_thread(struct neuron_device *nd); /** - * nmetric_init_driver_metrics() - Initializes the driver metrics to 0 + * nmetric_init_driver_metrics() - Initializes the driver metrics lock and values to 0 * * @param nd - the neuron device */ @@ -158,7 +179,7 @@ int nmetric_init(struct neuron_device *nd); * @param cur_reset_time_ms: the current TPB or device reset time in milliseconds. * @param is_device_reset: whether it is TPB or device reset. */ -void nmetric_set_reset_time_metrics(struct neuron_device *nd, uint64_t cur_reset_time_ms, bool is_device_reset); +void nmetric_set_reset_time_metrics(struct neuron_device *nd, s64 cur_reset_time_ms, bool is_device_reset); /** * nmetric_increment_reset_failure_count() - Increment the reset failure count by 1 for a device or TPB reset failure. @@ -168,4 +189,19 @@ void nmetric_set_reset_time_metrics(struct neuron_device *nd, uint64_t cur_reset */ void nmetric_increment_reset_failure_count(struct neuron_device *nd, bool is_device_reset); +/** + * nmetric_set_performance_profile() - Set the current performance profile value for metrics posting. + * + * @param nd: neuron device + * @param profile: performance profile value + */ +void nmetric_set_performance_profile(struct neuron_device *nd, int profile); + +/** + * nmetric_set_mode() - Enable or disable periodic posting of metrics. + * + * @param nd: neuron device + * @param mode: whether to change or maintain current behavior + */ +void nmetric_set_mode(struct neuron_device *nd, enum neuron_metrics_mode mode); #endif diff --git a/neuron_module.c b/neuron_module.c index e865d32..e6eb69a 100644 --- a/neuron_module.c +++ b/neuron_module.c @@ -18,13 +18,13 @@ #include "neuron_cdev.h" #include "neuron_pci.h" -MODULE_DESCRIPTION("Neuron Driver, built from SHA: bab563e32c62d9dd615a42079e5bbd8e1a6327b1"); +MODULE_DESCRIPTION("Neuron Driver, built from SHA: 5ebb67d2e5be7052dcf1774cff03c69ab40d21ee"); MODULE_LICENSE("GPL"); -MODULE_VERSION("2.24.7.0"); +MODULE_VERSION("2.25.4.0"); MODULE_ALIAS("pci:v00001d0fd00007064sv*sd*bc*sc*i*"); -const char driver_version[] = "2.24.7.0"; -const char driver_revision[] = "bab563e32c62d9dd615a42079e5bbd8e1a6327b1"; +const char driver_version[] = "2.25.4.0"; +const char driver_revision[] = "5ebb67d2e5be7052dcf1774cff03c69ab40d21ee"; #ifdef CONFIG_FAULT_INJECTION diff --git a/neuron_pci.c b/neuron_pci.c index 3233932..dbb1b14 100644 --- a/neuron_pci.c +++ b/neuron_pci.c @@ -19,7 +19,6 @@ #include "neuron_ds.h" #include "neuron_reg_access.h" #include "neuron_metrics.h" -#include "v1/fw_io.h" #include "neuron_dma.h" #include "neuron_dhal.h" #include "neuron_nq.h" @@ -29,13 +28,11 @@ static struct pci_device_id neuron_pci_dev_ids[] = { - { PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID0) }, - { PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID1) }, - { PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID2) }, - { PCI_DEVICE(AMZN_VENDOR_ID, INF1_DEVICE_ID3) }, { PCI_DEVICE(AMZN_VENDOR_ID, TRN1_DEVICE_ID0) }, { PCI_DEVICE(AMZN_VENDOR_ID, INF2_DEVICE_ID0) }, { PCI_DEVICE(AMZN_VENDOR_ID, TRN2_DEVICE_ID0) }, + { PCI_DEVICE(AMZN_VENDOR_ID, TRN3_DEVICE_ID0) }, + { PCI_DEVICE(AMZN_VENDOR_ID, TRN3_DEVICE_ID1) }, { 0, }, @@ -101,7 +98,7 @@ static int neuron_pci_device_init(struct neuron_device *nd) // Initialize the mc handle map ret = nmch_handle_init(nd); - if (ret) + if (ret) goto fail_mch; // Initialize the device mpset @@ -166,7 +163,7 @@ static int neuron_pci_device_close(struct neuron_device *nd) static void neuron_pci_set_device_architecture(struct neuron_device *nd) { unsigned short device = nd->pdev->device; - enum neuron_arch arch; + enum neuron_arch arch = NEURON_ARCH_INVALID; u8 revision; pci_read_config_byte(nd->pdev, PCI_REVISION_ID, &revision); @@ -178,8 +175,12 @@ static void neuron_pci_set_device_architecture(struct neuron_device *nd) case TRN2_DEVICE_ID0: arch = NEURON_ARCH_V3; break; + case TRN3_DEVICE_ID0: + case TRN3_DEVICE_ID1: + arch = NEURON_ARCH_V4; + break; default: - arch = NEURON_ARCH_V1; + return; } narch_init(arch, revision); } @@ -196,7 +197,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) } nmetric_init_driver_metrics(nd); - + if (neuron_log_init(nd)) { pci_warn(dev, "Warning: Can't allocate memory for neuron log\n"); } @@ -256,7 +257,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) nd->device_index = atomic_fetch_add(1, &device_count); #else nd->device_index = atomic_add_return(1, &device_count) - 1; -#endif +#endif nd->fw_io_ctx = fw_io_setup(nd->npdev.bar0, nd->npdev.bar0_size, nd->npdev.bar2, nd->npdev.bar2_size); if (nd->fw_io_ctx == NULL) { diff --git a/neuron_power.c b/neuron_power.c index 81c3072..47f2d25 100644 --- a/neuron_power.c +++ b/neuron_power.c @@ -49,6 +49,14 @@ bool npower_enabled_in_fw(struct neuron_device *nd) int ret = 0; u32 api_version_num = 0; + extern unsigned int nmetric_log_posts; + + // If metric log posting is disabled, don't do power reporting - we're likely on + // bringup hardware or simulation + if (!nmetric_log_posts) { + return false; + } + // Just read the API version from firmware. We could try to be smart here and cache // this, but we need to protect ourselves from rollbacks in the Pacific version or // other changes. Plus, this is just a simple MMIO read, so it's cheap. diff --git a/neuron_reset.c b/neuron_reset.c index 7bef5d4..2c908e3 100644 --- a/neuron_reset.c +++ b/neuron_reset.c @@ -379,7 +379,6 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t uint32_t reset_retry_interval; ktime_t start_time; ktime_t next_reset_retry_time; - uint32_t initial_poll_delay; ktime_t cur_time; s64 reset_time; @@ -394,14 +393,6 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t fw_io_initiate_reset(nd->npdev.bar0, is_device_reset, tpb_reset_map); next_reset_retry_time = ktime_add_ms(start_time, reset_retry_interval); - /* V1 only. Sleep extra time before polling */ - initial_poll_delay = (nc_map == NEURON_NC_MAP_DEVICE ? - ndhal->ndhal_reset.reset_device_initial_poll_delay : - ndhal->ndhal_reset.reset_tpb_initial_poll_delay); - if (nr_msleep_stoppable(nd, initial_poll_delay)) { - return -1; - } - do { /* * After reset initiation, firmware becomes unresponsive until @@ -416,7 +407,7 @@ int nr_initiate_reset_via_fw(struct neuron_device *nd, uint32_t nc_map, uint32_t // Reset is done. Record the time to metrics. reset_time = ktime_ms_delta(ktime_get(), start_time); if (reset_time > 0) { - nmetric_set_reset_time_metrics(nd, (uint64_t)reset_time, is_device_reset); + nmetric_set_reset_time_metrics(nd, reset_time, is_device_reset); } else { return -1; } diff --git a/neuron_ring.c b/neuron_ring.c index 0483d9e..165e0c9 100644 --- a/neuron_ring.c +++ b/neuron_ring.c @@ -26,6 +26,9 @@ int dev_nc_map = 1; module_param(dev_nc_map, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); MODULE_PARM_DESC(dev_nc_map, "Map of active neuron cores"); +// forward +static void ndmar_h2t_ring_free(struct ndma_ring *ring); + struct ndma_eng *ndmar_acquire_engine(struct neuron_device *nd, u32 eng_id) { if (eng_id >= NUM_DMA_ENG_PER_DEVICE) @@ -34,6 +37,14 @@ struct ndma_eng *ndmar_acquire_engine(struct neuron_device *nd, u32 eng_id) return &nd->ndma_engine[eng_id]; } +// acquire dma engine w/o locking. Use for scanning. +static struct ndma_eng *ndmar_acquire_engine_nl(struct neuron_device *nd, u32 eng_id) +{ + if (eng_id >= NUM_DMA_ENG_PER_DEVICE) + return NULL; + return &nd->ndma_engine[eng_id]; +} + void ndmar_release_engine(struct ndma_eng *eng) { mutex_unlock(&eng->nd->ndma_engine[eng->eng_id].lock); @@ -143,10 +154,18 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_ queue = ndmar_get_queue(eng, qid); ring = ndmar_get_ring(queue); + // check if the ring has been allocated to h2t and fail + if (ndmar_h2t_ring_is_h2t(ring) && (tx_mc != nd->ndma_q_dummy_mc)) { + pr_err("nd%02d: DMA ring allocation collision with h2t dma on eng: %d, queue: %d", nd->device_index, eng_id, qid); + ret = -EALREADY; + goto done; + } + queue->eng_id = eng_id; queue->qid = qid; queue->owner = task_tgid_nr(current); ring->qid = qid; + ring->h2t_completion_mc = NULL; trace_dma_queue_init(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc, port); @@ -193,15 +212,39 @@ void ndmar_handle_process_exit(struct neuron_device *nd, pid_t pid) const int desc_count = NDMA_QUEUE_DUMMY_RING_DESC_COUNT; for (eng_id = 0; eng_id < ndhal->ndhal_address_map.dma_eng_per_nd; eng_id++) { for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) { - if (nd->ndma_engine[eng_id].queues[qid].owner != pid) { + struct ndma_eng *eng = ndmar_acquire_engine_nl(nd, eng_id); + struct ndma_queue *queue; + struct ndma_ring *ring; + + if (eng == NULL) { + // continue; } + queue = ndmar_get_queue(eng, qid); + ring = ndmar_get_ring(queue); - // h2t rings are maintained by the driver so dont reset. + if (queue->owner != pid) { + continue; + } + + // default h2t rings are maintained by the driver so dont reset. // there cant be any outstanding DMA transaction in h2t since it is a // synchronous system call(which will block till finished when a process crashes). - if (ndhal->ndhal_ndmar.ndmar_is_h2t_q(nd, eng_id, qid)) + // TODO: async h2t will need to get cleaned up here. + + if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid)) { + pr_err_once("nd%02d: unexpected pid associated with default h2t ring", nd->device_index); continue; + } + + // h2t rings owned by driver are freed up on different path + if (ndmar_h2t_ring_is_h2t(ring)) { + pr_err_once("h2t ring should not be bound to process"); + continue; + } + + ndmar_h2t_ring_state_clr(ring); + queue->owner = 0; // rings owned by the nx should not be reset by us // ok since they should never be interacting with host mem @@ -280,30 +323,33 @@ int ndmar_queue_copy_start(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx int ndmar_queue_release(struct neuron_device *nd, u32 eng_id, u32 qid) { trace_dma_queue_release(nd, eng_id, qid); - // inf1 does not need any special handling return 0; } -static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id) +static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id, int qid) { int ret = 0; struct mem_chunk *rx_mc = NULL, *tx_mc = NULL, *h2t_completion_mc = NULL; + struct ndma_queue *queue; + struct ndma_ring *ring; const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); const int ndesc = DMA_H2T_DESC_COUNT; const u32 ring_size = ndmar_ring_get_desc_count(ndesc) * sizeof(union udma_desc); - const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_id); struct ndma_eng *eng = ndmar_acquire_engine(nd, eng_id); if (eng == NULL) return -EINVAL; + queue = ndmar_get_queue(eng, qid); + ring = ndmar_get_ring(queue); + eng->used_for_h2t = true; - struct ndma_queue *queue = &eng->queues[qid]; - queue->qid = qid; queue->eng_id = eng_id; - struct ndma_ring *ring = &queue->ring_info; + queue->qid = qid; + queue->owner = 0; ring->qid = qid; + ring->h2t_nc_id = nc_id; ring->size = ring_size; ring->has_compl = false; @@ -332,13 +378,18 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id) ring->h2t_completion.ptr = h2t_completion_mc->va; ring->h2t_completion.addr = virt_to_phys(ring->h2t_completion.ptr) | ndhal->ndhal_address_map.pci_host_base; - mutex_init(&eng->h2t_ring_lock); + mutex_init(&ring->h2t_ring_lock); ndmar_release_engine(eng); return 0; error: + ring->h2t_nc_id = -1; + ring->tx_mc = NULL; + ring->rx_mc = NULL; + ring->h2t_completion_mc = NULL; + ndmar_release_engine(eng); if (rx_mc) @@ -347,7 +398,7 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id) mc_free(&tx_mc); if (h2t_completion_mc) mc_free(&h2t_completion_mc); - + return ret; } @@ -359,13 +410,134 @@ int ndmar_h2t_ring_init(struct ndma_eng *eng, int qid) int ndesc = DMA_H2T_DESC_COUNT; u32 alloced_desc = ndmar_ring_get_desc_count(ndesc); - queue = &eng->queues[qid]; - ring = &queue->ring_info; + queue = ndmar_get_queue(eng, qid); + ring = ndmar_get_ring(queue); ret = udma_m2m_init_queue(&eng->udma, qid, eng->eng_id, alloced_desc, alloced_desc, true, &ring->tx, &ring->rx, NULL); return ret; } +static bool ndmar_h2t_ring_claim(struct neuron_device *nd, u32 eng_id, struct ndma_ring *ring) +{ + bool claimed = false; + struct ndma_eng *eng = ndmar_acquire_engine(nd, eng_id); + if (eng == NULL) + return false; + if (!ring->h2t_allocated) { + ring->h2t_nc_id = -1; + ring->h2t_allocated = true; + claimed = true; + } + ndmar_release_engine(eng); + return claimed; +} + +/* ndmar_h2t_ring_request() + * + * Ask the driver to dynamically allocate an h2t ring. + * + * caveats: + * - Other runtime allocated rings are not tracked by the driver, so h2t rings have to be requested + * prior to any rings allocated for a model. So basically we only track h2t ring allocations in the + * driver. + */ +int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *rqid) +{ + int ret = -1; + const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); + struct ndma_eng *eng; + struct ndma_queue *queue; + struct ndma_ring *ring; + int qid; + + eng = ndmar_acquire_engine_nl(nd, eng_id); + if (eng == NULL) + return -EINVAL; + + for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) { + if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid)) + continue; + queue = ndmar_get_queue(eng, qid); + ring = ndmar_get_ring(queue); + + // ring is unallocated, we can use it + if (ndmar_h2t_ring_claim(nd, eng_id, ring)) { + // For historical reasons, there are two ways we keep track of and manage queue ownership. DMAs that + // are managed by the driver on behalf of a process are tagged by nc_id that is used by the process, + // DMAs managed elsewhere (such as DMAs used NX refill) are tagged with PID. The tagging determines + // where the DMA resources are cleaned up. + if (h2t) { + // queue is used for driver memcopy + ret = ndmar_h2t_ring_alloc(nd, nc_id, qid); + if (ret) { + pr_err("nd%d:nc%d H2T ring allocation for qid:%d failed - %d\n", nd->device_index, nc_id, qid, ret); + ring->h2t_allocated = false; + goto done; + } + ret = ndmar_h2t_ring_init(eng, qid); + if (ret) { + ndmar_h2t_ring_free(ring); + pr_err("nd%d:nc%d H2T ring init for qid:%d failed - %d\n", nd->device_index, nc_id, qid, ret); + ring->h2t_allocated = false; + goto done; + } + } else { + // queue is use by HW or the RT directly + queue->owner = task_tgid_nr(current); + ring->h2t_nc_id = nc_id; + ret = 0; + } + *rqid = qid; + break; + } + } + +done: + return ret; +} + +int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid) +{ + int ret = 0; + const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_id); + struct ndma_eng *eng; + struct ndma_queue *queue; + struct ndma_ring *ring; + + if (qid >= DMA_MAX_Q_MAX) { + return -EINVAL; + } + + if (ndhal->ndhal_ndmar.ndmar_is_h2t_def_q(nd, eng_id, qid)) { + return 0; + } + + eng = ndmar_acquire_engine(nd, eng_id); + if (eng == NULL) { + return -EINVAL; + } + + queue = ndmar_get_queue(eng, qid); + ring = ndmar_get_ring(queue); + + if (!ndmar_h2t_ring_is_allocated(ring) || (ring->h2t_nc_id != nc_id)) { + pr_err("nd%02d: attempting to release ring %d on nc: %d that is not allocated as h2t ring", nd->device_index, qid, nc_id); + ret = -ENXIO; + goto done; + } + + if (ndmar_h2t_ring_is_h2t(ring)) { + ndmar_h2t_ring_free(ring); + } else { + ndmar_h2t_ring_state_clr(ring); + queue->owner = 0; + } + +done: + ndmar_release_engine(eng); + return ret; +} + int ndmar_eng_set_state(struct neuron_device *nd, int eng_id, u32 state) { struct ndma_eng *eng; @@ -532,7 +704,9 @@ static int ndmar_init_nc(struct neuron_device *nd, int nc_idx, bool init_h2t_eng } } - ret = ndmar_h2t_ring_alloc(nd, nc_idx); + const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid(nc_idx); + + ret = ndmar_h2t_ring_alloc(nd, nc_idx, qid); if (ret) { pr_err("nd%d:nc%d H2T ring allocation failed - %d\n", nd->device_index, nc_idx, ret); return ret; @@ -541,7 +715,7 @@ static int ndmar_init_nc(struct neuron_device *nd, int nc_idx, bool init_h2t_eng struct ndma_eng *eng = ndmar_acquire_engine(nd, eng_id); if (eng == NULL) return -EINVAL; - const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_idx); + ret = ndmar_h2t_ring_init(eng, qid); ndmar_release_engine(eng); if (ret) { @@ -579,22 +753,61 @@ int ndmar_init(struct neuron_device *nd) return ndmar_init_ncs(nd, -1); } -static void ndmar_h2t_ring_free(struct neuron_device *nd, int nc_idx, int eng_id) +static void ndmar_h2t_ring_free(struct ndma_ring *ring) { - const int qid = ndhal->ndhal_ndmar.ndmar_get_h2t_qid(nc_idx); - struct ndma_eng *eng = ndmar_acquire_engine(nd, eng_id); - BUG_ON(eng == NULL); - struct ndma_queue *queue = &eng->queues[qid]; - struct ndma_ring *ring = &queue->ring_info; - - if (ring->tx_mc) + if (ring->tx_mc) { mc_free(&ring->tx_mc); + ring->tx_mc = NULL; + } - if (ring->rx_mc) + if (ring->rx_mc) { mc_free(&ring->rx_mc); + ring->rx_mc = NULL; + } - if (ring->rxc_mc) + if (ring->rxc_mc) { mc_free(&ring->rxc_mc); + ring->rxc_mc = NULL; + } + + if (ring->h2t_completion_mc) { + mc_free(&ring->h2t_completion_mc); + ring->h2t_completion_mc = NULL; + } + + ndmar_h2t_ring_state_clr(ring); +} + +/* ndmar_h2t_ring_free_all() + * + */ +static void ndmar_h2t_ring_free_all(struct neuron_device *nd, int nc_idx) +{ + struct ndma_eng *eng; + struct ndma_queue *queue; + struct ndma_ring *ring; + const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_idx); + int qid; + + eng = ndmar_acquire_engine(nd, eng_id); + if (eng == NULL) { + pr_err("nd%02d: fatal error unable to acquire engine %d", nd->device_index, eng_id); + return; + } + + for (qid = 0; qid < DMA_MAX_Q_MAX; qid++) { + queue = ndmar_get_queue(eng, qid); + ring = ndmar_get_ring(queue); + if (ndmar_h2t_ring_is_allocated(ring) && ring->h2t_nc_id == nc_idx) { + if (ndmar_h2t_ring_is_h2t(ring)) { + // h2t queue free all resources + ndmar_h2t_ring_free(ring); + } else { + // service queue only clear state + ndmar_h2t_ring_state_clr(ring); + } + } + } ndmar_release_engine(eng); } @@ -604,8 +817,7 @@ static void ndmar_close_nc(struct neuron_device *nd, int nc_idx) if (!nd->dmar_init_done[nc_idx]) { return; } - const int eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, nc_idx); - ndmar_h2t_ring_free(nd, nc_idx, eng_id); + ndmar_h2t_ring_free_all(nd, nc_idx); nd->dmar_init_done[nc_idx] = false; } diff --git a/neuron_ring.h b/neuron_ring.h index 726f83a..f031be7 100644 --- a/neuron_ring.h +++ b/neuron_ring.h @@ -44,18 +44,22 @@ struct ndma_h2t_dma_context { }; struct ndma_ring { + // TODO combine all the h2t stuff in a sub structure + struct mutex h2t_ring_lock; + struct udma_ring_ptr h2t_completion; // TODO why are we using udma_ring_ptr... + struct mem_chunk *h2t_completion_mc; + struct ndma_h2t_dma_context h2t_dma_ctx[NEURON_DMA_H2T_CTX_HANDLE_CNT]; + u32 h2t_nc_id; + bool h2t_allocated; // ring can be allocated for standard use or h2t u32 qid; u32 size; //total size - num desc * desc size bool has_compl; struct udma_ring_ptr tx; struct udma_ring_ptr rx; struct udma_ring_ptr rxc; - struct udma_ring_ptr h2t_completion; struct mem_chunk *tx_mc; struct mem_chunk *rx_mc; struct mem_chunk *rxc_mc; - struct mem_chunk *h2t_completion_mc; - struct ndma_h2t_dma_context h2t_dma_ctx[NEURON_DMA_H2T_CTX_HANDLE_CNT]; u32 dram_channel; }; @@ -73,7 +77,6 @@ struct ndma_eng { struct ndma_queue queues[DMA_MAX_Q_MAX]; struct udma udma; bool used_for_h2t; - struct mutex h2t_ring_lock; }; /** @@ -287,4 +290,60 @@ int ndmar_h2t_ring_init(struct ndma_eng *eng, int qid); u32 ndmar_ring_get_desc_count(u32 v); +/** + * ndmar_h2t_ring_request() - request a h2t ring + * + * @nd: Neuron device which contains the DMA engine + * @nc_id: neuron core id + * @h2t: initialize ring to be used for h2t traffic + * @rqid: returned id of the queue + * + */ +int ndmar_h2t_ring_request(struct neuron_device *nd, int nc_id, bool h2t, int *rqid); + +/** + * ndmar_h2t_ring_release() + * + * @nd: Neuron device which contains the DMA engine + * @nc_id: neuron core id + * @qid: id the h2t queue to release + * + */ +int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid); + +/** + * ndmar_h2t_ring_is_h2t() - return true if this is an h2t ring + */ +static inline bool ndmar_h2t_ring_is_h2t(struct ndma_ring *ring) +{ + return (ring->h2t_completion_mc != NULL); +} + +/** + * ndmar_h2t_ring_is_owner - return true if this h2t ring is owned by nc_id + * + */ +static inline bool ndmar_h2t_ring_is_owner(struct ndma_ring *ring, int nc_id) +{ + return (nc_id == ring->h2t_nc_id) && ndmar_h2t_ring_is_h2t(ring); +} + +static inline bool ndmar_h2t_ring_is_allocated(struct ndma_ring *ring) +{ + return ring->h2t_allocated; +} + +static inline void ndmar_h2t_ring_state_clr(struct ndma_ring *ring) +{ + ring->h2t_nc_id = -1; + ring->h2t_allocated = false; +} + +/** + * ndmar_qid_valid() - return true if a queue is valid + */ +static inline bool ndmar_qid_valid(int qid) +{ + return ((qid >= 0) && (qid < DMA_MAX_Q_MAX)); +} #endif diff --git a/neuron_topsp.c b/neuron_topsp.c index 339c610..7638fbf 100644 --- a/neuron_topsp.c +++ b/neuron_topsp.c @@ -3,7 +3,7 @@ * Copyright 2021, Amazon.com, Inc. or its affiliates. All Rights Reserved */ -/** Each neuron device has N number of TOP_SPs. (inf1 does not have it). +/** Each neuron device has N number of TOP_SPs. TOP_SPs are only on V2 and after. * * Engine: * ------- diff --git a/share/neuron_driver_shared.h b/share/neuron_driver_shared.h index ecb987e..b1f716c 100644 --- a/share/neuron_driver_shared.h +++ b/share/neuron_driver_shared.h @@ -6,6 +6,7 @@ #define NEURON_DRIVER_SHARED_H #include +#include "neuron_driver_shared_tensor_batch_op.h" enum neuron_driver_feature_flag { NEURON_DRIVER_FEATURE_DMABUF = 1ull << 0, @@ -16,6 +17,7 @@ enum neuron_driver_feature_flag { NEURON_DRIVER_FEATURE_HBM_SCRUB = 1ull << 5, NEURON_DRIVER_FEATURE_MEM_ALLOC64 = 1ull << 6, NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD = 1ull << 7, + NEURON_DRIVER_FEATURE_ZEROCOPY = 1ull << 8, }; // FIXME this should be more generic - like node type. @@ -48,6 +50,11 @@ enum neuron_ultraserver_mode { NEURON_ULTRASERVER_MODE_X1 = 4, // 1 node US configuration (standalone) }; +enum neuron_metrics_mode { + NEURON_METRICS_MODE_PERIODIC_ENABLE = 0, // enable periodic posting + NEURON_METRICS_MODE_PERIODIC_DISABLE = 1, // disable periodic posting +}; + #define NEURON_NC_MAP_DEVICE (0xffffffff) enum neuron_dma_queue_type { @@ -90,6 +97,10 @@ enum neuron_dma_h2t_ctx_handle_type { NEURON_DMA_H2T_CTX_HANDLE_CNT = 3 // number of dma }; +/* + * H2T DMA Default Queue id + */ +#define NEURON_DMA_H2T_DEFAULT_QID (-1) /* * NOTE: In runtime version 5, this enum was passed in as a bool instead - @@ -188,6 +199,17 @@ struct neuron_ioctl_nc_map { struct neuron_ioctl_nc_map_entry mappings[]; }; +/* A batch of copy operations */ +typedef struct neuron_memcpy_batch { + __u64 mem_handle; // [in] Source or Destination memory handle from/to data needs to be copied. + __u64 mem_handle_offset; // [in] Memory offset of the memory handle + const nrt_tensor_batch_op_t *ops_ptr; // [in] Pointer to array of operations + __u32 num_ops; // [in] Number of neuron_memcpy_op operations. + __u16 bar4_wr_threshold; // [in] Threshold below which we will use bar4 direct write vs. DMA. Subject to driver limits. + __u16 flags; // [in] TBD. + void *context; // [in] TBD. opaque context pointer passed back in completion queue +} neuron_memcpy_batch_t; + /* * Memory allocation categories for sysfs counters */ diff --git a/share/neuron_driver_shared_tensor_batch_op.h b/share/neuron_driver_shared_tensor_batch_op.h new file mode 100644 index 0000000..59de68e --- /dev/null +++ b/share/neuron_driver_shared_tensor_batch_op.h @@ -0,0 +1,24 @@ +/* + * Shared tensor batch operation between runtime and driver. + */ + +#ifndef NEURON_DRIVER_SHARED_TENSOR_BATCH_OP_H +#define NEURON_DRIVER_SHARED_TENSOR_BATCH_OP_H + +#ifdef __KERNEL__ +#include +typedef __u64 nrt_tensor_batch_offset_t; +typedef __u64 nrt_tensor_batch_size_t; +#else +#include +typedef uint64_t nrt_tensor_batch_offset_t; +typedef uint64_t nrt_tensor_batch_size_t; +#endif + +typedef struct nrt_tensor_batch_op { + nrt_tensor_batch_offset_t offset; + nrt_tensor_batch_size_t size; + void *buffer; +} nrt_tensor_batch_op_t; + +#endif // NEURON_DRIVER_SHARED_TENSOR_BATCH_OP_H diff --git a/udma/udma.h b/udma/udma.h index a14c8ee..ce76549 100644 --- a/udma/udma.h +++ b/udma/udma.h @@ -66,7 +66,8 @@ union udma_desc { #define M2S_DESC_LEN_SHIFT 0 #define M2S_DESC_LEN_MASK (0xffff << M2S_DESC_LEN_SHIFT) /* Data length */ -#define S2M_DESC_INT_EN BIT(28) /* Enable Interrupt on completion */ +#define S2M_DESC_INT_EN BIT(28) /* Enable Interrupt on completion */ +#define S2M_DESC_STRONG_ORDER_WR BIT(29) /* Enable strong order write on this descriptor, V3+ only */ #define S2M_DESC_RING_ID_SHIFT 24 #define S2M_DESC_RING_ID_MASK (0x3 << S2M_DESC_RING_ID_SHIFT) /* Ring ID bits in s2m */ #define S2M_DESC_RING_SHIFT UDMA_S2M_Q_RDRBP_LOW_ADDR_SHIFT @@ -195,7 +196,8 @@ struct udma { enum { UDMA_M2M_BARRIER_NONE = 0, UDMA_M2M_BARRIER_DMB = 1, - UDMA_M2M_BARRIER_WRITE_BARRIER = 2 + UDMA_M2M_BARRIER_WRITE_BARRIER = 2, + UDMA_M2M_BARRIER_SOW = 3 }; /** diff --git a/udma/udma_m2m.c b/udma/udma_m2m.c index 8d0812f..0b0a4eb 100644 --- a/udma/udma_m2m.c +++ b/udma/udma_m2m.c @@ -126,22 +126,18 @@ static int udma_set_max_descs_and_prefetch(struct udma *udma, u8 max_descs) (1 << UDMA_M2S_RD_DESC_PREF_CFG_3_MIN_BURST_BELOW_THR_SHIFT); reg_write32(&udma->udma_regs_m2s->m2s_rd.desc_pref_cfg_3, value); - // likely harmless, but just in case, keep the old V1 behavior where - // we did not change default for s2m. V1 support is on the way out, - // once it's deprecated just remove this comment and the "if" - if (narch_get_arch() != NEURON_ARCH_V1) { - value = (pref_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_PREF_THR_SHIFT) | - (min_burst_above_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_ABOVE_THR_SHIFT) | - (1 << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_BELOW_THR_SHIFT); - reg_write32(&udma->udma_regs_s2m->s2m_rd.desc_pref_cfg_3, value); - // configure max_burst for both m2s and s2m - value = (max_burst << UDMA_AXI_M2S_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) | - (always_break_on_max_boundary << UDMA_AXI_M2S_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT); - reg_write32(&udma->udma_regs_m2s->axi_m2s.desc_rd_cfg_3, value); - value = (max_burst << UDMA_AXI_S2M_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) | - (always_break_on_max_boundary << UDMA_AXI_S2M_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT); - reg_write32(&udma->udma_regs_s2m->axi_s2m.desc_rd_cfg_3, value); - } + value = (pref_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_PREF_THR_SHIFT) | + (min_burst_above_thr << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_ABOVE_THR_SHIFT) | + (1 << UDMA_S2M_RD_DESC_PREF_CFG_3_MIN_BURST_BELOW_THR_SHIFT); + reg_write32(&udma->udma_regs_s2m->s2m_rd.desc_pref_cfg_3, value); + // configure max_burst for both m2s and s2m + value = (max_burst << UDMA_AXI_M2S_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) | + (always_break_on_max_boundary << UDMA_AXI_M2S_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT); + reg_write32(&udma->udma_regs_m2s->axi_m2s.desc_rd_cfg_3, value); + value = (max_burst << UDMA_AXI_S2M_DESC_RD_CFG_3_MAX_AXI_BEATS_SHIFT) | + (always_break_on_max_boundary << UDMA_AXI_S2M_DESC_RD_CFG_3_ALWAYS_BREAK_ON_MAX_BOUNDARY_SHIFT); + reg_write32(&udma->udma_regs_s2m->axi_s2m.desc_rd_cfg_3, value); + return 0; } @@ -341,6 +337,9 @@ static int udma_m2m_build_descriptor(union udma_desc *rx_desc_ptr, union udma_de case UDMA_M2M_BARRIER_WRITE_BARRIER: sdma_m2s_set_write_barrier(&meta_ctrl); break; + case UDMA_M2M_BARRIER_SOW: + rx_flags |= S2M_DESC_STRONG_ORDER_WR; + break; case UDMA_M2M_BARRIER_NONE: break; default: @@ -355,7 +354,7 @@ static int udma_m2m_build_descriptor(union udma_desc *rx_desc_ptr, union udma_de /* if rx should generate an interrupt make it so */ if (unlikely(set_dst_int)) - rx_flags = S2M_DESC_INT_EN; + rx_flags |= S2M_DESC_INT_EN; return udma_m2m_build_rx_descriptor(rx_desc_ptr, rx_ring_id, d_addr, size, rx_flags); } diff --git a/udma/udma_main.c b/udma/udma_main.c index f13bbb9..710da33 100644 --- a/udma/udma_main.c +++ b/udma/udma_main.c @@ -29,12 +29,13 @@ struct udma_m2s_pkt_len_conf { bool encode_64k_as_zero; }; +#define UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB (1 << 2) /* dma_q flags */ #define UDMA_Q_FLAGS_NO_COMP_UPDATE BIT(1) -/* M2S packet len configuration, configure maximum DMA packets size, i.e. - * the max size of the sum of all descriptors in a packet. Configure +/* M2S packet len configuration, configure maximum DMA packets size, i.e. + * the max size of the sum of all descriptors in a packet. Configure * whether len=0 encodes len=64k */ static int udma_m2s_packet_size_cfg_set(struct udma *udma, struct udma_m2s_pkt_len_conf *conf) @@ -60,6 +61,8 @@ static int udma_m2s_packet_size_cfg_set(struct udma *udma, struct udma_m2s_pkt_l return 0; } +#define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16) + /* set default configuration of one DMA engine */ static int udma_set_defaults(struct udma *udma) { @@ -117,7 +120,8 @@ static int udma_set_defaults(struct udma *udma) reg_write32(&gen_ex_regs->vmpr_v4[i].tx_sel, 0xffffffff); /* Set M2S data read master configuration */ - ndhal->ndhal_udma.udma_m2s_data_rd_cfg_boundaries_set(udma); + reg_write32(&udma->udma_regs_m2s->axi_m2s.data_rd_cfg, + UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY | 0x8); /* Ack time out */ reg_write32(&udma->udma_regs_s2m->s2m_comp.cfg_application_ack, 0); @@ -142,19 +146,26 @@ static int udma_set_defaults(struct udma *udma) (0x40 << UDMA_AXI_S2M_OSTAND_CFG_WR_MAX_COMP_DATA_WR_SHIFT); reg_write32(&udma->udma_regs_s2m->axi_s2m.ostand_cfg_wr, value); - // Enable the completion ring head reporting by disabling bit0 - struct udma_gen_regs_v4 __iomem *gen_regs = udma->gen_regs; - if (ndhal->arch == NEURON_ARCH_V1) { - // Keep completion disabled for V1 - // V1 requires this fix to avoid race-condition when resetting the NC instruction buffers - value = 0x1ul; - } else { - ret = reg_read32(&gen_regs->spare_reg.zeroes0, &value); + /* Use ostand_cfg_wr_2 to program max outstanding data writes in v4 to 256+ values + * ostand_cfg_wr is not effective in v4 by default. + */ + if (ndhal->ndhal_arch.arch == NEURON_ARCH_V4) { + ret = reg_read32(&udma->udma_regs_s2m->axi_s2m.ostand_cfg_wr_2, &value); if (ret) { return ret; } - value &= (~0x1ul); - } + value = ((value & (~UDMA_AXI_S2M_OSTAND_CFG_WR_2_MAX_DATA_WR_OSTAND_MASK)) | + (256 << UDMA_AXI_S2M_OSTAND_CFG_WR_2_MAX_DATA_WR_OSTAND_SHIFT)); + reg_write32(&udma->udma_regs_s2m->axi_s2m.ostand_cfg_wr_2, value); + } + + // Enable the completion ring head reporting by disabling bit0 + struct udma_gen_regs_v4 __iomem *gen_regs = udma->gen_regs; + ret = reg_read32(&gen_regs->spare_reg.zeroes0, &value); + if (ret) { + return ret; + } + value &= (~0x1ul); reg_write32(&gen_regs->spare_reg.zeroes0, value); return 0; @@ -422,7 +433,6 @@ static int udma_q_reset(struct udma_q *udma_q) return 0; } - /** Initializes the udma queue data structure. */ static void udma_q_init_internal(struct udma *udma, u32 qid, struct udma_q_params *q_params) @@ -458,7 +468,17 @@ static void udma_q_init_internal(struct udma *udma, u32 qid, struct udma_q_param udma_q->udma = udma; udma_q->qid = qid; - ndhal->ndhal_udma.udma_q_config(udma_q); + if (udma_q->type == UDMA_TX) { + uint32_t *reg_addr; + uint32_t val; + + reg_addr = &udma_q->q_regs->m2s_q.rlimit.mask; + val = udma_q->rlimit_mask; + // enable DMB + val &= ~UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB; + reg_write32(reg_addr, val); + } + /* clear all queue ptrs */ udma_q_reset(udma_q); diff --git a/v2/neuron_dhal_v2.c b/v2/neuron_dhal_v2.c index 61d40ff..465c811 100644 --- a/v2/neuron_dhal_v2.c +++ b/v2/neuron_dhal_v2.c @@ -464,7 +464,6 @@ static void mpset_set_dram_and_mpset_info_v2(struct mempool_set *mpset, u64 *dev /** * mpset_block_carveout_regions() * - in v2, block carve out regions: Upper 16 MB is used internally by firmware - * - in v1, do nothing and just return 0 * * @param nd: neuron device * @param mpset: pointer to mpset @@ -531,25 +530,25 @@ static uint32_t ndmar_get_h2t_eng_id_v2(struct neuron_device *nd, uint32_t nc_id } /** - * ndmar_get_h2t_qid() - return the H2T engine's queue id for this core + * ndmar_get_h2t_def_qid() - return the H2T engine's default queue id for this core * * @param nc_id: Neuron core corresponding to H2T engine * Return DMA queue id */ -static int ndmar_get_h2t_qid_v2(uint32_t nc_id) +static int ndmar_get_h2t_def_qid_v2(uint32_t nc_id) { return 0; } /** - * ndmar_is_h2t_q() - return true + * ndmar_is_h2t_def_q() - return true * * @param nd: Neuron device which contains the DMA engine * @param eng_id: engine id * @param q_id: queue id - * Return true if this is an h2t queue + * Return true if this is a default h2t queue */ -static bool ndmar_is_h2t_q_v2(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id) +static bool ndmar_is_h2t_def_q_v2(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id) { return (nd->ndma_engine[eng_id].used_for_h2t && (q_id == 0)); } @@ -699,7 +698,7 @@ const int *inf2_24xl_neighbor_ids[6] = { */ static int fw_io_topology_v2(struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count) { - // V2 does not have the device support to detect east/west/south/north neighbors like V1, + // V2 does not have the device support to detect east/west/south/north neighbors, // so its topology is hardcoded based on instance type. *count = 0; @@ -771,6 +770,39 @@ static int fw_io_read_csr_array_v2(void **ptrs, u32 *values, u32 num_csrs, bool } +/** + * fw_io_execute_request() - Execute commands + * + * @param ctx: FWIO context + * @param command_id: command ID + * @param req: Request data + * @param req_size: Request size + * @param resp: Response buffer + * @param resp_size: Response buffer size + * + * @return int: 0 on success, -1 on failure + */ +static int fw_io_execute_request_v2(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size) +{ + return fw_io_execute_request(ctx, command_id, req, req_size, resp, resp_size); +} + + +/** + * fw_io_post_metric() - Post metrics to CW + * + * @param ctx: FWIO context + * @param data: Data to write + * @param size: Size of data + * + * @return int: 0 on success, -1 on failure + */ +static int fw_io_post_metric_v2(struct fw_io_ctx *ctx, u8 *data, u32 size) +{ + return fw_io_post_metric(ctx, data, size); +} + + /* Register Access (read and write) Functions */ /** * reg_read32_array() - read an array of 32bit registers. @@ -841,7 +873,7 @@ static int root_info_node_attrs_info_tbl_cnt_v2 = sizeof(root_info_node_attrs_in * @param attr_info_tbl: the ecc attributes as an array * @return int 0 on success; otherwise on failure * - * Note: ecc errors are only supported by sysfs for V2. TODO: V1 support will be added + * Note: ecc errors are only supported by sysfs for V2. */ static int nsysfsmetric_add_ecc_nodes_v2(struct nsysfsmetric_metrics *metrics, struct nsysfsmetric_node *stats_node, @@ -1064,8 +1096,6 @@ static u32 neuron_pci_routing_id_to_user_id(u32 routing_id) * @param dev: PCI device * @param nd: neuron device * @return int: 0 on success, otherwise on failure - * - * for V1, this function is dummy */ static int neuron_pci_get_device_id_v2(struct neuron_device *nd, struct pci_dev *dev) { @@ -1093,7 +1123,7 @@ static int neuron_pci_get_device_id_v2(struct neuron_device *nd, struct pci_dev return -ENODEV; } - // TODO - TRN1 and INF2 mappings are different - likely all of this and the INF1 should be encapsulated. + // TODO - TRN1 and INF2 mappings are different if (nd->pdev->device == TRN1_DEVICE_ID0) nd->device_index = neuron_pci_routing_id_to_user_id(routing_id); else @@ -1153,7 +1183,7 @@ neuron_pci_device_id_to_rid_map_v2(uint32_t * count, uint32_t * did_to_rid_map) * * - Version 3 of runtime requires 1) aligned memory allocation support 2) SPROT. * - Version 4 of the runtime requires support for DMA queue init w/o already allocated rings (2.7). - * - Version 5 of the runtime requires V2 device renumbering (don't care for V1). + * - Version 5 of the runtime requires V2 device renumbering (don't care before V2). * - Version 6 of the runtime requires ham notification support, * + new V2 reset api for single-tpb reset + new notification init API with force mem realloc/resize. * - Version 7 of the runtime requires udma queue size support for non power of 2 rings + dmabuf support. @@ -1172,7 +1202,7 @@ static void ncdev_compatible_version_v2(struct neuron_ioctl_compatible_version * } /** - * ncdev_quiesce_exec_on_proc_exit() - for V1, before resetting DMA, allow current NeuronCore execution to finish and settle + * ncdev_quiesce_exec_on_proc_exit() * * Note: * When a process is killed, the driver resets DMA but there is no @@ -1196,110 +1226,18 @@ static void ncdev_quiesce_exec_on_proc_exit_v2(void) return; } -/** - * ncdev_bar_write_data() - write data to bar - * - * @param nd: neuron device - * @param bar: the BAR to write to - * @param reg_addresses - * @param data: the data to be written into the bar - * @param data_count: the number of data to be written - * @return 0 on success, otherwise failure - * - * V1: - * For BAR0 the addresses are passed as array(random access). - * For BAR2 a single address is provided and driver does sequential writes. - * V2: - * Only BAR0 is used right now. TODO: change runtime ioctl -*/ -static int ncdev_bar_write_data_v2(struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count) -{ - if (bar == 0) { - int i; - for (i = 0; i < data_count; i++) { - u64 off = reg_addresses[i] - (u64)nd->npdev.bar0; - if (off > nd->npdev.bar0_size) { - return -EINVAL; - } - if (ndhal->ndhal_ndma.ndma_is_bar0_write_blocked(off)) { - return -EINVAL; - } - writel(data[i], nd->npdev.bar0 + off); - trace_bar_write(nd, bar, off, data[i]); - } - } else if (bar == 4) { - // TODO: we don't have any use case for r/w memory over the BAR right now. Disabling. - // - // We'd like to use DMA for r/w of BAR4 because we might expect access to large amounts of data. - // Access via DMA requires an application to own a TPB because it determines which of the h2t DMAs - // are safe to use, otherwise a TPB along with its DMA could be reset while that DMA is used here. - // Don't want/need to solve it now. - return -EINVAL; - - /* - dma_addr_t dst_addr = reg_addresses[0] - (u64)nd->npdev.bar0; - - ret = ndma_memcpy(nd, 0, virt_to_phys(data) | ndhal->ndhal_address_map.pci_host_base, dst_addr, data_size); - if (ret) - return ret; - */ - } else { - pr_err("direct BAR%d write is not supported.\n", bar); - return -EINVAL; - } - - return 0; -} - static void ncdev_get_default_tpbs_for_hbm_v2(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count) { tpbs[0] = hbm_index; *tpb_count = 1; } -/* UDMA Functions */ -#define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16) -/** - * udma_m2s_data_rd_cfg_boundaries_set(): set data_rd_cfg to break at 256B boundaries - * - * @param udma: the UDMA structure - * - * for V1, this function is dummy - */ -static void udma_m2s_data_rd_cfg_boundaries_set_v2(struct udma *udma) -{ - reg_write32(&udma->udma_regs_m2s->axi_m2s.data_rd_cfg, - UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY | 0x8); -} - -#define UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB (1 << 2) -/** - * udma_q_config() - set misc queue configurations - * - * @param udma_q udma_q: the queue data structure - * - * for V1, this function is dummy - */ -static void udma_q_config_v2(struct udma_q *udma_q) -{ - if (udma_q->type != UDMA_TX) { - return; - } - - uint32_t *reg_addr = &udma_q->q_regs->m2s_q.rlimit.mask; - uint32_t val = udma_q->rlimit_mask; - - // enable DMB - val &= ~UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB; - reg_write32(reg_addr, val); -} - /* NDMA Functions */ /** * ndma_get_wait_for_completion_time() - calculate the first and the following wait times for a DMA tranfer completion * - * One full descriptor takes ~4 usec to transfer (64K at 16G/sec) on V2 and ~16 usec to transfer on V1. + * One full descriptor takes ~4 usec to transfer (64K at 16G/sec) on V2. * The last descriptor may be partial, so wait 1/4 64K transfer time for that descriptor. * Also, count includes the completion descriptor so don't include that in the count. * @@ -1332,14 +1270,12 @@ static void ndma_get_wait_for_completion_time_v2_emu(u32 count, bool async, u64 /** * ndma_validate_pa() - check the validity of the desc physical addresses - * V1: * west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000 * If west side is set then even host bit is set. When mc_alloc is called we set only the host bit * and insert into tree.. If some one sets the west side on that PA, then there is no way to check that, * since there could be a tdram address that could have the west side set * (that will look as though host is also set) - * V2: - * similar idea. Just check for valid address allocated in host memory + * Just check for valid address allocated in host memory * * @param nd: the neuron device * @param pa: the desc physical addresses @@ -1594,6 +1530,12 @@ static int npe_pod_ctrl_v2(struct neuron_device *nd, u32 pod_ctrl, enum neuron_u return 0; } +static int perf_set_profile_v2(struct neuron_device *nd, uint32_t profile) +{ + // NOP implementation for v2 + return 0; +} + /** * npe_class_node_id_show_data() - return sysfs class node_id * @@ -1708,8 +1650,6 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_address_map.dma_eng_per_nc = V2_DMA_ENG_PER_NC; ndhal->ndhal_address_map.dram_channels = V2_MAX_DRAM_CHANNELS; ndhal->ndhal_reset.reset_poll_interval = V2_NR_RESET_POLL_INTERVAL; - ndhal->ndhal_reset.reset_device_initial_poll_delay = 0; - ndhal->ndhal_reset.reset_tpb_initial_poll_delay = 0; ndhal->ndhal_reset.initiate_max_wait_time = V2_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS; ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT; ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v2; @@ -1726,8 +1666,8 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v2; ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v2; ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v2; - ndhal->ndhal_ndmar.ndmar_get_h2t_qid = ndmar_get_h2t_qid_v2; - ndhal->ndhal_ndmar.ndmar_is_h2t_q = ndmar_is_h2t_q_v2; + ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v2; + ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v2; ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v2; ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v2; ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v2; @@ -1735,6 +1675,8 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v2; ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v2; ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v2; + ndhal->ndhal_fw_io.fw_io_execute_request = fw_io_execute_request_v2; + ndhal->ndhal_fw_io.fw_io_post_metric = fw_io_post_metric_v2; ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v2; ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v2; ndhal->ndhal_sysfs_metrics.root_info_node_attrs_info_tbl_cnt = root_info_node_attrs_info_tbl_cnt_v2; @@ -1753,12 +1695,9 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v2; ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v2; ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v2; - ndhal->ndhal_cdev.ncdev_bar_write_data = ncdev_bar_write_data_v2; ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = NULL; ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v2; ndhal->ndhal_udma.num_beats = 1024; // >= UDMA_REV_ID_4 - ndhal->ndhal_udma.udma_m2s_data_rd_cfg_boundaries_set = udma_m2s_data_rd_cfg_boundaries_set_v2; - ndhal->ndhal_udma.udma_q_config = udma_q_config_v2; ndhal->ndhal_ndma.ndma_retry_memcpy = true; ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v2; ndhal->ndhal_ndma.ndma_validate_pa = ndma_validate_pa_v2; @@ -1773,6 +1712,7 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_npe.npe_class_node_id_show_data = npe_class_node_id_show_data_v2; ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v2; ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v2; + ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v2; ndhal->ndhal_tpb.pe_xbus_count = 5; ndhal->ndhal_tpb.pe_row_grp_count = 4; ndhal->ndhal_tpb.pe_col_grp_count = 4; diff --git a/v3/neuron_dhal_v3.c b/v3/neuron_dhal_v3.c index c138c30..c6682b4 100644 --- a/v3/neuron_dhal_v3.c +++ b/v3/neuron_dhal_v3.c @@ -6,10 +6,6 @@ #include #include -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) -#include -#endif #include "sdma.h" #include "notific.h" @@ -215,6 +211,12 @@ u64 ntpb_pe_idle_cntr_offsets_v3[V3_NC_PER_DEVICE] = V3_TPB_PE_ACTIVITY_COUNTER_OFFSET(V3_TPB_PE_SEQ_QUEUE_PERF_OFFSET(V3_APB_IO_1_USER_SE_1_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE, V3_APB_IO_1_BASE, V3_PCIE_BAR0_APB_IO_1_OFFSET), 4, V3_TPB_ARR_SEQ_QUEUE_PERF_IDLE_CYCLE_CNT_LSB_OFFSET), }; +u32 npe_neighbor_eng_ids_v3[2][2] = +{ + {36, 68}, // Left + {4, 100} // Right +}; + static int ndhal_register_funcs_trn2(void) { if (!ndhal) { pr_err("ndhal is null. Can't register functions for trn2."); @@ -227,59 +229,39 @@ static int ndhal_register_funcs_trn2(void) { return 0; } -static bool ndhal_instance_type_pod(void) -{ - static bool instance_type_is_pod = false; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) +/* Instance names + */ #define NEURON_TRN2P_INSTANCE_NAME "trn2p.48xlarge" #define NEURON_TRN2EU_INSTANCE_NAME "trn2eu.48xlarge" #define NEURON_TRN2U_INSTANCE_NAME "trn2u.48xlarge" - static bool initialized = false; - ssize_t len; - ssize_t file_size; - void *buf = NULL; - - if (initialized) { - return instance_type_is_pod; - } - - initialized = true; - - buf = kzalloc(PAGE_SIZE, GFP_KERNEL); - - if (buf == NULL) { - pr_err("failed to allocate buffer to read instance type"); - goto done; - } - - len = kernel_read_file_from_path("/sys/class/dmi/id/product_name", - 0, &buf, 64, &file_size, READING_UNKNOWN); +#define NEURON_TRN2UAC_INSTANCE_NAME "trn2u-ac.24xlarge" +#define NEURON_TRN2PDS_INSTANCE_NAME "trn2es.48xlarge" - if (!len) { - pr_err("read instance type failed"); - goto done; - } +static enum neuron_platform_type ndhal_platform_type_v3(void) +{ + enum neuron_platform_type platform_type = NEURON_PLATFORM_TYPE_INVALID; + char buf[128]; + if (narch_get_instance_type_name(buf, sizeof(buf))) goto done; if ((strncmp(buf, NEURON_TRN2P_INSTANCE_NAME, sizeof(NEURON_TRN2P_INSTANCE_NAME)-1) == 0) || (strncmp(buf, NEURON_TRN2EU_INSTANCE_NAME, sizeof(NEURON_TRN2EU_INSTANCE_NAME)-1) == 0) || - (strncmp(buf, NEURON_TRN2U_INSTANCE_NAME, sizeof(NEURON_TRN2U_INSTANCE_NAME)-1) == 0)) { - instance_type_is_pod = true; + (strncmp(buf, NEURON_TRN2U_INSTANCE_NAME, sizeof(NEURON_TRN2U_INSTANCE_NAME)-1) == 0) || + (strncmp(buf, NEURON_TRN2UAC_INSTANCE_NAME, sizeof(NEURON_TRN2UAC_INSTANCE_NAME)-1) == 0)) { + platform_type = NEURON_PLATFORM_TYPE_ULTRASERVER; + } else if ((strncmp(buf, NEURON_TRN2PDS_INSTANCE_NAME, sizeof(NEURON_TRN2PDS_INSTANCE_NAME)-1) == 0)) { + platform_type = NEURON_PLATFORM_TYPE_PDS; + } else { + platform_type = NEURON_PLATFORM_TYPE_STD; } done: - if (buf) { - kfree(buf); - } -#endif if (force_userver) { - instance_type_is_pod = true; + platform_type = NEURON_PLATFORM_TYPE_ULTRASERVER; } - return instance_type_is_pod; + return platform_type; } - - /* Device Reset Functions */ /** * nr_get_tpb_reset_map() - generates a the reset map of all resources associated with resetting a particular TPB @@ -416,7 +398,7 @@ static int nr_wait_for_reset_completion_v3_emu(struct neuron_device *nd) */ static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successful) { - if (!ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { return 0; } @@ -744,26 +726,26 @@ static uint32_t ndmar_get_h2t_eng_id_v3(struct neuron_device *nd, uint32_t nc_id } /** - * ndmar_get_h2t_qid() - return the H2T engine's queue id for this core + * ndmar_get_h2t_def_qid() - return the H2T engine's default queue id for this core * * @param nc_id: Neuron core corresponding to H2T engine * Return DMA queue id */ -static int ndmar_get_h2t_qid_v3(uint32_t nc_id) +static int ndmar_get_h2t_def_qid_v3(uint32_t nc_id) { // on V3 on h2t engines are shared between 2 cores so we give the even core queue 0 and the odd core queue 1 return nc_id % V3_NC_PER_SENG; } /** - * ndmar_is_h2t_q() - return true + * ndmar_is_h2t_def_q() - return true * * @param nd: Neuron device which contains the DMA engine * @param eng_id: engine id * @param q_id: queue id - * Return true if this is an h2t queue + * Return true if this is a default h2t queue */ -static bool ndmar_is_h2t_q_v3(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id) +static bool ndmar_is_h2t_def_q_v3(struct neuron_device *nd, uint32_t eng_id, uint32_t q_id) { return (nd->ndma_engine[eng_id].used_for_h2t && ((q_id == 0) || (q_id == 1))); } @@ -915,7 +897,7 @@ const int trn2_32xl_neigbor_ids[16][4] = { */ static int fw_io_topology_v3(struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count) { - // V3 does not have Pacific support to detect east/west/south/north neighbors like V1, + // V3 does not have Pacific support to detect east/west/south/north neighbors, // so its topology is hardcoded based on instance type. *count = 0; @@ -924,6 +906,9 @@ static int fw_io_topology_v3(struct fw_io_ctx *ctx, int pdev_index, int device_i switch (pdev_index) { case TRN2_DEVICE_ID0: // Trn2 + case TRN3_DEVICE_ID0: // Trn3 + case TRN3_DEVICE_ID1: // Trn3 + // TODO PDS update if (total_neuron_devices % 16 == 0) { int i; *count = 4; @@ -977,6 +962,39 @@ static int fw_io_read_csr_array_v3(void **ptrs, u32 *values, u32 num_csrs, bool } +/** + * fw_io_execute_request() - Execute commands + * + * @param ctx: FWIO context + * @param command_id: command ID + * @param req: Request data + * @param req_size: Request size + * @param resp: Response buffer + * @param resp_size: Response buffer size + * + * @return int: 0 on success, -1 on failure + */ +static int fw_io_execute_request_v3(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u32 req_size, u8 *resp, u32 resp_size) +{ + return fw_io_execute_request(ctx, command_id, req, req_size, resp, resp_size); +} + + +/** + * fw_io_post_metric() - Post metrics to CW + * + * @param ctx: FWIO context + * @param data: Data to write + * @param size: Size of data + * + * @return int: 0 on success, -1 on failure + */ +static int fw_io_post_metric_v3(struct fw_io_ctx *ctx, u8 *data, u32 size) +{ + return fw_io_post_metric(ctx, data, size); +} + + /* Register Access (read and write) Functions */ /** * reg_read32_array() - read an array of 32bit registers. @@ -1052,7 +1070,7 @@ static int root_info_node_attrs_info_tbl_cnt_v3 = sizeof(root_info_node_attrs_in * @param attr_info_tbl: the ecc attributes as an array * @return int 0 on success; otherwise on failure * - * Note: ecc errors are only supported by sysfs for V2/3. TODO: V1 support will be added + * Note: ecc errors are only supported by sysfs for V2/3. */ static int nsysfsmetric_add_ecc_nodes_v3(struct nsysfsmetric_metrics *metrics, struct nsysfsmetric_node *stats_node, @@ -1297,18 +1315,34 @@ static int neuron_pci_handle_dup_routing_id(void) // for V3 rename Neuron devices for better customer experience. // see internal documentation: TRN2-Discovery // map routing id to user id: -static const u32 v3_routing_id_to_user_id[] = { +static const u32 v3_torus_routing_id_to_user_id[] = { 0, 3, 4, 7, 12, 15, 8, 11, 1, 2, 5, 6, 13, 14, 9, 10 }; -#define V3_ROUTING_ID_TBL_SZ (sizeof(v3_routing_id_to_user_id) / sizeof(v3_routing_id_to_user_id[0])) +// map routing id to user id for trn2pds instance type. +// the only hard rule this map needs to follow is +// rid (i*2) and rid (i*2)+1 map to did (n*2) and did (n*2)+1 +// since rid (i*2) and rid (i*2)+1 are on the same JBOG. +static const u32 v3_pds_routing_id_to_user_id[] = { + 0, 1, + 2, 3, + 4, 5, + 6, 7, + 8, 9, + 10, 11, + 12, 13, + 14, 15 }; + +#define V3_ROUTING_ID_TBL_SZ (sizeof(v3_torus_routing_id_to_user_id) / sizeof(v3_torus_routing_id_to_user_id[0])) static u32 neuron_pci_routing_id_to_user_id(u32 routing_id) { - u32 user_id_base = v3_routing_id_to_user_id[ routing_id % V3_ROUTING_ID_TBL_SZ]; - return user_id_base + (routing_id / V3_ROUTING_ID_TBL_SZ) * V3_ROUTING_ID_TBL_SZ; + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + return v3_pds_routing_id_to_user_id[ routing_id % V3_ROUTING_ID_TBL_SZ]; + } + return v3_torus_routing_id_to_user_id[ routing_id % V3_ROUTING_ID_TBL_SZ]; } /** @@ -1317,14 +1351,13 @@ static u32 neuron_pci_routing_id_to_user_id(u32 routing_id) * @param dev: PCI device * @param nd: neuron device * @return int: 0 on success, otherwise on failure - * - * for V1, this function is dummy */ static int neuron_pci_get_device_id_v3(struct neuron_device *nd, struct pci_dev *dev) { int ret = 0; int i; u32 routing_id = (u32)-1; + u32 routing_id_max = MAX_NEURON_DEVICE_COUNT; // Poll the device id until the device is ready for (i = 0; i < 20; i++) { @@ -1340,8 +1373,28 @@ static int neuron_pci_get_device_id_v3(struct neuron_device *nd, struct pci_dev return -ENODEV; } + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + u32 server_info = 0; + bool server_id_valid = 0; + u32 server_id = 0; + ret = fw_io_server_info_read(nd->npdev.bar0, &server_info); + if (ret) { + return -ENODEV; + } + + server_id_valid = (server_info >> 15) & 0x1; // TODO PDS we probably need const shift value or macro + if (server_id_valid) { + server_id = server_info & 0x7fff; // TODO PDS we probably need constant mask for this + } else { + pr_err("Could not retrieve valid server id, ret = %d\n", ret); + return -ENODEV; + } + ndhal->ndhal_arch.server_id = server_id; + routing_id_max = MAX_NEURON_DEVICE_COUNT * 2; + } + // TODO - this should be a "valid routing_id check for TRN2 - if (routing_id < 0 || routing_id >= MAX_NEURON_DEVICE_COUNT) { + if (routing_id < 0 || routing_id >= routing_id_max) { pr_err("Invalid device index %u", routing_id); return -ENODEV; } @@ -1374,8 +1427,16 @@ neuron_pci_device_id_to_rid_map_v3(uint32_t * count, uint32_t * did_to_rid_map) switch (ndhal->pci_device_id) { case TRN2_DEVICE_ID0: + case TRN3_DEVICE_ID0: + case TRN3_DEVICE_ID1: for (i = 0; i < total_neuron_devices; i++) { - did_to_rid_map[neuron_pci_routing_id_to_user_id(i)] = i; + u32 routing_id; + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + routing_id = i + ndhal->ndhal_arch.server_id * total_neuron_devices; + } else { + routing_id = i; + } + did_to_rid_map[neuron_pci_routing_id_to_user_id(routing_id)] = routing_id; } break; @@ -1408,7 +1469,7 @@ neuron_pci_device_id_to_rid_map_v3(uint32_t * count, uint32_t * did_to_rid_map) * * - Version 3 of runtime requires 1) aligned memory allocation support 2) SPROT. * - Version 4 of the runtime requires support for DMA queue init w/o already allocated rings (2.7). - * - Version 5 of the runtime requires V2 device renumbering (don't care for V1). + * - Version 5 of the runtime requires V2 device renumbering (don't care before V2). * - Version 6 of the runtime requires ham notification support, * + new V2 reset api for single-tpb reset + new notification init API with force mem realloc/resize. * - Version 7 of the runtime requires udma queue size support for non power of 2 rings + dmabuf support. @@ -1431,7 +1492,7 @@ static void ncdev_compatible_version_v3(struct neuron_ioctl_compatible_version * } /** - * ncdev_quiesce_exec_on_proc_exit() - for V1, before resetting DMA, allow current NeuronCore execution to finish and settle + * ncdev_quiesce_exec_on_proc_exit() * * Note: * When a process is killed, the driver resets DMA but there is no @@ -1455,61 +1516,6 @@ static void ncdev_quiesce_exec_on_proc_exit_v3(void) return; } -/** - * ncdev_bar_write_data() - write data to bar - * - * @param nd: neuron device - * @param bar: the BAR to write to - * @param reg_addresses - * @param data: the data to be written into the bar - * @param data_count: the number of data to be written - * @return 0 on success, otherwise failure - * - * V1: - * For BAR0 the addresses are passed as array(random access). - * For BAR2 a single address is provided and driver does sequential writes. - * V2/3: - * Only BAR0 is used right now. TODO: change runtime ioctl -*/ -static int ncdev_bar_write_data_v3(struct neuron_device *nd, u8 bar, u64 *reg_addresses, u32 *data, u32 data_count) -{ - if (bar == 0) { - int i; - for (i = 0; i < data_count; i++) { - u64 off = reg_addresses[i] - (u64)nd->npdev.bar0; - if (off > nd->npdev.bar0_size) { - return -EINVAL; - } - if (ndhal->ndhal_ndma.ndma_is_bar0_write_blocked(off)) { - return -EINVAL; - } - writel(data[i], nd->npdev.bar0 + off); - trace_bar_write(nd, bar, off, data[i]); - } - } else if (bar == 4) { - // TODO: we don't have any use case for r/w memory over the BAR right now. Disabling. - // - // We'd like to use DMA for r/w of BAR4 because we might expect access to large amounts of data. - // Access via DMA requires an application to own a TPB because it determines which of the h2t DMAs - // are safe to use, otherwise a TPB along with its DMA could be reset while that DMA is used here. - // Don't want/need to solve it now. - return -EINVAL; - - /* - dma_addr_t dst_addr = reg_addresses[0] - (u64)nd->npdev.bar0; - - ret = ndma_memcpy(nd, 0, virt_to_phys(data) | ndhal->ndhal_address_map.pci_host_base, dst_addr, data_size); - if (ret) - return ret; - */ - } else { - pr_err("direct BAR%d write is not supported.\n", bar); - return -EINVAL; - } - - return 0; -} - static void ncdev_get_default_tpbs_for_hbm_v3(u32 hbm_index, u32 tpbs[MAX_NC_PER_DEVICE], u32 *tpb_count) { tpbs[0] = hbm_index * 2; @@ -1517,43 +1523,6 @@ static void ncdev_get_default_tpbs_for_hbm_v3(u32 hbm_index, u32 tpbs[MAX_NC_PER *tpb_count = 2; } -/* UDMA Functions */ -#define UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY (1 << 16) -/** - * udma_m2s_data_rd_cfg_boundaries_set(): set data_rd_cfg to break at 256B boundaries - * - * @param udma: the UDMA structure - * - * for V1, this function is dummy - */ -static void udma_m2s_data_rd_cfg_boundaries_set_v3(struct udma *udma) -{ - reg_write32(&udma->udma_regs_m2s->axi_m2s.data_rd_cfg, - UDMA_AXI_M2S_DATA_RD_CFG_ALWAYS_BREAK_ON_MAX_BOUDRY | 0x8); -} - -#define UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB (1 << 2) -/** - * udma_q_config() - set misc queue configurations - * - * @param udma_q udma_q: the queue data structure - * - * for V1, this function is dummy - */ -static void udma_q_config_v3(struct udma_q *udma_q) -{ - if (udma_q->type != UDMA_TX) { - return; - } - - uint32_t *reg_addr = &udma_q->q_regs->m2s_q.rlimit.mask; - uint32_t val = udma_q->rlimit_mask; - - // enable DMB - val &= ~UDMA_M2S_Q_RATE_LIMIT_MASK_INTERNAL_PAUSE_DMB; - reg_write32(reg_addr, val); -} - /* NDMA Functions */ /** @@ -1570,7 +1539,7 @@ static void ndma_get_wait_for_completion_time_v3(u32 count, bool async, u64 *fir { u64 est_wait_time = 2 * (count -1); *first_wait_time = async ? 1 : (est_wait_time - 1); // FIXME - need to adjust for zerocopy - *following_wait_time = (est_wait_time * 100) - *first_wait_time; + *following_wait_time = (est_wait_time * 400) - *first_wait_time; // for some reason getting a timeout when staging some of BERT training graphs. // https://tiny.amazon.com/8jw7wl18 @@ -1592,16 +1561,12 @@ static void ndma_get_wait_for_completion_time_v3_emu(u32 count, bool async, u64 /** * ndma_validate_pa() - check the validity of the desc physical addresses - * V1: * west side: PCIEX4_1_BASE: 0x00c00000000000 host: PCIEX8_0_BASE: 0x00400000000000 * If west side is set then even host bit is set. When mc_alloc is called we set only the host bit * and insert into tree.. If some one sets the west side on that PA, then there is no way to check that, * since there could be a tdram address that could have the west side set * (that will look as though host is also set) - * V2: - * similar idea. Just check for valid address allocated in host memory - * V3: - * similar idea. Just check for valid address allocated in host memory + * Just check for valid address allocated in host memory * * @param nd: the neuron device * @param pa: the desc physical addresses @@ -1838,7 +1803,7 @@ static bool ndhal_die_flipped(void) if (force_die_flip) { return true; } - if (!ndhal_instance_type_pod()) { + if (!(ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER)) { return false; } @@ -1897,6 +1862,7 @@ static void ndma_get_engines_with_host_connectivity_v3(u32 hbm_index, u32 engine /* POD Functions */ + /** * npe_notify_mark() - api for crwl to notify range marking (core claiming) activities * @@ -1906,7 +1872,7 @@ static void ndma_get_engines_with_host_connectivity_v3(u32 hbm_index, u32 engine */ static void npe_notify_mark_v3(int mark_cnt, bool mark) { - if (ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type != NEURON_PLATFORM_TYPE_STD) { npe_notify_mark(mark_cnt, mark); } } @@ -1923,17 +1889,26 @@ static void npe_notify_mark_v3(int mark_cnt, bool mark) */ static int npe_pod_info_v3(u8 *pod_type, u8 *pod_id, u8 *pod_sz, enum neuron_ultraserver_mode *mode, u32 *modes_supported) { - if (!ndhal_instance_type_pod()) { + if ((ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD)) { *pod_type = NEURON_POD_TYPE_NONE; *pod_sz = 0; *mode = NEURON_ULTRASERVER_MODE_UNSET; *modes_supported = 0; - } else { + } else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + *pod_type = NEURON_POD_TYPE_SWITCH; + npe_get_pod_sz(pod_sz); + npe_get_pod_id(pod_id); + npe_get_pod_mode(mode); + npe_get_pod_modes_supported(modes_supported); + } else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { *pod_type = NEURON_POD_TYPE_P2P; npe_get_pod_sz(pod_sz); npe_get_pod_id(pod_id); npe_get_pod_mode(mode); npe_get_pod_modes_supported(modes_supported); + } else { + pr_err("invalid platform type"); + return -EINVAL; } return 0; } @@ -1947,7 +1922,7 @@ static int npe_pod_info_v3(u8 *pod_type, u8 *pod_id, u8 *pod_sz, enum neuron_ult */ static int npe_pod_status_v3(u32 *pod_state, s8 *node_id) { - if (!ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { *pod_state = NEURON_POD_E_STATE_SINGLE_NODE; *node_id = -1; return 0; @@ -1967,12 +1942,23 @@ static int npe_pod_status_v3(u32 *pod_state, s8 *node_id) */ static int npe_pod_ctrl_v3(struct neuron_device *nd, u32 pod_ctrl, enum neuron_ultraserver_mode mode, u32 timeout, u32 *pod_state) { - if (!ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { return 0; } return npe_pod_ctrl(nd, pod_ctrl, mode, timeout, pod_state); } +static int perf_set_profile_v3(struct neuron_device *nd, uint32_t profile) +{ + int ret; + ret = fw_io_set_power_profile(nd->fw_io_ctx, profile); + if (ret == 0) { + ndhal->ndhal_perf.current_performance_profile = profile; + nmetric_set_performance_profile(nd, profile); + } + return ret; +} + /** * npe_class_node_id_show_data() - return sysfs class node_id * @@ -1982,7 +1968,7 @@ static int npe_pod_ctrl_v3(struct neuron_device *nd, u32 pod_ctrl, enum neuron_u */ static ssize_t npe_class_node_id_show_data_v3(char *buf, u32 sz) { - if (!ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { return dhal_sysfs_emit(buf, "-1\n"); } return npe_class_node_id_show_data(buf, sz); @@ -1997,7 +1983,7 @@ static ssize_t npe_class_node_id_show_data_v3(char *buf, u32 sz) */ static ssize_t npe_class_server_id_show_data_v3(char *buf, u32 sz) { - if (!ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { return dhal_sysfs_emit(buf, "0000000000000000\n"); } return npe_class_server_id_show_data(buf, sz); @@ -2011,7 +1997,7 @@ static ssize_t npe_class_server_id_show_data_v3(char *buf, u32 sz) */ static ssize_t npe_class_ultraserver_mode_show_data_v3(char *buf) { - if (!ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { return dhal_sysfs_emit(buf, "\n"); } return npe_class_ultraserver_mode_show_data(buf); @@ -2053,7 +2039,7 @@ static int ntpb_pe_get_aggregated_wl_cycle_cnt_v3(struct neuron_device *nd, int */ static void ndhal_ext_cleanup_v3(void) { - if (ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { npe_cleanup(); } return; @@ -2083,6 +2069,7 @@ int ndhal_register_funcs_v3(void) { return -EINVAL; } + ndhal->ndhal_arch.platform_type = ndhal_platform_type_v3(); ndhal->ndhal_address_map.pci_host_base = V3_PCIE_A0_BASE; ndhal->ndhal_address_map.mmap_p_offset = V3_MMAP_P_OFFSET; ndhal->ndhal_address_map.mmap_nc_event_offset = V3_MMAP_NC_EVENT_OFFSET; @@ -2102,8 +2089,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_address_map.dma_eng_per_nc = V3_DMA_ENG_PER_NC; ndhal->ndhal_address_map.dram_channels = V3_MAX_DRAM_CHANNELS; ndhal->ndhal_reset.reset_poll_interval = V3_NR_RESET_POLL_INTERVAL; - ndhal->ndhal_reset.reset_device_initial_poll_delay = 0; - ndhal->ndhal_reset.reset_tpb_initial_poll_delay = 0; ndhal->ndhal_reset.initiate_max_wait_time = V3_NR_RESET_INIT_MAX_TOTAL_WAIT_TIME_MS; ndhal->ndhal_reset.retry_count = NR_RESET_RETRY_COUNT; ndhal->ndhal_reset.nr_post_reset_config = nr_post_reset_config_v3; @@ -2120,8 +2105,8 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v3; ndhal->ndhal_mpset.mpset_block_carveout_regions = mpset_block_carveout_regions_v3; ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v3; - ndhal->ndhal_ndmar.ndmar_get_h2t_qid = ndmar_get_h2t_qid_v3; - ndhal->ndhal_ndmar.ndmar_is_h2t_q = ndmar_is_h2t_q_v3; + ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v3; + ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v3; ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v3; ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v3; ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v3; @@ -2129,6 +2114,8 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v3; ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v3; ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v3; + ndhal->ndhal_fw_io.fw_io_execute_request = fw_io_execute_request_v3; + ndhal->ndhal_fw_io.fw_io_post_metric = fw_io_post_metric_v3; ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v3; ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v3; ndhal->ndhal_sysfs_metrics.root_info_node_attrs_info_tbl_cnt = root_info_node_attrs_info_tbl_cnt_v3; @@ -2137,6 +2124,7 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count = nsysfsmetric_get_hbm_error_count_v3; ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_tensor_engine_node = nsysfsmetric_add_tensor_engine_node_v3; ndhal->ndhal_pci.axi_bar = BAR_UNUSED; + ndhal->ndhal_pci.apb_bar = 0; ndhal->ndhal_pci.dram_bar = 4; ndhal->ndhal_pci.neuron_pci_release_bar = neuron_pci_release_bar_v3; ndhal->ndhal_pci.neuron_pci_reserve_bar = neuron_pci_reserve_bar_v3; @@ -2147,12 +2135,9 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_cdev.ncdev_bar0_write_blocked_addrs = ncdev_bar0_write_blocked_addrs_v3; ndhal->ndhal_cdev.ncdev_compatible_version = ncdev_compatible_version_v3; ndhal->ndhal_cdev.ncdev_quiesce_exec_on_proc_exit = ncdev_quiesce_exec_on_proc_exit_v3; - ndhal->ndhal_cdev.ncdev_bar_write_data = ncdev_bar_write_data_v3; ndhal->ndhal_cdev.ncdev_logical_to_physical_nc_map = ncdev_logical_to_physical_nc_map_v3; ndhal->ndhal_cdev.ncdev_get_default_tpbs_for_hbm = ncdev_get_default_tpbs_for_hbm_v3; ndhal->ndhal_udma.num_beats = 2296; // allow up to 288 outstanding writes - ndhal->ndhal_udma.udma_m2s_data_rd_cfg_boundaries_set = udma_m2s_data_rd_cfg_boundaries_set_v3; - ndhal->ndhal_udma.udma_q_config = udma_q_config_v3; ndhal->ndhal_ndma.ndma_retry_memcpy = false; ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3; ndhal->ndhal_ndma.ndma_validate_pa = ndma_validate_pa_v3; @@ -2167,6 +2152,8 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_npe.npe_class_node_id_show_data = npe_class_node_id_show_data_v3; ndhal->ndhal_npe.npe_class_server_id_show_data = npe_class_server_id_show_data_v3; ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data = npe_class_ultraserver_mode_show_data_v3; + ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v3; + ndhal->ndhal_perf.perf_set_profile = perf_set_profile_v3; ndhal->ndhal_tpb.pe_xbus_count = 9; ndhal->ndhal_tpb.pe_row_grp_count = 4; ndhal->ndhal_tpb.pe_col_grp_count = 4; @@ -2176,6 +2163,7 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_tpb.pe_fast_wl_cntr_offsets = ntpb_pe_fast_wl_cntr_offsets_v3; ndhal->ndhal_tpb.pe_idle_cntr_offsets = ntpb_pe_idle_cntr_offsets_v3; ndhal->ndhal_tpb.pe_get_aggregated_wl_cycle_cnt = ntpb_pe_get_aggregated_wl_cycle_cnt_v3; + ndhal->ndhal_perf.current_performance_profile = 0; ndhal->ndhal_ext_cleanup = ndhal_ext_cleanup_v3; extern unsigned int nmetric_log_posts; @@ -2185,7 +2173,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3_qemu; ndhal->ndhal_address_map.dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC; ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu; - ndhal->ndhal_pci.apb_bar = 2; ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_qemu; ndhal->ndhal_address_map.dice_per_device = 1; @@ -2199,24 +2186,27 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_address_map.nc_per_device = nc_per_dev_param; ndhal->ndhal_address_map.dev_nc_map = dev_nc_map; ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3_qemu_emu; - ndhal->ndhal_pci.apb_bar = 0; ndhal->ndhal_ndma.ndma_get_wait_for_completion_time = ndma_get_wait_for_completion_time_v3_emu; ndhal->ndhal_address_map.dice_per_device = 1; // Disable metrics on emulation nmetric_log_posts = 0; - } else { ndhal->ndhal_reset.nr_initiate_reset = nr_initiate_reset_v3; ndhal->ndhal_reset.nr_wait_for_reset_completion = nr_wait_for_reset_completion_v3; ndhal->ndhal_address_map.dma_eng_per_nd = V3_NC_PER_DEVICE * V3_DMA_ENG_PER_NC; ndhal->ndhal_reg_access.reg_read32_array = reg_read32_array_v3; - ndhal->ndhal_pci.apb_bar = 0; } - if (ndhal_instance_type_pod()) { + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { ret = npe_init(); - } + if (ret) { + pr_err("failed to initialize pod election on V3\n"); + return ret; + } + } else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + // TODO PDS + } switch (ndhal->pci_device_id) { case TRN2_DEVICE_ID0: @@ -2226,6 +2216,10 @@ int ndhal_register_funcs_v3(void) { return ret; } break; + // TODO: remove once v4 dhal stops re-using v3 + case TRN3_DEVICE_ID0: + case TRN3_DEVICE_ID1: + break; default: pr_err("Unknown HW architecture. Can't init neuron_dhal.\n"); return -EINVAL; diff --git a/v3/neuron_pelect.c b/v3/neuron_pelect.c index 7bcb027..a9d1b0d 100644 --- a/v3/neuron_pelect.c +++ b/v3/neuron_pelect.c @@ -157,6 +157,16 @@ #include "../neuron_crwl.h" #include "neuron_pelect.h" +int userver_pds_node_cnt = 2; +module_param(userver_pds_node_cnt, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(userver_pds_node_cnt, "pds ultraserver node count"); + +int userver_pds_server_id = 0x0001; +module_param(userver_pds_server_id, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(userver_pds_server_id, "pds ultraserver id"); + + +/* Enable ultraserver auto election (4 node configuration) by default */ /* * UltraServer ctl to * - control of when election is triggered @@ -283,6 +293,8 @@ typedef struct pod_neighbor_io { struct mem_chunk *data_mc; } pod_neighbor_io_t; +static void npe_pds_spoof(void); + static bool npe_pod_ctl_is_set(int value) { return (pod_ctl & value); @@ -844,11 +856,11 @@ static int npe_primary_device_do_election(struct neuron_device *nd, int secondar // Initialize neighbor io structures // Left - ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, 36); - ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, 68); + ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][0]); + ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][1]); // Right - ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, 4); - ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, 100); + ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][0]); + ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][1]); if (ret) { pr_err("neighbor io initialization failed"); goto done; @@ -914,7 +926,6 @@ static int npe_primary_device_do_election(struct neuron_device *nd, int secondar // determine our node id node cnt and pod serial number // node_id = npe_get_node_id(serial_number, nbr_serial_number[0], nbr_serial_number[1], diagonal, &node_cnt, &pod_serial_number); - ret = 0; // set election status, with bad node id // @@ -988,11 +999,11 @@ static int npe_secondary_device_vet(struct neuron_device *nd, volatile long unsi // Initialize neighbor io structures // Left - ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, 36); - ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, 68); + ret = npe_pod_neighbor_io_init(&(pnio[0][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][0]); + ret |= npe_pod_neighbor_io_init(&(pnio[0][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[0][1]); // Right - ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, 4); - ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, 100); + ret |= npe_pod_neighbor_io_init(&(pnio[1][0]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][0]); + ret |= npe_pod_neighbor_io_init(&(pnio[1][1]), nd, ndhal->ndhal_npe.npe_neighbor_eng_ids[1][1]); if (ret) { pr_err("nd%02d: neighbor io initialization failed", nd->device_index); @@ -1041,6 +1052,9 @@ static int npe_secondary_device_vet(struct neuron_device *nd, volatile long unsi ret = -EPIPE; } } + if (ret) { + goto done; + } // set election status, check neighbor's election status, and // clear election data (but not election status) from miscram. @@ -1154,7 +1168,7 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful) // Device 0 is the primary actor in the election/topology discovery process, so // when we process Device 0 reset completions, we need to do some bookkeeping. // - if (nd->device_index == 0) { + if ((nd->device_index == 0) && (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER)) { // Prior election results are cached in miscram, for testing purposes, // we can clear the results through a module parameter, allowing us // to ignore the cached results. @@ -1188,6 +1202,13 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful) goto done; } } + + // spoof PDS topology/election data + // + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + npe_pds_spoof(); + goto done; + } // if we aren't kicking off election on first driver reset (testing) or // if we aren't in init state then we've already made an election decision. @@ -1201,7 +1222,7 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful) if (!npe_all_rst_complete()) { goto done; } - + npe_initiate_election(ndhal_pelect_data.nbr_data_read_timeout); done: @@ -1563,6 +1584,12 @@ int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mod } else if (ctrl == NEURON_NPE_POD_CTRL_REQ_POD) { int mark_cnt = ncrwl_range_mark_cnt_get(); + // no election required on PDS, return success + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + ret = 0; + goto done; + } + if ((mark_cnt == 0) && npe_all_rst_complete()) { npe_initiate_election(timeout * 1000); ret = 0; @@ -1774,6 +1801,42 @@ ssize_t npe_class_ultraserver_mode_show_data(char *buf) return dhal_sysfs_emit(buf, "%s\n", output); } +/* npe_pds_spoof(void) + * + * temp spoof of PDS platform data + * + */ +static void npe_pds_spoof(void) +{ + static bool initialized = false; + pr_info("spoofing pds data"); + + if (initialized) { + return; + } + + ndhal_pelect_data.node_cnt = userver_pds_node_cnt; + + if (ndhal_pelect_data.node_cnt == 0) { + ndhal_pelect_data.node_id = -1; + } else if (ndhal_pelect_data.node_cnt == 2) { + // node_cnt of 2 uses V-links + ndhal_pelect_data.lr_mask = 0x1; + ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id; + } else if (ndhal_pelect_data.node_cnt == 4) { + // TODO PDS add in rack id + ndhal_pelect_data.node_id = ndhal->ndhal_arch.server_id; + } else { + ndhal_pelect_data.node_cnt = 0; + pr_err("invalid PDS node count of %d", ndhal_pelect_data.node_cnt); + } + + ndhal_pelect_data.pod_serial_num = userver_pds_server_id; + ndhal_pelect_data.pod_state_internal = NEURON_NPE_POD_ST_ELECTION_SUCCESS; + + initialized = true; +} + int npe_init(void) { // force election to use right or left link only diff --git a/v4/address_map.h b/v4/address_map.h new file mode 100644 index 0000000..4391557 --- /dev/null +++ b/v4/address_map.h @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __V4_ADDR_MAP_H__ + +// Host memory access +#define V4_PCIE_A0_BASE 0x00400000000000ull + +// Neighbor memory access +#define V4_PCIE_B0_0_BASE 0x10000000000000ull +#define V4_PCIE_B0_1_BASE 0x14000000000000ull +#define V4_PCIE_B0_2_BASE 0x18000000000000ull +#define V4_PCIE_B0_3_BASE 0x1c000000000000ull + +// relative to nc +#define V4_MMAP_P_OFFSET 0x0000000d0000000ull +#define V4_MMAP_NC_EVENT_OFFSET 0x00000002700000ull +#define V4_MMAP_NC_SEMA_READ_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001000ull +#define V4_MMAP_NC_SEMA_SET_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001400ull +#define V4_MMAP_NC_SEMA_INCR_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001800ull +#define V4_MMAP_NC_SEMA_DECR_OFFSET V4_MMAP_NC_EVENT_OFFSET + 0x00000000001c00ull +#define V4_MMAP_NC_SEMA_SIZE 0x00000000002000ull + + +#define V4_PCIE_BAR0_APB_IO_0_OFFSET 0x000000000000000ull +#define V4_PCIE_BAR0_APB_IO_1_OFFSET 0x000000040000000ull +#define V4_PCIE_BAR0_APB_SE_0_OFFSET 0x000000080000000ull +#define V4_PCIE_BAR0_APB_SE_1_OFFSET 0x000000090000000ull +#define V4_PCIE_BAR0_APB_SE_2_OFFSET 0x0000000a0000000ull +#define V4_PCIE_BAR0_APB_SE_3_OFFSET 0x0000000b0000000ull +#define V4_PCIE_BAR0_APB_IO_0_SIZE 0x000000020000000ull + + +// relative to bar0 +#define V4_APB_IO_0_USER_IO_RELBASE 0x00000006800000ull +#define V4_APB_IO_0_USER_IO_MISC_RAM_RELBASE 0x00000000484000ull + +// relative to address space +#define V4_APB_MISC_RAM_OFFSET 0x00008006c84000ull + +#define V4_MMAP_NC_SIZE 0x00000804000000ull + +// +#define V4_NUM_DIE_PER_DEVICE 2 +#define V4_NUM_SENG_PER_DIE 2 +#define V4_SENG_PER_DEVICE (V4_NUM_DIE_PER_DEVICE * V4_NUM_SENG_PER_DIE) + +// Number of Neuron Core per device +#define V4_NC_PER_SENG 2 +#define V4_NC_PER_DIE (V4_NUM_SENG_PER_DIE * V4_NC_PER_SENG) +#define V4_NC_PER_DEVICE (V4_NC_PER_SENG * V4_SENG_PER_DEVICE) +// Number of DMA engines per NC +#define V4_DMA_ENG_PER_NC 16 + +// Number of DMA queues in each engine +#define V4_DMA_QUEUE_PER_ENG 16 + +#define V4_NUM_DMA_ENG_PER_SENG (V4_NC_PER_SENG * V4_DMA_ENG_PER_NC) +#define V4_NUM_SENG_DMA_PER_DEVICE (V4_NC_PER_DEVICE * V4_DMA_ENG_PER_NC) +#define V4_NUM_H2D_DMA_PER_DEVICE 4 +#define V4_NUM_DMA_ENG_PER_DEVICE (V4_NUM_SENG_DMA_PER_DEVICE + V4_NUM_H2D_DMA_PER_DEVICE) +#define V4_MAX_DMA_RINGS 16 + +// Number of TPB engines per NC +#define V4_TPB_ENG_PER_NC 5 + +// Number of TOP_SP +#define V4_TS_PER_NC 2 +#define V4_TS_PER_DIE (V4_NC_PER_DIE * V4_TS_PER_NC) +#define V4_TS_PER_DEVICE (V4_NC_PER_DEVICE * V4_TS_PER_NC) + +// max channels supported by V4 device +#define V4_MAX_DRAM_CHANNELS 4 + +#define V4_SEMAPHORE_COUNT 256 +#define V4_EVENTS_COUNT 256 + +#define V4_ALLOWED_DESC_PER_PACKET 64 + +#define V4_MAX_NQ_QUEUES 16 +#define V4_MAX_NQ_TYPE 5 +#define V4_MAX_NQ_SUPPORTED (V4_MAX_NQ_TYPE * V4_MAX_NQ_QUEUES) + +#define V4_APB_IO_0_BASE 0x00008000000000ull +#define V4_APB_IO_1_BASE 0x00808000000000ull +#define V4_HBM_0_BASE 0x00000000000000ull +#define V4_HBM_1_BASE 0x00004000000000ull +#define V4_HBM_2_BASE 0x00800000000000ull +#define V4_HBM_3_BASE 0x00804000000000ull +#define V4_HBM_SIZE 0x001000000000ull +#define V4_HBM_ACTIVE_SIZE 0x900000000ull /* Each HBM stack is actually 36GB not 64GB as above define suggests */ + +#define V4_PREPROC_0_BASE 0x00001200000000ull +#define V4_PREPROC_1_BASE 0x00005200000000ull +#define V4_PREPROC_2_BASE 0x00801200000000ull +#define V4_PREPROC_3_BASE 0x00805200000000ull +#define V4_PREPROC_SIZE 0x000000034c0000ull + +#define V4_MMAP_TPB_0_BASE 0x00002000000000ull +#define V4_MMAP_TPB_1_BASE 0x00003000000000ull +#define V4_MMAP_TPB_2_BASE 0x00006000000000ull +#define V4_MMAP_TPB_3_BASE 0x00007000000000ull +#define V4_MMAP_TPB_4_BASE 0x00802000000000ull +#define V4_MMAP_TPB_5_BASE 0x00803000000000ull +#define V4_MMAP_TPB_6_BASE 0x00806000000000ull +#define V4_MMAP_TPB_7_BASE 0x00807000000000ull + +#define V4_PCIE_BAR0_TPB_0_OFFSET 0x0000000d0000000ull +#define V4_PCIE_BAR0_TPB_1_OFFSET 0x0000000d4000000ull +#define V4_PCIE_BAR0_TPB_2_OFFSET 0x0000000d8000000ull +#define V4_PCIE_BAR0_TPB_3_OFFSET 0x0000000dc000000ull +#define V4_PCIE_BAR0_TPB_4_OFFSET 0x0000000e0000000ull +#define V4_PCIE_BAR0_TPB_5_OFFSET 0x0000000e4000000ull +#define V4_PCIE_BAR0_TPB_6_OFFSET 0x0000000e8000000ull +#define V4_PCIE_BAR0_TPB_7_OFFSET 0x0000000ec000000ull +#define V4_PCIE_BAR0_TPB_SIZE 0x000000004000000ull +#define V4_PCIE_BAR0_TPB_DIST (V4_PCIE_BAR0_TPB_1_OFFSET - V4_PCIE_BAR0_TPB_0_OFFSET) +#define V4_PCIE_BAR0_TPB_SBUF_SIZE 0x000000002000000ull + +#define V4_PCIE_BAR0_APB_IO_0_OFFSET 0x000000000000000ull +#define V4_PCIE_BAR0_APB_IO_1_OFFSET 0x000000040000000ull +#define V4_PCIE_BAR0_APB_IO_DIST (V4_PCIE_BAR0_APB_IO_1_OFFSET - V4_PCIE_BAR0_APB_IO_0_OFFSET) + +#define V4_PCIE_BAR0_APB_SE_0_OFFSET 0x000000080000000ull +#define V4_PCIE_BAR0_APB_SE_1_OFFSET 0x000000090000000ull +#define V4_PCIE_BAR0_APB_SE_2_OFFSET 0x0000000a0000000ull +#define V4_PCIE_BAR0_APB_SE_3_OFFSET 0x0000000b0000000ull +#define V4_PCIE_BAR0_APB_SE_DIST (V4_PCIE_BAR0_APB_SE_1_OFFSET - V4_PCIE_BAR0_APB_SE_0_OFFSET) + +#define V4_PCIE_BAR4_HBM_0_OFFSET 0x000000000000000 +#define V4_PCIE_BAR4_HBM_1_OFFSET 0x000001000000000 +#define V4_PCIE_BAR4_HBM_2_OFFSET 0x000002000000000 +#define V4_PCIE_BAR4_HBM_3_OFFSET 0x000003000000000 + +#define V4_APB_SE_0_USER_FIS_SDMA_0_OFFSET 0x0000000c000000ull +#define V4_APB_SE_1_USER_FIS_SDMA_0_OFFSET 0x0000000c400000ull +#define V4_APB_SE_2_USER_FIS_SDMA_0_OFFSET 0x0000000c000000ull +#define V4_APB_SE_3_USER_FIS_SDMA_0_OFFSET 0x0000000c400000ull +#define V4_APB_SE_USER_FIS_SDMA_0_FIS_0_USER_ERRTRIG_OFFSET 0x00000000000000ull +#define V4_APB_SE_USER_FIS_SDMA_0_SIZE 0x00000000020000ull + +#define V4_TOP_SP_SIZE 0x00000000400000ull +#define V4_TOP_SP_0_BASE 0x00008280000000ull +#define V4_TOP_SP_1_BASE 0x000082c0000000ull +#define V4_TOP_SP_2_BASE 0x00008300000000ull +#define V4_TOP_SP_3_BASE 0x00008340000000ull +#define V4_TOP_SP_4_BASE 0x00008380000000ull +#define V4_TOP_SP_5_BASE 0x000083c0000000ull +#define V4_TOP_SP_6_BASE 0x00008400000000ull +#define V4_TOP_SP_7_BASE 0x00008440000000ull +#define V4_TOP_SP_8_BASE 0x00008480000000ull +#define V4_TOP_SP_9_BASE 0x000084c0000000ull +#define V4_TOP_SP_10_BASE 0x00808280000000ull +#define V4_TOP_SP_11_BASE 0x008082c0000000ull +#define V4_TOP_SP_12_BASE 0x00808300000000ull +#define V4_TOP_SP_13_BASE 0x00808340000000ull +#define V4_TOP_SP_14_BASE 0x00808380000000ull +#define V4_TOP_SP_15_BASE 0x008083c0000000ull +#define V4_TOP_SP_16_BASE 0x00808400000000ull +#define V4_TOP_SP_17_BASE 0x00808440000000ull +#define V4_TOP_SP_18_BASE 0x00808480000000ull +#define V4_TOP_SP_19_BASE 0x008084c0000000ull +#define V4_TOP_SP_DIST (V4_TOP_SP_1_BASE - V4_TOP_SP_0_BASE) + +#define V4_PCIE_BAR0_TOP_SP_0_OFFSET 0x0000000f0000000ull +#define V4_PCIE_BAR0_TOP_SP_10_OFFSET 0x0000000f2800000ull +#define V4_PCIE_BAR0_TOP_SP_0_SIZE 0x000000000400000ull + +#define V4_MMAP_TPB_COUNT 8 +#define V4_NUM_DMA_ENGINES_PER_TPB 16 + +#define V4_D2H_0_IDX 128 +#define V4_H2D_0_IDX 129 +#define V4_D2H_1_IDX 130 +#define V4_H2D_1_IDX 131 + +#define V4_APB_SE_0_BASE 0x00001000000000ull +#define V4_APB_SE_1_BASE 0x00005000000000ull +#define V4_APB_SE_2_BASE 0x00801000000000ull +#define V4_APB_SE_3_BASE 0x00805000000000ull +#define V4_APB_SDMA_DIST 0x00000000100000ull + +#define V4_APB_SE_0_SDMA_0_BASE 0x00001002000000ull +#define V4_APB_SE_1_SDMA_0_BASE 0x00005004000000ull +#define V4_APB_SE_2_SDMA_0_BASE 0x00801002000000ull +#define V4_APB_SE_3_SDMA_0_BASE 0x00805004000000ull +#define V4_APB_SDMA_MISC_OFFSET 0x00000000040000ull + +#define V4_APB_IO_0_D2H_UDMA_BASE 0x00008006800000ull +#define V4_APB_IO_0_H2D_UDMA_BASE 0x00008006900000ull +#define V4_APB_IO_1_D2H_UDMA_BASE 0x00808006800000ull +#define V4_APB_IO_1_H2D_UDMA_BASE 0x00808006900000ull + +#define V4_APB_SENG_0_SDMA_0_NOTIFIC_RELBASE 0x00000000001000ull +#define V4_APB_IO_0_SE_0_RELBASE 0x00000006000000ull +#define V4_APB_IO_0_SE_1_RELBASE 0x00000006400000ull +#define V4_APB_IO_1_SE_0_RELBASE 0x00000006000000ull +#define V4_APB_IO_1_SE_1_RELBASE 0x00000006400000ull +#define V4_APB_IO_0_SE_0_TPB_0_SIZE 0x00000000180000ull +#define V4_APB_IO_0_SE_0_TPB_TOP_RELBASE 0x00000000000000ull +#define V4_APB_IO_0_SE_0_TPB_TOP_NOTIFIC_RELBASE 0x00000000001000ull +#define V4_APB_IO_0_SE_0_TPB_NOTIFIC_SIZE 0x00000000001000ull +#define V4_APB_SENG_0_SDMA_0_APP_RELBASE 0x00000000000000ull + +#define V4_APB_IO_0_USER_IO_TOP_SP_0_RELBASE 0x00000000200000ull +#define V4_APB_IO_0_USER_IO_TOP_SP_0_SIZE 0x00000000040000ull +#define V4_APB_IO_0_USER_IO_TOP_SP_0_NOTIFIC_RELBASE 0x00000000000000ull + +#define V4_MMAP_BAR0_APB_IO_0_MISC_RAM_OFFSET V4_PCIE_BAR0_APB_IO_0_OFFSET + V4_APB_IO_0_USER_IO_RELBASE + V4_APB_IO_0_USER_IO_MISC_RAM_RELBASE + +#define V4_APB_IO_0_USER_SE_0_RESERVED2_RELBASE 0x00000006300000ull + +#define V4_APB_IO_0_USER_SE_0_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00008006141000ull +#define V4_APB_IO_0_USER_SE_0_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x000080062c1000ull +#define V4_APB_IO_0_USER_SE_1_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00008006541000ull +#define V4_APB_IO_0_USER_SE_1_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x000080066c1000ull +#define V4_APB_IO_1_USER_SE_0_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00808006141000ull +#define V4_APB_IO_1_USER_SE_0_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x008080062c1000ull +#define V4_APB_IO_1_USER_SE_1_TPB_0_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x00808006541000ull +#define V4_APB_IO_1_USER_SE_1_TPB_1_PE_SEQ_CLUSTER_HOST_VISIBLE_BASE 0x008080066c1000ull + +#define V4_TPB_ARR_SEQ_QUEUE_PERF_BASE_OFFSET 0x00000000000100ull +#define V4_TPB_ARR_SEQ_QUEUE_PERF_SIZE 0x00000000000030ull +#define V4_TPB_ARR_SEQ_QUEUE_PERF_MATMUL_ACTIVE_CYCLE_CNT_LSB_OFFSET 0x00000000000000ull +#define V4_TPB_ARR_SEQ_QUEUE_PERF_WL_ACTIVE_CYCLE_CNT_LSB_OFFSET 0x00000000000008ull +#define V4_TPB_ARR_SEQ_QUEUE_PERF_IDLE_CYCLE_CNT_LSB_OFFSET 0x00000000000010ull + +#endif diff --git a/v4/neuron_dhal_v4.c b/v4/neuron_dhal_v4.c new file mode 100644 index 0000000..798719b --- /dev/null +++ b/v4/neuron_dhal_v4.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* +* Copyright 2025, Amazon.com, Inc. or its affiliates. All Rights Reserved +*/ + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include + +#include "address_map.h" +#include "../neuron_dhal.h" +#include "../neuron_reset.h" +#include "../neuron_arch.h" +#include "../neuron_cdev.h" +#include "../neuron_pci.h" +#include "../v3/neuron_pelect.h" + + +// TOP SP addresses are sparse on chip adjust to accommodate the table macro +// +#define V4_TOP_SP_GRP1_BASE V4_TOP_SP_0_BASE +#define V4_TOP_SP_GRP2_BASE (V4_TOP_SP_10_BASE - 8 * V4_TOP_SP_DIST) + +#define V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET V4_PCIE_BAR0_TOP_SP_0_OFFSET +#define V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET (V4_PCIE_BAR0_TOP_SP_10_OFFSET - 8 * V4_TOP_SP_SIZE) + +struct neuron_dm_special_mmap_ent dm_mmap_special_v4[] = { + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 0, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 1, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 2, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 3, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 4, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 5, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 6, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 7, NEURON_DM_RESOURCE_SEMAPHORE, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, V4_MMAP_NC_EVENT_OFFSET, V4_MMAP_NC_SEMA_SIZE, 0), + + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 0, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 1, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 2, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 3, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 4, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 5, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 6, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TPB, 7, NEURON_DM_RESOURCE_SBUF, V4_MMAP_TPB_0_BASE, V4_PCIE_BAR0_TPB_0_OFFSET, V4_PCIE_BAR0_TPB_DIST, V4_PCIE_BAR0_TPB_SIZE, 0, V4_PCIE_BAR0_TPB_SBUF_SIZE, 0), + + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 0, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 1, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 2, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 3, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 4, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 5, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 6, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 7, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 8, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 9, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 10, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 11, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 12, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 13, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 14, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 15, NEURON_DM_RESOURCE_SEMAPHORE, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_MMAP_NC_SEMA_SIZE, 0), + + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 0, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 1, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 2, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 3, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 4, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 5, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 6, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 7, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP1_BASE, V4_PCIE_BAR0_TOP_SP_GRP1_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 8, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 9, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 10, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 11, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 12, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 13, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 14, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + DM_SPECIAL_MM_ENT_( NEURON_DM_BLOCK_TOPSP, 15, NEURON_DM_RESOURCE_ALL, V4_TOP_SP_GRP2_BASE, V4_PCIE_BAR0_TOP_SP_GRP2_OFFSET, V4_TOP_SP_DIST, V4_TOP_SP_SIZE, 0, V4_TOP_SP_SIZE, 0), + + {.block = NEURON_DM_BLOCK_HBM, .block_id = 0, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_0_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 0, .bar_num = 4}, + {.block = NEURON_DM_BLOCK_HBM, .block_id = 1, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_1_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 1, .bar_num = 4}, + {.block = NEURON_DM_BLOCK_HBM, .block_id = 2, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_2_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 2, .bar_num = 4}, + {.block = NEURON_DM_BLOCK_HBM, .block_id = 3, .resource = NEURON_DM_RESOURCE_DMEM, .offset = V4_HBM_3_BASE, .size = V4_HBM_ACTIVE_SIZE, .bar_offset = V4_HBM_SIZE * 3, .bar_num = 4}, + + {NEURON_DM_BLOCK_INVALID, 0, 0, 0, 0, 0}, +}; + +struct ncdev_mem_region ncdev_mem_regions_v4[] = { + { V4_MMAP_TPB_0_BASE, V4_MMAP_NC_SIZE }, // FIXME this is inefficient this may need a routine to slice and range check + { V4_MMAP_TPB_1_BASE, V4_MMAP_NC_SIZE }, + { V4_MMAP_TPB_2_BASE, V4_MMAP_NC_SIZE }, + { V4_MMAP_TPB_3_BASE, V4_MMAP_NC_SIZE }, + { V4_MMAP_TPB_4_BASE, V4_MMAP_NC_SIZE }, + { V4_MMAP_TPB_5_BASE, V4_MMAP_NC_SIZE }, + { V4_MMAP_TPB_6_BASE, V4_MMAP_NC_SIZE }, + { V4_MMAP_TPB_7_BASE, V4_MMAP_NC_SIZE }, + { V4_TOP_SP_0_BASE, V4_TOP_SP_SIZE }, // could flatten TOP_SP + { V4_TOP_SP_1_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_2_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_3_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_4_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_5_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_6_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_7_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_8_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_9_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_10_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_11_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_12_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_13_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_14_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_15_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_16_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_17_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_18_BASE, V4_TOP_SP_SIZE }, + { V4_TOP_SP_19_BASE, V4_TOP_SP_SIZE }, + { V4_HBM_0_BASE, V4_HBM_ACTIVE_SIZE }, + { V4_HBM_1_BASE, V4_HBM_ACTIVE_SIZE }, + { V4_HBM_2_BASE, V4_HBM_ACTIVE_SIZE }, + { V4_HBM_3_BASE, V4_HBM_ACTIVE_SIZE }, + { V4_PREPROC_0_BASE, V4_PREPROC_SIZE}, + { V4_PREPROC_1_BASE, V4_PREPROC_SIZE}, + { V4_PREPROC_2_BASE, V4_PREPROC_SIZE}, + { V4_PREPROC_3_BASE, V4_PREPROC_SIZE}, + { NCDEV_MEM_REGION_INVALID, 0 }, +}; + + +u32 npe_neighbor_eng_ids_v4[2][2] = +{ + {40, 72}, // Left + {8, 104} // Right +}; + +static int ndhal_register_funcs_trn3(void) { + if (!ndhal) { + pr_err("ndhal is null. Can't register functions for trn3."); + return -EINVAL; + } + ndhal->ndhal_sysfs_metrics.arch_nd_type_suffix = "v4"; + ndhal->ndhal_sysfs_metrics.arch_nc_type_suffix = "v4"; + ndhal->ndhal_sysfs_metrics.arch_instance_suffix = "Trn3"; + ndhal->ndhal_sysfs_metrics.arch_device_name_suffix = "Trainium3"; + return 0; +} + +/* Instance names + */ +#define NEURON_TRN3PDS_INSTANCE_NAME "trn3s.48xlarge" +#define NEURON_TRN3PDS0_INSTANCE_NAME "trn3-dev0.48xlarge" + +static enum neuron_platform_type ndhal_platform_type_v4(void) +{ + enum neuron_platform_type platform_type = NEURON_PLATFORM_TYPE_INVALID; + char buf[128]; + + if (narch_get_instance_type_name(buf, sizeof(buf))) goto done; + if ((strncmp(buf, NEURON_TRN3PDS_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) { + platform_type = NEURON_PLATFORM_TYPE_PDS; + } else if ((strncmp(buf, NEURON_TRN3PDS0_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) { + platform_type = NEURON_PLATFORM_TYPE_PDS; + } else { + platform_type = NEURON_PLATFORM_TYPE_STD; + } + +done: + return platform_type; +} + +static bool ndhal_instance_type_3xl(void) +{ + static bool instance_type_is_3xl = false; +#define NEURON_TRN3PD98_3XL_INSTANCE_NAME "trn3pd98.3xlarge" + char buf[128]; + if (narch_get_instance_type_name(buf, sizeof(buf))) goto done; + if (strncmp(buf, NEURON_TRN3PD98_3XL_INSTANCE_NAME, sizeof(NEURON_TRN3PD98_3XL_INSTANCE_NAME)-1) == 0) { + instance_type_is_3xl = true; + } + +done: + return instance_type_is_3xl; +} + + +/* Memory Pool Functions */ +/** + * mpset_set_dram_and_mpset_info() + * - set the address and size of device dram + * - set mpset's num_channels and number of regions in the device pool + * + * @param mpset: pointer to mpset + * @param device_dram_addr: DRAM Channel 0 and 1's addresses + * @param device_dram_size: DRAM Channel 0 and 1's sizes + */ +static void mpset_set_dram_and_mpset_info_v4(struct mempool_set *mpset, u64 *device_dram_addr, u64 *device_dram_size) +{ + mpset->num_channels = V4_MAX_DRAM_CHANNELS; + mpset->mp_device_num_regions = 1; + device_dram_addr[0] = V4_HBM_0_BASE; + device_dram_addr[1] = V4_HBM_1_BASE; + device_dram_addr[2] = V4_HBM_2_BASE; + device_dram_addr[3] = V4_HBM_3_BASE; + + if (narch_is_qemu()) { + // Allow qemu setups to dynamically allocate their HBM sizes + const u64 msize = ndhal->ndhal_pci.dram_bar_size / 4; + device_dram_size[0] = msize; + device_dram_size[1] = msize; + device_dram_size[2] = msize; + device_dram_size[3] = msize; + + u32 mem_regions = sizeof(dm_mmap_special_v4) / sizeof(dm_mmap_special_v4[0]); + int i = 0; + for (; i < mem_regions; ++i) { + if ((dm_mmap_special_v4[i].offset == V4_HBM_0_BASE) || + (dm_mmap_special_v4[i].offset == V4_HBM_1_BASE) || + (dm_mmap_special_v4[i].offset == V4_HBM_2_BASE) || + (dm_mmap_special_v4[i].offset == V4_HBM_3_BASE)) { + dm_mmap_special_v4[i].size = msize; + } + } + pr_info("overriding hbm size to %llu bytes", msize); + } else { + device_dram_size[0] = V4_HBM_ACTIVE_SIZE; + device_dram_size[1] = V4_HBM_ACTIVE_SIZE; + device_dram_size[2] = V4_HBM_ACTIVE_SIZE; + device_dram_size[3] = V4_HBM_ACTIVE_SIZE; + } + int i; + for (i = 0; i < mpset->num_channels; i++) { + ndhal->ndhal_mpset.device_dram_end_addr[i] = device_dram_addr[i] + device_dram_size[i]; + } +} + + +/* Memory Map Functions */ +/** + * mmap_get_bar4_offset() - calculate the offset of BAR4 + * + * @param start_addr: start address + * @param size: size of memory + * @param offset: offset of BAR4 + * @return int: 0 on success; negative on failure + */ +static int mmap_get_bar4_offset_v4(u64 start_addr, u64 size, u64 *offset) +{ + u64 hbm_dist = narch_is_qemu() ? (ndhal->ndhal_pci.dram_bar_size / 4) : V4_HBM_SIZE; + + if (start_addr >= V4_HBM_0_BASE && start_addr + size < V4_HBM_0_BASE + V4_HBM_ACTIVE_SIZE) + *offset = start_addr; + else if (start_addr >= V4_HBM_1_BASE && start_addr + size < V4_HBM_1_BASE + V4_HBM_ACTIVE_SIZE) + *offset = start_addr - V4_HBM_1_BASE + hbm_dist; + else if (start_addr >= V4_HBM_2_BASE && start_addr + size < V4_HBM_2_BASE + V4_HBM_ACTIVE_SIZE) + *offset = start_addr - V4_HBM_2_BASE + hbm_dist * 2; + else if (start_addr >= V4_HBM_3_BASE && start_addr + size < V4_HBM_3_BASE + V4_HBM_ACTIVE_SIZE) + *offset = start_addr - V4_HBM_3_BASE + hbm_dist * 3; + else + return -EINVAL; + return 0; +} + +extern int dup_helper_enable; +static atomic_t dup_rid_cnt = ATOMIC_INIT(0); // count of duplicate routing IDs encountered +static int neuron_pci_handle_dup_routing_id(void) +{ + int ret = -ENODEV; + int dup_cnt; + char cmd[256]; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) + dup_cnt = atomic_fetch_add(1, &dup_rid_cnt); +#else + dup_cnt = atomic_add_return(1, &dup_rid_cnt) - 1; +#endif + + // If this is the first dup encounted, unload the driver + if ((dup_cnt == 0) && dup_helper_enable) { + pr_err("scheduling unload of %s due to duplicate routing id\n", module_name(THIS_MODULE)); + + int n = snprintf(cmd, sizeof(cmd), "sleep 10;/sbin/modprobe -r %s", module_name(THIS_MODULE)); + if (n > sizeof(cmd)) { + pr_err("unable to schedule driver unload cmd buffer len exceeded\n"); + return -EINVAL; + } + char *argv[] = { "/bin/sh", + "-c", + cmd, + NULL}; + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL}; + + ret = call_usermodehelper( argv[0], argv, envp, UMH_WAIT_EXEC); + if (ret) + pr_err("unable to schedule driver unload. Error: %d\n", ret); + } + + return ret; +} + +// for V4 rename Neuron devices for better customer experience. +// see internal documentation: TRN2-Discovery +// map routing id to user id: +static const u32 v4_torus_routing_id_to_user_id[] = { + 0, 3, 4, 7, + 12, 15, 8, 11, + 1, 2, 5, 6, + 13, 14, 9, 10 }; + +// map routing id to user id for trn2pds instance type. +// the only hard rule this map needs to follow is +// rid (i*2) and rid (i*2)+1 map to did (n*2) and did (n*2)+1 +// since rid (i*2) and rid (i*2)+1 are on the same JBOG. +static const u32 v4_pds_routing_id_to_user_id[] = { + 0, 1, + 2, 3, + 4, 5, + 6, 7, + 8, 9, + 10, 11, + 12, 13, + 14, 15 }; + +#define V4_ROUTING_ID_TBL_SZ (sizeof(v4_torus_routing_id_to_user_id) / sizeof(v4_torus_routing_id_to_user_id[0])) + +static u32 neuron_pci_routing_id_to_user_id(u32 routing_id) +{ + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + return v4_pds_routing_id_to_user_id[ routing_id % V4_ROUTING_ID_TBL_SZ]; + } + return v4_torus_routing_id_to_user_id[routing_id % V4_ROUTING_ID_TBL_SZ]; +} + +/** + * neuron_pci_get_device_id() - get device id from pacific and set nd->device_index + * + * @param dev: PCI device + * @param nd: neuron device + * @return int: 0 on success, otherwise on failure + */ +static int neuron_pci_get_device_id_v4(struct neuron_device *nd, struct pci_dev *dev) +{ + int ret = 0; + int i; + u32 routing_id = (u32)-1; + u32 routing_id_max = MAX_NEURON_DEVICE_COUNT; + + if (ndhal_instance_type_3xl()) { + // Temporarily auto-assign routing_id to 0 for 3xl instances, since they + // only have 1 device anyways + routing_id = 0; + } else { + // Poll the device id until the device is ready + for (i = 0; i < 20; i++) { + ret = fw_io_device_id_read(nd->npdev.bar0, &routing_id); + if (!ret && routing_id != 0xdeadbeef) { + break; + } + msleep(1000); + } + } + + if (ret) { + pr_err("Could not retrieve device index (read timeout)"); + return -ENODEV; + } + + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + u32 server_info = 0; + bool server_id_valid = 0; + u32 server_id = 0; + ret = fw_io_server_info_read(nd->npdev.bar0, &server_info); + if (ret) { + return -ENODEV; + } + + server_id_valid = (server_info >> 15) & 0x1; // TODO we probably need const shift value or macro + if (server_id_valid) { + server_id = server_info & 0x7fff; // TODO we probably need constant mask for this + } else { + pr_err("Could not retrieve valid server id, ret = %d\n", ret); + return -ENODEV; + } + ndhal->ndhal_arch.server_id = server_id; + routing_id_max = MAX_NEURON_DEVICE_COUNT * 2; + } + + // TODO - this should be a "valid routing_id check for TRN3 + if (routing_id < 0 || routing_id >= routing_id_max) { + pr_err("Invalid device index %u", routing_id); + return -ENODEV; + } + + nd->device_index = neuron_pci_routing_id_to_user_id(routing_id); + + pr_err("** BDF: %2.2x:%2.2x.%x => nd[%d] (routing id: %u)\n", dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), nd->device_index, routing_id); + + // protection against duplicate IDs - doesn't provide 100% protection in multi-threaded device discovery + if (neuron_devices[nd->device_index] != NULL) { + pr_err("duplicate routing id %u found\n", routing_id); + neuron_pci_handle_dup_routing_id(); + return -ENODEV; + } + + return 0; +} + +/** + * ndhal_register_funcs_v4() - initialize the dhal for v4 chips + * + * This function only overrides the functions and + * constants that are different from v3 in v4. + */ +int ndhal_register_funcs_v4(void) { + int ret = 0; + + if (!ndhal) { + pr_err("ndhal is null. Can't register functions for V4."); + return -EINVAL; + } + + ndhal->ndhal_arch.platform_type = ndhal_platform_type_v4(); + ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v4; + ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v4; + ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v4; + ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v4; + ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v4; + ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v4; + + if (narch_is_emu()) { + // Temporarily disable resets on mariana emulation until pacific is ready + extern int no_reset; + no_reset = 1; + } + + // TODO initialization needs refactoring because V4 is piggybacking on V3 + // which risks double calling any hal init functions + // + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { + ret = npe_init(); + if (ret) { + pr_err("failed to initialize pod election on V4\n"); + return ret; + } + } else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + //TODO PDS + } + + switch (ndhal->pci_device_id) { + case TRN3_DEVICE_ID0: + case TRN3_DEVICE_ID1: + ret = ndhal_register_funcs_trn3(); + if (ret) { + pr_err("failed to register ndhal funcs for trn3.\n"); + return ret; + } + break; + default: + pr_err("Unknown HW architecture. Can't init neuron_dhal.\n"); + return -EINVAL; + } + + return ret; +} diff --git a/vc/neuron_dhal_vc.c b/vc/neuron_dhal_vc.c index 0f33e6c..e550d48 100644 --- a/vc/neuron_dhal_vc.c +++ b/vc/neuron_dhal_vc.c @@ -136,17 +136,16 @@ static int ntpb_pe_format_activity_stats_vc(struct neuron_device *nd, int nc_id, * ndhal_register_funcs_vc() - initialize the common dhal for all chips * */ -int ndhal_register_funcs_vc(void) { - int ret = 0; - - if (!ndhal) { - pr_err("ndhal is null. Can't register functions for VC."); - return -EINVAL; - } +int ndhal_register_funcs_vc(void) +{ + if (!ndhal) { + pr_err("ndhal is null. Can't register functions for VC."); + return -EINVAL; + } ndhal->ndhal_tpb.pe_format_activity_stats = ntpb_pe_format_activity_stats_vc; ndhal->ndhal_tpb.pe_get_counter_val = ntpb_pe_get_counter_val_vc; ndhal->ndhal_tpb.pe_get_row_grp_activity_counter_offset = ntpb_pe_get_row_grp_activity_counter_offset_vc; ndhal->ndhal_tpb.pe_get_fast_wl_cycle_cnt = ntpb_pe_get_fast_wl_cycle_cnt_vc; - return ret; -} \ No newline at end of file + return 0; +}