From 4bce1a19fcfac0c9f22c11278daa510546ccd3f2 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sat, 4 Jan 2025 21:08:43 +0100 Subject: [PATCH 001/143] TCP BBR: remove code which is not needed rc_bbr_substate is a 3-bit unsigned int, so it can't be larger than or equal to 8. The wrap around already happens. No functional change intended. Reviewed by: rrs CID: 1523795 MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48320 --- sys/netinet/tcp_stacks/bbr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index ed7e07861ebdef..535766a0a1b004 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -10313,10 +10313,6 @@ bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog) bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state; } bbr->rc_bbr_substate++; - if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) { - /* Cycle back to first state-> gain */ - bbr->rc_bbr_substate = 0; - } if (bbr_state_val(bbr) == BBR_SUB_GAIN) { /* * We enter the gain(5/4) cycle (possibly less if From 84e894ce1309b426aa5d1a20ec194401f35dc478 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Sat, 4 Jan 2025 21:11:26 +0100 Subject: [PATCH 002/143] TCP RACK: remove variable with is only initialized and not changed minslot is initialized to 0 and never changed. It is not clear to me under which condition minslot should be set to which value. Therefore, remove it and the code checking that it is not zero. No functional change intended. Reviewed by: rrs CID: 1523812 MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48321 --- sys/netinet/tcp_stacks/rack.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 6774acb9d5e62f..cc07253247609c 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -17456,7 +17456,6 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str { uint64_t srtt; int32_t slot = 0; - int32_t minslot = 0; int can_start_hw_pacing = 1; int err; int pace_one; @@ -17788,11 +17787,6 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str } } } - if (minslot && (minslot > slot)) { - rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim, - 98, __LINE__, NULL, 0); - slot = minslot; - } done_w_hdwr: if (rack_limit_time_with_srtt && (rack->use_fixed_rate == 0) && From 0f7d8b71b45b0a86b25e1005e83140ee6cbdff45 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Thu, 2 Jan 2025 02:26:00 +0000 Subject: [PATCH 003/143] Makefile.inc1: Set DISTDIR in stagekernel target The distributekernel target expects DESTDIR and DISTDIR to be set. The stagekernel target invokes `make distributekernel`, and previously left DISTDIR unset, resulting in a path with a "//" component. Instead, set DISTDIR to . to make the way we're (ab)using the distributekernel target more explicit. Reviewed by: bapt Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D48288 --- Makefile.inc1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index 5c3d190e4c3ede..bad747a0e551e9 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -1991,7 +1991,7 @@ packagekernel: .PHONY .endif stagekernel: .PHONY - ${_+_}${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} distributekernel + ${_+_}${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} DISTDIR=. distributekernel PORTSDIR?= /usr/ports WSTAGEDIR?= ${OBJTOP}/worldstage From 48b9d78a0a9d795cfdeb56895a27309aadd50c77 Mon Sep 17 00:00:00 2001 From: Daniel Schaefer Date: Sat, 4 Jan 2025 23:53:37 +0800 Subject: [PATCH 004/143] hda: Add patch for Framework Laptop Intel 13th gen It uses the same audio codec as 12th gen (PCI ID 0x0002). Actually everything is the same, except the CPU. Signed-off-by: Daniel Schaefer --- sys/dev/sound/pci/hda/hdaa_patches.c | 3 ++- sys/dev/sound/pci/hda/hdac.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sys/dev/sound/pci/hda/hdaa_patches.c b/sys/dev/sound/pci/hda/hdaa_patches.c index d425e3c0080b13..3c063deb0822d5 100644 --- a/sys/dev/sound/pci/hda/hdaa_patches.c +++ b/sys/dev/sound/pci/hda/hdaa_patches.c @@ -330,7 +330,8 @@ hdac_pin_patch(struct hdaa_widget *w) } } else if (id == HDA_CODEC_IDT92HD95B && (subid == FRAMEWORK_LAPTOP_0001_SUBVENDOR || - subid == FRAMEWORK_LAPTOP_0002_SUBVENDOR)) { + subid == FRAMEWORK_LAPTOP_0002_SUBVENDOR || + subid == FRAMEWORK_LAPTOP_0003_SUBVENDOR)) { switch (nid) { case 10: patch_str = "as=1 seq=15 color=Black loc=Left"; diff --git a/sys/dev/sound/pci/hda/hdac.h b/sys/dev/sound/pci/hda/hdac.h index d00ad4e4705c59..f0e72f091a85c2 100644 --- a/sys/dev/sound/pci/hda/hdac.h +++ b/sys/dev/sound/pci/hda/hdac.h @@ -528,6 +528,7 @@ #define FRAMEWORK_VENDORID 0xf111 #define FRAMEWORK_LAPTOP_0001_SUBVENDOR HDA_MODEL_CONSTRUCT(FRAMEWORK, 0x0001) #define FRAMEWORK_LAPTOP_0002_SUBVENDOR HDA_MODEL_CONSTRUCT(FRAMEWORK, 0x0002) +#define FRAMEWORK_LAPTOP_0003_SUBVENDOR HDA_MODEL_CONSTRUCT(FRAMEWORK, 0x0003) /* All codecs you can eat... */ #define HDA_CODEC_CONSTRUCT(vendor, id) \ From 93411b39fff24ab4c9bf2b0395c7789b1a1c7a42 Mon Sep 17 00:00:00 2001 From: Adrian Chadd Date: Sun, 15 Dec 2024 18:05:13 -0800 Subject: [PATCH 005/143] rtwn: calculate control rate for VHT rate frames If the passed in rate is a VHT rate, use rtwn_ctl_vhtrate() to find a suitable rate for RTS/CTS. Differential Revision: https://reviews.freebsd.org/D48295 Reviewed by: bz, cy, emaste --- sys/dev/rtwn/rtl8812a/r12a_tx.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sys/dev/rtwn/rtl8812a/r12a_tx.c b/sys/dev/rtwn/rtl8812a/r12a_tx.c index cc686668e4a221..582e6e0ddaf464 100644 --- a/sys/dev/rtwn/rtl8812a/r12a_tx.c +++ b/sys/dev/rtwn/rtl8812a/r12a_tx.c @@ -103,11 +103,17 @@ r12a_tx_protection(struct rtwn_softc *sc, struct r12a_tx_desc *txd, if (mode == IEEE80211_PROT_CTSONLY || mode == IEEE80211_PROT_RTSCTS) { - /* TODO: VHT */ - if (RTWN_RATE_IS_HT(ridx)) + /* + * Note: this code assumes basic rates for protection for + * both 802.11abg and 802.11n rates. + */ + if (RTWN_RATE_IS_VHT(ridx)) + rate = rtwn_ctl_vhtrate(ic->ic_rt, ridx); + else if (RTWN_RATE_IS_HT(ridx)) rate = rtwn_ctl_mcsrate(ic->ic_rt, ridx); else rate = ieee80211_ctl_rate(ic->ic_rt, ridx2rate[ridx]); + /* Map basic rate back to ridx */ ridx = rate2ridx(IEEE80211_RV(rate)); txd->txdw4 |= htole32(SM(R12A_TXDW4_RTSRATE, ridx)); From 50c1e179b584f43ba82e9afc91b25ec4831b58ef Mon Sep 17 00:00:00 2001 From: Kristof Provost Date: Sun, 5 Jan 2025 17:09:08 +0100 Subject: [PATCH 006/143] umtx: handle allocation failire in umtx_pi_alloc() Don't assume that this allocation will succeed. We may have been passed M_NOWAIT. The calling code already handles allocation failures, but the function itself did not. PR: 283807 MFC after: 1 week --- sys/kern/kern_umtx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c index c4a820f41bc367..a9294c324cb422 100644 --- a/sys/kern/kern_umtx.c +++ b/sys/kern/kern_umtx.c @@ -1740,6 +1740,9 @@ umtx_pi_alloc(int flags) struct umtx_pi *pi; pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags); + if (pi == NULL) + return (NULL); + TAILQ_INIT(&pi->pi_blocked); atomic_add_int(&umtx_pi_allocated, 1); return (pi); From d830cac9fed1882a192b0ec5c96fb3ac9bfbc2ee Mon Sep 17 00:00:00 2001 From: Poul-Henning Kamp Date: Sun, 5 Jan 2025 20:14:37 +0000 Subject: [PATCH 007/143] recoverdisk: Fix comparison between stripesize and sectorsize Discovered trying to read a 360KB floppy disk :-) --- sbin/recoverdisk/recoverdisk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbin/recoverdisk/recoverdisk.c b/sbin/recoverdisk/recoverdisk.c index 91f42c904c52b3..446266c36d5055 100644 --- a/sbin/recoverdisk/recoverdisk.c +++ b/sbin/recoverdisk/recoverdisk.c @@ -482,7 +482,7 @@ main(int argc, char * const argv[]) err(1, "DIOCGSECTORSIZE failed"); error = ioctl(fdr, DIOCGSTRIPESIZE, &stripesize); - if (error == 0 && stripesize > sectorsize) + if (error == 0 && stripesize < sectorsize) sectorsize = stripesize; minsize = sectorsize; From a0f06dfb0d188966bee7265ec7d9f20093186bb6 Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Mon, 6 Jan 2025 08:34:02 +0100 Subject: [PATCH 008/143] loader: Add a list of firmware name mapping Since we started to ship raw firmware for iwm(4), users who loads the driver from loader are having problems as loader don't know that the firmwares are now raw files and not kernel modules anymore. Start a list of default entry for iwm(4) firmwares name mapping so it will still works when loaded from loader. Differential Revision: https://reviews.freebsd.org/D48211 Reviewed by: bz, imp, kevans Sponsored by: Beckhoff Automation GmbH & Co. KG --- stand/defaults/loader.conf | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/stand/defaults/loader.conf b/stand/defaults/loader.conf index d266c240955afa..b1e87520a2d41e 100644 --- a/stand/defaults/loader.conf +++ b/stand/defaults/loader.conf @@ -109,7 +109,7 @@ kernels_autodetect="YES" # Auto-detect kernel directories in /boot #comconsole_speed="115200" # Set the current serial console speed #console="vidconsole" # A comma separated list of console(s) #currdev="disk1s1a" # Set the current device -module_path="/boot/modules;/boot/dtb;/boot/dtb/overlays" # Set the module search path +module_path="/boot/modules;/boot/firmware;/boot/dtb;/boot/dtb/overlays" # Set the module search path module_blacklist="drm drm2 radeonkms i915kms amdgpu" # Loader module blacklist #prompt="\\${interpret}" # Set the command prompt #root_disk_unit="0" # Force the root disk unit number @@ -182,3 +182,14 @@ module_blacklist="drm drm2 radeonkms i915kms amdgpu" # Loader module blacklist #module_before="cmd" # executes "cmd" before loading the module #module_after="cmd" # executes "cmd" after loading the module #module_error="cmd" # executes "cmd" if load fails + +### Firmware names mapping list +iwm3160fw_type="firmware" +iwm7260fw_type="firmware" +iwm7265fw_type="firmware" +iwm8265fw_type="firmware" +iwm9260fw_type="firmware" +iwm3168fw_type="firmware" +iwm7265Dfw_type="firmware" +iwm8000C_type="firmware" +iwm9000fw_type="firmware" From 48db612d8939fab6081fc2bf16f0d42aef3c682d Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Thu, 2 Jan 2025 13:46:30 +0100 Subject: [PATCH 009/143] arm64: Add a new SOC_ROCKCHIP option A lot of drivers are shared between all rockchip SoCs, each time we add suppot for a new SoC we need to add the options in the files.arm64 lines. Add a new option SOC_ROCKCHIP that will help simplify this file. Reviewed by: andrew Differential Revision: https://reviews.freebsd.org/D48286 --- sys/arm64/conf/std.rockchip | 1 + sys/conf/files.arm64 | 30 +++++++++++++++--------------- sys/conf/options.arm64 | 7 ++++--- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/sys/arm64/conf/std.rockchip b/sys/arm64/conf/std.rockchip index 3733ddc4eeae4a..d32de4e4fe7950 100644 --- a/sys/arm64/conf/std.rockchip +++ b/sys/arm64/conf/std.rockchip @@ -3,6 +3,7 @@ # # SoC support +options SOC_ROCKCHIP options SOC_ROCKCHIP_RK3328 options SOC_ROCKCHIP_RK3399 options SOC_ROCKCHIP_RK3568 diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 3335dfe6cab176..b1f3aefadbe140 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -720,18 +720,18 @@ arm64/rockchip/rk3568_pciephy.c optional fdt pci soc_rockchip_rk3568 arm64/rockchip/rk_i2s.c optional fdt sound soc_rockchip_rk3328 | fdt sound soc_rockchip_rk3399 arm64/rockchip/rk_otp.c optional fdt soc_rockchip_rk3568 arm64/rockchip/rk_otp_if.m optional fdt soc_rockchip_rk3568 -dev/iicbus/pmic/rockchip/rk8xx.c optional fdt rk805 soc_rockchip_rk3328 | fdt rk805 soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/iicbus/pmic/rockchip/rk8xx_clocks.c optional fdt rk805 soc_rockchip_rk3328 | fdt rk805 soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/iicbus/pmic/rockchip/rk8xx_regulators.c optional fdt rk805 soc_rockchip_rk3328 | fdt rk805 soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/iicbus/pmic/rockchip/rk8xx_rtc.c optional fdt rk805 soc_rockchip_rk3328 | fdt rk805 soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 +dev/iicbus/pmic/rockchip/rk8xx.c optional fdt rk805 soc_rockchip +dev/iicbus/pmic/rockchip/rk8xx_clocks.c optional fdt rk805 soc_rockchip +dev/iicbus/pmic/rockchip/rk8xx_regulators.c optional fdt rk805 soc_rockchip +dev/iicbus/pmic/rockchip/rk8xx_rtc.c optional fdt rk805 soc_rockchip dev/iicbus/pmic/rockchip/rk805.c optional fdt rk805 soc_rockchip_rk3328 dev/iicbus/pmic/rockchip/rk808.c optional fdt rk805 soc_rockchip_rk3399 dev/iicbus/pmic/rockchip/rk817.c optional fdt rk817 soc_rockchip_rk3568 -arm64/rockchip/rk_grf.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -arm64/rockchip/rk_pinctrl.c optional fdt rk_pinctrl soc_rockchip_rk3328 | fdt rk_pinctrl soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -arm64/rockchip/rk_gpio.c optional fdt rk_gpio soc_rockchip_rk3328 | fdt rk_gpio soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 +arm64/rockchip/rk_grf.c optional fdt soc_rockchip +arm64/rockchip/rk_pinctrl.c optional fdt rk_pinctrl soc_rockchip +arm64/rockchip/rk_gpio.c optional fdt rk_gpio soc_rockchip arm64/rockchip/rk_iodomain.c optional fdt rk_iodomain -arm64/rockchip/rk_usb2phy.c optional fdt rk_usb2phy soc_rockchip_rk3328 | fdt rk_usb2phy soc_rockchip_rk3399 | fdt rk_usb2phy soc_rockchip_rk3568 +arm64/rockchip/rk_usb2phy.c optional fdt rk_usb2phy soc_rockchip arm64/rockchip/rk_typec_phy.c optional fdt rk_typec_phy soc_rockchip_rk3399 arm64/rockchip/rk_tsadc_if.m optional fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 arm64/rockchip/rk_tsadc.c optional fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 @@ -739,13 +739,13 @@ arm64/rockchip/rk_pcie.c optional fdt pci soc_rockchip_rk3399 arm64/rockchip/rk_pcie_phy.c optional fdt pci soc_rockchip_rk3399 # RockChip Clock support -dev/clk/rockchip/rk_cru.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/clk/rockchip/rk_clk_armclk.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/clk/rockchip/rk_clk_composite.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/clk/rockchip/rk_clk_fract.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/clk/rockchip/rk_clk_gate.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/clk/rockchip/rk_clk_mux.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 -dev/clk/rockchip/rk_clk_pll.c optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399 | fdt soc_rockchip_rk3568 +dev/clk/rockchip/rk_cru.c optional fdt soc_rockchip +dev/clk/rockchip/rk_clk_armclk.c optional fdt soc_rockchip +dev/clk/rockchip/rk_clk_composite.c optional fdt soc_rockchip +dev/clk/rockchip/rk_clk_fract.c optional fdt soc_rockchip +dev/clk/rockchip/rk_clk_gate.c optional fdt soc_rockchip +dev/clk/rockchip/rk_clk_mux.c optional fdt soc_rockchip +dev/clk/rockchip/rk_clk_pll.c optional fdt soc_rockchip dev/clk/rockchip/rk3328_cru.c optional fdt soc_rockchip_rk3328 dev/clk/rockchip/rk3399_cru.c optional fdt soc_rockchip_rk3399 dev/clk/rockchip/rk3399_pmucru.c optional fdt soc_rockchip_rk3399 diff --git a/sys/conf/options.arm64 b/sys/conf/options.arm64 index e36f856ecb04ad..4bdd408f46512d 100644 --- a/sys/conf/options.arm64 +++ b/sys/conf/options.arm64 @@ -37,7 +37,8 @@ SOC_INTEL_STRATIX10 opt_soc.h SOC_MARVELL_8K opt_soc.h SOC_NVIDIA_TEGRA210 opt_soc.h SOC_NXP_LS opt_soc.h -SOC_ROCKCHIP_RK3328 opt_soc.h -SOC_ROCKCHIP_RK3399 opt_soc.h -SOC_ROCKCHIP_RK3568 opt_soc.h +SOC_ROCKCHIP opt_soc.h +SOC_ROCKCHIP_RK3328 opt_soc.h # Depends on SOC_ROCKCHIP +SOC_ROCKCHIP_RK3399 opt_soc.h # Depends on SOC_ROCKCHIP +SOC_ROCKCHIP_RK3568 opt_soc.h # Depends on SOC_ROCKCHIP SOC_XILINX_ZYNQ opt_soc.h From ad1bf74705e4bbf116bf9e285088c4dfb31247e2 Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Thu, 2 Jan 2025 13:51:09 +0100 Subject: [PATCH 010/143] arm64: rockchip: Add a new rk8xx device This device will select the base driver for Rockchip PMIC. While here also add a new rk808 device which selects the PMIC used for RK3399 Reviewed by: andrew Differential Revision: https://reviews.freebsd.org/D48287 --- sys/arm64/conf/std.rockchip | 2 ++ sys/conf/files.arm64 | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sys/arm64/conf/std.rockchip b/sys/arm64/conf/std.rockchip index d32de4e4fe7950..dc4dfadfde3999 100644 --- a/sys/arm64/conf/std.rockchip +++ b/sys/arm64/conf/std.rockchip @@ -21,7 +21,9 @@ device rk_i2c # RockChip I2C controller device fan53555 # Fairchild Semi FAN53555/SYR82x Regulator # Power management controllers +device rk8xx # RockChip RK8XX base support device rk805 # RockChip RK805 PMIC +device rk808 # RockChip RK805 PMIC device rk817 # RockChip RK817 PMIC device syr827 # Silergy SYR827 PMIC device tcs4525 # TCS 4525 PMIC diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index b1f3aefadbe140..ed251b7ae0d72c 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -720,12 +720,12 @@ arm64/rockchip/rk3568_pciephy.c optional fdt pci soc_rockchip_rk3568 arm64/rockchip/rk_i2s.c optional fdt sound soc_rockchip_rk3328 | fdt sound soc_rockchip_rk3399 arm64/rockchip/rk_otp.c optional fdt soc_rockchip_rk3568 arm64/rockchip/rk_otp_if.m optional fdt soc_rockchip_rk3568 -dev/iicbus/pmic/rockchip/rk8xx.c optional fdt rk805 soc_rockchip -dev/iicbus/pmic/rockchip/rk8xx_clocks.c optional fdt rk805 soc_rockchip -dev/iicbus/pmic/rockchip/rk8xx_regulators.c optional fdt rk805 soc_rockchip -dev/iicbus/pmic/rockchip/rk8xx_rtc.c optional fdt rk805 soc_rockchip +dev/iicbus/pmic/rockchip/rk8xx.c optional fdt rk8xx soc_rockchip +dev/iicbus/pmic/rockchip/rk8xx_clocks.c optional fdt rk8xx soc_rockchip +dev/iicbus/pmic/rockchip/rk8xx_regulators.c optional fdt rk8xx soc_rockchip +dev/iicbus/pmic/rockchip/rk8xx_rtc.c optional fdt rk8xx soc_rockchip dev/iicbus/pmic/rockchip/rk805.c optional fdt rk805 soc_rockchip_rk3328 -dev/iicbus/pmic/rockchip/rk808.c optional fdt rk805 soc_rockchip_rk3399 +dev/iicbus/pmic/rockchip/rk808.c optional fdt rk808 soc_rockchip_rk3399 dev/iicbus/pmic/rockchip/rk817.c optional fdt rk817 soc_rockchip_rk3568 arm64/rockchip/rk_grf.c optional fdt soc_rockchip arm64/rockchip/rk_pinctrl.c optional fdt rk_pinctrl soc_rockchip From a071c76746f6f10ac2254ae1d9d2c34beb21a981 Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Mon, 6 Jan 2025 08:39:22 +0100 Subject: [PATCH 011/143] UPDATING: Document recent Rockchip options and device --- UPDATING | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/UPDATING b/UPDATING index 4fef13073431ba..412eb446f4fd3e 100644 --- a/UPDATING +++ b/UPDATING @@ -27,6 +27,12 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 15.x IS SLOW: world, or to merely disable the most expensive debugging functionality at runtime, run "ln -s 'abort:false,junk:false' /etc/malloc.conf".) +20240106: + A new SOC_ROCKCHIP options appeared, so if you have a custom kernel configuration + targetting Rockchip SoC you need to add it so shared and mandatory drivers for + this SoC familly will be selected. + Also a new rk8xx device was added, this select the base driver for Rockchip PMIC. + 20241216: The iwm(4) firmwares are no longer compiled as kernel modules but instead shipped as raw files. For pkgbase users if you use iwm(4) you will need From 73ad5af7d90894cdf9f829cecb45af506bbe2dba Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Mon, 6 Jan 2025 12:50:51 +0100 Subject: [PATCH 012/143] arm64: Unbreak LINT build Add the recently added options SOC_ROCKCHIP and device rk8xx. While here add options SOC_ROCKCHIP_RK3568 and device rk808/rk817 which where never added. Fixes: 48db612d8939 ("arm64: Add a new SOC_ROCKCHIP option") Fixes: ad1bf74705e4 ("arm64: rockchip: Add a new rk8xx device") --- sys/arm64/conf/NOTES | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sys/arm64/conf/NOTES b/sys/arm64/conf/NOTES index b3bf2fedd5f7ca..54bc7dcf1f0926 100644 --- a/sys/arm64/conf/NOTES +++ b/sys/arm64/conf/NOTES @@ -43,8 +43,10 @@ options SOC_INTEL_STRATIX10 options SOC_MARVELL_8K options SOC_NVIDIA_TEGRA210 options SOC_NXP_LS +options SOC_ROCKCHIP options SOC_ROCKCHIP_RK3328 options SOC_ROCKCHIP_RK3399 +options SOC_ROCKCHIP_RK3568 options SOC_XILINX_ZYNQ # Timer drivers @@ -161,7 +163,10 @@ device aw_wdog # Allwinner Watchdog # Power management controllers device axp81x # X-Powers AXP81x PMIC +device rk8xx # RockChip RK8XX base support device rk805 # RockChip RK805 PMIC +device rk808 # RockChip RK808 PMIC +device rk817 # RockChip RK817 PMIC # EFUSE device aw_sid # Allwinner Secure ID EFUSE From 7f39f03c4d9a138f84a08931b2a6c016521cacf5 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Mon, 6 Jan 2025 08:22:14 -0800 Subject: [PATCH 013/143] libc/xdr: remove bogus lseek(2) for xdr streams Doing some debugging I noticed that applications using rpc(3) would often make lseek(2) on a totally bogus file descriptor, that looks more like a pointer. So, what happens here is that xdrrec type xdr doesn't keep a track of how many bytes were sent/received on the stream and tries to obtain this number via lseek(2). Then it adds/subtracts the offset in the internal buffer from the obtained number. This code originates from the original Sun RPC import in 1994. However, it was not a working code even if Solaris would support lseek(2) on a socket, because it was passing not the file descriptor, but a pointer to opaque data from upper RPC layer. It could be that previously (before import to FreeBSD) code was correct, but the Solaris 8 documentation says that lseek(2) on socket isn't supported [1]. Maybe supported on older Solaris? Anyway, this lseek(2) never worked and xdr_getpos() would always fail on xdrrec object, until 8f55a568f69c5 in 2008 it was slightly fixed to tolerate failure of lseek(2) and return a correct value within the small internal buffer for XDR_ENCODE mode and a an incorrect (negative to unsigned) result for XDR_DECODE. It seems no consumer ever calls xdr_getpos()/xdr_setpos() on this kind of descriptor when in XDR_DECODE mode. So, remove this lseek(2) and preserve operation within the small buffer only. Supposedly fix the operation for XDR_DECODE mode. Note that there is no use and no test coverage for the XDR_DECODE. Note that xdr(3) manual page already documents limitations for xdr_getpos() and xdr_setpos() for the stream type objects. [1] https://docs.oracle.com/cd/E19109-01/tsolaris8/835-8003/6ruu1b0or/index.html Reviewed by: asomers, markj Differential Revision: https://reviews.freebsd.org/D48205 --- lib/libc/xdr/xdr_rec.c | 69 +++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/lib/libc/xdr/xdr_rec.c b/lib/libc/xdr/xdr_rec.c index f1167fdeaa65d6..7dc9bbb31ec393 100644 --- a/lib/libc/xdr/xdr_rec.c +++ b/lib/libc/xdr/xdr_rec.c @@ -318,27 +318,30 @@ xdrrec_putbytes(XDR *xdrs, const char *addr, u_int len) return (TRUE); } +/* + * XXX: xdrrec operates on a TCP stream and doesn't keep record of how many + * bytes were sent/received overall. Thus, the XDR_GETPOS() and XDR_SETPOS() + * can operate only within small internal buffer. So far, the limited set of + * consumers of this xdr are fine with that. It also seems that methods are + * never called in the XDR_DECODE mode. + */ static u_int xdrrec_getpos(XDR *xdrs) { RECSTREAM *rstrm = (RECSTREAM *)xdrs->x_private; - off_t pos; + ptrdiff_t pos; - pos = lseek((int)(u_long)rstrm->tcp_handle, (off_t)0, 1); - if (pos == -1) - pos = 0; switch (xdrs->x_op) { - case XDR_ENCODE: - pos += rstrm->out_finger - rstrm->out_base; + pos = rstrm->out_finger - rstrm->out_base; break; case XDR_DECODE: - pos -= rstrm->in_boundry - rstrm->in_finger; + pos = rstrm->in_finger - rstrm->in_base; break; - default: - pos = (off_t) -1; + case XDR_FREE: + pos = -1; break; } return ((u_int) pos); @@ -352,32 +355,30 @@ xdrrec_setpos(XDR *xdrs, u_int pos) int delta = currpos - pos; char *newpos; - if ((int)currpos != -1) - switch (xdrs->x_op) { - - case XDR_ENCODE: - newpos = rstrm->out_finger - delta; - if ((newpos > (char *)(void *)(rstrm->frag_header)) && - (newpos < rstrm->out_boundry)) { - rstrm->out_finger = newpos; - return (TRUE); - } - break; - - case XDR_DECODE: - newpos = rstrm->in_finger - delta; - if ((delta < (int)(rstrm->fbtbc)) && - (newpos <= rstrm->in_boundry) && - (newpos >= rstrm->in_base)) { - rstrm->in_finger = newpos; - rstrm->fbtbc -= delta; - return (TRUE); - } - break; - - case XDR_FREE: - break; + switch (xdrs->x_op) { + case XDR_ENCODE: + newpos = rstrm->out_finger - delta; + if ((newpos > (char *)(void *)(rstrm->frag_header)) && + (newpos < rstrm->out_boundry)) { + rstrm->out_finger = newpos; + return (TRUE); } + break; + + case XDR_DECODE: + newpos = rstrm->in_finger - delta; + if ((delta < (int)(rstrm->fbtbc)) && + (newpos <= rstrm->in_boundry) && + (newpos >= rstrm->in_base)) { + rstrm->in_finger = newpos; + rstrm->fbtbc -= delta; + return (TRUE); + } + break; + + case XDR_FREE: + break; + } return (FALSE); } From b5c1f7d479de02f2c886b7453adbea312418ee7a Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Mon, 6 Jan 2025 08:22:15 -0800 Subject: [PATCH 014/143] xdr: use C99 initializers for xdr_ops No functional change. --- sys/xdr/xdr_mbuf.c | 16 ++++++++-------- sys/xdr/xdr_mem.c | 36 ++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/sys/xdr/xdr_mbuf.c b/sys/xdr/xdr_mbuf.c index 0ed807de903e76..896e317f552629 100644 --- a/sys/xdr/xdr_mbuf.c +++ b/sys/xdr/xdr_mbuf.c @@ -46,14 +46,14 @@ static bool_t xdrmbuf_setpos(XDR *, u_int); static int32_t *xdrmbuf_inline(XDR *, u_int); static const struct xdr_ops xdrmbuf_ops = { - xdrmbuf_getlong, - xdrmbuf_putlong, - xdrmbuf_getbytes, - xdrmbuf_putbytes, - xdrmbuf_getpos, - xdrmbuf_setpos, - xdrmbuf_inline, - xdrmbuf_destroy + .x_getlong = xdrmbuf_getlong, + .x_putlong = xdrmbuf_putlong, + .x_getbytes = xdrmbuf_getbytes, + .x_putbytes = xdrmbuf_putbytes, + .x_getpostn = xdrmbuf_getpos, + .x_setpostn = xdrmbuf_setpos, + .x_inline = xdrmbuf_inline, + .x_destroy = xdrmbuf_destroy, }; /* diff --git a/sys/xdr/xdr_mem.c b/sys/xdr/xdr_mem.c index 1489aadf53a0f5..65a74836b7b301 100644 --- a/sys/xdr/xdr_mem.c +++ b/sys/xdr/xdr_mem.c @@ -63,27 +63,27 @@ static int32_t *xdrmem_inline_unaligned(XDR *, u_int); static bool_t xdrmem_control(XDR *xdrs, int request, void *info); static const struct xdr_ops xdrmem_ops_aligned = { - xdrmem_getlong_aligned, - xdrmem_putlong_aligned, - xdrmem_getbytes, - xdrmem_putbytes, - xdrmem_getpos, - xdrmem_setpos, - xdrmem_inline_aligned, - xdrmem_destroy, - xdrmem_control + .x_getlong = xdrmem_getlong_aligned, + .x_putlong = xdrmem_putlong_aligned, + .x_getbytes = xdrmem_getbytes, + .x_putbytes = xdrmem_putbytes, + .x_getpostn = xdrmem_getpos, + .x_setpostn = xdrmem_setpos, + .x_inline = xdrmem_inline_aligned, + .x_destroy = xdrmem_destroy, + .x_control = xdrmem_control, }; static const struct xdr_ops xdrmem_ops_unaligned = { - xdrmem_getlong_unaligned, - xdrmem_putlong_unaligned, - xdrmem_getbytes, - xdrmem_putbytes, - xdrmem_getpos, - xdrmem_setpos, - xdrmem_inline_unaligned, - xdrmem_destroy, - xdrmem_control + .x_getlong = xdrmem_getlong_unaligned, + .x_putlong = xdrmem_putlong_unaligned, + .x_getbytes = xdrmem_getbytes, + .x_putbytes = xdrmem_putbytes, + .x_getpostn = xdrmem_getpos, + .x_setpostn = xdrmem_setpos, + .x_inline = xdrmem_inline_unaligned, + .x_destroy = xdrmem_destroy, + .x_control = xdrmem_control }; /* From c2153a533ffb9691848a072c7628dcf56e0e6442 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Fri, 27 Dec 2024 14:24:17 -0700 Subject: [PATCH 015/143] fusefs: minor cleanup in the tests Delete some unused includes and member variables. MFC after: 2 weeks Sponsored by: ConnectWise --- tests/sys/fs/fusefs/allow_other.cc | 3 --- tests/sys/fs/fusefs/forget.cc | 1 - tests/sys/fs/fusefs/io.cc | 1 - tests/sys/fs/fusefs/notify.cc | 1 - 4 files changed, 6 deletions(-) diff --git a/tests/sys/fs/fusefs/allow_other.cc b/tests/sys/fs/fusefs/allow_other.cc index dae6290ea8e5da..24a161166a909c 100644 --- a/tests/sys/fs/fusefs/allow_other.cc +++ b/tests/sys/fs/fusefs/allow_other.cc @@ -52,9 +52,6 @@ const static char RELPATH[] = "some_file.txt"; class NoAllowOther: public FuseTest { public: -/* Unprivileged user id */ -int m_uid; - virtual void SetUp() { if (geteuid() != 0) { GTEST_SKIP() << "This test must be run as root"; diff --git a/tests/sys/fs/fusefs/forget.cc b/tests/sys/fs/fusefs/forget.cc index 846198e7592577..1e7764ac478215 100644 --- a/tests/sys/fs/fusefs/forget.cc +++ b/tests/sys/fs/fusefs/forget.cc @@ -31,7 +31,6 @@ extern "C" { #include #include -#include #include #include diff --git a/tests/sys/fs/fusefs/io.cc b/tests/sys/fs/fusefs/io.cc index f8684ee02100e5..ced291836da046 100644 --- a/tests/sys/fs/fusefs/io.cc +++ b/tests/sys/fs/fusefs/io.cc @@ -31,7 +31,6 @@ extern "C" { #include #include -#include #include #include diff --git a/tests/sys/fs/fusefs/notify.cc b/tests/sys/fs/fusefs/notify.cc index e3f539f57599b2..1e22bde13db72f 100644 --- a/tests/sys/fs/fusefs/notify.cc +++ b/tests/sys/fs/fusefs/notify.cc @@ -30,7 +30,6 @@ extern "C" { #include -#include #include #include From 58610d1e0fbbd1a49927559ee3970d8e4594cc86 Mon Sep 17 00:00:00 2001 From: Pat Maddox Date: Fri, 13 Dec 2024 11:57:30 -0800 Subject: [PATCH 016/143] build: Sort distributekernel METALOG when using -DNO_ROOT The metalog is produced by install -M, which is not inherently sorted. This results in non-deterministic file ordering in kernel.txz. Order the files in kernel.txz to support reproducible builds. PR: 283214 Reviewed by: emaste Signed-off-by: Pat Maddox --- Makefile.inc1 | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/Makefile.inc1 b/Makefile.inc1 index bad747a0e551e9..fe799218ab0a40 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -1903,9 +1903,7 @@ distributekernel distributekernel.debug: .PHONY false .endif mkdir -p ${DESTDIR}/${DISTDIR} -.if defined(NO_ROOT) - @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.premeta -.endif + rm -f ${DESTDIR}/${DISTDIR}/kernel.premeta ${_+_}cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH:Q} ${MAKE} KERNEL=${INSTKERNNAME} \ @@ -1913,15 +1911,14 @@ distributekernel distributekernel.debug: .PHONY METALOG=${METALOG:S/METALOG/kernel.premeta/} \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) - @sed -e 's|^./kernel|.|' ${DESTDIR}/${DISTDIR}/kernel.premeta > \ - ${DESTDIR}/${DISTDIR}/kernel.meta + echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.meta + sed -e 's|^./kernel|.|' ${DESTDIR}/${DISTDIR}/kernel.premeta | \ + ${METALOG_SORT_CMD} >> ${DESTDIR}/${DISTDIR}/kernel.meta .endif .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} -.if defined(NO_ROOT) - @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta -.endif + rm -f ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta ${_+_}cd ${KRNLOBJDIR}/${_kernel}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.${_kernel}.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH:Q} ${MAKE} \ @@ -1930,9 +1927,10 @@ distributekernel distributekernel.debug: .PHONY METALOG=${METALOG:S/METALOG/kernel.${_kernel}.premeta/} \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) - @sed -e "s|^./kernel.${_kernel}|.|" \ - ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta > \ - ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta + echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta + sed -e "s|^./kernel.${_kernel}|.|" \ + ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta | \ + ${METALOG_SORT_CMD} >> ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta .endif .endfor .endif From f415b2ef30f7bf0db753f09fbba7b0910475b0d2 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Mon, 6 Jan 2025 12:21:29 -0700 Subject: [PATCH 017/143] fusefs: Coverity cleanup in the lseek tests Always check the return value of open(). Reported by: Coverity Scan CID: 1471118 1471133 1471215 1471896 1471901 1472116 1473799 CID: 1473879 1473996 1555269 1558044 MFC after: 2 weeks Sponsored by: ConnectWise --- tests/sys/fs/fusefs/lseek.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/sys/fs/fusefs/lseek.cc b/tests/sys/fs/fusefs/lseek.cc index 2a1cb198bccee0..12d41f7af1b26f 100644 --- a/tests/sys/fs/fusefs/lseek.cc +++ b/tests/sys/fs/fusefs/lseek.cc @@ -71,6 +71,7 @@ TEST_F(LseekPathconf, already_enosys) ).WillOnce(Invoke(ReturnErrno(ENOSYS))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(offset_in, lseek(fd, offset_in, SEEK_DATA)); EXPECT_EQ(-1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); @@ -105,6 +106,7 @@ TEST_F(LseekPathconf, already_seeked) out.body.lseek.offset = i.body.lseek.offset; }))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(offset, lseek(fd, offset, SEEK_DATA)); EXPECT_EQ(1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); @@ -171,6 +173,7 @@ TEST_F(LseekPathconf, eio) .WillRepeatedly(Invoke(ReturnErrno(EIO))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(-1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); EXPECT_EQ(EIO, errno); @@ -203,6 +206,7 @@ TEST_F(LseekPathconf, enosys_now) ).WillOnce(Invoke(ReturnErrno(ENOSYS))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(-1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); EXPECT_EQ(EINVAL, errno); @@ -266,6 +270,7 @@ TEST_F(LseekPathconf, seek_now) }))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(offset_initial, lseek(fd, offset_initial, SEEK_SET)); EXPECT_EQ(1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); /* And check that the file pointer hasn't changed */ @@ -299,6 +304,7 @@ TEST_F(LseekPathconf, zerolength) ).WillOnce(Invoke(ReturnErrno(ENXIO))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); /* Check again, to ensure that the kernel recorded the response */ EXPECT_EQ(1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); @@ -327,6 +333,7 @@ TEST_F(LseekPathconf_7_23, already_enosys) ).Times(0); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(-1, fpathconf(fd, _PC_MIN_HOLE_SIZE)); EXPECT_EQ(EINVAL, errno); @@ -391,6 +398,7 @@ TEST_F(LseekSeekData, enosys) _) ).WillOnce(Invoke(ReturnErrno(ENOSYS))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); /* * Default behavior: ENXIO if offset is < 0 or >= fsize, offset @@ -431,6 +439,7 @@ TEST_F(LseekSeekHole, ok) out.body.lseek.offset = offset_out; }))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(offset_out, lseek(fd, offset_in, SEEK_HOLE)); EXPECT_EQ(offset_out, lseek(fd, 0, SEEK_CUR)); @@ -463,6 +472,7 @@ TEST_F(LseekSeekHole, enosys) _) ).WillOnce(Invoke(ReturnErrno(ENOSYS))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); /* * Default behavior: ENXIO if offset is < 0 or >= fsize, fsize @@ -500,6 +510,7 @@ TEST_F(LseekSeekHole, enxio) _) ).WillOnce(Invoke(ReturnErrno(ENXIO))); fd = open(FULLPATH, O_RDONLY); + ASSERT_LE(0, fd); EXPECT_EQ(-1, lseek(fd, offset_in, SEEK_HOLE)); EXPECT_EQ(ENXIO, errno); From bb9525f30214e8b6c53c6cccd9e8f02e8f8e8c42 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Mon, 6 Jan 2025 20:35:11 +0100 Subject: [PATCH 018/143] TCP RACK: fix TCP fast open Do not jump to a place in the code, which requires several variables to be set (segsize, minseg, idle, len, sb_offset), which is not true. To avoid using these variables, start the HPTS timer explicitly. This fix only applies to the client side using TCP fast open. Approved by: rrs CID: 1523766 CID: 1523770 CID: 1523786 CID: 1523801 CID: 1523809 MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48322 --- sys/netinet/tcp_stacks/rack.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index cc07253247609c..7baf1a6267875b 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -19943,10 +19943,11 @@ rack_output(struct tcpcb *tp) (tp->t_state == TCPS_SYN_SENT)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->t_rxtshift == 0)) { /* not a retransmit */ - cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; - so = inp->inp_socket; - sb = &so->so_snd; - goto just_return_nolock; + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + return (0); } /* * Determine length of data that should be transmitted, and flags From c28fefe1dc44b69743dd18d038440da38a2867a7 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Mon, 6 Jan 2025 20:40:33 +0100 Subject: [PATCH 019/143] TCP BBR: remove dead code bw is unsigned and not zero. So it cannot be smaller than 1. No functional change intended. Reviewed by: rrs, cc CID: 1523791 MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48323 --- sys/netinet/tcp_stacks/bbr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 535766a0a1b004..97ff46f0b96b8f 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -2993,9 +2993,6 @@ __bbr_get_bw(struct tcp_bbr *bbr) /* We should not be at 0, go to the initial window then */ goto use_initial_window; } - if (bw < 1) - /* Probably should panic */ - bw = 1; if (bw < min_bw) bw = min_bw; return (bw); From 061727efe1e355fb2fde1b05e92718543d05bfe7 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Mon, 6 Jan 2025 21:25:58 +0100 Subject: [PATCH 020/143] TCP BBR: remove dead code No functional change intended. Reviewed by: rrs CID: 1523808 MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48338 --- sys/netinet/tcp_stacks/bbr.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 97ff46f0b96b8f..4600088bd1a119 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -978,14 +978,6 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock * and we do */ return; - } else if (sbavail(&inp->inp_socket->so_snd) && - (tmr_up == PACE_TMR_RXT)) { - /* - * if we hit enobufs then we would expect the - * possibility of nothing outstanding and the RXT up - * (and the hptsi timer). - */ - return; } else if (((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && From e8ec28047df5185582a95c5211ed75682fad5ec5 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Mon, 6 Jan 2025 21:38:34 +0100 Subject: [PATCH 021/143] TCP RACK: fix TCP_RACK_PACING_BETA socket option Bring back the code, which was accidentally removed. While there, indent a comment correctly. Reviewed by: rrs CID: 1540026 Fixes: e18b97bd63a8 ("Update to bring the rack stack with all its fixes in.") MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48340 --- sys/netinet/tcp_stacks/rack.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 7baf1a6267875b..f590edd71d9d3c 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -24486,15 +24486,29 @@ rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) * when you exit recovery. */ case TCP_RACK_PACING_BETA: + if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) + error = EINVAL; + else if (rack->rc_pacing_cc_set == 0) + optval = rack->r_ctl.rc_saved_beta.beta; + else { + /* + * Reach out into the CC data and report back what + * I have previously set. Yeah it looks hackish but + * we don't want to report the saved values. + */ + if (tp->t_ccv.cc_data) + optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; + else + error = EINVAL; + } break; - /* - * Beta_ecn is the congestion control value for NewReno that influences how - * much of a backoff happens when a ECN mark is detected. It is normally set - * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when - * you exit recovery. Note that classic ECN has a beta of 50, it is only - * ABE Ecn that uses this "less" value, but we do too with pacing :) - */ - + /* + * Beta_ecn is the congestion control value for NewReno that influences how + * much of a backoff happens when a ECN mark is detected. It is normally set + * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when + * you exit recovery. Note that classic ECN has a beta of 50, it is only + * ABE Ecn that uses this "less" value, but we do too with pacing :) + */ case TCP_RACK_PACING_BETA_ECN: if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) error = EINVAL; From 9743e9efdf5f0d2338d7cfeed8f09d89d889bac4 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 6 Jan 2025 20:53:52 +0000 Subject: [PATCH 022/143] SO_SPLICE tests: Fix a comment typo MFC after: 1 week Sponsored by: Klara, Inc. --- tests/sys/kern/socket_splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sys/kern/socket_splice.c b/tests/sys/kern/socket_splice.c index 3970f16b34dce1..3a85ae91ecc79d 100644 --- a/tests/sys/kern/socket_splice.c +++ b/tests/sys/kern/socket_splice.c @@ -330,7 +330,7 @@ ATF_TC_BODY(splice_capsicum, tc) tcp4_socketpair(right); /* - * Make sure that we splice a socket that's missing recv rights. + * Make sure that we can't splice a socket that's missing recv rights. */ remove_rights(left[1], cap_rights_init(&rights, CAP_RECV)); splice_init(&sp, right[0], 0, NULL); From 8c75c15d43e4123bc51f24f5bf99319289c45a6c Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 6 Jan 2025 22:53:38 +0000 Subject: [PATCH 023/143] jail: Avoid a potential use-after-free when destroying jails prison_deref() and prison_deref_kill() have to handle the case where destruction of a jail will release the final reference on the jail's parent, resulting in destruction of the parent jail. They thus maintain a list of jails whose references have gone away; the loop at the end of prison_deref() then goes through the list and deallocates resources associated with each jail. In particular, if a jail's VNET is not the same as that of its parent, this loop destroys the VNET. Suppose prison_deref() removes the last reference on a jail, releasing a reference to its parent and causing the jail to be placed in the "freeprison" list. Suppose then that the parent jail is destroyed before the "freeprison" list is processed. When destroying the now-orphaned child jail, prison_deref() derefences its parent to see whether the child jail's VNET needs to be freed, but if this race occurs, this is a use-after-free. Fix the problem by using PR_VNET to decide whether the jail's VNET is to be destroyed, rather than dereferencing the parent jail pointer. Set it earlier so that a subsequent failure in kern_jail_set() cleans up the nascent VNET. Reviewed by: zlei (previous version), jamie MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D47992 --- sys/kern/kern_jail.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index ad6483ed374d68..6ffeab59112b47 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -1687,9 +1687,18 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) sizeof(pr->pr_osrelease)); #ifdef VIMAGE - /* Allocate a new vnet if specified. */ - pr->pr_vnet = (pr_flags & PR_VNET) - ? vnet_alloc() : ppr->pr_vnet; + /* + * Allocate a new vnet if specified. + * + * Set PR_VNET now if so, so that the vnet is disposed of + * properly when the jail is destroyed. + */ + if (pr_flags & PR_VNET) { + pr->pr_flags |= PR_VNET; + pr->pr_vnet = vnet_alloc(); + } else { + pr->pr_vnet = ppr->pr_vnet; + } #endif /* * Allocate a dedicated cpuset for each jail. @@ -3207,9 +3216,12 @@ prison_deref(struct prison *pr, int flags) * Removing a prison frees references * from its parent. */ + ppr = pr->pr_parent; + pr->pr_parent = NULL; mtx_unlock(&pr->pr_mtx); + + pr = ppr; flags &= ~PD_LOCKED; - pr = pr->pr_parent; flags |= PD_DEREF | PD_DEUREF; continue; } @@ -3236,7 +3248,7 @@ prison_deref(struct prison *pr, int flags) */ TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { #ifdef VIMAGE - if (rpr->pr_vnet != rpr->pr_parent->pr_vnet) + if (rpr->pr_flags & PR_VNET) vnet_destroy(rpr->pr_vnet); #endif if (rpr->pr_root != NULL) From 1c933f464fdbb630f9663751f04c29cdcda38902 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 6 Jan 2025 22:55:38 +0000 Subject: [PATCH 024/143] unix: Be consistent about error handling for unconnected sockets SOCK_STREAM and SOCK_SEQPACKET sockets should get the same treatment here. PR: 176420 MFC after: 2 weeks --- sys/kern/uipc_usrreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 7af73a1d344be3..3d7e5bcc5ad014 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1796,7 +1796,7 @@ uipc_ctloutput(struct socket *so, struct sockopt *sopt) if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { - if (so->so_type == SOCK_STREAM) + if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EINVAL; From 5bf3ac7ae219f126cf3965be97a2d718007c1be4 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 6 Jan 2025 22:56:07 +0000 Subject: [PATCH 025/143] bsdinstall: Fix a typo in a comment PR: 283507 MFC after: 1 week --- usr.sbin/bsdinstall/scripts/zfsboot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr.sbin/bsdinstall/scripts/zfsboot b/usr.sbin/bsdinstall/scripts/zfsboot index 45c023b065133e..6f2244a918bc6e 100755 --- a/usr.sbin/bsdinstall/scripts/zfsboot +++ b/usr.sbin/bsdinstall/scripts/zfsboot @@ -139,7 +139,7 @@ f_include $BSDCFG_SHARE/variable.subr # # Default ZFS datasets for root zpool # -# NOTE: Requires /tmp, /var/tmp, /$ZFSBOOT_BOOTFS_NAME/$ZFSBOOT_BOOTFS_NAME +# NOTE: Requires /tmp, /var/tmp, /$ZFSBOOT_BEROOT_NAME/$ZFSBOOT_BOOTFS_NAME # NOTE: Anything after pound/hash character [#] is ignored as a comment. # f_isset ZFSBOOT_DATASETS || ZFSBOOT_DATASETS=" From 872686b17e70636f031436b458262eb7dacc5832 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 6 Jan 2025 23:20:00 +0000 Subject: [PATCH 026/143] rc: Document rtadvd_flags PR: 283696 MFC after: 1 week --- libexec/rc/rc.conf | 1 + share/man/man5/rc.conf.5 | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/libexec/rc/rc.conf b/libexec/rc/rc.conf index 62756fece2016f..9540cca6c2f104 100644 --- a/libexec/rc/rc.conf +++ b/libexec/rc/rc.conf @@ -551,6 +551,7 @@ rtadvd_enable="NO" # Set to YES to enable an IPv6 router # advertisement daemon. If set to YES, # this router becomes a possible candidate # IPv6 default router for local subnets. +rtadvd_flags="" # Flags to the IPv6 router advertisement daemon. rtadvd_interfaces="" # Interfaces rtadvd sends RA packets. stf_interface_ipv4addr="" # Local IPv4 addr for 6to4 IPv6 over IPv4 # tunneling interface. Specify this entry diff --git a/share/man/man5/rc.conf.5 b/share/man/man5/rc.conf.5 index a66477ae4510de..8ad503f792e899 100644 --- a/share/man/man5/rc.conf.5 +++ b/share/man/man5/rc.conf.5 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd October 14, 2024 +.Dd January 6, 2025 .Dt RC.CONF 5 .Os .Sh NAME @@ -3133,6 +3133,14 @@ the interfaces specified in This should only be enabled with great care. You may want to fine-tune .Xr rtadvd.conf 5 . +.It Va rtadvd_flags +.Pq Vt str +If +.Va rtadvd_enable +is set to +.Dq Li YES , +these are the flags to pass to +.Xr rtadvd 8 . .It Va rtadvd_interfaces .Pq Vt str If From 378a2b155aaf853933df5b53e174b3880826488c Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Mon, 6 Jan 2025 23:20:08 +0000 Subject: [PATCH 027/143] netipsec: Pass the right mbuf up Note that key_spdacquire() is dead code, as the SADB_X_SPDACQUIRE message handler is not set. PR: 243057 MFC after: 2 weeks --- sys/netipsec/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index ad1d6164f15815..3c64a65f024d38 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -2595,7 +2595,7 @@ key_spdacquire(struct secpolicy *sp) mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); - return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED); + return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); } /* From 02ebbc781f082df9714e74775700d8c08bac7850 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Mon, 6 Jan 2025 16:44:21 -0700 Subject: [PATCH 028/143] swab: Fix implementation to support overlapping copies A number of image processing packages assume that swab() can handle to and from being the same. However, POSIX.1 states that overlapping buffers produces undefined results. Our old implementation would produce coherent results, but the recent change to the musl-inspired code does not. Since there's complaints in the forums for these image processing packages for musl and now FreeBSD, update the algorithm to just read a word at a time and bswap16 the results. All FreeBSD's architecutres support unaligned access in userland, and swab is not used in the kernel (g_part_apm has its own copy), so opt for even simpler code that's easier to understand. This makes the overlapping behavior match i386 again, since its assembler routine for swab handles overlapping correctly. PR: 283698 Sponsored by: Netflix Reviewed by: nwhitehorn Differential Revision: https://reviews.freebsd.org/D48259 --- lib/libc/string/swab.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/libc/string/swab.c b/lib/libc/string/swab.c index 2b044d68ca4603..ed4436a4981066 100644 --- a/lib/libc/string/swab.c +++ b/lib/libc/string/swab.c @@ -4,19 +4,22 @@ */ #include +#include void swab(const void * __restrict from, void * __restrict to, ssize_t len) { - const unsigned char *f = from; - unsigned char *t = to; + const uint16_t *f __aligned(1) = from; + uint16_t *t __aligned(1) = to; + /* + * POSIX says overlapping copy behavior is undefined, however many + * applications assume the old FreeBSD and current GNU libc behavior + * that will swap the bytes correctly when from == to. Reading both bytes + * and swapping them before writing them back accomplishes this. + */ while (len > 1) { - t[0] = f[1]; - t[1] = f[0]; - - f += 2; - t += 2; + *t++ = bswap16(*f++); len -= 2; } } From 6fc164c7775a5dc7a4277969870abd50eb62cd1e Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Mon, 6 Jan 2025 16:45:47 -0700 Subject: [PATCH 029/143] cdefs.9: Note only one programming environment at a time Only one programming environment at a time can be defined at a time. Posix states that when defining _POSIX_C_SOURCE, the system headers must define only the macros, variables, and functions that a given standard level defines. Selecting a different macro along with this is fundamentally incompatible with that. Sponsored by: Netflix --- share/man/man9/cdefs.9 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/share/man/man9/cdefs.9 b/share/man/man9/cdefs.9 index b9821c70f2a124..2e6f1b440efbd4 100644 --- a/share/man/man9/cdefs.9 +++ b/share/man/man9/cdefs.9 @@ -3,7 +3,7 @@ .\" .\" SPDX-License-Identifier: BSD-2-Clause .\" -.Dd December 6, 2024 +.Dd January 6, 2025 .Dt CDEFS 9 .Os .Sh NAME @@ -352,6 +352,7 @@ Defining the macros outlined below requests that the system header files provide only the functions, structures and macros (symbols) defined by the appropriate standard, while suppressing all extensions. However, system headers not defined by that standard may define extensions. +You may only define one of the following for any compilation unit. .Bl -column "---------------" .It Sy Macro Ta Sy Environment .It Dv _POSIX_SOURCE Ta St -p1003.1-88 including St -ansiC From cfd8866818abb68fbfbffb925298c0b457cb32b5 Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Mon, 6 Jan 2025 16:45:51 -0700 Subject: [PATCH 030/143] cdefs.h: Add warning about defining __BSD_VISIBLE and friends Undefined things happen if users define these macros, be more explicit about documenting that. Sponsored by: Netflix --- share/man/man9/cdefs.9 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/share/man/man9/cdefs.9 b/share/man/man9/cdefs.9 index 2e6f1b440efbd4..4efce132d3932c 100644 --- a/share/man/man9/cdefs.9 +++ b/share/man/man9/cdefs.9 @@ -401,7 +401,8 @@ are also included. These macros are set by .Nm to control the visibility of different standards. -Users should not use these, but they are documented here for developers. +Users must not define these, and doing so will produced undefined results. +They are documented here for developers working on system's header files. .Bl -column "---------------" .It Dv __XSI_VISIBLE Ta Restricts the visibility of XOPEN Single Unix Standard version. Possible values are 500, 600, 700 or 800, corresponding to Issue 5, 6, 7, or 8 From 080f68d0ab0c87950ecd9b393a156b1e4d12c825 Mon Sep 17 00:00:00 2001 From: Ariel Ehrenberg Date: Wed, 4 Dec 2024 11:32:54 +0200 Subject: [PATCH 031/143] mlx5_core: Add steering support for IPsec with IPv6 ipv6 flow tables were not connected to previous FS tables. Created an additional table to serve as IPsec RX root. This table has 2 rules for redirecting the received packets to ipv4/ipv6 based on the IP family in the packet header. Sponsored by: NVidia networking --- sys/dev/mlx5/mlx5_accel/ipsec.h | 2 + sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c | 157 ++++++++++++++++++++-- sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c | 4 +- 3 files changed, 149 insertions(+), 14 deletions(-) diff --git a/sys/dev/mlx5/mlx5_accel/ipsec.h b/sys/dev/mlx5/mlx5_accel/ipsec.h index 95742c4099f192..361b9f72d873af 100644 --- a/sys/dev/mlx5/mlx5_accel/ipsec.h +++ b/sys/dev/mlx5/mlx5_accel/ipsec.h @@ -43,6 +43,7 @@ struct mlx5e_priv; struct mlx5e_tx_wqe; struct mlx5e_ipsec_tx; struct mlx5e_ipsec_rx; +struct mlx5e_ipsec_rx_ip_type; struct aes_gcm_keymat { u64 seq_iv; @@ -128,6 +129,7 @@ struct mlx5e_ipsec { struct mlx5e_ipsec_tx *tx; struct mlx5e_ipsec_rx *rx_ipv4; struct mlx5e_ipsec_rx *rx_ipv6; + struct mlx5e_ipsec_rx_ip_type *rx_ip_type; struct mlx5e_ipsec_aso *aso; u32 pdn; u32 mkey; diff --git a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c index f7950bf612698a..fb9ca94278db24 100644 --- a/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c +++ b/sys/dev/mlx5/mlx5_accel/mlx5_ipsec_fs.c @@ -138,6 +138,14 @@ struct mlx5e_ipsec_rx_roce { struct mlx5_flow_namespace *ns_rdma; }; +struct mlx5e_ipsec_rx_ip_type { + struct mlx5_flow_table *ft; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_handle *ipv4_rule; + struct mlx5_flow_handle *ipv6_rule; + struct mlx5e_ipsec_miss miss; +}; + struct mlx5e_ipsec_rx { struct mlx5e_ipsec_ft ft; struct mlx5e_ipsec_miss pol; @@ -497,6 +505,16 @@ static void setup_fte_addr6(struct mlx5_flow_spec *spec, __be32 *saddr, outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), 0xff, 16); } +static void +setup_fte_ip_version(struct mlx5_flow_spec *spec, u8 family) +{ + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, + family == AF_INET ? 4 : 6); +} + static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) { struct mlx5e_ipsec_rule *ipsec_rule = &sa_entry->ipsec_rule; @@ -1598,9 +1616,18 @@ static void ipsec_fs_rx_roce_table_destroy(struct mlx5e_ipsec_rx_roce *rx_roce) mlx5_destroy_flow_table(rx_roce->ft); } +static void +ipsec_fs_rx_ip_type_catchall_rule_destroy(struct mlx5e_ipsec_rx_ip_type* rx_ip_type) +{ + mlx5_del_flow_rules(&rx_ip_type->ipv4_rule); + mlx5_del_flow_rules(&rx_ip_type->ipv6_rule); + mlx5_del_flow_rules(&rx_ip_type->miss.rule); + mlx5_destroy_flow_group(rx_ip_type->miss.group); + rx_ip_type->miss.group = NULL; +} + static void ipsec_fs_rx_table_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx) { - mutex_lock(&rx->ft.mutex); if (rx->chains) { ipsec_chains_destroy(rx->chains); } else { @@ -1610,7 +1637,6 @@ static void ipsec_fs_rx_table_destroy(struct mlx5_core_dev *mdev, struct mlx5e_i mlx5_destroy_flow_table(rx->ft.sa); mlx5_destroy_flow_table(rx->ft.status); ipsec_fs_rx_roce_table_destroy(&rx->roce); - mutex_unlock(&rx->ft.mutex); } static void ipsec_roce_setup_udp_dport(struct mlx5_flow_spec *spec, u16 dport) @@ -1831,6 +1857,90 @@ static int ipsec_fs_rx_roce_tables_create(struct mlx5e_ipsec_rx *rx, return err; } +static int +ipsec_fs_rx_ip_type_catchall_rules_create(struct mlx5e_priv *priv, + struct mlx5_flow_destination *defdst) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_ipsec *ipsec = priv->ipsec; + struct mlx5_flow_destination dst = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + return -ENOMEM; + } + dst.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + /* Set rule for ipv4 packets */ + dst.ft = ipsec->rx_ipv4->ft.pol; + setup_fte_ip_version(spec, AF_INET); + rule = mlx5_add_flow_rules(ipsec->rx_ip_type->ft, spec, &flow_act, &dst, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, "Failed to add ipv4 rule to ip_type table err=%d\n", + err); + goto out; + } + ipsec->rx_ip_type->ipv4_rule = rule; + + /* Set rule for ipv6 packets */ + dst.ft = ipsec->rx_ipv6->ft.pol; + setup_fte_ip_version(spec, AF_INET6); + rule = mlx5_add_flow_rules(ipsec->rx_ip_type->ft, spec, &flow_act, &dst, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, "Failed to add ipv6 rule to ip_type table err=%d\n", + err); + goto fail_add_ipv6_rule; + } + ipsec->rx_ip_type->ipv6_rule = rule; + + /* set miss rule */ + err = ipsec_miss_create(mdev, ipsec->rx_ip_type->ft, &ipsec->rx_ip_type->miss, defdst); + if (err) { + mlx5_core_err(mdev, "Failed to add miss rule to ip_type table err=%d\n", + err); + goto fail_miss_rule; + } + + goto out; + +fail_miss_rule: + mlx5_del_flow_rules(&ipsec->rx_ip_type->ipv6_rule); +fail_add_ipv6_rule: + mlx5_del_flow_rules(&ipsec->rx_ip_type->ipv4_rule); +out: + kvfree(spec); + return err; +} + +static int +ipsec_fs_rx_ip_type_table_create(struct mlx5e_priv *priv, + int level) +{ + struct mlx5e_ipsec *ipsec = priv->ipsec; + struct mlx5_flow_table *ft; + int err = 0; + + /* Create rx ip type table */ + ft = ipsec_rx_ft_create(ipsec->rx_ip_type->ns, level, 0, 1); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto out; + } + ipsec->rx_ip_type->ft = ft; + + priv->fts.ipsec_ft = priv->ipsec->rx_ip_type->ft; + +out: + return err; +} + static int ipsec_fs_rx_table_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx, int rx_init_level, int rdma_init_level) { @@ -1996,6 +2106,7 @@ void mlx5e_accel_ipsec_fs_rx_catchall_rules_destroy(struct mlx5e_priv *priv) if (!priv->ipsec) return; + ipsec_fs_rx_ip_type_catchall_rule_destroy(priv->ipsec->rx_ip_type); ipsec_fs_rx_catchall_rules_destroy(priv->mdev, priv->ipsec->rx_ipv4); ipsec_fs_rx_catchall_rules_destroy(priv->mdev, priv->ipsec->rx_ipv6); } @@ -2019,6 +2130,13 @@ int mlx5e_accel_ipsec_fs_rx_catchall_rules(struct mlx5e_priv *priv) err = ipsec_fs_rx_catchall_rules(priv, ipsec->rx_ipv4, &dest); if (err) ipsec_fs_rx_catchall_rules_destroy(priv->mdev, priv->ipsec->rx_ipv6); + + err = ipsec_fs_rx_ip_type_catchall_rules_create(priv, &dest); + if (err) { + ipsec_fs_rx_catchall_rules_destroy(priv->mdev, priv->ipsec->rx_ipv6); + ipsec_fs_rx_catchall_rules_destroy(priv->mdev, priv->ipsec->rx_ipv4); + } + out: return err; } @@ -2032,6 +2150,7 @@ void mlx5e_accel_ipsec_fs_rx_tables_destroy(struct mlx5e_priv *priv) if (!ipsec) return; + mlx5_destroy_flow_table(ipsec->rx_ip_type->ft); ipsec_fs_rx_table_destroy(mdev, ipsec->rx_ipv6); ipsec_fs_rx_table_destroy(mdev, ipsec->rx_ipv4); } @@ -2045,18 +2164,24 @@ int mlx5e_accel_ipsec_fs_rx_tables_create(struct mlx5e_priv *priv) if (!ipsec) return 0; - err = ipsec_fs_rx_table_create(ipsec->mdev, ipsec->rx_ipv4, 0, 0); + err = ipsec_fs_rx_ip_type_table_create(priv, 0); if (err) - goto out; + return err; - err = ipsec_fs_rx_table_create(ipsec->mdev, ipsec->rx_ipv6, 4, 1); - if (err) { - ipsec_fs_rx_table_destroy(priv->mdev, ipsec->rx_ipv4); - goto out; - } + err = ipsec_fs_rx_table_create(ipsec->mdev, ipsec->rx_ipv4, 1, 0); + if (err) + goto err_ipv4_table; - priv->fts.ipsec_ft = priv->ipsec->rx_ipv4->ft.pol; -out: + err = ipsec_fs_rx_table_create(ipsec->mdev, ipsec->rx_ipv6, 5, 1); + if (err) + goto err_ipv6_table; + + return 0; + +err_ipv6_table: + ipsec_fs_rx_table_destroy(priv->mdev, ipsec->rx_ipv4); +err_ipv4_table: + mlx5_destroy_flow_table(ipsec->rx_ip_type->ft); return err; } @@ -2067,6 +2192,7 @@ void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_ipsec *ipsec) mutex_destroy(&ipsec->rx_ipv4->ft.mutex); mutex_destroy(&ipsec->tx->ft.mutex); ipsec_fs_destroy_counters(ipsec); + kfree(ipsec->rx_ip_type); kfree(ipsec->rx_ipv6); kfree(ipsec->rx_ipv4); kfree(ipsec->tx); @@ -2089,9 +2215,13 @@ int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec) if (!ipsec->tx) return -ENOMEM; + ipsec->rx_ip_type = kzalloc(sizeof(*ipsec->rx_ip_type), GFP_KERNEL); + if (!ipsec->rx_ip_type) + goto err_tx; + ipsec->rx_ipv4 = kzalloc(sizeof(*ipsec->rx_ipv4), GFP_KERNEL); if (!ipsec->rx_ipv4) - goto err_tx; + goto err_ip_type; ipsec->rx_ipv6 = kzalloc(sizeof(*ipsec->rx_ipv6), GFP_KERNEL); if (!ipsec->rx_ipv6) @@ -2103,6 +2233,7 @@ int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec) ipsec->tx->ns = tns; mutex_init(&ipsec->tx->ft.mutex); + ipsec->rx_ip_type->ns = rns; ipsec->rx_ipv4->ns = rns; ipsec->rx_ipv6->ns = rns; mutex_init(&ipsec->rx_ipv4->ft.mutex); @@ -2116,6 +2247,8 @@ int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec) kfree(ipsec->rx_ipv6); err_rx_ipv4: kfree(ipsec->rx_ipv4); +err_ip_type: + kfree(ipsec->rx_ip_type); err_tx: kfree(ipsec->tx); return err; diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c index f8be5b9e881c60..6e24395b5577ad 100644 --- a/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_flow_table.c @@ -1626,7 +1626,7 @@ mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv) ft->num_groups = 0; ft_attr.max_fte = MLX5E_VLAN_TABLE_SIZE; - ft_attr.level = (priv->ipsec) ? 8 : 0; + ft_attr.level = (priv->ipsec) ? 9 : 0; ft->t = mlx5_create_flow_table(priv->fts.ns, &ft_attr); if (IS_ERR(ft->t)) { @@ -2014,7 +2014,7 @@ mlx5e_create_vxlan_flow_table(struct mlx5e_priv *priv) ft->num_groups = 0; ft_attr.max_fte = MLX5E_VXLAN_TABLE_SIZE; - ft_attr.level = (priv->ipsec) ? 9 : 1; + ft_attr.level = (priv->ipsec) ? 10 : 1; ft->t = mlx5_create_flow_table(priv->fts.ns, &ft_attr); if (IS_ERR(ft->t)) { From 215c8b79c498f647afcbe9fa9076c2c3329e09b4 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 02:15:23 +0000 Subject: [PATCH 032/143] riscv/vmm: Make vcpu sleep periods consistent with other platforms There's no apparent reason for the difference here, so let's be consistent to make merging easier. Tested by: br --- sys/riscv/vmm/vmm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c index 0596e0de2e436b..f7cbfc1dfea580 100644 --- a/sys/riscv/vmm/vmm.c +++ b/sys/riscv/vmm/vmm.c @@ -1125,8 +1125,7 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, if (from_idle) { while (vcpu->state != VCPU_IDLE) { vcpu_notify_event_locked(vcpu); - msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", - hz / 1000); + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " @@ -1425,7 +1424,7 @@ vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) * XXX msleep_spin() cannot be interrupted by signals so * wake up periodically to check pending signals. */ - msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz / 1000); + msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz); vcpu_require_state_locked(vcpu, VCPU_FROZEN); } vcpu_unlock(vcpu); From 19cb383dc03a80e1651d80e0f0e3d4e9cbd20e04 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 02:15:36 +0000 Subject: [PATCH 033/143] vmm.4: Update to mention non-amd64 platforms Most of the text here relates to PCI passthrough, which is still amd64-only, but we should still document supported platforms. While here, remove the comment that vmm.ko has to be loaded at boot-time for passthrough, as devctl makes it possible to detach host drivers on the fly. I'm not aware of any other reason to require vmm.ko to be loaded at boot. Reviewed by: br, andrew Differential Revision: https://reviews.freebsd.org/D48263 --- share/man/man4/vmm.4 | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/share/man/man4/vmm.4 b/share/man/man4/vmm.4 index 7e4c9050021a45..07c40541f404a4 100644 --- a/share/man/man4/vmm.4 +++ b/share/man/man4/vmm.4 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd September 27, 2024 +.Dd December 30, 2024 .Dt VMM 4 .Os .Sh NAME @@ -45,15 +45,22 @@ kldload vmm provides the kernel portion of the .Xr bhyve 4 hypervisor. -.Pp -An Intel CPU with VT-x/EPT or AMD CPU with SVM support is required. +The following platforms are supported: +.Bl -bullet -compat +.It +amd64: An Intel CPU with VT-x/EPT or AMD CPU with SVM support is required. +.It +arm64: The boot CPU must start in EL2 and the system must have a GICv3 interrupt +controller. +VHE support will be used if available. +.It +riscv: The CPUs must implement the H (hypervisor) RISC-V ISA extension. +.El .Pp PCI device passthrough to a virtual machine requires -hardware with VT-d support. +hardware with VT-d support and is available only on amd64. .Sh PCI PASSTHROUGH -When the hardware supports VT-d, and -.Nm -has been loaded at boot time, +On amd64 where the hardware supports VT-d, PCI devices can be reserved for use by the hypervisor. Entries consisting of the PCI .Ar bus Ns / Ns Ar slot Ns / Ns Ar function @@ -143,6 +150,8 @@ back: .Nm vmm.ko first appeared in .Fx 10.0 . +arm64 and riscv support first appeared in +.Fx 15.0 . .Sh AUTHORS .An Neel Natu Aq neel@freebsd.org .An Peter Grehan Aq grehan@freebsd.org From b09fe08ede8f90ab40f09eac49700698dd4d02b3 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 02:15:51 +0000 Subject: [PATCH 034/143] amd64/vmm: Remove vmm_mem_init() It is a no-op and doesn't exist on other platforms. As part of some work to deduplicate vmm code, just remove it. No functional change intended. Reviewed by: corvink Differential Revision: https://reviews.freebsd.org/D48267 --- sys/amd64/vmm/vmm.c | 6 ------ sys/amd64/vmm/vmm_mem.c | 7 ------- sys/amd64/vmm/vmm_mem.h | 1 - 3 files changed, 14 deletions(-) diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index d1f57a717fdf78..0e3ab2845d58fd 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -427,8 +427,6 @@ vm_exitinfo_cpuset(struct vcpu *vcpu) static int vmm_init(void) { - int error; - if (!vmm_is_hw_supported()) return (ENXIO); @@ -449,10 +447,6 @@ vmm_init(void) if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; - error = vmm_mem_init(); - if (error) - return (error); - vmm_suspend_p = vmmops_modsuspend; vmm_resume_p = vmmops_modresume; diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c index 0e953b6af534d0..e96c9e4bdc66bb 100644 --- a/sys/amd64/vmm/vmm_mem.c +++ b/sys/amd64/vmm/vmm_mem.c @@ -45,13 +45,6 @@ #include "vmm_mem.h" -int -vmm_mem_init(void) -{ - - return (0); -} - vm_object_t vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h index b237e08ccc053d..41b9bf07c4fc61 100644 --- a/sys/amd64/vmm/vmm_mem.h +++ b/sys/amd64/vmm/vmm_mem.h @@ -32,7 +32,6 @@ struct vmspace; struct vm_object; -int vmm_mem_init(void); struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); From c945c9ddca8d6db162334127a43ef8a0be1d3db1 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 02:16:04 +0000 Subject: [PATCH 035/143] amd64/vmm: Rename vm_get_vmspace() to vm_vmspace() For consistency with other vm accessors. No functional change intended. Reviewed by: corvink Differential Revision: https://reviews.freebsd.org/D48268 --- sys/amd64/include/vmm.h | 2 +- sys/amd64/vmm/vmm.c | 3 +-- sys/amd64/vmm/vmm_dev_machdep.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index dd8e76962cafc4..6501baa455daaa 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -401,7 +401,7 @@ vcpu_should_yield(struct vcpu *vcpu) void *vcpu_stats(struct vcpu *vcpu); void vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr); -struct vmspace *vm_get_vmspace(struct vm *vm); +struct vmspace *vm_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 0e3ab2845d58fd..d05d979a531a2b 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -2677,9 +2677,8 @@ vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) } struct vmspace * -vm_get_vmspace(struct vm *vm) +vm_vmspace(struct vm *vm) { - return (vm->vmspace); } diff --git a/sys/amd64/vmm/vmm_dev_machdep.c b/sys/amd64/vmm/vmm_dev_machdep.c index 2d0ceadaedfed0..d8d2b460404c42 100644 --- a/sys/amd64/vmm/vmm_dev_machdep.c +++ b/sys/amd64/vmm/vmm_dev_machdep.c @@ -441,7 +441,7 @@ vmmdev_machdep_ioctl(struct vm *vm, struct vcpu *vcpu, u_long cmd, caddr_t data, break; case VM_GET_GPA_PMAP: gpapte = (struct vm_gpa_pte *)data; - pmap_get_mapping(vmspace_pmap(vm_get_vmspace(vm)), + pmap_get_mapping(vmspace_pmap(vm_vmspace(vm)), gpapte->gpa, gpapte->pte, &gpapte->ptenum); error = 0; break; From fd94571ccf0c9c6521063c08a72a760873e87897 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 02:20:05 +0000 Subject: [PATCH 036/143] rawip: Take the inpcb lock when appropriate in rip_ctloutput() Reviewed by: glebius MFC after: 1 week Sponsored by: Klara, Inc. Sponsored by: Stormshield Differential Revision: https://reviews.freebsd.org/D48344 --- sys/netinet/raw_ip.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index a6bef1c7e2752f..3a0b9f632fb456 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -625,8 +625,6 @@ rip_send(struct socket *so, int pruflags, struct mbuf *m, struct sockaddr *nam, * * When adding new socket options here, make sure to add access control * checks here as necessary. - * - * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) @@ -637,7 +635,9 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) if (sopt->sopt_level != IPPROTO_IP) { if ((sopt->sopt_level == SOL_SOCKET) && (sopt->sopt_name == SO_SETFIB)) { + INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; + INP_WUNLOCK(inp); return (0); } return (EINVAL); @@ -707,10 +707,12 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) sizeof optval); if (error) break; + INP_WLOCK(inp); if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; + INP_WUNLOCK(inp); break; case IP_FW3: /* generic ipfw v.3 functions */ From cc0d806f63e833b9e011c0665905b2208b436c8b Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Mon, 6 Jan 2025 13:01:11 +0200 Subject: [PATCH 037/143] open(2): allow O_PATH | O_CREAT There is no reason to disallow creating the file opened for path. More, it might be a useful feature together with O_EXCL. Reviewed by: markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D48332 --- sys/kern/vfs_syscalls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 7a1677c945e316..bbd67c2b032697 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1185,7 +1185,7 @@ openatfp(struct thread *td, int dirfd, const char *path, * except O_EXEC is ignored. */ if ((flags & O_PATH) != 0) { - flags &= ~(O_CREAT | O_ACCMODE); + flags &= ~O_ACCMODE; } else if ((flags & O_EXEC) != 0) { if (flags & O_ACCMODE) return (EINVAL); From 749b3b2c0629f44f6b0044992dfb2ce5ac7e562b Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 7 Jan 2025 00:07:07 +0200 Subject: [PATCH 038/143] path_test: adjust test for open(O_PATH | O_CREAT) Instead of failing, it must succeed now. Reviewed by: markj Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D48332 --- tests/sys/file/path_test.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/sys/file/path_test.c b/tests/sys/file/path_test.c index 911c7c7075f0d4..b3b8b7cebd4db1 100644 --- a/tests/sys/file/path_test.c +++ b/tests/sys/file/path_test.c @@ -684,10 +684,14 @@ ATF_TC_BODY(path_io, tc) size_t page_size; int error, fd, pathfd, sd[2]; - /* It shouldn't be possible to create new files with O_PATH. */ + /* It is allowed to create new files with O_PATH. */ snprintf(path, sizeof(path), "path_io.XXXXXX"); ATF_REQUIRE_MSG(mktemp(path) == path, FMT_ERR("mktemp")); - ATF_REQUIRE_ERRNO(ENOENT, open(path, O_PATH | O_CREAT, 0600) < 0); + pathfd = open(path, O_PATH | O_CREAT, 0600); + ATF_REQUIRE_MSG(pathfd >= 0, FMT_ERR("open(O_PATH|O_CREAT)")); + /* Ensure that this is indeed O_PATH fd */ + ATF_REQUIRE_ERRNO(EBADF, write(pathfd, path, strlen(path)) == -1); + CHECKED_CLOSE(pathfd); /* Create a non-empty file for use in the rest of the tests. */ mktfile(path, "path_io.XXXXXX"); From 2372f8cc640c2f4ab82831e6ac0e27ab6c18321a Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Fri, 20 Dec 2024 14:23:50 +0000 Subject: [PATCH 039/143] LinuxKPI 802.11 / rtw88: make packets flow again In 886653492945f we added checks for packets to only go out if the station is known to the firmware (amongst others) as there are implications in drivers. Unfortunately rtw88 does not support the mac80211 (*sta_state)() KPI but only the fallback (*sta_add/remove)() in which case the station is only added to firmware when going from AUTH to ASSOC. That means we had no chance to get authenticated anymore. The problem has existed since June in main and stable/14 but only now was noticed in December with 14.2-R which makes me wonder. I am still not entirely sure what implications the missing checks have on all the other drivers using (*sta_state)() (or if they were really needed in first place beyond txq_ready) but I have run a few days of iwlwifi with this without extra trouble but I was not always able to reproduce problems in the past. Also people are occasionally still reporting the original "Invalid TXQ" error which indicates there is another lingering case somewhere. For the moment make rtw88 work again and expose the change to a wider audience. PR: 283142, 274382 Fixes: 886653492945f (make sure we can send DISASSOC or DEAUTH frames) Tested by: imb protected-networks.net, oleg.nauman gmail.com Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D48161 --- sys/compat/linuxkpi/common/src/linux_80211.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sys/compat/linuxkpi/common/src/linux_80211.c b/sys/compat/linuxkpi/common/src/linux_80211.c index edc3131286f5f2..d66bc40f40f68d 100644 --- a/sys/compat/linuxkpi/common/src/linux_80211.c +++ b/sys/compat/linuxkpi/common/src/linux_80211.c @@ -3704,7 +3704,16 @@ lkpi_ic_raw_xmit(struct ieee80211_node *ni, struct mbuf *m, lsta = ni->ni_drv_data; LKPI_80211_LSTA_TXQ_LOCK(lsta); +#if 0 if (!lsta->added_to_drv || !lsta->txq_ready) { +#else + /* + * Backout this part of 886653492945f which breaks rtw88 or + * in general drivers without (*sta_state)() but only the + * legacy fallback to (*sta_add)(). + */ + if (!lsta->txq_ready) { +#endif LKPI_80211_LSTA_TXQ_UNLOCK(lsta); /* * Free the mbuf (do NOT release ni ref for the m_pkthdr.rcvif! @@ -3952,7 +3961,16 @@ lkpi_80211_txq_task(void *ctx, int pending) * We also use txq_ready as a semaphore and will drain the txq manually * if needed on our way towards SCAN/INIT in the state machine. */ +#if 0 shall_tx = lsta->added_to_drv && lsta->txq_ready; +#else + /* + * Backout this part of 886653492945f which breaks rtw88 or + * in general drivers without (*sta_state)() but only the + * legacy fallback to (*sta_add)(). + */ + shall_tx = lsta->txq_ready; +#endif if (__predict_true(shall_tx)) mbufq_concat(&mq, &lsta->txq); /* From 91a4107d6d3028acd96df96de33b8a7665d3eb03 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 28 Dec 2024 10:00:09 +0000 Subject: [PATCH 040/143] ifconfig: remove debug printfs from set80211vhtconf() Anyone testing VHT options would wonder about these extra two printfs by now. Remove them from the tree before I have to do so locally again in another branch. Sponsored by: The FreeBSD Foundation Fixes: e9bb7f9aa1b4f MFC after: 1 week Reviewed by: adrian, emaste Differential Revision: https://reviews.freebsd.org/D48319 --- sbin/ifconfig/ifieee80211.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sbin/ifconfig/ifieee80211.c b/sbin/ifconfig/ifieee80211.c index 25de7fb1363256..396368798da344 100644 --- a/sbin/ifconfig/ifieee80211.c +++ b/sbin/ifconfig/ifieee80211.c @@ -1978,13 +1978,11 @@ set80211vhtconf(if_ctx *ctx, const char *val __unused, int d) { if (get80211val(ctx, IEEE80211_IOC_VHTCONF, &vhtconf) < 0) errx(-1, "cannot set VHT setting"); - printf("%s: vhtconf=0x%08x, d=%d\n", __func__, vhtconf, d); if (d < 0) { d = -d; vhtconf &= ~d; } else vhtconf |= d; - printf("%s: vhtconf is now 0x%08x\n", __func__, vhtconf); set80211(ctx, IEEE80211_IOC_VHTCONF, vhtconf, 0, NULL); } From 2be86b6cc168615e19350710347b77616c4b7f19 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 14:30:53 +0000 Subject: [PATCH 041/143] makefs: Remove dead code in inode_type() No functional change intended. MFC after: 1 week --- usr.sbin/makefs/walk.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/usr.sbin/makefs/walk.c b/usr.sbin/makefs/walk.c index 4018652299576f..fe1fe8df80dbcf 100644 --- a/usr.sbin/makefs/walk.c +++ b/usr.sbin/makefs/walk.c @@ -603,8 +603,6 @@ inode_type(mode_t mode) return ("symlink"); if (S_ISDIR(mode)) return ("dir"); - if (S_ISLNK(mode)) - return ("link"); if (S_ISFIFO(mode)) return ("fifo"); if (S_ISSOCK(mode)) From ce878284318e71217d8d8f43f7d590b6c338d3aa Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 14:31:02 +0000 Subject: [PATCH 042/143] makefs: Handle special file types when creating a zpool Previously, anything other than a regular file, directory or symlink would cause makefs to exit with an assertion failure. Make it a bit more resilient to user error: print a warning and skip the file. Add a regression test wherein we create an image from a devfs mount. PR: 283583 MFC after: 2 weeks --- usr.sbin/makefs/tests/makefs_zfs_tests.sh | 22 +++++++++++++ usr.sbin/makefs/zfs/fs.c | 39 +++++++++++++++++++---- 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/usr.sbin/makefs/tests/makefs_zfs_tests.sh b/usr.sbin/makefs/tests/makefs_zfs_tests.sh index aeda889d9a5c1d..3d5819439a733e 100644 --- a/usr.sbin/makefs/tests/makefs_zfs_tests.sh +++ b/usr.sbin/makefs/tests/makefs_zfs_tests.sh @@ -148,6 +148,27 @@ dataset_removal_cleanup() common_cleanup } +# +# Make sure that we can handle some special file types. Anything other than +# regular files, symlinks and directories are ignored. +# +atf_test_case devfs cleanup +devfs_body() +{ + atf_check mkdir dev + atf_check mount -t devfs none ./dev + + atf_check -e match:"skipping unhandled" $MAKEFS -s 1g -o rootpath=/ \ + -o poolname=$ZFS_POOL_NAME $TEST_IMAGE ./dev + + import_image +} +devfs_cleanup() +{ + common_cleanup + umount -f ./dev +} + # # Make sure that we can create and remove an empty directory. # @@ -842,6 +863,7 @@ atf_init_test_cases() atf_add_test_case autoexpand atf_add_test_case basic atf_add_test_case dataset_removal + atf_add_test_case devfs atf_add_test_case empty_dir atf_add_test_case empty_fs atf_add_test_case file_extend diff --git a/usr.sbin/makefs/zfs/fs.c b/usr.sbin/makefs/zfs/fs.c index 9413241da0c7d5..073dce3ce6978b 100644 --- a/usr.sbin/makefs/zfs/fs.c +++ b/usr.sbin/makefs/zfs/fs.c @@ -177,6 +177,13 @@ fsnode_isroot(const fsnode *cur) return (strcmp(cur->name, ".") == 0); } +static bool +fsnode_valid(const fsnode *cur) +{ + return (cur->type == S_IFREG || cur->type == S_IFDIR || + cur->type == S_IFLNK); +} + /* * Visit each node in a directory hierarchy, in pre-order depth-first order. */ @@ -186,9 +193,11 @@ fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) assert(root->type == S_IFDIR); for (fsnode *cur = root; cur != NULL; cur = cur->next) { - assert(cur->type == S_IFREG || cur->type == S_IFDIR || - cur->type == S_IFLNK); - + if (!fsnode_valid(cur)) { + warnx("skipping unhandled %s %s/%s", + inode_type(cur->type), cur->path, cur->name); + continue; + } if (cb(cur, arg) == 0) continue; if (cur->type == S_IFDIR && cur->child != NULL) @@ -381,9 +390,15 @@ fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, */ for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child; c != NULL; c = c->next) { - if (c->type == S_IFDIR) + switch (c->type) { + case S_IFDIR: links++; - objsize++; + /* FALLTHROUGH */ + case S_IFREG: + case S_IFLNK: + objsize++; + break; + } } /* The root directory is its own parent. */ @@ -652,6 +667,16 @@ fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) fs_populate_sattrs(arg, cur, dnode); } +static fsnode * +fsnode_next(fsnode *cur) +{ + for (cur = cur->next; cur != NULL; cur = cur->next) { + if (fsnode_valid(cur)) + return (cur); + } + return (NULL); +} + static int fs_foreach_populate(fsnode *cur, void *_arg) { @@ -678,7 +703,7 @@ fs_foreach_populate(fsnode *cur, void *_arg) ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; - if (cur->next == NULL && + if (fsnode_next(cur) == NULL && (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { /* * We reached a terminal node in a subtree. Walk back up and @@ -694,7 +719,7 @@ fs_foreach_populate(fsnode *cur, void *_arg) eclose(dir->dirfd); free(dir); cur = cur->parent; - } while (cur != NULL && cur->next == NULL && + } while (cur != NULL && fsnode_next(cur) == NULL && (cur->inode->flags & FI_ROOT) == 0); } From 596ee234ef4537e71f030e13598ecbe73ee697bb Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 14:31:15 +0000 Subject: [PATCH 043/143] ktrace: Make -t t trace struct arrays as well as structs Otherwise there is no specific -t option which captures struct arrays. MFC after: 1 week --- usr.bin/ktrace/ktrace.1 | 2 +- usr.bin/ktrace/subr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/usr.bin/ktrace/ktrace.1 b/usr.bin/ktrace/ktrace.1 index 6542bb88dffd8c..c2f046dc52872c 100644 --- a/usr.bin/ktrace/ktrace.1 +++ b/usr.bin/ktrace/ktrace.1 @@ -132,7 +132,7 @@ trace capability check failures .It Cm s trace signal processing .It Cm t -trace various structures +trace various structures and arrays of structures .It Cm u userland traces generated by .Xr utrace 2 diff --git a/usr.bin/ktrace/subr.c b/usr.bin/ktrace/subr.c index 6762fe9620cbd8..1db4c214414b08 100644 --- a/usr.bin/ktrace/subr.c +++ b/usr.bin/ktrace/subr.c @@ -70,7 +70,7 @@ getpoints(char *s) facs |= KTRFAC_PSIG; break; case 't': - facs |= KTRFAC_STRUCT; + facs |= KTRFAC_STRUCT | KTRFAC_STRUCT_ARRAY; break; case 'u': facs |= KTRFAC_USER; From d3bdfa583044dbfb76ef777939b86bb68baebee7 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 14:33:06 +0000 Subject: [PATCH 044/143] bhyve: Use a non-blocking read in slirp_recv() When using the slirp backend with the e1000 frontend, I otherwise get hangs in readv(), caused by the e1000 emulation not checking whether bytes are available before trying to read them. In particular, that device model expects the recv callback to return 0 if no bytes are available, and with slirp it would end up blocking forever. The virtio device model uses the peek_recvlen to check first, so I didn't notice the problem when implementing the slirp backend. Make the slirp backend more flexible to accommodate e1000. MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D48164 --- usr.sbin/bhyve/net_backend_slirp.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/usr.sbin/bhyve/net_backend_slirp.c b/usr.sbin/bhyve/net_backend_slirp.c index 5ae33801387cd2..d070d2cdfdb6fb 100644 --- a/usr.sbin/bhyve/net_backend_slirp.c +++ b/usr.sbin/bhyve/net_backend_slirp.c @@ -609,11 +609,22 @@ static ssize_t slirp_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) { struct slirp_priv *priv = NET_BE_PRIV(be); + struct msghdr hdr; ssize_t n; - n = readv(priv->pipe[0], iov, iovcnt); - if (n < 0) + hdr.msg_name = NULL; + hdr.msg_namelen = 0; + hdr.msg_iov = __DECONST(struct iovec *, iov); + hdr.msg_iovlen = iovcnt; + hdr.msg_control = NULL; + hdr.msg_controllen = 0; + hdr.msg_flags = 0; + n = recvmsg(priv->pipe[0], &hdr, MSG_DONTWAIT); + if (n < 0) { + if (errno == EWOULDBLOCK) + return (0); return (-1); + } assert(n <= SLIRP_MTU); return (n); } From 20a51e6073f488440e108c7c628231cd6ae6757e Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 14:33:45 +0000 Subject: [PATCH 045/143] bhyve: Implement the libslirp notify callback libslirp can invoke a callback when received data is removed from a socket buffer, generally because the guest ACKed some data. Previously it didn't do anything, but it needs to wake up the poll thread to get reasonable throughput. Suppose one is using scp to copy data into a guest filesystem via the slirp backend. Data is received on libslirp's socket, which we poll for data in slirp_pollfd_td_loop(). That data gets buffered in priv->pipe, and eventually is placed in the device model's RX rings by the backend's mevent handler. When implementing TCP, libslirp holds on to a copy of data until it's ACKed by the guest via slirp_send(), at which point it drops that data and invokes the notify callback. The initial implementation of this backend didn't take into account the fact that slirp_pollfds_fill() will not add libslirp's socket to the pollfd set if more than a threshold amount of data is already buffered. Then poll() needs to time out before the backend sends more data to the guest. With a default timeout of 500ms, this kills throughput. Use a pipe to implement a simple in-band signal to the poll thread so that it reacts quickly when more buffer space becomes available. MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D48192 --- usr.sbin/bhyve/net_backend_slirp.c | 90 ++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 22 deletions(-) diff --git a/usr.sbin/bhyve/net_backend_slirp.c b/usr.sbin/bhyve/net_backend_slirp.c index d070d2cdfdb6fb..171c5b5bdbbdb7 100644 --- a/usr.sbin/bhyve/net_backend_slirp.c +++ b/usr.sbin/bhyve/net_backend_slirp.c @@ -84,6 +84,18 @@ static slirp_new_p_t slirp_new_p; static slirp_pollfds_fill_p_t slirp_pollfds_fill_p; static slirp_pollfds_poll_p_t slirp_pollfds_poll_p; +static void +checked_close(int *fdp) +{ + int error; + + if (*fdp != -1) { + error = close(*fdp); + assert(error == 0); + *fdp = -1; + } +} + static int slirp_init_once(void) { @@ -134,7 +146,8 @@ struct slirp_priv { #define SLIRP_MTU 2048 struct mevent *mevp; - int pipe[2]; + int pipe[2]; /* used to buffer data sent to the guest */ + int wakeup[2]; /* used to wake up the pollfd thread */ pthread_t pollfd_td; struct pollfd *pollfds; @@ -151,6 +164,7 @@ slirp_priv_init(struct slirp_priv *priv) memset(priv, 0, sizeof(*priv)); priv->pipe[0] = priv->pipe[1] = -1; + priv->wakeup[0] = priv->wakeup[1] = -1; error = pthread_mutex_init(&priv->mtx, NULL); assert(error == 0); } @@ -160,14 +174,10 @@ slirp_priv_cleanup(struct slirp_priv *priv) { int error; - if (priv->pipe[0] != -1) { - error = close(priv->pipe[0]); - assert(error == 0); - } - if (priv->pipe[1] != -1) { - error = close(priv->pipe[1]); - assert(error == 0); - } + checked_close(&priv->pipe[0]); + checked_close(&priv->pipe[1]); + checked_close(&priv->wakeup[0]); + checked_close(&priv->wakeup[1]); if (priv->mevp) mevent_delete(priv->mevp); if (priv->slirp != NULL) @@ -188,8 +198,13 @@ slirp_cb_clock_get_ns(void *param __unused) } static void -slirp_cb_notify(void *param __unused) +slirp_cb_notify(void *param) { + struct slirp_priv *priv; + + /* Wake up the poll thread. We assume that priv->mtx is held here. */ + priv = param; + (void)write(priv->wakeup[1], "M", 1); } static void @@ -310,11 +325,19 @@ slirp_poll_revents(int idx, void *param) { struct slirp_priv *priv; struct pollfd *pollfd; + short revents; priv = param; + assert(idx >= 0); + assert((unsigned int)idx < priv->npollfds); pollfd = &priv->pollfds[idx]; assert(pollfd->fd != -1); - return (pollev2slirpev(pollfd->revents)); + + /* The kernel may report POLLHUP even if we didn't ask for it. */ + revents = pollfd->revents; + if ((pollfd->events & POLLHUP) == 0) + revents &= ~POLLHUP; + return (pollev2slirpev(revents)); } static void * @@ -331,9 +354,14 @@ slirp_pollfd_td_loop(void *param) pthread_mutex_lock(&priv->mtx); for (;;) { + int wakeup; + for (size_t i = 0; i < priv->npollfds; i++) priv->pollfds[i].fd = -1; + /* Register for notifications from slirp_cb_notify(). */ + wakeup = slirp_addpoll_cb(priv->wakeup[0], POLLIN, priv); + timeout = UINT32_MAX; slirp_pollfds_fill_p(priv->slirp, &timeout, slirp_addpoll_cb, priv); @@ -341,20 +369,32 @@ slirp_pollfd_td_loop(void *param) pollfds = priv->pollfds; npollfds = priv->npollfds; pthread_mutex_unlock(&priv->mtx); - for (;;) { - error = poll(pollfds, npollfds, timeout); - if (error == -1) { - if (errno != EINTR) { - EPRINTLN("poll: %s", strerror(errno)); - exit(1); - } - continue; - } - break; + error = poll(pollfds, npollfds, timeout); + if (error == -1 && errno != EINTR) { + EPRINTLN("poll: %s", strerror(errno)); + exit(1); } pthread_mutex_lock(&priv->mtx); slirp_pollfds_poll_p(priv->slirp, error == -1, slirp_poll_revents, priv); + + /* + * If we were woken up by the notify callback, mask the + * interrupt. + */ + if ((pollfds[wakeup].revents & POLLIN) != 0) { + ssize_t n; + + do { + uint8_t b; + + n = read(priv->wakeup[0], &b, 1); + } while (n == 1); + if (n != -1 || errno != EAGAIN) { + EPRINTLN("read(wakeup): %s", strerror(errno)); + exit(1); + } + } } } @@ -510,12 +550,18 @@ _slirp_init(struct net_backend *be, const char *devname __unused, free(tofree); } - error = socketpair(PF_LOCAL, SOCK_DGRAM, 0, priv->pipe); + error = socketpair(PF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0, priv->pipe); if (error != 0) { EPRINTLN("Unable to create pipe: %s", strerror(errno)); goto err; } + error = pipe2(priv->wakeup, O_CLOEXEC | O_NONBLOCK); + if (error != 0) { + EPRINTLN("Unable to create wakeup pipe: %s", strerror(errno)); + goto err; + } + /* * Try to avoid dropping buffered packets in slirp_cb_send_packet(). */ From f1aeb5d850cf26418fb70a16d1304b92c45b5f1d Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Tue, 7 Jan 2025 12:16:57 +0000 Subject: [PATCH 046/143] LinuxKPI: 802.11: add a print mask for ieee80211_rx_status_flags bits Add a print mask for use with %b to aid debugging. It is a lot easier to read names than numbers. Sponsored by: The FreeBSD Foundation MFC after: 3 days --- sys/compat/linuxkpi/common/include/net/mac80211.h | 11 +++++++++++ sys/compat/linuxkpi/common/src/linux_80211.c | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sys/compat/linuxkpi/common/include/net/mac80211.h b/sys/compat/linuxkpi/common/include/net/mac80211.h index dff152caf14094..3aa383554e9310 100644 --- a/sys/compat/linuxkpi/common/include/net/mac80211.h +++ b/sys/compat/linuxkpi/common/include/net/mac80211.h @@ -625,6 +625,17 @@ enum ieee80211_rx_status_flags { RX_FLAG_FAILED_PLCP_CRC = BIT(31), }; +#define IEEE80211_RX_STATUS_FLAGS_BITS \ + "\20\1ALLOW_SAME_PN\2AMPDU_DETAILS\3AMPDU_EOF_BIT\4AMPDU_EOF_BIT_KNOWN" \ + "\5DECRYPTED\6DUP_VALIDATED\7FAILED_FCS_CRC\10ICV_STRIPPED" \ + "\11MACTIME_PLCP_START\12MACTIME_START\13MIC_STRIPPED" \ + "\14MMIC_ERROR\15MMIC_STRIPPED\16NO_PSDU\17PN_VALIDATED" \ + "\20RADIOTAP_HE\21RADIOTAP_HE_MU\22RADIOTAP_LSIG\23RADIOTAP_VENDOR_DATA" \ + "\24NO_SIGNAL_VAL\25IV_STRIPPED\26AMPDU_IS_LAST\27AMPDU_LAST_KNOWN" \ + "\30AMSDU_MORE\31MACTIME_END\32ONLY_MONITOR\33SKIP_MONITOR" \ + "\348023\35RADIOTAP_TLV_AT_END\36MACTIME\37MACTIME_IS_RTAP_TS64" \ + "\40FAILED_PLCP_CRC" + enum mac80211_rx_encoding { RX_ENC_LEGACY = 0, RX_ENC_HT, diff --git a/sys/compat/linuxkpi/common/src/linux_80211.c b/sys/compat/linuxkpi/common/src/linux_80211.c index d66bc40f40f68d..77e23775ba1951 100644 --- a/sys/compat/linuxkpi/common/src/linux_80211.c +++ b/sys/compat/linuxkpi/common/src/linux_80211.c @@ -5333,13 +5333,13 @@ linuxkpi_ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, /* Implement a dump_rxcb() !!! */ if (linuxkpi_debug_80211 & D80211_TRACE_RX) - printf("TRACE-RX: %s: RXCB: %ju %ju %u, %#0x, %u, %#0x, %#0x, " + printf("TRACE-RX: %s: RXCB: %ju %ju %u, %b, %u, %#0x, %#0x, " "%u band %u, %u { %d %d %d %d }, %d, %#x %#x %#x %#x %u %u %u\n", __func__, (uintmax_t)rx_status->boottime_ns, (uintmax_t)rx_status->mactime, rx_status->device_timestamp, - rx_status->flag, + rx_status->flag, IEEE80211_RX_STATUS_FLAGS_BITS, rx_status->freq, rx_status->bw, rx_status->encoding, From cf71349a23f02b55bd3bb4973decac87f7a7d2b8 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 4 Jan 2025 08:01:24 +0000 Subject: [PATCH 047/143] ifconfig: 802.11: fix indentation of a line No functional changes. Sponsored by: The FreeBSD Foundation MFC after: 3 days --- sbin/ifconfig/ifieee80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbin/ifconfig/ifieee80211.c b/sbin/ifconfig/ifieee80211.c index 396368798da344..77f7bdabb0b2fc 100644 --- a/sbin/ifconfig/ifieee80211.c +++ b/sbin/ifconfig/ifieee80211.c @@ -2296,7 +2296,7 @@ regdomain_addchans(if_ctx *ctx, struct ieee80211req_chaninfo *ci, memset(c, 0, sizeof(*c)); c->ic_freq = freq; c->ic_flags = flags; - if (c->ic_flags & IEEE80211_CHAN_DFS) + if (c->ic_flags & IEEE80211_CHAN_DFS) c->ic_maxregpower = nb->maxPowerDFS; else c->ic_maxregpower = nb->maxPower; From e6d40f90110ad8026f1af3fa68f836463936ea78 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Tue, 7 Jan 2025 11:56:07 +0000 Subject: [PATCH 048/143] net80211: correct typo s/Insure/Ensure/ No functional changes. Sposnored by: The FreeBSD Foundation MFC after: 3 days Reviewed by: emaste Differential Revision: https://reviews.freebsd.org/D48358 --- sys/net80211/ieee80211_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/net80211/ieee80211_crypto.c b/sys/net80211/ieee80211_crypto.c index d70b3aa4a24a12..e1fac3a624e8e3 100644 --- a/sys/net80211/ieee80211_crypto.c +++ b/sys/net80211/ieee80211_crypto.c @@ -741,7 +741,7 @@ ieee80211_crypto_decap(struct ieee80211_node *ni, struct mbuf *m, int hdrlen, k = &ni->ni_ucastkey; /* - * Insure crypto header is contiguous and long enough for all + * Ensure crypto header is contiguous and long enough for all * decap work. */ cip = k->wk_cipher; From 6ba2c036a0117ac02f9979b7dc49f15e9c1ea9c9 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 7 Jan 2025 01:29:18 +0200 Subject: [PATCH 049/143] pci_find_cap_method(): limit number of iterations for finding a capability Powered down device might return 0xff of extended config registers reads, causing loop. PR: 283815 Reviewed by: imp Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D48348 --- sys/dev/pci/pci.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index cf01115581e16b..0b02d873c5a190 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -1519,6 +1519,7 @@ pci_find_cap_method(device_t dev, device_t child, int capability, pcicfgregs *cfg = &dinfo->cfg; uint32_t status; uint8_t ptr; + int cnt; /* * Check the CAP_LIST bit of the PCI status register first. @@ -1545,9 +1546,11 @@ pci_find_cap_method(device_t dev, device_t child, int capability, ptr = pci_read_config(child, ptr, 1); /* - * Traverse the capabilities list. + * Traverse the capabilities list. Limit by total theoretical + * maximum number of caps: capability needs at least id and + * next registers, and any type X header cannot contain caps. */ - while (ptr != 0) { + for (cnt = 0; ptr != 0 && cnt < (PCIE_REGMAX - 0x40) / 2; cnt++) { if (pci_read_config(child, ptr + PCICAP_ID, 1) == capability) { if (capreg != NULL) *capreg = ptr; From f73c9b5da190954a81e9e70e2caa8e9168623bfd Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Tue, 7 Jan 2025 11:04:00 -0500 Subject: [PATCH 050/143] mi_switch.9: Remove cpu_switch, cpu_throw cpu_machdep.9 was added to document cpu_*, but cpu_switch and cpu_throw were already documented in mi_switch.9, and MLINKed. cpu_machdep.9 seems like the correct place for this, so remove them from mi_switch.9. Some of the removed text was stale, although there are few notes that ought to be added to cpu_machdep.9 in a future commit. Reported by: tools/pkgbase/metalog_reader.lua Reviewed by: jhb Sponsored by: The FreeBSD Foundation Fixes: 9c87cbbcaaed ("cpu_machdep.9: New manpage describing the semantics of several cpu_*") Differential Revision: https://reviews.freebsd.org/D48360 --- share/man/man9/Makefile | 2 -- share/man/man9/mi_switch.9 | 68 +++----------------------------------- 2 files changed, 4 insertions(+), 66 deletions(-) diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile index 91a7bbe294fa1d..c09d3aa554a115 100644 --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -1594,8 +1594,6 @@ MLINKS+=microuptime.9 binuptime.9 \ microuptime.9 getsbinuptime.9 \ microuptime.9 nanouptime.9 \ microuptime.9 sbinuptime.9 -MLINKS+=mi_switch.9 cpu_switch.9 \ - mi_switch.9 cpu_throw.9 MLINKS+=mod_cc.9 CCV.9 \ mod_cc.9 DECLARE_CC_MODULE.9 MLINKS+=mtx_pool.9 mtx_pool_alloc.9 \ diff --git a/share/man/man9/mi_switch.9 b/share/man/man9/mi_switch.9 index 549ec497434cc6..e04c2ee35acebe 100644 --- a/share/man/man9/mi_switch.9 +++ b/share/man/man9/mi_switch.9 @@ -31,23 +31,17 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd January 9, 2023 +.Dd January 7, 2025 .Dt MI_SWITCH 9 .Os .Sh NAME -.Nm mi_switch , -.Nm cpu_switch , -.Nm cpu_throw +.Nm mi_switch .Nd switch to another thread context .Sh SYNOPSIS .In sys/param.h .In sys/proc.h .Ft void .Fn mi_switch "int flags" -.Ft void -.Fn cpu_switch "struct thread *oldtd" "struct thread *newtd" "struct mtx *lock" -.Ft void -.Fn cpu_throw "struct thread *oldtd" "struct thread *newtd" .Sh DESCRIPTION The .Fn mi_switch @@ -168,63 +162,9 @@ running thread .Fa oldtd to the chosen thread .Fa newtd . -First, it saves the context of -.Fa oldtd -to its Process Control Block -.Po -PCB, -.Vt struct pcb -.Pc , -pointed at by -.Va oldtd->td_pcb . -The function then updates important per-CPU state such as the -.Dv curthread -variable, and activates -.Fa newtd\&'s -virtual address space using its associated -.Xr pmap 9 -structure. -Finally, it reads in the saved context from -.Fa newtd\&'s -PCB. -CPU instruction flow continues in the new thread context, on -.Fa newtd\&'s -kernel stack. -The return from -.Fn cpu_switch -can be understood as a completion of the function call initiated by -.Fa newtd -when it was previously switched out, at some point in the distant (relative to -CPU time) past. -.Pp -The -.Fa mtx -argument to -.Fn cpu_switch -is used to pass the mutex which will be stored as -.Fa oldtd\&'s -thread lock at the moment that -.Fa oldtd -is completely switched out. -This is an implementation detail of -.Fn sched_switch . -.Pp -.Fn cpu_throw -is similar to -.Fn cpu_switch -except that it does not save the context of the old thread. -This function is useful when the kernel does not have an old thread -context to save, such as when CPUs other than the boot CPU perform their -first task switch, or when the kernel does not care about the state of the -old thread, such as in -.Xr thread_exit 9 -when the kernel terminates the current thread and switches into a new -thread, -.Fa newtd . -The -.Fa oldtd -argument is unused. .Sh SEE ALSO +.Xr cpu_switch 9 , +.Xr cpu_throw 9 , .Xr critical_exit 9 , .Xr issignal 9 , .Xr kern_yield 9 , From 07f6575585bf69ae48dffe87c4578057ae4782d8 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 28 Dec 2024 09:52:45 +0000 Subject: [PATCH 051/143] LinuxKPI: 802.11: turn on debugfs for iwlwifi and rtw88 Make iwlwifi compile with debugfs after the last updates and turn it on for both iwlwifi and rtw88 in order to be able to get at least some useful information on driver/firwmare state. Sponsored by: The FreeBSD Foundation MFC after: 10 days --- sys/compat/linuxkpi/common/include/net/mac80211.h | 4 ++++ sys/contrib/dev/iwlwifi/mvm/debugfs-vif.c | 2 ++ sys/modules/iwlwifi/Makefile | 2 +- sys/modules/rtw88/Makefile | 5 ++++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/sys/compat/linuxkpi/common/include/net/mac80211.h b/sys/compat/linuxkpi/common/include/net/mac80211.h index 3aa383554e9310..fe36f1adf28ade 100644 --- a/sys/compat/linuxkpi/common/include/net/mac80211.h +++ b/sys/compat/linuxkpi/common/include/net/mac80211.h @@ -737,6 +737,7 @@ struct ieee80211_sta_agg { }; struct ieee80211_link_sta { + struct ieee80211_sta *sta; uint8_t addr[ETH_ALEN]; uint8_t link_id; uint32_t supp_rates[NUM_NL80211_BANDS]; @@ -1121,6 +1122,9 @@ struct ieee80211_ops { /* #ifdef CONFIG_MAC80211_DEBUGFS */ /* Do not change depending on compile-time option. */ void (*sta_add_debugfs)(struct ieee80211_hw *, struct ieee80211_vif *, struct ieee80211_sta *, struct dentry *); + void (*vif_add_debugfs)(struct ieee80211_hw *, struct ieee80211_vif *); + void (*link_sta_add_debugfs)(struct ieee80211_hw *, struct ieee80211_vif *, struct ieee80211_link_sta *, struct dentry *); + void (*link_add_debugfs)(struct ieee80211_hw *, struct ieee80211_vif *, struct ieee80211_bss_conf *, struct dentry *); /* #endif */ }; diff --git a/sys/contrib/dev/iwlwifi/mvm/debugfs-vif.c b/sys/contrib/dev/iwlwifi/mvm/debugfs-vif.c index aa505895532349..0b3bc62f39a707 100644 --- a/sys/contrib/dev/iwlwifi/mvm/debugfs-vif.c +++ b/sys/contrib/dev/iwlwifi/mvm/debugfs-vif.c @@ -888,10 +888,12 @@ void iwl_mvm_vif_add_debugfs(struct ieee80211_hw *hw, struct ieee80211_vif *vif) void iwl_mvm_vif_dbgfs_add_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif) { struct dentry *dbgfs_dir = vif->debugfs_dir; +#if defined(__linux__) struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); char buf[3 * 3 + 11 + (NL80211_WIPHY_NAME_MAXLEN + 1) + (7 + IFNAMSIZ + 1) + 6 + 1]; char name[7 + IFNAMSIZ + 1]; +#endif /* this will happen in monitor mode */ if (!dbgfs_dir) diff --git a/sys/modules/iwlwifi/Makefile b/sys/modules/iwlwifi/Makefile index a8176383a13c04..c41a1a1757c1ab 100644 --- a/sys/modules/iwlwifi/Makefile +++ b/sys/modules/iwlwifi/Makefile @@ -3,7 +3,7 @@ DEVIWLWIFIDIR= ${SRCTOP}/sys/contrib/dev/iwlwifi .PATH: ${DEVIWLWIFIDIR} WITH_CONFIG_PM= 0 -WITH_DEBUGFS= 0 +WITH_DEBUGFS= 1 KMOD= if_iwlwifi diff --git a/sys/modules/rtw88/Makefile b/sys/modules/rtw88/Makefile index 19e77b271c3722..486197d8c95257 100644 --- a/sys/modules/rtw88/Makefile +++ b/sys/modules/rtw88/Makefile @@ -3,6 +3,7 @@ DEVRTW88DIR= ${SRCTOP}/sys/contrib/dev/rtw88 .PATH: ${DEVRTW88DIR} WITH_CONFIG_PM= 0 +WITH_DEBUGFS= 1 KMOD= if_rtw88 @@ -39,6 +40,8 @@ CFLAGS+= -DLINUXKPI_VERSION=60800 CFLAGS+= -I${DEVRTW88DIR} CFLAGS+= ${LINUXKPI_INCLUDES} CFLAGS+= -DCONFIG_RTW88_DEBUG -#CFLAGS+= -DCONFIG_RTW88_DEBUGFS +.if defined(WITH_DEBUGFS) && ${WITH_DEBUGFS} > 0 +CFLAGS+= -DCONFIG_RTW88_DEBUGFS +.endif .include From 42410c6d682c4e00ce6147f99b51a55f6f3fe075 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 4 Jan 2025 16:58:49 +0000 Subject: [PATCH 052/143] ifconfig: make -vht work Also hide the other vht options on -vht and only show vht40/80/160/80p80 when vht is enabled. While here fix some whitespace and comments. Sponsored by: The FreeBSD Foundation MFC after: 3 days Reviewed by: adrian, emaste Differential Revision: https://reviews.freebsd.org/D48326 --- sbin/ifconfig/ifieee80211.c | 45 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/sbin/ifconfig/ifieee80211.c b/sbin/ifconfig/ifieee80211.c index 77f7bdabb0b2fc..d4dcfdf63254ff 100644 --- a/sbin/ifconfig/ifieee80211.c +++ b/sbin/ifconfig/ifieee80211.c @@ -198,8 +198,10 @@ static int gottxparams = 0; static struct ieee80211_channel curchan; static int gotcurchan = 0; static struct ifmediareq *global_ifmr; + +/* HT */ static int htconf = 0; -static int gothtconf = 0; +static int gothtconf = 0; static void gethtconf(if_ctx *ctx) @@ -213,7 +215,7 @@ gethtconf(if_ctx *ctx) /* VHT */ static int vhtconf = 0; -static int gotvhtconf = 0; +static int gotvhtconf = 0; static void getvhtconf(if_ctx *ctx) @@ -5416,26 +5418,27 @@ ieee80211_status(if_ctx *ctx) if (IEEE80211_IS_CHAN_VHT(c) || verbose) { getvhtconf(ctx); - if (vhtconf & IEEE80211_FVHT_VHT) + if (vhtconf & IEEE80211_FVHT_VHT) { LINE_CHECK("vht"); - else + + if (vhtconf & IEEE80211_FVHT_USEVHT40) + LINE_CHECK("vht40"); + else + LINE_CHECK("-vht40"); + if (vhtconf & IEEE80211_FVHT_USEVHT80) + LINE_CHECK("vht80"); + else + LINE_CHECK("-vht80"); + if (vhtconf & IEEE80211_FVHT_USEVHT160) + LINE_CHECK("vht160"); + else + LINE_CHECK("-vht160"); + if (vhtconf & IEEE80211_FVHT_USEVHT80P80) + LINE_CHECK("vht80p80"); + else + LINE_CHECK("-vht80p80"); + } else if (verbose) LINE_CHECK("-vht"); - if (vhtconf & IEEE80211_FVHT_USEVHT40) - LINE_CHECK("vht40"); - else - LINE_CHECK("-vht40"); - if (vhtconf & IEEE80211_FVHT_USEVHT80) - LINE_CHECK("vht80"); - else - LINE_CHECK("-vht80"); - if (vhtconf & IEEE80211_FVHT_USEVHT160) - LINE_CHECK("vht160"); - else - LINE_CHECK("-vht160"); - if (vhtconf & IEEE80211_FVHT_USEVHT80P80) - LINE_CHECK("vht80p80"); - else - LINE_CHECK("-vht80p80"); } if (get80211val(ctx, IEEE80211_IOC_WME, &wme) != -1) { @@ -6029,7 +6032,7 @@ static struct cmd ieee80211_cmds[] = { DEF_CMD("ht", 3, set80211htconf), /* NB: 20+40 */ DEF_CMD("-ht", 0, set80211htconf), DEF_CMD("vht", IEEE80211_FVHT_VHT, set80211vhtconf), - DEF_CMD("-vht", 0, set80211vhtconf), + DEF_CMD("-vht", -IEEE80211_FVHT_VHT, set80211vhtconf), DEF_CMD("vht40", IEEE80211_FVHT_USEVHT40, set80211vhtconf), DEF_CMD("-vht40", -IEEE80211_FVHT_USEVHT40, set80211vhtconf), DEF_CMD("vht80", IEEE80211_FVHT_USEVHT80, set80211vhtconf), From 1832eb102e10c7f2891c032ecf7b265b75d3cd50 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 4 Jan 2025 08:02:19 +0000 Subject: [PATCH 053/143] net80211: add missing 80Mhz and 160Mhz channel ranges We have two arrays, one for 80Mhz and one for 160Mhz. Both were lacking frequency ranges for more possibly available configurations (the other bits of what is valid are for regdomain to set right). Sponsored by: The FreeBSD Foundation MFC after: 3 days Fixes: 67f4aa3878efa, 04e7bb08a5750 Reviewed by: adrian Differential Revision: https://reviews.freebsd.org/D48357 --- sys/net80211/ieee80211.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/net80211/ieee80211.c b/sys/net80211/ieee80211.c index ccb7efaa4df584..49d313e5077d2b 100644 --- a/sys/net80211/ieee80211.c +++ b/sys/net80211/ieee80211.c @@ -1196,12 +1196,14 @@ struct vht_chan_range vht80_chan_ranges[] = { { 5570, 5650 }, { 5650, 5730 }, { 5735, 5815 }, + { 5815, 5895 }, { 0, 0 } }; struct vht_chan_range vht160_chan_ranges[] = { { 5170, 5330 }, { 5490, 5650 }, + { 5735, 5895 }, { 0, 0 } }; From 254a2b767f9a39f1541e0a07a70bbe269e86ad70 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 7 Jan 2025 17:58:58 +0000 Subject: [PATCH 054/143] x86: Short-circuit ipi_all_but_self() on UP systems Apparently this is required on old intel hw, see https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=275086#c3 PR: 275086 Reviewed by: mav, kib Fixes: 279cd05b7e4d ("Use APIC_IPI_DEST_OTHERS for bitmapped IPIs too.") MFC after: 1 week Diagnosed by: Ben Wilber Differential Revision: https://reviews.freebsd.org/D48361 --- sys/x86/x86/mp_x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index 493017e303e3d9..c0da41a4d2229b 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1425,6 +1425,9 @@ ipi_all_but_self(u_int ipi) cpuset_t other_cpus; int cpu, c; + if (mp_ncpus == 1) + return; + /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. From a3a308f0f29b14d522c56dd88231e27fdf206104 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 4 Jan 2025 07:58:48 +0000 Subject: [PATCH 055/143] lib80211: regdomain: add the two other 160MHz bands ETSI had one 160Mhz band in regdomain but the other two were missing. Add them. I am always confused that the bands use the center frequency of the 20Mhz edge channels rather than the actual edges so it seems we are only configuring 140Mhz instead of 160Mhz. We will have to go through the entire regdomain file one day and make sure to verify all the power levels. Sponsored by: The FreeSBD Foundation MFC after: 3 days Reviewed by: adrian Differential Revision: https://reviews.freebsd.org/D48356 --- lib/lib80211/regdomain.xml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/lib80211/regdomain.xml b/lib/lib80211/regdomain.xml index 5a432f39ccf644..557af0349cb05a 100644 --- a/lib/lib80211/regdomain.xml +++ b/lib/lib80211/regdomain.xml @@ -572,6 +572,13 @@ IEEE80211_CHAN_VHT80 INDOOR + + + 22 + IEEE80211_CHAN_HT40 + IEEE80211_CHAN_VHT160 + IEEE80211_CHAN_DFS + @@ -651,6 +658,13 @@ IEEE80211_CHAN_VHT80 IEEE80211_CHAN_DFS + + + 13 + IEEE80211_CHAN_HT40 + IEEE80211_CHAN_VHT160 + IEEE80211_CHAN_DFS + @@ -1905,6 +1919,11 @@ 80 20 IEEE80211_CHAN_A + + 5180 5320 + 160 20 + IEEE80211_CHAN_A + 5260 5340 @@ -1958,6 +1977,11 @@ 80 20 IEEE80211_CHAN_A + + 5745 5885 + 160 20 + IEEE80211_CHAN_A + 5180 5240 40 20 From 5fdc4824a5e2646a07c0638eca9f5c81b0b85fd5 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 4 Jan 2025 08:06:58 +0000 Subject: [PATCH 056/143] net80211: (v)ht: use macros at hand Rather than duplicating the manual logic here and leaving a comment, use the self-explanatory macros we already have. No functional changes intended. Sponsored by: The FreeBSD Foundation MFC after: 3 days Reviewed by: adrian Differential Revision: https://reviews.freebsd.org/D48359 --- sys/net80211/ieee80211_ht.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c index e2506c1e0ce09b..2ec5ffb1a2afad 100644 --- a/sys/net80211/ieee80211_ht.c +++ b/sys/net80211/ieee80211_ht.c @@ -1934,9 +1934,7 @@ ieee80211_vht_get_vhtflags(struct ieee80211_node *ni, uint32_t htflags) vhtflags = 0; if (ni->ni_flags & IEEE80211_NODE_VHT && vap->iv_vht_flags & IEEE80211_FVHT_VHT) { if ((ni->ni_vht_chanwidth == IEEE80211_VHT_CHANWIDTH_160MHZ) && - /* XXX 2 means "160MHz and 80+80MHz", 1 means "160MHz" */ - (_IEEE80211_MASKSHIFT(vap->iv_vht_cap.vht_cap_info, - IEEE80211_VHTCAP_SUPP_CHAN_WIDTH_MASK) >= 1) && + IEEE80211_VHTCAP_SUPP_CHAN_WIDTH_IS_160MHZ(vap->iv_vht_cap.vht_cap_info) && (vap->iv_vht_flags & IEEE80211_FVHT_USEVHT160)) { vhtflags = IEEE80211_CHAN_VHT160; /* Mirror the HT40 flags */ @@ -1946,9 +1944,7 @@ ieee80211_vht_get_vhtflags(struct ieee80211_node *ni, uint32_t htflags) vhtflags |= IEEE80211_CHAN_HT40D; } } else if ((ni->ni_vht_chanwidth == IEEE80211_VHT_CHANWIDTH_80P80MHZ) && - /* XXX 2 means "160MHz and 80+80MHz" */ - (_IEEE80211_MASKSHIFT(vap->iv_vht_cap.vht_cap_info, - IEEE80211_VHTCAP_SUPP_CHAN_WIDTH_MASK) == 2) && + IEEE80211_VHTCAP_SUPP_CHAN_WIDTH_IS_160_80P80MHZ(vap->iv_vht_cap.vht_cap_info) && (vap->iv_vht_flags & IEEE80211_FVHT_USEVHT80P80)) { vhtflags = IEEE80211_CHAN_VHT80P80; /* Mirror the HT40 flags */ From 2c8b0d6205f6f98855773e3a82640b50abb2f2f6 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sun, 29 Dec 2024 08:07:48 +0000 Subject: [PATCH 057/143] net80211 / LinuxKPI 802.11: correct enum ieee80211_sta_rx_bw When moving the enum from LinuxKPI to net80211 it got adjusted to be used in net80211 style in order to use it with a print_mask (%b). Turns out that change broke assumptions given the minimum value of BW_20 no longer was 0. Adjust it back to a plain enum starting at 0 and use an inline function to convert to value names. Pointy hat to: bz Fixes: ca389486a9599768e0ba69dca13c208020623083 MFC after: 3 days Sponsored by: The FreeBSD Foundation Reviewed by: adrian Differential Revision: https://reviews.freebsd.org/D48375 --- sys/net80211/ieee80211_ddb.c | 4 ++-- sys/net80211/ieee80211_ht.c | 4 ++-- sys/net80211/ieee80211_node.c | 4 ++-- sys/net80211/ieee80211_node.h | 26 +++++++++++++++++++------- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/sys/net80211/ieee80211_ddb.c b/sys/net80211/ieee80211_ddb.c index 0050038457c787..05b370eafa3871 100644 --- a/sys/net80211/ieee80211_ddb.c +++ b/sys/net80211/ieee80211_ddb.c @@ -294,9 +294,9 @@ _db_show_sta(const struct ieee80211_node *ni) db_printf("\thtcap %b htparam 0x%x htctlchan %u ht2ndchan %u\n", ni->ni_htcap, IEEE80211_HTCAP_BITS, ni->ni_htparam, ni->ni_htctlchan, ni->ni_ht2ndchan); - db_printf("\thtopmode 0x%x htstbc 0x%x chw %b\n", + db_printf("\thtopmode 0x%x htstbc 0x%x chw %d (%s)\n", ni->ni_htopmode, ni->ni_htstbc, - ni->ni_chw, IEEE80211_NI_CHW_BITS); + ni->ni_chw, ieee80211_ni_chw_to_str(ni->ni_chw)); /* XXX ampdu state */ for (i = 0; i < WME_NUM_TID; i++) diff --git a/sys/net80211/ieee80211_ht.c b/sys/net80211/ieee80211_ht.c index 2ec5ffb1a2afad..9e047244cc3b47 100644 --- a/sys/net80211/ieee80211_ht.c +++ b/sys/net80211/ieee80211_ht.c @@ -2604,8 +2604,8 @@ ht_recv_action_ht_txchwidth(struct ieee80211_node *ni, IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_11N, ni, - "%s: HT txchwidth, width %b%s", - __func__, chw, IEEE80211_NI_CHW_BITS, ni->ni_chw != chw ? "*" : ""); + "%s: HT txchwidth, width %d%s (%s)", __func__, + chw, ni->ni_chw != chw ? "*" : "", ieee80211_ni_chw_to_str(chw)); if (chw != ni->ni_chw) { /* XXX does this need to change the ht40 station count? */ ni->ni_chw = chw; diff --git a/sys/net80211/ieee80211_node.c b/sys/net80211/ieee80211_node.c index d2a4558970f925..17ddc8533e41cc 100644 --- a/sys/net80211/ieee80211_node.c +++ b/sys/net80211/ieee80211_node.c @@ -2672,9 +2672,9 @@ ieee80211_dump_node(struct ieee80211_node_table *nt __unused, printf("\thtcap %x htparam %x htctlchan %u ht2ndchan %u\n", ni->ni_htcap, ni->ni_htparam, ni->ni_htctlchan, ni->ni_ht2ndchan); - printf("\thtopmode %x htstbc %x htchw %b\n", + printf("\thtopmode %x htstbc %x htchw %d (%s)\n", ni->ni_htopmode, ni->ni_htstbc, - ni->ni_chw, IEEE80211_NI_CHW_BITS); + ni->ni_chw, ieee80211_ni_chw_to_str(ni->ni_chw)); printf("\tvhtcap %x freq1 %d freq2 %d vhtbasicmcs %x\n", ni->ni_vhtcap, (int) ni->ni_vht_chan1, (int) ni->ni_vht_chan2, (int) ni->ni_vht_basicmcs); diff --git a/sys/net80211/ieee80211_node.h b/sys/net80211/ieee80211_node.h index 1f36ceb368b93e..0039c743544ce0 100644 --- a/sys/net80211/ieee80211_node.h +++ b/sys/net80211/ieee80211_node.h @@ -115,17 +115,29 @@ enum ieee80211_mesh_mlstate { * flags. This allows us to keep the uint8_t slot for ni_chw in * struct ieee80211_node and means we do not have to sync to the value for * LinuxKPI. + * + * NB: BW_20 needs to 0 and values need to be sorted! Cannot make it + * bitfield-alike for use with %b. */ enum ieee80211_sta_rx_bw { - IEEE80211_STA_RX_BW_20 = 0x01, - IEEE80211_STA_RX_BW_40 = 0x02, - IEEE80211_STA_RX_BW_80 = 0x04, - IEEE80211_STA_RX_BW_160 = 0x08, - IEEE80211_STA_RX_BW_320 = 0x10, + IEEE80211_STA_RX_BW_20 = 0x00, + IEEE80211_STA_RX_BW_40, + IEEE80211_STA_RX_BW_80, + IEEE80211_STA_RX_BW_160, + IEEE80211_STA_RX_BW_320, } __packed; -#define IEEE80211_NI_CHW_BITS \ - "\20\1BW_20\2BW_40\3BW_80\4BW_160\5BW_320" +static inline const char * +ieee80211_ni_chw_to_str(enum ieee80211_sta_rx_bw bw) +{ + switch (bw) { + case IEEE80211_STA_RX_BW_20: return ("BW_20"); + case IEEE80211_STA_RX_BW_40: return ("BW_40"); + case IEEE80211_STA_RX_BW_80: return ("BW_80"); + case IEEE80211_STA_RX_BW_160: return ("BW_160"); + case IEEE80211_STA_RX_BW_320: return ("BW_320"); + } +} /* * Node specific information. Note that drivers are expected From a4cdb785bbd7e26cc3f2ed0bb4e5cf7ea83c400b Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 28 Dec 2024 10:13:12 +0000 Subject: [PATCH 058/143] LinuxKPI: 802.11: improve the IMPROVE_HT() macro Let the macro take a format string and arguments and add __func__, __LINE__ to the output. Sponsored by: The FreeBSD Foundation MFC after: 3 days --- sys/compat/linuxkpi/common/src/linux_80211.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/compat/linuxkpi/common/src/linux_80211.h b/sys/compat/linuxkpi/common/src/linux_80211.h index 8605ec86ad1bcb..0c4c615d82e515 100644 --- a/sys/compat/linuxkpi/common/src/linux_80211.h +++ b/sys/compat/linuxkpi/common/src/linux_80211.h @@ -78,9 +78,10 @@ if (linuxkpi_debug_80211 & D80211_IMPROVE_TXQ) \ printf("%s:%d: XXX LKPI80211 IMPROVE_TXQ\n", __func__, __LINE__) -#define IMPROVE_HT(...) \ +#define IMPROVE_HT(fmt, ...) \ if (linuxkpi_debug_80211 & D80211_TRACE_MODE_HT) \ - printf("%s:%d: XXX LKPI80211 IMPROVE_HT\n", __func__, __LINE__) + printf("%s:%d: XXX LKPI80211 IMPROVE_HT " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__); #define MTAG_ABI_LKPI80211 1707696513 /* LinuxKPI 802.11 KBI */ From fd27f86dd71b7ff1df6981297095b88d1d29652e Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Sat, 28 Dec 2024 09:57:56 +0000 Subject: [PATCH 059/143] LinuxKPI: switch jiffies and timer->expire to unsigned long It seems these functions work with unsigned long and not int in Linux. Start simply replacing the int where I came across it while debugging a wireless driver timer modification. Also sprinkle in some "const". Sponsored by: The FreeBSD Foundation MFC after: 2 weeks Reviewed by: emaste Differential Revision: https://reviews.freebsd.org/D48318 --- .../linuxkpi/common/include/linux/jiffies.h | 28 +++++++++---------- .../linuxkpi/common/include/linux/timer.h | 4 +-- sys/compat/linuxkpi/common/src/linux_compat.c | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sys/compat/linuxkpi/common/include/linux/jiffies.h b/sys/compat/linuxkpi/common/include/linux/jiffies.h index bd05a0db076703..8346e74fb830f6 100644 --- a/sys/compat/linuxkpi/common/include/linux/jiffies.h +++ b/sys/compat/linuxkpi/common/include/linux/jiffies.h @@ -38,7 +38,7 @@ #define jiffies ticks #define jiffies_64 ticks -#define jiffies_to_msecs(x) ((unsigned int)(((int64_t)(int)(x)) * 1000 / hz)) +#define jiffies_to_msecs(x) ((unsigned int)(((int64_t)(unsigned long)(x)) * 1000 / hz)) #define MAX_JIFFY_OFFSET ((INT_MAX >> 1) - 1) @@ -68,7 +68,7 @@ extern uint64_t lkpi_msec2hz_rem; extern uint64_t lkpi_msec2hz_div; extern uint64_t lkpi_msec2hz_max; -static inline int +static inline unsigned long timespec_to_jiffies(const struct timespec *ts) { u64 result; @@ -78,10 +78,10 @@ timespec_to_jiffies(const struct timespec *ts) if (result > MAX_JIFFY_OFFSET) result = MAX_JIFFY_OFFSET; - return ((int)result); + return ((unsigned long)result); } -static inline int +static inline unsigned long msecs_to_jiffies(uint64_t msec) { uint64_t result; @@ -92,10 +92,10 @@ msecs_to_jiffies(uint64_t msec) if (result > MAX_JIFFY_OFFSET) result = MAX_JIFFY_OFFSET; - return ((int)result); + return ((unsigned long)result); } -static inline int +static inline unsigned long usecs_to_jiffies(uint64_t usec) { uint64_t result; @@ -106,7 +106,7 @@ usecs_to_jiffies(uint64_t usec) if (result > MAX_JIFFY_OFFSET) result = MAX_JIFFY_OFFSET; - return ((int)result); + return ((unsigned long)result); } static inline uint64_t @@ -133,17 +133,17 @@ nsecs_to_jiffies(uint64_t nsec) } static inline uint64_t -jiffies_to_nsecs(int j) +jiffies_to_nsecs(const unsigned long j) { - return ((1000000000ULL / hz) * (uint64_t)(unsigned int)j); + return ((1000000000ULL / hz) * (const uint64_t)j); } static inline uint64_t -jiffies_to_usecs(int j) +jiffies_to_usecs(const unsigned long j) { - return ((1000000ULL / hz) * (uint64_t)(unsigned int)j); + return ((1000000ULL / hz) * (const uint64_t)j); } static inline uint64_t @@ -153,10 +153,10 @@ get_jiffies_64(void) return ((uint64_t)(unsigned int)ticks); } -static inline int -linux_timer_jiffies_until(int expires) +static inline unsigned long +linux_timer_jiffies_until(unsigned long expires) { - int delta = expires - jiffies; + unsigned long delta = expires - jiffies; /* guard against already expired values */ if (delta < 1) delta = 1; diff --git a/sys/compat/linuxkpi/common/include/linux/timer.h b/sys/compat/linuxkpi/common/include/linux/timer.h index 8bea082c3e6c71..f9c76222795c8f 100644 --- a/sys/compat/linuxkpi/common/include/linux/timer.h +++ b/sys/compat/linuxkpi/common/include/linux/timer.h @@ -42,7 +42,7 @@ struct timer_list { void (*function_415) (struct timer_list *); }; unsigned long data; - int expires; + unsigned long expires; }; extern unsigned long linux_timer_hz_mask; @@ -76,7 +76,7 @@ extern unsigned long linux_timer_hz_mask; callout_init(&(timer)->callout, 1); \ } while (0) -extern int mod_timer(struct timer_list *, int); +extern int mod_timer(struct timer_list *, unsigned long); extern void add_timer(struct timer_list *); extern void add_timer_on(struct timer_list *, int cpu); extern int del_timer(struct timer_list *); diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index ec3ccb16b47d63..35cb2fc2f3d742 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -1938,7 +1938,7 @@ linux_timer_callback_wrapper(void *context) } int -mod_timer(struct timer_list *timer, int expires) +mod_timer(struct timer_list *timer, unsigned long expires) { int ret; From d82bfe73a3f4f3f38757c2e064047f09629ec7b7 Mon Sep 17 00:00:00 2001 From: Adrian Chadd Date: Mon, 16 Dec 2024 12:11:16 -0800 Subject: [PATCH 060/143] rtwn: don't set the RTS/CTS primary channel field for RTL8812AU/RTL8821AU According to the rtl8812au reference driver, this seems to control the bandwidth used by lower-bandwidth frames when transmitted in a higher bandwidth channel. For example, transmitting a 20MHz frame on an 80MHz channel (eg in hostap mode) is doable, but you may want to at least duplicate the RTS/CTS exchange across all four 20MHz subchannels, AND perhaps duplicate the 20MHz frame. I haven't fired this up with a spectrum analyser to see what the result is. The vendor driver doesn't bother with this and it doesn't change performance. My guess is that for modes like AP mode we MAY wantto be able to control the RTS/CTS bandwidth choices rather than letting the firmare do it, but we're not there yet. The rtl8812au code in hal/rtl8812a_xmit.c:SCMapping_8812() has the gory details, but then the one place it's used just has it commented out and 0 (ie "do not care") is always programmed in. Differential Revision: https://reviews.freebsd.org/D48113 Obtained from: https://github.com/lwfinger/rtl8812au Reviewed by: bz --- sys/dev/rtwn/rtl8812a/r12a_tx.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sys/dev/rtwn/rtl8812a/r12a_tx.c b/sys/dev/rtwn/rtl8812a/r12a_tx.c index 582e6e0ddaf464..336ad75a0b1f45 100644 --- a/sys/dev/rtwn/rtl8812a/r12a_tx.c +++ b/sys/dev/rtwn/rtl8812a/r12a_tx.c @@ -56,14 +56,35 @@ #include #include +/* + * This function actually handles the secondary channel mapping, + * not the primary channel mapping. It hints to the MAC where + * to handle duplicate transmission of the RTS/CTS and payload + * frames when the requested transmit channel width is less than + * the configured channel width. + * + * Note: the vendor driver and linux rtw88 driver both leave this + * field currently set to 0. + * + * See the rtl8812au vendor driver, hal/rtl8812a_xmit.c:SCMapping_8812() + * and where it's used (and ignored.) + */ static int r12a_get_primary_channel(struct rtwn_softc *sc, struct ieee80211_channel *c) { +#if 0 /* XXX VHT80; VHT40 */ if (IEEE80211_IS_CHAN_HT40U(c)) return (R12A_TXDW5_PRIM_CHAN_20_80_2); else return (R12A_TXDW5_PRIM_CHAN_20_80_3); +#endif + + /* + * For now just return the VHT_DATA_SC_DONOT_CARE value + * from the reference driver. + */ + return (0); } static void From dd58d03a2a46dddf2ce661d623224a947751beff Mon Sep 17 00:00:00 2001 From: Adrian Chadd Date: Mon, 16 Dec 2024 21:36:02 -0800 Subject: [PATCH 061/143] rtwn: set the maximum A-MPDU size correctly for RTL8812AU/RTL8821AU The vendor driver sets it to 64k or 128k depending upon chipset, along with bit 31 being set in hal/rtl8812a_hal_init.c:SetHwReg8812A(). Differential Revision: https://reviews.freebsd.org/D48118 Obtained from: https://github.com/lwfinger/rtl8812au Reviewed by: bz --- sys/dev/rtwn/rtl8812a/r12a_var.h | 1 + sys/dev/rtwn/rtl8812a/usb/r12au_attach.c | 1 + sys/dev/rtwn/rtl8812a/usb/r12au_init.c | 12 +++++++++++- sys/dev/rtwn/rtl8821a/usb/r21au_attach.c | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/sys/dev/rtwn/rtl8812a/r12a_var.h b/sys/dev/rtwn/rtl8812a/r12a_var.h index 182e6b90275867..0a76e013b6a733 100644 --- a/sys/dev/rtwn/rtl8812a/r12a_var.h +++ b/sys/dev/rtwn/rtl8812a/r12a_var.h @@ -99,6 +99,7 @@ struct r12a_softc { int ac_usb_dma_size; int ac_usb_dma_time; int ampdu_max_time; + int ampdu_max_size; }; #define R12A_SOFTC(_sc) ((struct r12a_softc *)((_sc)->sc_priv)) diff --git a/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c b/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c index 84bfcfbda0e8f6..c87bffb4db197d 100644 --- a/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c +++ b/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c @@ -141,6 +141,7 @@ r12a_attach_private(struct rtwn_softc *sc) rs->rs_iq_calib_sw = r12a_iq_calib_sw; rs->ampdu_max_time = 0x70; + rs->ampdu_max_size = 0x1ffff; /* 128k */ sc->sc_priv = rs; } diff --git a/sys/dev/rtwn/rtl8812a/usb/r12au_init.c b/sys/dev/rtwn/rtl8812a/usb/r12au_init.c index ac6a599895acbb..1bee2c66565742 100644 --- a/sys/dev/rtwn/rtl8812a/usb/r12au_init.c +++ b/sys/dev/rtwn/rtl8812a/usb/r12au_init.c @@ -142,7 +142,17 @@ r12au_init_ampdu(struct rtwn_softc *sc) /* Setup AMPDU aggregation. */ rtwn_write_1(sc, R12A_AMPDU_MAX_TIME, rs->ampdu_max_time); - rtwn_write_4(sc, R12A_AMPDU_MAX_LENGTH, 0xffffffff); + /* + * Note: The vendor driver (hal/rtl8812a_hal_init.c:SetHwReg8812A()) + * also sets bit 31. + */ + /* + * TODO: this should be limited to the peer in STA mode, + * and perhaps the minimum A-MPDU of all VAPs/peers in + * multi-STA / other operating modes. + */ + rtwn_write_4(sc, R12A_AMPDU_MAX_LENGTH, + rs->ampdu_max_size | (1<<31)); /* 80 MHz clock (again?) */ rtwn_write_1(sc, R92C_USTIME_TSF, 0x50); diff --git a/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c b/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c index 9f0e2c950a1e46..175bac8f6fc9a9 100644 --- a/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c +++ b/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c @@ -141,6 +141,7 @@ r21a_attach_private(struct rtwn_softc *sc) rs->rs_iq_calib_sw = r21a_iq_calib_sw; rs->ampdu_max_time = 0x5e; + rs->ampdu_max_size = 0xffff; /* 64k */ rs->ac_usb_dma_size = 0x01; rs->ac_usb_dma_time = 0x10; From 7f8f120439b77e60a1070d87f4dc6cb9a43d0335 Mon Sep 17 00:00:00 2001 From: Xin LI Date: Tue, 7 Jan 2025 20:42:16 -0800 Subject: [PATCH 062/143] libmagic: Unbreak for older FreeBSD releases. byteswap.h is introduced in FreeBSD 13.2 but was not available in earlier versions. In order to support upgrading from an earlier FreeBSD release we would need to tell the build system that fact. PR: bin/273736 Reported by: philip MFC after: 3 days --- lib/libmagic/config.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/libmagic/config.h b/lib/libmagic/config.h index 12cd382ab9bb77..d642880d609629 100644 --- a/lib/libmagic/config.h +++ b/lib/libmagic/config.h @@ -1,6 +1,9 @@ /* config.h. Generated from config.h.in by configure. */ /* config.h.in. Generated from configure.ac by autoheader. */ +/* FreeBSD */ +#include + /* Define if building universal (internal helper macro) */ /* #undef AC_APPLE_UNIVERSAL_BUILD */ @@ -21,8 +24,10 @@ /* Define to 1 if you have the header file. */ #ifndef __APPLE__ /* Cross building tools on macOS */ +#if __FreeBSD_version >= 1400079 || (__FreeBSD_version < 1400000 && __FreeBSD_version >= 1302500) #define HAVE_BYTESWAP_H 1 #endif +#endif /* Define to 1 if you have the header file. */ /* #undef HAVE_BZLIB_H */ From 2bb0efbb7b64fa957d46d4f443b000f375fc03d4 Mon Sep 17 00:00:00 2001 From: "Bjoern A. Zeeb" Date: Wed, 8 Jan 2025 08:30:00 +0000 Subject: [PATCH 063/143] Revert: LinuxKPI: switch jiffies and timer->expire to unsigned long There are possible problems when jiffies (ticks) which still are int wrap around. Also given this did not touch every single place some checks may be broken now. Reported by: markj This reverts commit fd27f86dd71b7ff1df6981297095b88d1d29652e. --- .../linuxkpi/common/include/linux/jiffies.h | 28 +++++++++---------- .../linuxkpi/common/include/linux/timer.h | 4 +-- sys/compat/linuxkpi/common/src/linux_compat.c | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sys/compat/linuxkpi/common/include/linux/jiffies.h b/sys/compat/linuxkpi/common/include/linux/jiffies.h index 8346e74fb830f6..bd05a0db076703 100644 --- a/sys/compat/linuxkpi/common/include/linux/jiffies.h +++ b/sys/compat/linuxkpi/common/include/linux/jiffies.h @@ -38,7 +38,7 @@ #define jiffies ticks #define jiffies_64 ticks -#define jiffies_to_msecs(x) ((unsigned int)(((int64_t)(unsigned long)(x)) * 1000 / hz)) +#define jiffies_to_msecs(x) ((unsigned int)(((int64_t)(int)(x)) * 1000 / hz)) #define MAX_JIFFY_OFFSET ((INT_MAX >> 1) - 1) @@ -68,7 +68,7 @@ extern uint64_t lkpi_msec2hz_rem; extern uint64_t lkpi_msec2hz_div; extern uint64_t lkpi_msec2hz_max; -static inline unsigned long +static inline int timespec_to_jiffies(const struct timespec *ts) { u64 result; @@ -78,10 +78,10 @@ timespec_to_jiffies(const struct timespec *ts) if (result > MAX_JIFFY_OFFSET) result = MAX_JIFFY_OFFSET; - return ((unsigned long)result); + return ((int)result); } -static inline unsigned long +static inline int msecs_to_jiffies(uint64_t msec) { uint64_t result; @@ -92,10 +92,10 @@ msecs_to_jiffies(uint64_t msec) if (result > MAX_JIFFY_OFFSET) result = MAX_JIFFY_OFFSET; - return ((unsigned long)result); + return ((int)result); } -static inline unsigned long +static inline int usecs_to_jiffies(uint64_t usec) { uint64_t result; @@ -106,7 +106,7 @@ usecs_to_jiffies(uint64_t usec) if (result > MAX_JIFFY_OFFSET) result = MAX_JIFFY_OFFSET; - return ((unsigned long)result); + return ((int)result); } static inline uint64_t @@ -133,17 +133,17 @@ nsecs_to_jiffies(uint64_t nsec) } static inline uint64_t -jiffies_to_nsecs(const unsigned long j) +jiffies_to_nsecs(int j) { - return ((1000000000ULL / hz) * (const uint64_t)j); + return ((1000000000ULL / hz) * (uint64_t)(unsigned int)j); } static inline uint64_t -jiffies_to_usecs(const unsigned long j) +jiffies_to_usecs(int j) { - return ((1000000ULL / hz) * (const uint64_t)j); + return ((1000000ULL / hz) * (uint64_t)(unsigned int)j); } static inline uint64_t @@ -153,10 +153,10 @@ get_jiffies_64(void) return ((uint64_t)(unsigned int)ticks); } -static inline unsigned long -linux_timer_jiffies_until(unsigned long expires) +static inline int +linux_timer_jiffies_until(int expires) { - unsigned long delta = expires - jiffies; + int delta = expires - jiffies; /* guard against already expired values */ if (delta < 1) delta = 1; diff --git a/sys/compat/linuxkpi/common/include/linux/timer.h b/sys/compat/linuxkpi/common/include/linux/timer.h index f9c76222795c8f..8bea082c3e6c71 100644 --- a/sys/compat/linuxkpi/common/include/linux/timer.h +++ b/sys/compat/linuxkpi/common/include/linux/timer.h @@ -42,7 +42,7 @@ struct timer_list { void (*function_415) (struct timer_list *); }; unsigned long data; - unsigned long expires; + int expires; }; extern unsigned long linux_timer_hz_mask; @@ -76,7 +76,7 @@ extern unsigned long linux_timer_hz_mask; callout_init(&(timer)->callout, 1); \ } while (0) -extern int mod_timer(struct timer_list *, unsigned long); +extern int mod_timer(struct timer_list *, int); extern void add_timer(struct timer_list *); extern void add_timer_on(struct timer_list *, int cpu); extern int del_timer(struct timer_list *); diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index 35cb2fc2f3d742..ec3ccb16b47d63 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -1938,7 +1938,7 @@ linux_timer_callback_wrapper(void *context) } int -mod_timer(struct timer_list *timer, unsigned long expires) +mod_timer(struct timer_list *timer, int expires) { int ret; From cad1d13af789b82d829923795cccbf37cdf93b51 Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Wed, 8 Jan 2025 10:34:06 +0100 Subject: [PATCH 064/143] dts: Bump the freebsd branding version to 6.12 --- sys/dts/freebsd-compatible.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/dts/freebsd-compatible.dts b/sys/dts/freebsd-compatible.dts index 7d602f70a83bbb..ffdc2e542a22d0 100644 --- a/sys/dts/freebsd-compatible.dts +++ b/sys/dts/freebsd-compatible.dts @@ -1,3 +1,3 @@ / { - freebsd,dts-version = "6.8"; + freebsd,dts-version = "6.12"; }; From 1f7c379c07168029694a9a33bc437b05cdee623e Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 8 Jan 2025 13:04:34 -0500 Subject: [PATCH 065/143] isp: Fix abort issue introduced by previous commit Aborting ATIO while its CTIOs are in progress makes impossible to handle their completions, making them stuck forever. Detect this case by checking ctcnt counter and if so instead of aborting just mark the ATIO as dead to block any new CTIOs. It is not perfect since the task id can not be reused for some more time, but not as bad as the task stuck forever. MFC after: 1 week --- etc/mtree/BSD.include.dist | 2 ++ include/Makefile | 2 +- sys/conf/files.amd64 | 2 ++ sys/dev/isp/isp_freebsd.c | 19 +++++++++++++++++-- sys/dev/isp/isp_freebsd.h | 3 ++- sys/modules/Makefile | 2 ++ usr.sbin/Makefile | 1 + 7 files changed, 27 insertions(+), 4 deletions(-) diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index ad1b8a5f741350..0a2dbea23d5a6a 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -149,6 +149,8 @@ .. io .. + ixnvdimm + .. mfi .. mlx5 diff --git a/include/Makefile b/include/Makefile index 16d641b42a908c..0c71f1518a914f 100644 --- a/include/Makefile +++ b/include/Makefile @@ -50,7 +50,7 @@ LDIRS= geom net net80211 netgraph netinet netinet6 \ LSUBDIRS= dev/acpica dev/agp dev/ciss dev/filemon dev/firewire \ dev/hwpmc dev/hyperv \ - dev/ic dev/iicbus dev/io dev/mfi dev/mmc \ + dev/ic dev/iicbus dev/io dev/ixnvdimm dev/mfi dev/mmc \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/pwm \ dev/smbus dev/speaker dev/tcp_log dev/veriexec dev/vkbd dev/wg \ fs/devfs fs/fdescfs fs/msdosfs fs/nfs fs/nullfs \ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 0584fc29d03963..571e61f6b26428 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -235,6 +235,8 @@ dev/ixl/i40e_adminq.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_dcb.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" +dev/ixnvdimm/ixnvdimm.c optional ixnvdimm +dev/ixnvdimm/ixnvdimm_copy.S optional ixnvdimm dev/ncthwm/ncthwm.c optional ncthwm superio dev/qlxge/qls_dbg.c optional qlxge pci dev/qlxge/qls_dump.c optional qlxge pci diff --git a/sys/dev/isp/isp_freebsd.c b/sys/dev/isp/isp_freebsd.c index d5aa7a54142eaf..b496eae1b466ea 100644 --- a/sys/dev/isp/isp_freebsd.c +++ b/sys/dev/isp/isp_freebsd.c @@ -986,6 +986,16 @@ isp_target_start_ctio(ispsoftc_t *isp, union ccb *ccb, enum Start_Ctio_How how) continue; } + /* + * Is this command a dead duck? + */ + if (atp->dead) { + isp_prt(isp, ISP_LOGERR, "%s: [0x%x] not sending a CTIO for a dead command", __func__, cso->tag_id); + ccb->ccb_h.status = CAM_REQ_ABORTED; + xpt_done(ccb); + continue; + } + /* * Check to make sure we're still in target mode. */ @@ -2503,14 +2513,19 @@ isp_action(struct cam_sim *sim, union ccb *ccb) } /* - * Target should abort all affected CCBs before ACK-ing INOT, + * Target should abort all affected tasks before ACK-ing INOT, * but if/since it doesn't, add this hack to allow tag reuse. + * We can not do it if some CTIOs are in progress, or we won't + * handle the completions. In such case just block new ones. */ uint32_t rsp = (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0; if (ntp->nt.nt_ncode == NT_ABORT_TASK && (rsp & 0xff) == 0 && (atp = isp_find_atpd(isp, XS_CHANNEL(ccb), ccb->cna2.seq_id)) != NULL) { - if (isp_abort_atpd(isp, XS_CHANNEL(ccb), atp) == 0) + if (atp->ctcnt == 0 && + isp_abort_atpd(isp, XS_CHANNEL(ccb), atp) == 0) isp_put_atpd(isp, XS_CHANNEL(ccb), atp); + else + atp->dead = 1; } if (isp_handle_platform_target_notify_ack(isp, &ntp->nt, rsp)) { diff --git a/sys/dev/isp/isp_freebsd.h b/sys/dev/isp/isp_freebsd.h index 5bb3dd43b6de41..73390fa14769fb 100644 --- a/sys/dev/isp/isp_freebsd.h +++ b/sys/dev/isp/isp_freebsd.h @@ -104,8 +104,9 @@ typedef struct atio_private_data { uint16_t ctcnt; /* number of CTIOs currently active */ uint8_t seqno; /* CTIO sequence number */ uint8_t cdb0; - uint8_t srr_notify_rcvd : 1, + uint16_t srr_notify_rcvd : 1, sendst : 1, + dead : 1, tattr : 3, state : 3; void * ests; diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 294cb5a224de34..3451928e2e5324 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -203,6 +203,7 @@ SUBDIR= \ ${_ix} \ ${_ixv} \ ${_ixl} \ + ${_ixnvdimm} \ jme \ kbdmux \ kgssapi \ @@ -834,6 +835,7 @@ _enic= enic _iavf= iavf _ioat= ioat _ixl= ixl +_ixnvdimm= ixnvdimm _nvdimm= nvdimm _pms= pms _qat= qat diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile index e039cea6ee167a..2eeff698083618 100644 --- a/usr.sbin/Makefile +++ b/usr.sbin/Makefile @@ -34,6 +34,7 @@ SUBDIR= adduser \ ifmcstat \ iostat \ iovctl \ + ixnvdimm \ kldxref \ mailwrapper \ makefs \ From e6c96c7af717b459aea4126590ba413d29f283bf Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 8 Jan 2025 13:20:09 -0500 Subject: [PATCH 066/143] Revert "isp: Fix abort issue introduced by previous commit" This reverts commit 1f7c379c07168029694a9a33bc437b05cdee623e. Leaked unintended changes. I'm sorry. --- etc/mtree/BSD.include.dist | 2 -- include/Makefile | 2 +- sys/conf/files.amd64 | 2 -- sys/dev/isp/isp_freebsd.c | 19 ++----------------- sys/dev/isp/isp_freebsd.h | 3 +-- sys/modules/Makefile | 2 -- usr.sbin/Makefile | 1 - 7 files changed, 4 insertions(+), 27 deletions(-) diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index 0a2dbea23d5a6a..ad1b8a5f741350 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -149,8 +149,6 @@ .. io .. - ixnvdimm - .. mfi .. mlx5 diff --git a/include/Makefile b/include/Makefile index 0c71f1518a914f..16d641b42a908c 100644 --- a/include/Makefile +++ b/include/Makefile @@ -50,7 +50,7 @@ LDIRS= geom net net80211 netgraph netinet netinet6 \ LSUBDIRS= dev/acpica dev/agp dev/ciss dev/filemon dev/firewire \ dev/hwpmc dev/hyperv \ - dev/ic dev/iicbus dev/io dev/ixnvdimm dev/mfi dev/mmc \ + dev/ic dev/iicbus dev/io dev/mfi dev/mmc \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/pwm \ dev/smbus dev/speaker dev/tcp_log dev/veriexec dev/vkbd dev/wg \ fs/devfs fs/fdescfs fs/msdosfs fs/nfs fs/nullfs \ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 571e61f6b26428..0584fc29d03963 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -235,8 +235,6 @@ dev/ixl/i40e_adminq.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" dev/ixl/i40e_dcb.c optional ixl pci \ compile-with "${NORMAL_C} -I$S/dev/ixl" -dev/ixnvdimm/ixnvdimm.c optional ixnvdimm -dev/ixnvdimm/ixnvdimm_copy.S optional ixnvdimm dev/ncthwm/ncthwm.c optional ncthwm superio dev/qlxge/qls_dbg.c optional qlxge pci dev/qlxge/qls_dump.c optional qlxge pci diff --git a/sys/dev/isp/isp_freebsd.c b/sys/dev/isp/isp_freebsd.c index b496eae1b466ea..d5aa7a54142eaf 100644 --- a/sys/dev/isp/isp_freebsd.c +++ b/sys/dev/isp/isp_freebsd.c @@ -986,16 +986,6 @@ isp_target_start_ctio(ispsoftc_t *isp, union ccb *ccb, enum Start_Ctio_How how) continue; } - /* - * Is this command a dead duck? - */ - if (atp->dead) { - isp_prt(isp, ISP_LOGERR, "%s: [0x%x] not sending a CTIO for a dead command", __func__, cso->tag_id); - ccb->ccb_h.status = CAM_REQ_ABORTED; - xpt_done(ccb); - continue; - } - /* * Check to make sure we're still in target mode. */ @@ -2513,19 +2503,14 @@ isp_action(struct cam_sim *sim, union ccb *ccb) } /* - * Target should abort all affected tasks before ACK-ing INOT, + * Target should abort all affected CCBs before ACK-ing INOT, * but if/since it doesn't, add this hack to allow tag reuse. - * We can not do it if some CTIOs are in progress, or we won't - * handle the completions. In such case just block new ones. */ uint32_t rsp = (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0; if (ntp->nt.nt_ncode == NT_ABORT_TASK && (rsp & 0xff) == 0 && (atp = isp_find_atpd(isp, XS_CHANNEL(ccb), ccb->cna2.seq_id)) != NULL) { - if (atp->ctcnt == 0 && - isp_abort_atpd(isp, XS_CHANNEL(ccb), atp) == 0) + if (isp_abort_atpd(isp, XS_CHANNEL(ccb), atp) == 0) isp_put_atpd(isp, XS_CHANNEL(ccb), atp); - else - atp->dead = 1; } if (isp_handle_platform_target_notify_ack(isp, &ntp->nt, rsp)) { diff --git a/sys/dev/isp/isp_freebsd.h b/sys/dev/isp/isp_freebsd.h index 73390fa14769fb..5bb3dd43b6de41 100644 --- a/sys/dev/isp/isp_freebsd.h +++ b/sys/dev/isp/isp_freebsd.h @@ -104,9 +104,8 @@ typedef struct atio_private_data { uint16_t ctcnt; /* number of CTIOs currently active */ uint8_t seqno; /* CTIO sequence number */ uint8_t cdb0; - uint16_t srr_notify_rcvd : 1, + uint8_t srr_notify_rcvd : 1, sendst : 1, - dead : 1, tattr : 3, state : 3; void * ests; diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 3451928e2e5324..294cb5a224de34 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -203,7 +203,6 @@ SUBDIR= \ ${_ix} \ ${_ixv} \ ${_ixl} \ - ${_ixnvdimm} \ jme \ kbdmux \ kgssapi \ @@ -835,7 +834,6 @@ _enic= enic _iavf= iavf _ioat= ioat _ixl= ixl -_ixnvdimm= ixnvdimm _nvdimm= nvdimm _pms= pms _qat= qat diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile index 2eeff698083618..e039cea6ee167a 100644 --- a/usr.sbin/Makefile +++ b/usr.sbin/Makefile @@ -34,7 +34,6 @@ SUBDIR= adduser \ ifmcstat \ iostat \ iovctl \ - ixnvdimm \ kldxref \ mailwrapper \ makefs \ From 2c48a8f161c91bf7020122697d064a25287097a3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 8 Jan 2025 13:23:26 -0500 Subject: [PATCH 067/143] isp: Fix abort issue introduced by previous commit Aborting ATIO while its CTIOs are in progress makes impossible to handle their completions, making them stuck forever. Detect this case by checking ctcnt counter and if so instead of aborting just mark the ATIO as dead to block any new CTIOs. It is not perfect since the task id can not be reused for some more time, but not as bad as the task stuck forever. MFC after: 1 week --- sys/dev/isp/isp_freebsd.c | 19 +++++++++++++++++-- sys/dev/isp/isp_freebsd.h | 3 ++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sys/dev/isp/isp_freebsd.c b/sys/dev/isp/isp_freebsd.c index d5aa7a54142eaf..b496eae1b466ea 100644 --- a/sys/dev/isp/isp_freebsd.c +++ b/sys/dev/isp/isp_freebsd.c @@ -986,6 +986,16 @@ isp_target_start_ctio(ispsoftc_t *isp, union ccb *ccb, enum Start_Ctio_How how) continue; } + /* + * Is this command a dead duck? + */ + if (atp->dead) { + isp_prt(isp, ISP_LOGERR, "%s: [0x%x] not sending a CTIO for a dead command", __func__, cso->tag_id); + ccb->ccb_h.status = CAM_REQ_ABORTED; + xpt_done(ccb); + continue; + } + /* * Check to make sure we're still in target mode. */ @@ -2503,14 +2513,19 @@ isp_action(struct cam_sim *sim, union ccb *ccb) } /* - * Target should abort all affected CCBs before ACK-ing INOT, + * Target should abort all affected tasks before ACK-ing INOT, * but if/since it doesn't, add this hack to allow tag reuse. + * We can not do it if some CTIOs are in progress, or we won't + * handle the completions. In such case just block new ones. */ uint32_t rsp = (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0; if (ntp->nt.nt_ncode == NT_ABORT_TASK && (rsp & 0xff) == 0 && (atp = isp_find_atpd(isp, XS_CHANNEL(ccb), ccb->cna2.seq_id)) != NULL) { - if (isp_abort_atpd(isp, XS_CHANNEL(ccb), atp) == 0) + if (atp->ctcnt == 0 && + isp_abort_atpd(isp, XS_CHANNEL(ccb), atp) == 0) isp_put_atpd(isp, XS_CHANNEL(ccb), atp); + else + atp->dead = 1; } if (isp_handle_platform_target_notify_ack(isp, &ntp->nt, rsp)) { diff --git a/sys/dev/isp/isp_freebsd.h b/sys/dev/isp/isp_freebsd.h index 5bb3dd43b6de41..73390fa14769fb 100644 --- a/sys/dev/isp/isp_freebsd.h +++ b/sys/dev/isp/isp_freebsd.h @@ -104,8 +104,9 @@ typedef struct atio_private_data { uint16_t ctcnt; /* number of CTIOs currently active */ uint8_t seqno; /* CTIO sequence number */ uint8_t cdb0; - uint8_t srr_notify_rcvd : 1, + uint16_t srr_notify_rcvd : 1, sendst : 1, + dead : 1, tattr : 3, state : 3; void * ests; From 4c89d59e0cdac4d83fb5841aefae9214545b2273 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Wed, 8 Jan 2025 23:15:54 +0100 Subject: [PATCH 068/143] TCP RACK: don't log an uninitialized value reduce is uninitialized, if the code path for logging is reached via goto old_method;. Reviewed by: rrs, Peter Lei CID: 1557359 MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48346 --- sys/netinet/tcp_stacks/rack.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index f590edd71d9d3c..902845ad34f6d1 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -17474,7 +17474,7 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str * the peer to have a gap in data sending. */ uint64_t cwnd, tr_perms = 0; - int32_t reduce = 0; + int32_t reduce; old_method: /* @@ -17511,7 +17511,8 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str slot -= reduce; } else slot = 0; - } + } else + reduce = 0; slot *= HPTS_USEC_IN_MSEC; if (rack->rc_pace_to_cwnd) { uint64_t rate_wanted = 0; From 912a05670ed9545a1d1b010eedafb819e14eb1b8 Mon Sep 17 00:00:00 2001 From: Adrian Chadd Date: Sun, 15 Dec 2024 18:29:33 -0800 Subject: [PATCH 069/143] net80211: add helper functions for VHT transmit * Add static helper functions for VHT TX for 20MHz, 40MHz and 80MHz. * Add a public function to check if the given VHT width is available for transmit. Differential Revision: https://reviews.freebsd.org/D48101 --- sys/net80211/ieee80211_vht.c | 137 +++++++++++++++++++++++++++++++++++ sys/net80211/ieee80211_vht.h | 4 + 2 files changed, 141 insertions(+) diff --git a/sys/net80211/ieee80211_vht.c b/sys/net80211/ieee80211_vht.c index a05beb91216f48..82879f90c67b5e 100644 --- a/sys/net80211/ieee80211_vht.c +++ b/sys/net80211/ieee80211_vht.c @@ -873,3 +873,140 @@ ieee80211_vht_get_vhtinfo_ie(struct ieee80211_node *ni, { printf("%s: called; TODO!\n", __func__); } + +/* + * Return true if VHT rates can be used for the given node. + */ +bool +ieee80211_vht_check_tx_vht(const struct ieee80211_node *ni) +{ + const struct ieee80211vap *vap; + const struct ieee80211_channel *bss_chan; + + if (ni == NULL || ni->ni_chan == IEEE80211_CHAN_ANYC || + ni->ni_vap == NULL || ni->ni_vap->iv_bss == NULL) + return (false); + + vap = ni->ni_vap; + bss_chan = vap->iv_bss->ni_chan; + + if (bss_chan == IEEE80211_CHAN_ANYC) + return (false); + + return (IEEE80211_IS_CHAN_VHT(ni->ni_chan)); +} + +/* + * Return true if VHT40 rates can be transmitted to the given node. + * + * This verifies that the BSS is VHT40 capable and the current + * node channel width is 40MHz. + */ +static bool +ieee80211_vht_check_tx_vht40(const struct ieee80211_node *ni) +{ + struct ieee80211vap *vap; + struct ieee80211_channel *bss_chan; + + if (!ieee80211_vht_check_tx_vht(ni)) + return (false); + + vap = ni->ni_vap; + bss_chan = vap->iv_bss->ni_chan; + + return (IEEE80211_IS_CHAN_VHT40(bss_chan) && + IEEE80211_IS_CHAN_VHT40(ni->ni_chan) && + (ni->ni_chw == IEEE80211_STA_RX_BW_40)); +} + +/* + * Return true if VHT80 rates can be transmitted to the given node. + * + * This verifies that the BSS is VHT80 capable and the current + * node channel width is 80MHz. + */ +static bool +ieee80211_vht_check_tx_vht80(const struct ieee80211_node *ni) +{ + struct ieee80211vap *vap; + struct ieee80211_channel *bss_chan; + + if (!ieee80211_vht_check_tx_vht(ni)) + return (false); + + vap = ni->ni_vap; + bss_chan = vap->iv_bss->ni_chan; + + return (IEEE80211_IS_CHAN_VHT80(bss_chan) && + IEEE80211_IS_CHAN_VHT80(ni->ni_chan) && + (ni->ni_chw == IEEE80211_STA_RX_BW_80)); +} + +/* + * Return true if VHT 160 rates can be transmitted to the given node. + * + * This verifies that the BSS is VHT80+80 or VHT160 capable and the current + * node channel width is 80+80MHz or 160MHz. + */ +static bool +ieee80211_vht_check_tx_vht160(const struct ieee80211_node *ni) +{ + struct ieee80211vap *vap; + struct ieee80211_channel *bss_chan; + + if (!ieee80211_vht_check_tx_vht(ni)) + return (false); + + vap = ni->ni_vap; + bss_chan = vap->iv_bss->ni_chan; + + if (ni->ni_chw != IEEE80211_STA_RX_BW_160) + return (false); + + if (IEEE80211_IS_CHAN_VHT160(bss_chan) && + IEEE80211_IS_CHAN_VHT160(ni->ni_chan)) + return (true); + + if (IEEE80211_IS_CHAN_VHT80P80(bss_chan) && + IEEE80211_IS_CHAN_VHT80P80(ni->ni_chan)) + return (true); + + return (false); +} + +/** + * @brief Check if the given transmit bandwidth is available to the given node + * + * This checks that the node and BSS both allow the given bandwidth, + * and that the current node bandwidth (which can dynamically change) + * also allows said bandwidth. + * + * This relies on the channels having the flags for the narrower + * channels as well - eg a VHT160 channel will have the CHAN_VHT80, + * CHAN_VHT40, CHAN_VHT flags also set. + * + * @param ni the ieee80211_node to check + * @param bw the required bandwidth to check + * + * @returns true if it is allowed, false otherwise + */ +bool +ieee80211_vht_check_tx_bw(const struct ieee80211_node *ni, + enum ieee80211_sta_rx_bw bw) +{ + + switch (bw) { + case IEEE80211_STA_RX_BW_20: + return (ieee80211_vht_check_tx_vht(ni)); + case IEEE80211_STA_RX_BW_40: + return (ieee80211_vht_check_tx_vht40(ni)); + case IEEE80211_STA_RX_BW_80: + return (ieee80211_vht_check_tx_vht80(ni)); + case IEEE80211_STA_RX_BW_160: + return (ieee80211_vht_check_tx_vht160(ni)); + case IEEE80211_STA_RX_BW_320: + return (false); + default: + return (false); + } +} diff --git a/sys/net80211/ieee80211_vht.h b/sys/net80211/ieee80211_vht.h index f2d1706ea0f29a..bcb61020c5a1e8 100644 --- a/sys/net80211/ieee80211_vht.h +++ b/sys/net80211/ieee80211_vht.h @@ -65,4 +65,8 @@ void ieee80211_vht_get_vhtcap_ie(struct ieee80211_node *ni, void ieee80211_vht_get_vhtinfo_ie(struct ieee80211_node *ni, struct ieee80211_vht_operation *, int); +bool ieee80211_vht_check_tx_vht(const struct ieee80211_node *); +bool ieee80211_vht_check_tx_bw(const struct ieee80211_node *, + enum ieee80211_sta_rx_bw); + #endif /* _NET80211_IEEE80211_VHT_H_ */ From 82182587bcc3adf39d6b3b6347f052865c3a34e2 Mon Sep 17 00:00:00 2001 From: Adrian Chadd Date: Sun, 15 Dec 2024 18:35:28 -0800 Subject: [PATCH 070/143] rtwn: add VHT20/VHT40/VHT80 bandwidth configuration for transmit. Add a separate function and path for VHT 20/40/80MHz bandwidth transmission. Differential Revision: https://reviews.freebsd.org/D48102 --- sys/dev/rtwn/rtl8812a/r12a_tx.c | 41 ++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/sys/dev/rtwn/rtl8812a/r12a_tx.c b/sys/dev/rtwn/rtl8812a/r12a_tx.c index 336ad75a0b1f45..acb23831655983 100644 --- a/sys/dev/rtwn/rtl8812a/r12a_tx.c +++ b/sys/dev/rtwn/rtl8812a/r12a_tx.c @@ -47,6 +47,7 @@ #include #include +#include #include #include @@ -87,12 +88,42 @@ r12a_get_primary_channel(struct rtwn_softc *sc, struct ieee80211_channel *c) return (0); } +/* + * Configure VHT20/VHT40/VHT80 as appropriate. + * + * This is only called for VHT, not for HT. + */ +static void +r12a_tx_set_vht_bw(struct rtwn_softc *sc, void *buf, struct ieee80211_node *ni) +{ + struct r12a_tx_desc *txd = (struct r12a_tx_desc *)buf; + int prim_chan; + + prim_chan = r12a_get_primary_channel(sc, ni->ni_chan); + + if (ieee80211_vht_check_tx_bw(ni, IEEE80211_STA_RX_BW_80)) { + txd->txdw5 |= htole32(SM(R12A_TXDW5_DATA_BW, + R12A_TXDW5_DATA_BW80)); + txd->txdw5 |= htole32(SM(R12A_TXDW5_DATA_PRIM_CHAN, + prim_chan)); + } else if (ieee80211_vht_check_tx_bw(ni, IEEE80211_STA_RX_BW_40)) { + txd->txdw5 |= htole32(SM(R12A_TXDW5_DATA_BW, + R12A_TXDW5_DATA_BW40)); + txd->txdw5 |= htole32(SM(R12A_TXDW5_DATA_PRIM_CHAN, + prim_chan)); + } +} + +/* + * Configure HT20/HT40 as appropriate. + * + * This is only called for HT, not for VHT. + */ static void r12a_tx_set_ht40(struct rtwn_softc *sc, void *buf, struct ieee80211_node *ni) { struct r12a_tx_desc *txd = (struct r12a_tx_desc *)buf; - /* XXX VHT80; VHT40; VHT20 */ if (ieee80211_ht_check_tx_ht40(ni)) { int prim_chan; @@ -353,8 +384,12 @@ r12a_fill_tx_desc(struct rtwn_softc *sc, struct ieee80211_node *ni, txd->txdw5 |= htole32(R12A_TXDW5_DATA_SHORT); prot = IEEE80211_PROT_NONE; - /* TODO: VHT */ - if (RTWN_RATE_IS_HT(ridx)) { + if (RTWN_RATE_IS_VHT(ridx)) { + r12a_tx_set_vht_bw(sc, txd, ni); + /* XXX TODO: sgi */ + /* XXX TODO: ldpc */ + prot = ic->ic_htprotmode; + } else if (RTWN_RATE_IS_HT(ridx)) { r12a_tx_set_ht40(sc, txd, ni); r12a_tx_set_sgi(sc, txd, ni); r12a_tx_set_ldpc(sc, txd, ni); From ec07af2a3d494de36a20a541efdd24874c841db5 Mon Sep 17 00:00:00 2001 From: Adrian Chadd Date: Sun, 15 Dec 2024 20:15:46 -0800 Subject: [PATCH 071/143] rtwn: announce VHT support for RTL8812AU/RTL8821AU. Although the transmit path doesn't yet support VHT rates (because the rate control and rate representation in net80211 doesn't yet know about VHT rates) the NIC will receive VHT frames but only transmit HT frames. Locally tested: * RTL8812AU, STA mode Differential Revision: https://reviews.freebsd.org/D48103 --- sys/dev/rtwn/if_rtwn.c | 23 +++++++++++++++++++++++ sys/dev/rtwn/if_rtwnvar.h | 2 ++ sys/dev/rtwn/rtl8812a/usb/r12au_attach.c | 19 ++++++++++++++++--- sys/dev/rtwn/rtl8821a/usb/r21au_attach.c | 17 ++++++++++++++--- 4 files changed, 55 insertions(+), 6 deletions(-) diff --git a/sys/dev/rtwn/if_rtwn.c b/sys/dev/rtwn/if_rtwn.c index ed84950b1a944c..f9950c5acf4dbd 100644 --- a/sys/dev/rtwn/if_rtwn.c +++ b/sys/dev/rtwn/if_rtwn.c @@ -436,6 +436,29 @@ rtwn_resume(struct rtwn_softc *sc) ieee80211_resume_all(ic); } +void +rtwn_attach_vht_cap_info_mcs(struct rtwn_softc *sc) +{ + struct ieee80211com *ic = &sc->sc_ic; + uint32_t rx_mcs = 0, tx_mcs = 0; + + for (int i = 0 ; i < 8; i++) { + if (i < sc->ntxchains) + tx_mcs |= (IEEE80211_VHT_MCS_SUPPORT_0_9 << (i*2)); + else + tx_mcs |= (IEEE80211_VHT_MCS_NOT_SUPPORTED << (i*2)); + + if (i < sc->nrxchains) + rx_mcs |= (IEEE80211_VHT_MCS_SUPPORT_0_9 << (i*2)); + else + rx_mcs |= (IEEE80211_VHT_MCS_NOT_SUPPORTED << (i*2)); + } + ic->ic_vht_cap.supp_mcs.rx_mcs_map = rx_mcs; + ic->ic_vht_cap.supp_mcs.rx_highest = 0; + ic->ic_vht_cap.supp_mcs.tx_mcs_map = tx_mcs; + ic->ic_vht_cap.supp_mcs.tx_highest = 0; +} + static void rtwn_vap_decrement_counters(struct rtwn_softc *sc, enum ieee80211_opmode opmode, int id) diff --git a/sys/dev/rtwn/if_rtwnvar.h b/sys/dev/rtwn/if_rtwnvar.h index fa4b6d0a5df7af..aa42715b1674e8 100644 --- a/sys/dev/rtwn/if_rtwnvar.h +++ b/sys/dev/rtwn/if_rtwnvar.h @@ -436,6 +436,8 @@ void rtwn_detach(struct rtwn_softc *); void rtwn_resume(struct rtwn_softc *); void rtwn_suspend(struct rtwn_softc *); +void rtwn_attach_vht_cap_info_mcs(struct rtwn_softc *); + /* Interface-specific. */ #define rtwn_write_1(_sc, _addr, _val) \ (((_sc)->sc_write_1)((_sc), (_addr), (_val))) diff --git a/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c b/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c index c87bffb4db197d..b6850eb9fa23eb 100644 --- a/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c +++ b/sys/dev/rtwn/rtl8812a/usb/r12au_attach.c @@ -175,11 +175,24 @@ r12au_adj_devcaps(struct rtwn_softc *sc) } ic->ic_htcaps |= - IEEE80211_HTCAP_CHWIDTH40 /* 40 MHz channel width */ - | IEEE80211_HTCAP_SHORTGI40 /* short GI in 40MHz */ + IEEE80211_HTCAP_CHWIDTH40 | /* 40 MHz channel width */ + IEEE80211_HTCAP_SHORTGI40 /* short GI in 40MHz */ ; - /* TODO: STBC, VHT etc */ + /* TODO: STBC */ + + /* VHT config */ + ic->ic_flags_ext |= IEEE80211_FEXT_VHT; + ic->ic_vht_cap.vht_cap_info = + IEEE80211_VHTCAP_MAX_MPDU_LENGTH_11454 | + IEEE80211_VHTCAP_SHORT_GI_80 | + IEEE80211_VHTCAP_TXSTBC | + IEEE80211_VHTCAP_RXSTBC_1 | + IEEE80211_VHTCAP_HTC_VHT | + _IEEE80211_SHIFTMASK(7, + IEEE80211_VHTCAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK); + + rtwn_attach_vht_cap_info_mcs(sc); } void diff --git a/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c b/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c index 175bac8f6fc9a9..60cb6d3fc61d40 100644 --- a/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c +++ b/sys/dev/rtwn/rtl8821a/usb/r21au_attach.c @@ -160,11 +160,22 @@ r21au_adj_devcaps(struct rtwn_softc *sc) ic->ic_caps |= IEEE80211_C_DFS; ic->ic_htcaps |= - IEEE80211_HTCAP_CHWIDTH40 /* 40 MHz channel width */ - | IEEE80211_HTCAP_SHORTGI40 /* short GI in 40MHz */ + IEEE80211_HTCAP_CHWIDTH40 | /* 40 MHz channel width */ + IEEE80211_HTCAP_SHORTGI40 /* short GI in 40MHz */ ; - /* TODO: VHT */ + /* VHT config */ + ic->ic_flags_ext |= IEEE80211_FEXT_VHT; + ic->ic_vht_cap.vht_cap_info = + IEEE80211_VHTCAP_MAX_MPDU_LENGTH_11454 | + IEEE80211_VHTCAP_SHORT_GI_80 | + IEEE80211_VHTCAP_TXSTBC | + IEEE80211_VHTCAP_RXSTBC_1 | + IEEE80211_VHTCAP_HTC_VHT | + _IEEE80211_SHIFTMASK(7, + IEEE80211_VHTCAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK); + + rtwn_attach_vht_cap_info_mcs(sc); } void From 5aac61c5d2bcbd3358b3d74d46827a8bfdeff86c Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 8 Jan 2025 20:00:12 -0800 Subject: [PATCH 072/143] rpc: delete disabled code from rpcb_clnt.c The code was disabled since the initial bulk check-in from Isilon in 2008. Its existence poisoned grep(1) results when one would try to learn what are the actual RPC mechanisms that are used by the modern NFC client and server. --- sys/rpc/rpcb_clnt.c | 1142 +------------------------------------------ 1 file changed, 1 insertion(+), 1141 deletions(-) diff --git a/sys/rpc/rpcb_clnt.c b/sys/rpc/rpcb_clnt.c index 5f7d13020af07e..ee2253596f8589 100644 --- a/sys/rpc/rpcb_clnt.c +++ b/sys/rpc/rpcb_clnt.c @@ -62,363 +62,6 @@ static const char nullstring[] = "\000"; static CLIENT *local_rpcb(void); -#if 0 - -static const struct timeval rmttimeout = { 3, 0 }; -static struct timeval rpcbrmttime = { 15, 0 }; - -#define CACHESIZE 6 - -struct address_cache { - char *ac_host; - char *ac_netid; - char *ac_uaddr; - struct netbuf *ac_taddr; - struct address_cache *ac_next; -}; - -static struct address_cache *front; -static int cachesize; - -#define CLCR_GET_RPCB_TIMEOUT 1 -#define CLCR_SET_RPCB_TIMEOUT 2 - - -extern int __rpc_lowvers; - -static struct address_cache *check_cache(const char *, const char *); -static void delete_cache(struct netbuf *); -static void add_cache(const char *, const char *, struct netbuf *, char *); -static CLIENT *getclnthandle(const char *, const struct netconfig *, char **); -static CLIENT *local_rpcb(void); -static struct netbuf *got_entry(rpcb_entry_list_ptr, const struct netconfig *); - -/* - * This routine adjusts the timeout used for calls to the remote rpcbind. - * Also, this routine can be used to set the use of portmapper version 2 - * only when doing rpc_broadcasts - * These are private routines that may not be provided in future releases. - */ -bool_t -__rpc_control(request, info) - int request; - void *info; -{ - switch (request) { - case CLCR_GET_RPCB_TIMEOUT: - *(struct timeval *)info = tottimeout; - break; - case CLCR_SET_RPCB_TIMEOUT: - tottimeout = *(struct timeval *)info; - break; - case CLCR_SET_LOWVERS: - __rpc_lowvers = *(int *)info; - break; - case CLCR_GET_LOWVERS: - *(int *)info = __rpc_lowvers; - break; - default: - return (FALSE); - } - return (TRUE); -} - -/* - * It might seem that a reader/writer lock would be more reasonable here. - * However because getclnthandle(), the only user of the cache functions, - * may do a delete_cache() operation if a check_cache() fails to return an - * address useful to clnt_tli_create(), we may as well use a mutex. - */ -/* - * As it turns out, if the cache lock is *not* a reader/writer lock, we will - * block all clnt_create's if we are trying to connect to a host that's down, - * since the lock will be held all during that time. - */ - -/* - * The routines check_cache(), add_cache(), delete_cache() manage the - * cache of rpcbind addresses for (host, netid). - */ - -static struct address_cache * -check_cache(host, netid) - const char *host, *netid; -{ - struct address_cache *cptr; - - /* READ LOCK HELD ON ENTRY: rpcbaddr_cache_lock */ - - for (cptr = front; cptr != NULL; cptr = cptr->ac_next) { - if (!strcmp(cptr->ac_host, host) && - !strcmp(cptr->ac_netid, netid)) { -#ifdef ND_DEBUG - fprintf(stderr, "Found cache entry for %s: %s\n", - host, netid); -#endif - return (cptr); - } - } - return ((struct address_cache *) NULL); -} - -static void -delete_cache(addr) - struct netbuf *addr; -{ - struct address_cache *cptr, *prevptr = NULL; - - /* WRITE LOCK HELD ON ENTRY: rpcbaddr_cache_lock */ - for (cptr = front; cptr != NULL; cptr = cptr->ac_next) { - if (!memcmp(cptr->ac_taddr->buf, addr->buf, addr->len)) { - free(cptr->ac_host); - free(cptr->ac_netid); - free(cptr->ac_taddr->buf); - free(cptr->ac_taddr); - if (cptr->ac_uaddr) - free(cptr->ac_uaddr); - if (prevptr) - prevptr->ac_next = cptr->ac_next; - else - front = cptr->ac_next; - free(cptr); - cachesize--; - break; - } - prevptr = cptr; - } -} - -static void -add_cache(host, netid, taddr, uaddr) - const char *host, *netid; - char *uaddr; - struct netbuf *taddr; -{ - struct address_cache *ad_cache, *cptr, *prevptr; - - ad_cache = (struct address_cache *) - malloc(sizeof (struct address_cache)); - if (!ad_cache) { - return; - } - ad_cache->ac_host = strdup(host); - ad_cache->ac_netid = strdup(netid); - ad_cache->ac_uaddr = uaddr ? strdup(uaddr) : NULL; - ad_cache->ac_taddr = (struct netbuf *)malloc(sizeof (struct netbuf)); - if (!ad_cache->ac_host || !ad_cache->ac_netid || !ad_cache->ac_taddr || - (uaddr && !ad_cache->ac_uaddr)) { - goto out; - } - ad_cache->ac_taddr->len = ad_cache->ac_taddr->maxlen = taddr->len; - ad_cache->ac_taddr->buf = (char *) malloc(taddr->len); - if (ad_cache->ac_taddr->buf == NULL) { -out: - if (ad_cache->ac_host) - free(ad_cache->ac_host); - if (ad_cache->ac_netid) - free(ad_cache->ac_netid); - if (ad_cache->ac_uaddr) - free(ad_cache->ac_uaddr); - if (ad_cache->ac_taddr) - free(ad_cache->ac_taddr); - free(ad_cache); - return; - } - memcpy(ad_cache->ac_taddr->buf, taddr->buf, taddr->len); -#ifdef ND_DEBUG - fprintf(stderr, "Added to cache: %s : %s\n", host, netid); -#endif - -/* VARIABLES PROTECTED BY rpcbaddr_cache_lock: cptr */ - - rwlock_wrlock(&rpcbaddr_cache_lock); - if (cachesize < CACHESIZE) { - ad_cache->ac_next = front; - front = ad_cache; - cachesize++; - } else { - /* Free the last entry */ - cptr = front; - prevptr = NULL; - while (cptr->ac_next) { - prevptr = cptr; - cptr = cptr->ac_next; - } - -#ifdef ND_DEBUG - fprintf(stderr, "Deleted from cache: %s : %s\n", - cptr->ac_host, cptr->ac_netid); -#endif - free(cptr->ac_host); - free(cptr->ac_netid); - free(cptr->ac_taddr->buf); - free(cptr->ac_taddr); - if (cptr->ac_uaddr) - free(cptr->ac_uaddr); - - if (prevptr) { - prevptr->ac_next = NULL; - ad_cache->ac_next = front; - front = ad_cache; - } else { - front = ad_cache; - ad_cache->ac_next = NULL; - } - free(cptr); - } - rwlock_unlock(&rpcbaddr_cache_lock); -} - -/* - * This routine will return a client handle that is connected to the - * rpcbind. If targaddr is non-NULL, the "universal address" of the - * host will be stored in *targaddr; the caller is responsible for - * freeing this string. - * On error, returns NULL and free's everything. - */ -static CLIENT * -getclnthandle(host, nconf, targaddr) - const char *host; - const struct netconfig *nconf; - char **targaddr; -{ - CLIENT *client; - struct netbuf *addr, taddr; - struct netbuf addr_to_delete; - struct __rpc_sockinfo si; - struct addrinfo hints, *res, *tres; - struct address_cache *ad_cache; - char *tmpaddr; - -/* VARIABLES PROTECTED BY rpcbaddr_cache_lock: ad_cache */ - - /* Get the address of the rpcbind. Check cache first */ - client = NULL; - addr_to_delete.len = 0; - rwlock_rdlock(&rpcbaddr_cache_lock); - ad_cache = NULL; - if (host != NULL) - ad_cache = check_cache(host, nconf->nc_netid); - if (ad_cache != NULL) { - addr = ad_cache->ac_taddr; - client = clnt_tli_create(RPC_ANYFD, nconf, addr, - (rpcprog_t)RPCBPROG, (rpcvers_t)RPCBVERS4, 0, 0); - if (client != NULL) { - if (targaddr) - *targaddr = strdup(ad_cache->ac_uaddr); - rwlock_unlock(&rpcbaddr_cache_lock); - return (client); - } - addr_to_delete.len = addr->len; - addr_to_delete.buf = (char *)malloc(addr->len); - if (addr_to_delete.buf == NULL) { - addr_to_delete.len = 0; - } else { - memcpy(addr_to_delete.buf, addr->buf, addr->len); - } - } - rwlock_unlock(&rpcbaddr_cache_lock); - if (addr_to_delete.len != 0) { - /* - * Assume this may be due to cache data being - * outdated - */ - rwlock_wrlock(&rpcbaddr_cache_lock); - delete_cache(&addr_to_delete); - rwlock_unlock(&rpcbaddr_cache_lock); - free(addr_to_delete.buf); - } - if (!__rpc_nconf2sockinfo(nconf, &si)) { - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - return NULL; - } - - memset(&hints, 0, sizeof hints); - hints.ai_family = si.si_af; - hints.ai_socktype = si.si_socktype; - hints.ai_protocol = si.si_proto; - -#ifdef CLNT_DEBUG - printf("trying netid %s family %d proto %d socktype %d\n", - nconf->nc_netid, si.si_af, si.si_proto, si.si_socktype); -#endif - - if (nconf->nc_protofmly != NULL && strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0) { - client = local_rpcb(); - if (! client) { -#ifdef ND_DEBUG - clnt_pcreateerror("rpcbind clnt interface"); -#endif - return (NULL); - } else { - struct sockaddr_un sun; - if (targaddr) { - *targaddr = malloc(sizeof(sun.sun_path)); - if (*targaddr == NULL) { - CLNT_DESTROY(client); - return (NULL); - } - strncpy(*targaddr, _PATH_RPCBINDSOCK, - sizeof(sun.sun_path)); - } - return (client); - } - } else { - if (getaddrinfo(host, "sunrpc", &hints, &res) != 0) { - rpc_createerr.cf_stat = RPC_UNKNOWNHOST; - return NULL; - } - } - - for (tres = res; tres != NULL; tres = tres->ai_next) { - taddr.buf = tres->ai_addr; - taddr.len = taddr.maxlen = tres->ai_addrlen; - -#ifdef ND_DEBUG - { - char *ua; - - ua = taddr2uaddr(nconf, &taddr); - fprintf(stderr, "Got it [%s]\n", ua); - free(ua); - } -#endif - -#ifdef ND_DEBUG - { - int i; - - fprintf(stderr, "\tnetbuf len = %d, maxlen = %d\n", - taddr.len, taddr.maxlen); - fprintf(stderr, "\tAddress is "); - for (i = 0; i < taddr.len; i++) - fprintf(stderr, "%u.", ((char *)(taddr.buf))[i]); - fprintf(stderr, "\n"); - } -#endif - client = clnt_tli_create(RPC_ANYFD, nconf, &taddr, - (rpcprog_t)RPCBPROG, (rpcvers_t)RPCBVERS4, 0, 0); -#ifdef ND_DEBUG - if (! client) { - clnt_pcreateerror("rpcbind clnt interface"); - } -#endif - - if (client) { - tmpaddr = targaddr ? taddr2uaddr(nconf, &taddr) : NULL; - add_cache(host, nconf->nc_netid, &taddr, tmpaddr); - if (targaddr) - *targaddr = tmpaddr; - break; - } - } - if (res) - freeaddrinfo(res); - return (client); -} - -#endif - /* XXX */ #define IN4_LOCALHOST_STRING "127.0.0.1" #define IN6_LOCALHOST_STRING "::1" @@ -446,7 +89,7 @@ local_rpcb(void) error = socreate(AF_LOCAL, &so, SOCK_STREAM, 0, curthread->td_ucred, curthread); if (error) - goto try_nconf; + return (NULL); sun.sun_family = AF_LOCAL; strcpy(sun.sun_path, _PATH_RPCBINDSOCK); sun.sun_len = SUN_LEN(&sun); @@ -464,65 +107,7 @@ local_rpcb(void) /* Nobody needs this socket anymore; free the descriptor. */ soclose(so); -try_nconf: - -#if 0 - static struct netconfig *loopnconf; - static char *localhostname; - -/* VARIABLES PROTECTED BY loopnconf_lock: loopnconf */ - mutex_lock(&loopnconf_lock); - if (loopnconf == NULL) { - struct netconfig *nconf, *tmpnconf = NULL; - void *nc_handle; - int fd; - - nc_handle = setnetconfig(); - if (nc_handle == NULL) { - /* fails to open netconfig file */ - syslog (LOG_ERR, "rpc: failed to open " NETCONFIG); - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - mutex_unlock(&loopnconf_lock); - return (NULL); - } - while ((nconf = getnetconfig(nc_handle)) != NULL) { - if (( -#ifdef INET6 - strcmp(nconf->nc_protofmly, NC_INET6) == 0 || -#endif - strcmp(nconf->nc_protofmly, NC_INET) == 0) && - (nconf->nc_semantics == NC_TPI_COTS || - nconf->nc_semantics == NC_TPI_COTS_ORD)) { - fd = __rpc_nconf2fd(nconf); - /* - * Can't create a socket, assume that - * this family isn't configured in the kernel. - */ - if (fd < 0) - continue; - _close(fd); - tmpnconf = nconf; - if (!strcmp(nconf->nc_protofmly, NC_INET)) - localhostname = IN4_LOCALHOST_STRING; - else - localhostname = IN6_LOCALHOST_STRING; - } - } - if (tmpnconf == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - mutex_unlock(&loopnconf_lock); - return (NULL); - } - loopnconf = getnetconfigent(tmpnconf->nc_netid); - /* loopnconf is never freed */ - endnetconfig(nc_handle); - } - mutex_unlock(&loopnconf_lock); - client = getclnthandle(localhostname, loopnconf, NULL); - return (client); -#else return (NULL); -#endif } /* @@ -636,728 +221,3 @@ rpcb_unset(rpcprog_t program, rpcvers_t version, const struct netconfig *nconf) CLNT_DESTROY(client); return (rslt); } - -#if 0 - -/* - * From the merged list, find the appropriate entry - */ -static struct netbuf * -got_entry(relp, nconf) - rpcb_entry_list_ptr relp; - const struct netconfig *nconf; -{ - struct netbuf *na = NULL; - rpcb_entry_list_ptr sp; - rpcb_entry *rmap; - - for (sp = relp; sp != NULL; sp = sp->rpcb_entry_next) { - rmap = &sp->rpcb_entry_map; - if ((strcmp(nconf->nc_proto, rmap->r_nc_proto) == 0) && - (strcmp(nconf->nc_protofmly, rmap->r_nc_protofmly) == 0) && - (nconf->nc_semantics == rmap->r_nc_semantics) && - (rmap->r_maddr != NULL) && (rmap->r_maddr[0] != 0)) { - na = uaddr2taddr(nconf, rmap->r_maddr); -#ifdef ND_DEBUG - fprintf(stderr, "\tRemote address is [%s].\n", - rmap->r_maddr); - if (!na) - fprintf(stderr, - "\tCouldn't resolve remote address!\n"); -#endif - break; - } - } - return (na); -} - -/* - * Quick check to see if rpcbind is up. Tries to connect over - * local transport. - */ -static bool_t -__rpcbind_is_up() -{ - struct netconfig *nconf; - struct sockaddr_un sun; - void *localhandle; - int sock; - - nconf = NULL; - localhandle = setnetconfig(); - while ((nconf = getnetconfig(localhandle)) != NULL) { - if (nconf->nc_protofmly != NULL && - strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0) - break; - } - if (nconf == NULL) - return (FALSE); - - endnetconfig(localhandle); - - memset(&sun, 0, sizeof sun); - sock = _socket(AF_LOCAL, SOCK_STREAM, 0); - if (sock < 0) - return (FALSE); - sun.sun_family = AF_LOCAL; - strncpy(sun.sun_path, _PATH_RPCBINDSOCK, sizeof(sun.sun_path)); - sun.sun_len = SUN_LEN(&sun); - - if (_connect(sock, (struct sockaddr *)&sun, sun.sun_len) < 0) { - _close(sock); - return (FALSE); - } - - _close(sock); - return (TRUE); -} - -/* - * An internal function which optimizes rpcb_getaddr function. It also - * returns the client handle that it uses to contact the remote rpcbind. - * - * The algorithm used: If the transports is TCP or UDP, it first tries - * version 2 (portmap), 4 and then 3 (svr4). This order should be - * changed in the next OS release to 4, 2 and 3. We are assuming that by - * that time, version 4 would be available on many machines on the network. - * With this algorithm, we get performance as well as a plan for - * obsoleting version 2. - * - * For all other transports, the algorithm remains as 4 and then 3. - * - * XXX: Due to some problems with t_connect(), we do not reuse the same client - * handle for COTS cases and hence in these cases we do not return the - * client handle. This code will change if t_connect() ever - * starts working properly. Also look under clnt_vc.c. - */ -struct netbuf * -__rpcb_findaddr_timed(program, version, nconf, host, clpp, tp) - rpcprog_t program; - rpcvers_t version; - const struct netconfig *nconf; - const char *host; - CLIENT **clpp; - struct timeval *tp; -{ - static bool_t check_rpcbind = TRUE; - CLIENT *client = NULL; - RPCB parms; - enum clnt_stat clnt_st; - char *ua = NULL; - rpcvers_t vers; - struct netbuf *address = NULL; - rpcvers_t start_vers = RPCBVERS4; - struct netbuf servaddr; - - /* parameter checking */ - if (nconf == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - return (NULL); - } - - parms.r_addr = NULL; - - /* - * Use default total timeout if no timeout is specified. - */ - if (tp == NULL) - tp = &tottimeout; - -#ifdef PORTMAP - /* Try version 2 for TCP or UDP */ - if (strcmp(nconf->nc_protofmly, NC_INET) == 0) { - u_short port = 0; - struct netbuf remote; - rpcvers_t pmapvers = 2; - struct pmap pmapparms; - - /* - * Try UDP only - there are some portmappers out - * there that use UDP only. - */ - if (strcmp(nconf->nc_proto, NC_TCP) == 0) { - struct netconfig *newnconf; - - if ((newnconf = getnetconfigent("udp")) == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - return (NULL); - } - client = getclnthandle(host, newnconf, &parms.r_addr); - freenetconfigent(newnconf); - } else { - client = getclnthandle(host, nconf, &parms.r_addr); - } - if (client == NULL) - return (NULL); - - /* - * Set version and retry timeout. - */ - CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, (char *)&rpcbrmttime); - CLNT_CONTROL(client, CLSET_VERS, (char *)&pmapvers); - - pmapparms.pm_prog = program; - pmapparms.pm_vers = version; - pmapparms.pm_prot = strcmp(nconf->nc_proto, NC_TCP) ? - IPPROTO_UDP : IPPROTO_TCP; - pmapparms.pm_port = 0; /* not needed */ - clnt_st = CLNT_CALL(client, (rpcproc_t)PMAPPROC_GETPORT, - (xdrproc_t) xdr_pmap, (caddr_t)(void *)&pmapparms, - (xdrproc_t) xdr_u_short, (caddr_t)(void *)&port, - *tp); - if (clnt_st != RPC_SUCCESS) { - if ((clnt_st == RPC_PROGVERSMISMATCH) || - (clnt_st == RPC_PROGUNAVAIL)) - goto try_rpcbind; /* Try different versions */ - rpc_createerr.cf_stat = RPC_PMAPFAILURE; - clnt_geterr(client, &rpc_createerr.cf_error); - goto error; - } else if (port == 0) { - address = NULL; - rpc_createerr.cf_stat = RPC_PROGNOTREGISTERED; - goto error; - } - port = htons(port); - CLNT_CONTROL(client, CLGET_SVC_ADDR, (char *)&remote); - if (((address = (struct netbuf *) - malloc(sizeof (struct netbuf))) == NULL) || - ((address->buf = (char *) - malloc(remote.len)) == NULL)) { - rpc_createerr.cf_stat = RPC_SYSTEMERROR; - clnt_geterr(client, &rpc_createerr.cf_error); - if (address) { - free(address); - address = NULL; - } - goto error; - } - memcpy(address->buf, remote.buf, remote.len); - memcpy(&((char *)address->buf)[sizeof (short)], - (char *)(void *)&port, sizeof (short)); - address->len = address->maxlen = remote.len; - goto done; - } -#endif /* PORTMAP */ - -try_rpcbind: - /* - * Check if rpcbind is up. This prevents needless delays when - * accessing applications such as the keyserver while booting - * disklessly. - */ - if (check_rpcbind && strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0) { - if (!__rpcbind_is_up()) { - rpc_createerr.cf_stat = RPC_PMAPFAILURE; - rpc_createerr.cf_error.re_errno = 0; - goto error; - } - check_rpcbind = FALSE; - } - - /* - * Now we try version 4 and then 3. - * We also send the remote system the address we used to - * contact it in case it can help to connect back with us - */ - parms.r_prog = program; - parms.r_vers = version; - /*LINTED const castaway*/ - parms.r_owner = (char *) &nullstring[0]; /* not needed; */ - /* just for xdring */ - parms.r_netid = nconf->nc_netid; /* not really needed */ - - /* - * If a COTS transport is being used, try getting address via CLTS - * transport. This works only with version 4. - */ - if (nconf->nc_semantics == NC_TPI_COTS_ORD || - nconf->nc_semantics == NC_TPI_COTS) { - - void *handle; - struct netconfig *nconf_clts; - rpcb_entry_list_ptr relp = NULL; - - if (client == NULL) { - /* This did not go through the above PORTMAP/TCP code */ - if ((handle = __rpc_setconf("datagram_v")) != NULL) { - while ((nconf_clts = __rpc_getconf(handle)) - != NULL) { - if (strcmp(nconf_clts->nc_protofmly, - nconf->nc_protofmly) != 0) { - continue; - } - client = getclnthandle(host, nconf_clts, - &parms.r_addr); - break; - } - __rpc_endconf(handle); - } - if (client == NULL) - goto regular_rpcbind; /* Go the regular way */ - } else { - /* This is a UDP PORTMAP handle. Change to version 4 */ - vers = RPCBVERS4; - CLNT_CONTROL(client, CLSET_VERS, (char *)(void *)&vers); - } - /* - * We also send the remote system the address we used to - * contact it in case it can help it connect back with us - */ - if (parms.r_addr == NULL) { - /*LINTED const castaway*/ - parms.r_addr = (char *) &nullstring[0]; /* for XDRing */ - } - - CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, (char *)&rpcbrmttime); - - clnt_st = CLNT_CALL(client, (rpcproc_t)RPCBPROC_GETADDRLIST, - (xdrproc_t) xdr_rpcb, (char *)(void *)&parms, - (xdrproc_t) xdr_rpcb_entry_list_ptr, - (char *)(void *)&relp, *tp); - if (clnt_st == RPC_SUCCESS) { - if ((address = got_entry(relp, nconf)) != NULL) { - xdr_free((xdrproc_t) xdr_rpcb_entry_list_ptr, - (char *)(void *)&relp); - CLNT_CONTROL(client, CLGET_SVC_ADDR, - (char *)(void *)&servaddr); - __rpc_fixup_addr(address, &servaddr); - goto done; - } - /* Entry not found for this transport */ - xdr_free((xdrproc_t) xdr_rpcb_entry_list_ptr, - (char *)(void *)&relp); - /* - * XXX: should have perhaps returned with error but - * since the remote machine might not always be able - * to send the address on all transports, we try the - * regular way with regular_rpcbind - */ - goto regular_rpcbind; - } else if ((clnt_st == RPC_PROGVERSMISMATCH) || - (clnt_st == RPC_PROGUNAVAIL)) { - start_vers = RPCBVERS; /* Try version 3 now */ - goto regular_rpcbind; /* Try different versions */ - } else { - rpc_createerr.cf_stat = RPC_PMAPFAILURE; - clnt_geterr(client, &rpc_createerr.cf_error); - goto error; - } - } - -regular_rpcbind: - - /* Now the same transport is to be used to get the address */ - if (client && ((nconf->nc_semantics == NC_TPI_COTS_ORD) || - (nconf->nc_semantics == NC_TPI_COTS))) { - /* A CLTS type of client - destroy it */ - CLNT_DESTROY(client); - client = NULL; - } - - if (client == NULL) { - client = getclnthandle(host, nconf, &parms.r_addr); - if (client == NULL) { - goto error; - } - } - if (parms.r_addr == NULL) { - /*LINTED const castaway*/ - parms.r_addr = (char *) &nullstring[0]; - } - - /* First try from start_vers and then version 3 (RPCBVERS) */ - - CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, (char *) &rpcbrmttime); - for (vers = start_vers; vers >= RPCBVERS; vers--) { - /* Set the version */ - CLNT_CONTROL(client, CLSET_VERS, (char *)(void *)&vers); - clnt_st = CLNT_CALL(client, (rpcproc_t)RPCBPROC_GETADDR, - (xdrproc_t) xdr_rpcb, (char *)(void *)&parms, - (xdrproc_t) xdr_wrapstring, (char *)(void *) &ua, *tp); - if (clnt_st == RPC_SUCCESS) { - if ((ua == NULL) || (ua[0] == 0)) { - /* address unknown */ - rpc_createerr.cf_stat = RPC_PROGNOTREGISTERED; - goto error; - } - address = uaddr2taddr(nconf, ua); -#ifdef ND_DEBUG - fprintf(stderr, "\tRemote address is [%s]\n", ua); - if (!address) - fprintf(stderr, - "\tCouldn't resolve remote address!\n"); -#endif - xdr_free((xdrproc_t)xdr_wrapstring, - (char *)(void *)&ua); - - if (! address) { - /* We don't know about your universal address */ - rpc_createerr.cf_stat = RPC_N2AXLATEFAILURE; - goto error; - } - CLNT_CONTROL(client, CLGET_SVC_ADDR, - (char *)(void *)&servaddr); - __rpc_fixup_addr(address, &servaddr); - goto done; - } else if (clnt_st == RPC_PROGVERSMISMATCH) { - struct rpc_err rpcerr; - - clnt_geterr(client, &rpcerr); - if (rpcerr.re_vers.low > RPCBVERS4) - goto error; /* a new version, can't handle */ - } else if (clnt_st != RPC_PROGUNAVAIL) { - /* Cant handle this error */ - rpc_createerr.cf_stat = clnt_st; - clnt_geterr(client, &rpc_createerr.cf_error); - goto error; - } - } - -error: - if (client) { - CLNT_DESTROY(client); - client = NULL; - } -done: - if (nconf->nc_semantics != NC_TPI_CLTS) { - /* This client is the connectionless one */ - if (client) { - CLNT_DESTROY(client); - client = NULL; - } - } - if (clpp) { - *clpp = client; - } else if (client) { - CLNT_DESTROY(client); - } - if (parms.r_addr != NULL && parms.r_addr != nullstring) - free(parms.r_addr); - return (address); -} - - -/* - * Find the mapped address for program, version. - * Calls the rpcbind service remotely to do the lookup. - * Uses the transport specified in nconf. - * Returns FALSE (0) if no map exists, else returns 1. - * - * Assuming that the address is all properly allocated - */ -bool_t -rpcb_getaddr(program, version, nconf, address, host) - rpcprog_t program; - rpcvers_t version; - const struct netconfig *nconf; - struct netbuf *address; - const char *host; -{ - struct netbuf *na; - - if ((na = __rpcb_findaddr_timed(program, version, - (struct netconfig *) nconf, (char *) host, - (CLIENT **) NULL, (struct timeval *) NULL)) == NULL) - return (FALSE); - - if (na->len > address->maxlen) { - /* Too long address */ - free(na->buf); - free(na); - rpc_createerr.cf_stat = RPC_FAILED; - return (FALSE); - } - memcpy(address->buf, na->buf, (size_t)na->len); - address->len = na->len; - free(na->buf); - free(na); - return (TRUE); -} - -/* - * Get a copy of the current maps. - * Calls the rpcbind service remotely to get the maps. - * - * It returns only a list of the services - * It returns NULL on failure. - */ -rpcblist * -rpcb_getmaps(nconf, host) - const struct netconfig *nconf; - const char *host; -{ - rpcblist_ptr head = NULL; - CLIENT *client; - enum clnt_stat clnt_st; - rpcvers_t vers = 0; - - client = getclnthandle(host, nconf, NULL); - if (client == NULL) { - return (head); - } - clnt_st = CLNT_CALL(client, (rpcproc_t)RPCBPROC_DUMP, - (xdrproc_t) xdr_void, NULL, (xdrproc_t) xdr_rpcblist_ptr, - (char *)(void *)&head, tottimeout); - if (clnt_st == RPC_SUCCESS) - goto done; - - if ((clnt_st != RPC_PROGVERSMISMATCH) && - (clnt_st != RPC_PROGUNAVAIL)) { - rpc_createerr.cf_stat = RPC_RPCBFAILURE; - clnt_geterr(client, &rpc_createerr.cf_error); - goto done; - } - - /* fall back to earlier version */ - CLNT_CONTROL(client, CLGET_VERS, (char *)(void *)&vers); - if (vers == RPCBVERS4) { - vers = RPCBVERS; - CLNT_CONTROL(client, CLSET_VERS, (char *)(void *)&vers); - if (CLNT_CALL(client, (rpcproc_t)RPCBPROC_DUMP, - (xdrproc_t) xdr_void, NULL, (xdrproc_t) xdr_rpcblist_ptr, - (char *)(void *)&head, tottimeout) == RPC_SUCCESS) - goto done; - } - rpc_createerr.cf_stat = RPC_RPCBFAILURE; - clnt_geterr(client, &rpc_createerr.cf_error); - -done: - CLNT_DESTROY(client); - return (head); -} - -/* - * rpcbinder remote-call-service interface. - * This routine is used to call the rpcbind remote call service - * which will look up a service program in the address maps, and then - * remotely call that routine with the given parameters. This allows - * programs to do a lookup and call in one step. -*/ -enum clnt_stat -rpcb_rmtcall(nconf, host, prog, vers, proc, xdrargs, argsp, - xdrres, resp, tout, addr_ptr) - const struct netconfig *nconf; /* Netconfig structure */ - const char *host; /* Remote host name */ - rpcprog_t prog; - rpcvers_t vers; - rpcproc_t proc; /* Remote proc identifiers */ - xdrproc_t xdrargs, xdrres; /* XDR routines */ - caddr_t argsp, resp; /* Argument and Result */ - struct timeval tout; /* Timeout value for this call */ - const struct netbuf *addr_ptr; /* Preallocated netbuf address */ -{ - CLIENT *client; - enum clnt_stat stat; - struct r_rpcb_rmtcallargs a; - struct r_rpcb_rmtcallres r; - rpcvers_t rpcb_vers; - - stat = 0; - client = getclnthandle(host, nconf, NULL); - if (client == NULL) { - return (RPC_FAILED); - } - /*LINTED const castaway*/ - CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, (char *)(void *)&rmttimeout); - a.prog = prog; - a.vers = vers; - a.proc = proc; - a.args.args_val = argsp; - a.xdr_args = xdrargs; - r.addr = NULL; - r.results.results_val = resp; - r.xdr_res = xdrres; - - for (rpcb_vers = RPCBVERS4; rpcb_vers >= RPCBVERS; rpcb_vers--) { - CLNT_CONTROL(client, CLSET_VERS, (char *)(void *)&rpcb_vers); - stat = CLNT_CALL(client, (rpcproc_t)RPCBPROC_CALLIT, - (xdrproc_t) xdr_rpcb_rmtcallargs, (char *)(void *)&a, - (xdrproc_t) xdr_rpcb_rmtcallres, (char *)(void *)&r, tout); - if ((stat == RPC_SUCCESS) && (addr_ptr != NULL)) { - struct netbuf *na; - /*LINTED const castaway*/ - na = uaddr2taddr((struct netconfig *) nconf, r.addr); - if (!na) { - stat = RPC_N2AXLATEFAILURE; - /*LINTED const castaway*/ - ((struct netbuf *) addr_ptr)->len = 0; - goto error; - } - if (na->len > addr_ptr->maxlen) { - /* Too long address */ - stat = RPC_FAILED; /* XXX A better error no */ - free(na->buf); - free(na); - /*LINTED const castaway*/ - ((struct netbuf *) addr_ptr)->len = 0; - goto error; - } - memcpy(addr_ptr->buf, na->buf, (size_t)na->len); - /*LINTED const castaway*/ - ((struct netbuf *)addr_ptr)->len = na->len; - free(na->buf); - free(na); - break; - } else if ((stat != RPC_PROGVERSMISMATCH) && - (stat != RPC_PROGUNAVAIL)) { - goto error; - } - } -error: - CLNT_DESTROY(client); - if (r.addr) - xdr_free((xdrproc_t) xdr_wrapstring, (char *)(void *)&r.addr); - return (stat); -} - -/* - * Gets the time on the remote host. - * Returns 1 if succeeds else 0. - */ -bool_t -rpcb_gettime(host, timep) - const char *host; - time_t *timep; -{ - CLIENT *client = NULL; - void *handle; - struct netconfig *nconf; - rpcvers_t vers; - enum clnt_stat st; - - - if ((host == NULL) || (host[0] == 0)) { - time(timep); - return (TRUE); - } - - if ((handle = __rpc_setconf("netpath")) == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - return (FALSE); - } - rpc_createerr.cf_stat = RPC_SUCCESS; - while (client == NULL) { - if ((nconf = __rpc_getconf(handle)) == NULL) { - if (rpc_createerr.cf_stat == RPC_SUCCESS) - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - break; - } - client = getclnthandle(host, nconf, NULL); - if (client) - break; - } - __rpc_endconf(handle); - if (client == (CLIENT *) NULL) { - return (FALSE); - } - - st = CLNT_CALL(client, (rpcproc_t)RPCBPROC_GETTIME, - (xdrproc_t) xdr_void, NULL, - (xdrproc_t) xdr_int, (char *)(void *)timep, tottimeout); - - if ((st == RPC_PROGVERSMISMATCH) || (st == RPC_PROGUNAVAIL)) { - CLNT_CONTROL(client, CLGET_VERS, (char *)(void *)&vers); - if (vers == RPCBVERS4) { - /* fall back to earlier version */ - vers = RPCBVERS; - CLNT_CONTROL(client, CLSET_VERS, (char *)(void *)&vers); - st = CLNT_CALL(client, (rpcproc_t)RPCBPROC_GETTIME, - (xdrproc_t) xdr_void, NULL, - (xdrproc_t) xdr_int, (char *)(void *)timep, - tottimeout); - } - } - CLNT_DESTROY(client); - return (st == RPC_SUCCESS? TRUE: FALSE); -} - -static bool_t -xdr_netbuf(XDR *xdrs, struct netbuf *objp) -{ - bool_t dummy; - void **pp; - - if (!xdr_uint32_t(xdrs, (uint32_t *) &objp->maxlen)) { - return (FALSE); - } - pp = &objp->buf; - - if (objp->maxlen > RPC_MAXDATASIZE) { - return (FALSE); - } - - dummy = xdr_bytes(xdrs, (char **) pp, - (u_int *)&(objp->len), objp->maxlen); - return (dummy); -} - -/* - * Converts taddr to universal address. This routine should never - * really be called because local n2a libraries are always provided. - */ -char * -rpcb_taddr2uaddr(struct netconfig *nconf, struct netbuf *taddr) -{ - CLIENT *client; - char *uaddr = NULL; - - - /* parameter checking */ - if (nconf == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - return (NULL); - } - if (taddr == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNADDR; - return (NULL); - } - client = local_rpcb(); - if (! client) { - return (NULL); - } - - CLNT_CALL(client, (rpcproc_t)RPCBPROC_TADDR2UADDR, - (xdrproc_t) xdr_netbuf, (char *)(void *)taddr, - (xdrproc_t) xdr_wrapstring, (char *)(void *)&uaddr, tottimeout); - CLNT_DESTROY(client); - return (uaddr); -} - -/* - * Converts universal address to netbuf. This routine should never - * really be called because local n2a libraries are always provided. - */ -struct netbuf * -rpcb_uaddr2taddr(struct netconfig *nconf, char *uaddr) -{ - CLIENT *client; - struct netbuf *taddr; - - - /* parameter checking */ - if (nconf == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNPROTO; - return (NULL); - } - if (uaddr == NULL) { - rpc_createerr.cf_stat = RPC_UNKNOWNADDR; - return (NULL); - } - client = local_rpcb(); - if (! client) { - return (NULL); - } - - taddr = (struct netbuf *)malloc(sizeof (struct netbuf), M_RPC, M_WAITOK|M_ZERO); - if (CLNT_CALL(client, (rpcproc_t)RPCBPROC_UADDR2TADDR, - (xdrproc_t) xdr_wrapstring, (char *)(void *)&uaddr, - (xdrproc_t) xdr_netbuf, (char *)(void *)taddr, - tottimeout) != RPC_SUCCESS) { - free(taddr); - taddr = NULL; - } - CLNT_DESTROY(client); - return (taddr); -} - -#endif From 2834fd2ad58b42c45aa02d0cd21fc1c04b3c278a Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 8 Jan 2025 20:00:12 -0800 Subject: [PATCH 073/143] kgssapi: remove the debug module Its build was disabled since original bulk check-in in 2008. Today it fails to compile due to multiple errors. I also tried to build it on stable/10, and that failed, too. I guess it wasn't buildable since initial check-in. --- sys/conf/files | 1 - sys/conf/options | 1 - sys/kgssapi/gsstest.c | 1145 ----------------------------------------- 3 files changed, 1147 deletions(-) delete mode 100644 sys/kgssapi/gsstest.c diff --git a/sys/conf/files b/sys/conf/files index 428a2805768c50..d358737c561320 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4038,7 +4038,6 @@ kgssapi/krb5/krb5_mech.c optional kgssapi kgssapi/krb5/kcrypto.c optional kgssapi kgssapi/krb5/kcrypto_aes.c optional kgssapi kgssapi/kgss_if.m optional kgssapi -kgssapi/gsstest.c optional kgssapi_debug # These files in libkern/ are those needed by all architectures. Some # of the files in libkern/ are only needed on some architectures, e.g., # libkern/divdi3.c is needed by i386 but not alpha. Also, some of these diff --git a/sys/conf/options b/sys/conf/options index 438d0e81889c8a..c467dc9995c254 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -290,7 +290,6 @@ TARFS_DEBUG opt_tarfs.h # In-kernel GSS-API KGSSAPI opt_kgssapi.h -KGSSAPI_DEBUG opt_kgssapi.h # These static filesystems have one slightly bogus static dependency in # sys/i386/i386/autoconf.c. If any of these filesystems are diff --git a/sys/kgssapi/gsstest.c b/sys/kgssapi/gsstest.c deleted file mode 100644 index e47b25042d1c64..00000000000000 --- a/sys/kgssapi/gsstest.c +++ /dev/null @@ -1,1145 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ - * Authors: Doug Rabson - * Developed with Red Inc: Alfred Perlstein - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -static void -report_error(gss_OID mech, OM_uint32 maj, OM_uint32 min) -{ - OM_uint32 maj_stat, min_stat; - OM_uint32 message_context; - gss_buffer_desc buf; - - uprintf("major_stat=%d, minor_stat=%d\n", maj, min); - message_context = 0; - do { - maj_stat = gss_display_status(&min_stat, maj, - GSS_C_GSS_CODE, GSS_C_NO_OID, &message_context, &buf); - if (GSS_ERROR(maj_stat)) - break; - uprintf("%.*s\n", (int)buf.length, (char *) buf.value); - gss_release_buffer(&min_stat, &buf); - } while (message_context); - if (mech && min) { - message_context = 0; - do { - maj_stat = gss_display_status(&min_stat, min, - GSS_C_MECH_CODE, mech, &message_context, &buf); - if (GSS_ERROR(maj_stat)) - break; - uprintf("%.*s\n", (int)buf.length, (char *) buf.value); - gss_release_buffer(&min_stat, &buf); - } while (message_context); - } -} - -#if 0 -static void -send_token_to_peer(const gss_buffer_t token) -{ - const uint8_t *p; - size_t i; - - printf("send token:\n"); - printf("%d ", (int) token->length); - p = (const uint8_t *) token->value; - for (i = 0; i < token->length; i++) - printf("%02x", *p++); - printf("\n"); -} - -static void -receive_token_from_peer(gss_buffer_t token) -{ - char line[8192]; - char *p; - uint8_t *q; - int len, val; - - printf("receive token:\n"); - fgets(line, sizeof(line), stdin); - if (line[strlen(line) - 1] != '\n') { - printf("token truncated\n"); - exit(1); - } - p = line; - if (sscanf(line, "%d ", &len) != 1) { - printf("bad token\n"); - exit(1); - } - p = strchr(p, ' ') + 1; - token->length = len; - token->value = malloc(len); - q = (uint8_t *) token->value; - while (len) { - if (sscanf(p, "%02x", &val) != 1) { - printf("bad token\n"); - exit(1); - } - *q++ = val; - p += 2; - len--; - } -} -#endif - -#if 0 -void -server(int argc, char** argv) -{ - OM_uint32 maj_stat, min_stat; - gss_buffer_desc input_token, output_token; - gss_ctx_id_t context_hdl = GSS_C_NO_CONTEXT; - gss_name_t client_name; - gss_OID mech_type; - - if (argc != 1) - usage(); - - do { - receive_token_from_peer(&input_token); - maj_stat = gss_accept_sec_context(&min_stat, - &context_hdl, - GSS_C_NO_CREDENTIAL, - &input_token, - GSS_C_NO_CHANNEL_BINDINGS, - &client_name, - &mech_type, - &output_token, - NULL, - NULL, - NULL); - if (GSS_ERROR(maj_stat)) { - report_error(mech_type, maj_stat, min_stat); - } - if (output_token.length != 0) { - send_token_to_peer(&output_token); - gss_release_buffer(&min_stat, &output_token); - } - if (GSS_ERROR(maj_stat)) { - if (context_hdl != GSS_C_NO_CONTEXT) - gss_delete_sec_context(&min_stat, - &context_hdl, - GSS_C_NO_BUFFER); - break; - } - } while (maj_stat & GSS_S_CONTINUE_NEEDED); - - if (client_name) { - gss_buffer_desc name_desc; - char buf[512]; - - gss_display_name(&min_stat, client_name, &name_desc, NULL); - memcpy(buf, name_desc.value, name_desc.length); - buf[name_desc.length] = 0; - gss_release_buffer(&min_stat, &name_desc); - printf("client name is %s\n", buf); - } - - receive_token_from_peer(&input_token); - gss_unwrap(&min_stat, context_hdl, &input_token, &output_token, - NULL, NULL); - printf("%.*s\n", (int)output_token.length, (char *) output_token.value); - gss_release_buffer(&min_stat, &output_token); -} -#endif - -/* 1.2.752.43.13.14 */ -static gss_OID_desc gss_krb5_set_allowable_enctypes_x_desc = -{6, (void *) "\x2a\x85\x70\x2b\x0d\x0e"}; - -gss_OID GSS_KRB5_SET_ALLOWABLE_ENCTYPES_X = &gss_krb5_set_allowable_enctypes_x_desc; -#define ETYPE_DES_CBC_CRC 1 - -/* - * Create an initiator context and acceptor context in the kernel and - * use them to exchange signed and sealed messages. - */ -static int -gsstest_1(struct thread *td) -{ - OM_uint32 maj_stat, min_stat; - OM_uint32 smaj_stat, smin_stat; - int context_established = 0; - gss_ctx_id_t client_context = GSS_C_NO_CONTEXT; - gss_ctx_id_t server_context = GSS_C_NO_CONTEXT; - gss_cred_id_t client_cred = GSS_C_NO_CREDENTIAL; - gss_cred_id_t server_cred = GSS_C_NO_CREDENTIAL; - gss_name_t name = GSS_C_NO_NAME; - gss_name_t received_name = GSS_C_NO_NAME; - gss_buffer_desc name_desc; - gss_buffer_desc client_token, server_token, message_buf; - gss_OID mech, actual_mech, mech_type; - static gss_OID_desc krb5_desc = - {9, (void *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02"}; -#if 0 - static gss_OID_desc spnego_desc = - {6, (void *)"\x2b\x06\x01\x05\x05\x02"}; - static gss_OID_desc ntlm_desc = - {10, (void *)"\x2b\x06\x01\x04\x01\x82\x37\x02\x02\x0a"}; -#endif - char enctype[sizeof(uint32_t)]; - - mech = GSS_C_NO_OID; - - { - static char sbuf[512]; - memcpy(sbuf, "nfs@", 4); - getcredhostname(td->td_ucred, sbuf + 4, sizeof(sbuf) - 4); - name_desc.value = sbuf; - } - - name_desc.length = strlen((const char *) name_desc.value); - maj_stat = gss_import_name(&min_stat, &name_desc, - GSS_C_NT_HOSTBASED_SERVICE, &name); - if (GSS_ERROR(maj_stat)) { - printf("gss_import_name failed\n"); - report_error(mech, maj_stat, min_stat); - goto out; - } - - maj_stat = gss_acquire_cred(&min_stat, GSS_C_NO_NAME, - 0, GSS_C_NO_OID_SET, GSS_C_INITIATE, &client_cred, - NULL, NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_acquire_cred (client) failed\n"); - report_error(mech, maj_stat, min_stat); - goto out; - } - - enctype[0] = (ETYPE_DES_CBC_CRC >> 24) & 0xff; - enctype[1] = (ETYPE_DES_CBC_CRC >> 16) & 0xff; - enctype[2] = (ETYPE_DES_CBC_CRC >> 8) & 0xff; - enctype[3] = ETYPE_DES_CBC_CRC & 0xff; - message_buf.length = sizeof(enctype); - message_buf.value = enctype; - maj_stat = gss_set_cred_option(&min_stat, &client_cred, - GSS_KRB5_SET_ALLOWABLE_ENCTYPES_X, &message_buf); - if (GSS_ERROR(maj_stat)) { - printf("gss_set_cred_option failed\n"); - report_error(mech, maj_stat, min_stat); - goto out; - } - - server_token.length = 0; - server_token.value = NULL; - while (!context_established) { - client_token.length = 0; - client_token.value = NULL; - maj_stat = gss_init_sec_context(&min_stat, - client_cred, - &client_context, - name, - mech, - GSS_C_MUTUAL_FLAG|GSS_C_CONF_FLAG|GSS_C_INTEG_FLAG, - 0, - GSS_C_NO_CHANNEL_BINDINGS, - &server_token, - &actual_mech, - &client_token, - NULL, - NULL); - if (server_token.length) - gss_release_buffer(&smin_stat, &server_token); - if (GSS_ERROR(maj_stat)) { - printf("gss_init_sec_context failed\n"); - report_error(mech, maj_stat, min_stat); - goto out; - } - - if (client_token.length != 0) { - if (!server_cred) { - gss_OID_set_desc oid_set; - oid_set.count = 1; - oid_set.elements = &krb5_desc; - smaj_stat = gss_acquire_cred(&smin_stat, - name, 0, &oid_set, GSS_C_ACCEPT, &server_cred, - NULL, NULL); - if (GSS_ERROR(smaj_stat)) { - printf("gss_acquire_cred (server) failed\n"); - report_error(mech_type, smaj_stat, smin_stat); - goto out; - } - } - smaj_stat = gss_accept_sec_context(&smin_stat, - &server_context, - server_cred, - &client_token, - GSS_C_NO_CHANNEL_BINDINGS, - &received_name, - &mech_type, - &server_token, - NULL, - NULL, - NULL); - if (GSS_ERROR(smaj_stat)) { - printf("gss_accept_sec_context failed\n"); - report_error(mech_type, smaj_stat, smin_stat); - goto out; - } - gss_release_buffer(&min_stat, &client_token); - } - if (GSS_ERROR(maj_stat)) { - if (client_context != GSS_C_NO_CONTEXT) - gss_delete_sec_context(&min_stat, - &client_context, - GSS_C_NO_BUFFER); - break; - } - - if (maj_stat == GSS_S_COMPLETE) { - context_established = 1; - } - } - - message_buf.length = strlen("Hello world"); - message_buf.value = (void *) "Hello world"; - - maj_stat = gss_get_mic(&min_stat, client_context, - GSS_C_QOP_DEFAULT, &message_buf, &client_token); - if (GSS_ERROR(maj_stat)) { - printf("gss_get_mic failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - maj_stat = gss_verify_mic(&min_stat, server_context, - &message_buf, &client_token, NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_verify_mic failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - gss_release_buffer(&min_stat, &client_token); - - maj_stat = gss_wrap(&min_stat, client_context, - TRUE, GSS_C_QOP_DEFAULT, &message_buf, NULL, &client_token); - if (GSS_ERROR(maj_stat)) { - printf("gss_wrap failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - maj_stat = gss_unwrap(&min_stat, server_context, - &client_token, &server_token, NULL, NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_unwrap failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - - if (message_buf.length != server_token.length - || memcmp(message_buf.value, server_token.value, - message_buf.length)) - printf("unwrap result corrupt\n"); - - gss_release_buffer(&min_stat, &client_token); - gss_release_buffer(&min_stat, &server_token); - -out: - if (client_context) - gss_delete_sec_context(&min_stat, &client_context, - GSS_C_NO_BUFFER); - if (server_context) - gss_delete_sec_context(&min_stat, &server_context, - GSS_C_NO_BUFFER); - if (client_cred) - gss_release_cred(&min_stat, &client_cred); - if (server_cred) - gss_release_cred(&min_stat, &server_cred); - if (name) - gss_release_name(&min_stat, &name); - if (received_name) - gss_release_name(&min_stat, &received_name); - - return (0); -} - -/* - * Interoperability with userland. This takes several steps: - * - * 1. Accept an initiator token from userland, return acceptor - * token. Repeat this step until both userland and kernel return - * GSS_S_COMPLETE. - * - * 2. Receive a signed message from userland and verify the - * signature. Return a signed reply to userland for it to verify. - * - * 3. Receive a wrapped message from userland and unwrap it. Return a - * wrapped reply to userland. - */ -static int -gsstest_2(struct thread *td, int step, const gss_buffer_t input_token, - OM_uint32 *maj_stat_res, OM_uint32 *min_stat_res, gss_buffer_t output_token) -{ - OM_uint32 maj_stat, min_stat; - static int context_established = 0; - static gss_ctx_id_t server_context = GSS_C_NO_CONTEXT; - static gss_cred_id_t server_cred = GSS_C_NO_CREDENTIAL; - static gss_name_t name = GSS_C_NO_NAME; - gss_buffer_desc name_desc; - gss_buffer_desc message_buf; - gss_OID mech_type = GSS_C_NO_OID; - char enctype[sizeof(uint32_t)]; - int error = EINVAL; - - maj_stat = GSS_S_FAILURE; - min_stat = 0; - switch (step) { - case 1: - if (server_context == GSS_C_NO_CONTEXT) { - static char sbuf[512]; - memcpy(sbuf, "nfs@", 4); - getcredhostname(td->td_ucred, sbuf + 4, - sizeof(sbuf) - 4); - name_desc.value = sbuf; - name_desc.length = strlen((const char *) - name_desc.value); - maj_stat = gss_import_name(&min_stat, &name_desc, - GSS_C_NT_HOSTBASED_SERVICE, &name); - if (GSS_ERROR(maj_stat)) { - printf("gss_import_name failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - - maj_stat = gss_acquire_cred(&min_stat, - name, 0, GSS_C_NO_OID_SET, GSS_C_ACCEPT, - &server_cred, NULL, NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_acquire_cred (server) failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - - enctype[0] = (ETYPE_DES_CBC_CRC >> 24) & 0xff; - enctype[1] = (ETYPE_DES_CBC_CRC >> 16) & 0xff; - enctype[2] = (ETYPE_DES_CBC_CRC >> 8) & 0xff; - enctype[3] = ETYPE_DES_CBC_CRC & 0xff; - message_buf.length = sizeof(enctype); - message_buf.value = enctype; - maj_stat = gss_set_cred_option(&min_stat, &server_cred, - GSS_KRB5_SET_ALLOWABLE_ENCTYPES_X, &message_buf); - if (GSS_ERROR(maj_stat)) { - printf("gss_set_cred_option failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - } - - maj_stat = gss_accept_sec_context(&min_stat, - &server_context, - server_cred, - input_token, - GSS_C_NO_CHANNEL_BINDINGS, - NULL, - &mech_type, - output_token, - NULL, - NULL, - NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_accept_sec_context failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - - if (maj_stat == GSS_S_COMPLETE) { - context_established = 1; - } - *maj_stat_res = maj_stat; - *min_stat_res = min_stat; - break; - - case 2: - message_buf.length = strlen("Hello world"); - message_buf.value = (void *) "Hello world"; - - maj_stat = gss_verify_mic(&min_stat, server_context, - &message_buf, input_token, NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_verify_mic failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - - maj_stat = gss_get_mic(&min_stat, server_context, - GSS_C_QOP_DEFAULT, &message_buf, output_token); - if (GSS_ERROR(maj_stat)) { - printf("gss_get_mic failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - break; - - case 3: - maj_stat = gss_unwrap(&min_stat, server_context, - input_token, &message_buf, NULL, NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_unwrap failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - gss_release_buffer(&min_stat, &message_buf); - - message_buf.length = strlen("Hello world"); - message_buf.value = (void *) "Hello world"; - maj_stat = gss_wrap(&min_stat, server_context, - TRUE, GSS_C_QOP_DEFAULT, &message_buf, NULL, output_token); - if (GSS_ERROR(maj_stat)) { - printf("gss_wrap failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - break; - - case 4: - maj_stat = gss_unwrap(&min_stat, server_context, - input_token, &message_buf, NULL, NULL); - if (GSS_ERROR(maj_stat)) { - printf("gss_unwrap failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - gss_release_buffer(&min_stat, &message_buf); - - message_buf.length = strlen("Hello world"); - message_buf.value = (void *) "Hello world"; - maj_stat = gss_wrap(&min_stat, server_context, - FALSE, GSS_C_QOP_DEFAULT, &message_buf, NULL, output_token); - if (GSS_ERROR(maj_stat)) { - printf("gss_wrap failed\n"); - report_error(mech_type, maj_stat, min_stat); - goto out; - } - break; - - case 5: - error = 0; - goto out; - } - *maj_stat_res = maj_stat; - *min_stat_res = min_stat; - return (0); - -out: - *maj_stat_res = maj_stat; - *min_stat_res = min_stat; - if (server_context) - gss_delete_sec_context(&min_stat, &server_context, - GSS_C_NO_BUFFER); - if (server_cred) - gss_release_cred(&min_stat, &server_cred); - if (name) - gss_release_name(&min_stat, &name); - - return (error); -} - -/* - * Create an RPC client handle for the given (address,prog,vers) - * triple using UDP. - */ -static CLIENT * -gsstest_get_rpc(struct sockaddr *sa, rpcprog_t prog, rpcvers_t vers) -{ - struct thread *td = curthread; - const char* protofmly; - struct sockaddr_storage ss; - struct socket *so; - CLIENT *rpcb; - struct timeval timo; - RPCB parms; - char *uaddr; - enum clnt_stat stat = RPC_SUCCESS; - int rpcvers = RPCBVERS4; - bool_t do_tcp = FALSE; - struct portmap mapping; - u_short port = 0; - - /* - * First we need to contact the remote RPCBIND service to find - * the right port. - */ - memcpy(&ss, sa, sa->sa_len); - switch (ss.ss_family) { - case AF_INET: - ((struct sockaddr_in *)&ss)->sin_port = htons(111); - protofmly = "inet"; - socreate(AF_INET, &so, SOCK_DGRAM, 0, td->td_ucred, td); - break; - -#ifdef INET6 - case AF_INET6: - ((struct sockaddr_in6 *)&ss)->sin6_port = htons(111); - protofmly = "inet6"; - socreate(AF_INET6, &so, SOCK_DGRAM, 0, td->td_ucred, td); - break; -#endif - - default: - /* - * Unsupported address family - fail. - */ - return (NULL); - } - - rpcb = clnt_dg_create(so, (struct sockaddr *)&ss, - RPCBPROG, rpcvers, 0, 0); - if (!rpcb) - return (NULL); - -try_tcp: - parms.r_prog = prog; - parms.r_vers = vers; - if (do_tcp) - parms.r_netid = "tcp"; - else - parms.r_netid = "udp"; - parms.r_addr = ""; - parms.r_owner = ""; - - /* - * Use the default timeout. - */ - timo.tv_sec = 25; - timo.tv_usec = 0; -again: - switch (rpcvers) { - case RPCBVERS4: - case RPCBVERS: - /* - * Try RPCBIND 4 then 3. - */ - uaddr = NULL; - stat = CLNT_CALL(rpcb, (rpcprog_t) RPCBPROC_GETADDR, - (xdrproc_t) xdr_rpcb, &parms, - (xdrproc_t) xdr_wrapstring, &uaddr, timo); - if (stat == RPC_PROGVERSMISMATCH) { - if (rpcvers == RPCBVERS4) - rpcvers = RPCBVERS; - else if (rpcvers == RPCBVERS) - rpcvers = PMAPVERS; - CLNT_CONTROL(rpcb, CLSET_VERS, &rpcvers); - goto again; - } else if (stat == RPC_SUCCESS) { - /* - * We have a reply from the remote RPCBIND - turn it - * into an appropriate address and make a new client - * that can talk to the remote service. - * - * XXX fixup IPv6 scope ID. - */ - struct netbuf *a; - a = __rpc_uaddr2taddr_af(ss.ss_family, uaddr); - xdr_free((xdrproc_t) xdr_wrapstring, &uaddr); - if (!a) { - CLNT_DESTROY(rpcb); - return (NULL); - } - memcpy(&ss, a->buf, a->len); - free(a->buf, M_RPC); - free(a, M_RPC); - } - break; - case PMAPVERS: - /* - * Try portmap. - */ - mapping.pm_prog = parms.r_prog; - mapping.pm_vers = parms.r_vers; - mapping.pm_prot = do_tcp ? IPPROTO_TCP : IPPROTO_UDP; - mapping.pm_port = 0; - - stat = CLNT_CALL(rpcb, (rpcprog_t) PMAPPROC_GETPORT, - (xdrproc_t) xdr_portmap, &mapping, - (xdrproc_t) xdr_u_short, &port, timo); - - if (stat == RPC_SUCCESS) { - switch (ss.ss_family) { - case AF_INET: - ((struct sockaddr_in *)&ss)->sin_port = - htons(port); - break; - -#ifdef INET6 - case AF_INET6: - ((struct sockaddr_in6 *)&ss)->sin6_port = - htons(port); - break; -#endif - } - } - break; - default: - panic("invalid rpcvers %d", rpcvers); - } - /* - * We may have a positive response from the portmapper, but - * the requested service was not found. Make sure we received - * a valid port. - */ - switch (ss.ss_family) { - case AF_INET: - port = ((struct sockaddr_in *)&ss)->sin_port; - break; -#ifdef INET6 - case AF_INET6: - port = ((struct sockaddr_in6 *)&ss)->sin6_port; - break; -#endif - } - if (stat != RPC_SUCCESS || !port) { - /* - * If we were able to talk to rpcbind or portmap, but the udp - * variant wasn't available, ask about tcp. - * - * XXX - We could also check for a TCP portmapper, but - * if the host is running a portmapper at all, we should be able - * to hail it over UDP. - */ - if (stat == RPC_SUCCESS && !do_tcp) { - do_tcp = TRUE; - goto try_tcp; - } - - /* Otherwise, bad news. */ - printf("gsstest_get_rpc: failed to contact remote rpcbind, " - "stat = %d, port = %d\n", - (int) stat, port); - CLNT_DESTROY(rpcb); - return (NULL); - } - - if (do_tcp) { - /* - * Destroy the UDP client we used to speak to rpcbind and - * recreate as a TCP client. - */ - struct netconfig *nconf = NULL; - - CLNT_DESTROY(rpcb); - - switch (ss.ss_family) { - case AF_INET: - nconf = getnetconfigent("tcp"); - break; -#ifdef INET6 - case AF_INET6: - nconf = getnetconfigent("tcp6"); - break; -#endif - } - - rpcb = clnt_reconnect_create(nconf, (struct sockaddr *)&ss, - prog, vers, 0, 0); - } else { - /* - * Re-use the client we used to speak to rpcbind. - */ - CLNT_CONTROL(rpcb, CLSET_SVC_ADDR, &ss); - CLNT_CONTROL(rpcb, CLSET_PROG, &prog); - CLNT_CONTROL(rpcb, CLSET_VERS, &vers); - } - - return (rpcb); -} - -/* - * RPCSEC_GSS client - */ -static int -gsstest_3(struct thread *td) -{ - struct sockaddr_in sin; - char service[128]; - CLIENT *client; - AUTH *auth; - rpc_gss_options_ret_t options_ret; - enum clnt_stat stat; - struct timeval tv; - rpc_gss_service_t svc; - int i; - - sin.sin_len = sizeof(sin); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - sin.sin_port = 0; - - client = gsstest_get_rpc((struct sockaddr *) &sin, 123456, 1); - if (!client) { - uprintf("Can't connect to service\n"); - return(1); - } - - memcpy(service, "host@", 5); - getcredhostname(td->td_ucred, service + 5, sizeof(service) - 5); - - auth = rpc_gss_seccreate(client, curthread->td_ucred, - service, "kerberosv5", rpc_gss_svc_privacy, - NULL, NULL, &options_ret); - if (!auth) { - gss_OID oid; - uprintf("Can't authorize to service (mech=%s)\n", - options_ret.actual_mechanism); - oid = GSS_C_NO_OID; - rpc_gss_mech_to_oid(options_ret.actual_mechanism, &oid); - report_error(oid, options_ret.major_status, - options_ret.minor_status); - CLNT_DESTROY(client); - return (1); - } - - for (svc = rpc_gss_svc_none; svc <= rpc_gss_svc_privacy; svc++) { - const char *svc_names[] = { - "rpc_gss_svc_default", - "rpc_gss_svc_none", - "rpc_gss_svc_integrity", - "rpc_gss_svc_privacy" - }; - int num; - - rpc_gss_set_defaults(auth, svc, NULL); - - client->cl_auth = auth; - tv.tv_sec = 5; - tv.tv_usec = 0; - for (i = 42; i < 142; i++) { - num = i; - stat = CLNT_CALL(client, 1, - (xdrproc_t) xdr_int, (char *) &num, - (xdrproc_t) xdr_int, (char *) &num, tv); - if (stat == RPC_SUCCESS) { - if (num != i + 100) - uprintf("unexpected reply %d\n", num); - } else { - uprintf("call failed, stat=%d\n", (int) stat); - break; - } - } - if (i == 142) - uprintf("call succeeded with %s\n", svc_names[svc]); - } - - AUTH_DESTROY(auth); - CLNT_RELEASE(client); - - return (0); -} - -/* - * RPCSEC_GSS server - */ -static rpc_gss_principal_t server_acl = NULL; -static bool_t server_new_context(struct svc_req *req, gss_cred_id_t deleg, - gss_ctx_id_t gss_context, rpc_gss_lock_t *lock, void **cookie); -static void server_program_1(struct svc_req *rqstp, register SVCXPRT *transp); - -static int -gsstest_4(struct thread *td) -{ - SVCPOOL *pool; - char principal[128 + 5]; - const char **mechs; - static rpc_gss_callback_t cb; - - memcpy(principal, "host@", 5); - getcredhostname(td->td_ucred, principal + 5, sizeof(principal) - 5); - - mechs = rpc_gss_get_mechanisms(); - while (*mechs) { - if (!rpc_gss_set_svc_name(principal, *mechs, GSS_C_INDEFINITE, - 123456, 1)) { - rpc_gss_error_t e; - - rpc_gss_get_error(&e); - printf("setting name for %s for %s failed: %d, %d\n", - principal, *mechs, - e.rpc_gss_error, e.system_error); - } - mechs++; - } - - cb.program = 123456; - cb.version = 1; - cb.callback = server_new_context; - rpc_gss_set_callback(&cb); - - pool = svcpool_create("gsstest", NULL); - - svc_create(pool, server_program_1, 123456, 1, NULL); - svc_run(pool); - - rpc_gss_clear_svc_name(123456, 1); - rpc_gss_clear_callback(&cb); - - svcpool_destroy(pool); - - return (0); -} - -static void -server_program_1(struct svc_req *rqstp, register SVCXPRT *transp) -{ - rpc_gss_rawcred_t *rcred; - rpc_gss_ucred_t *ucred; - int i, num; - - if (rqstp->rq_cred.oa_flavor != RPCSEC_GSS) { - svcerr_weakauth(rqstp); - return; - } - - if (!rpc_gss_getcred(rqstp, &rcred, &ucred, NULL)) { - svcerr_systemerr(rqstp); - return; - } - - printf("svc=%d, mech=%s, uid=%d, gid=%d, gids={", - rcred->service, rcred->mechanism, ucred->uid, ucred->gid); - for (i = 0; i < ucred->gidlen; i++) { - if (i > 0) printf(","); - printf("%d", ucred->gidlist[i]); - } - printf("}\n"); - - switch (rqstp->rq_proc) { - case 0: - if (!svc_getargs(rqstp, (xdrproc_t) xdr_void, 0)) { - svcerr_decode(rqstp); - goto out; - } - if (!svc_sendreply(rqstp, (xdrproc_t) xdr_void, 0)) { - svcerr_systemerr(rqstp); - } - goto out; - - case 1: - if (!svc_getargs(rqstp, (xdrproc_t) xdr_int, - (char *) &num)) { - svcerr_decode(rqstp); - goto out; - } - num += 100; - if (!svc_sendreply(rqstp, (xdrproc_t) xdr_int, - (char *) &num)) { - svcerr_systemerr(rqstp); - } - goto out; - - default: - svcerr_noproc(rqstp); - goto out; - } - -out: - svc_freereq(rqstp); - return; -} - -static void -print_principal(rpc_gss_principal_t principal) -{ - int i, len, n; - uint8_t *p; - - len = principal->len; - p = (uint8_t *) principal->name; - while (len > 0) { - n = len; - if (n > 16) - n = 16; - for (i = 0; i < n; i++) - printf("%02x ", p[i]); - for (; i < 16; i++) - printf(" "); - printf("|"); - for (i = 0; i < n; i++) - printf("%c", isprint(p[i]) ? p[i] : '.'); - printf("|\n"); - len -= n; - p += n; - } -} - -static bool_t -server_new_context(__unused struct svc_req *req, - gss_cred_id_t deleg, - __unused gss_ctx_id_t gss_context, - rpc_gss_lock_t *lock, - __unused void **cookie) -{ - rpc_gss_rawcred_t *rcred = lock->raw_cred; - OM_uint32 junk; - - printf("new security context version=%d, mech=%s, qop=%s:\n", - rcred->version, rcred->mechanism, rcred->qop); - print_principal(rcred->client_principal); - - if (server_acl) { - if (rcred->client_principal->len != server_acl->len - || memcmp(rcred->client_principal->name, server_acl->name, - server_acl->len)) { - return (FALSE); - } - } - gss_release_cred(&junk, &deleg); - - return (TRUE); -} - -/* - * Hook up a syscall for gssapi testing. - */ - -struct gsstest_args { - int a_op; - void *a_args; - void *a_res; -}; - -struct gsstest_2_args { - int step; /* test step number */ - gss_buffer_desc input_token; /* token from userland */ - gss_buffer_desc output_token; /* buffer to receive reply token */ -}; -struct gsstest_2_res { - OM_uint32 maj_stat; /* maj_stat from kernel */ - OM_uint32 min_stat; /* min_stat from kernel */ - gss_buffer_desc output_token; /* reply token (using space from gsstest_2_args.output) */ -}; - -static int -gsstest(struct thread *td, struct gsstest_args *uap) -{ - int error; - - switch (uap->a_op) { - case 1: - return (gsstest_1(td)); - - case 2: { - struct gsstest_2_args args; - struct gsstest_2_res res; - gss_buffer_desc input_token, output_token; - OM_uint32 junk; - - error = copyin(uap->a_args, &args, sizeof(args)); - if (error) - return (error); - input_token.length = args.input_token.length; - input_token.value = malloc(input_token.length, M_GSSAPI, - M_WAITOK); - error = copyin(args.input_token.value, input_token.value, - input_token.length); - if (error) { - gss_release_buffer(&junk, &input_token); - return (error); - } - output_token.length = 0; - output_token.value = NULL; - gsstest_2(td, args.step, &input_token, - &res.maj_stat, &res.min_stat, &output_token); - gss_release_buffer(&junk, &input_token); - if (output_token.length > args.output_token.length) { - gss_release_buffer(&junk, &output_token); - return (EOVERFLOW); - } - res.output_token.length = output_token.length; - res.output_token.value = args.output_token.value; - error = copyout(output_token.value, res.output_token.value, - output_token.length); - gss_release_buffer(&junk, &output_token); - if (error) - return (error); - - return (copyout(&res, uap->a_res, sizeof(res))); - - break; - } - case 3: - return (gsstest_3(td)); - case 4: - return (gsstest_4(td)); - } - - return (EINVAL); -} - -/* - * The `sysent' for the new syscall - */ -static struct sysent gsstest_sysent = { - 3, /* sy_narg */ - (sy_call_t *) gsstest /* sy_call */ -}; - -/* - * The offset in sysent where the syscall is allocated. - */ -static int gsstest_offset = NO_SYSCALL; - -/* - * The function called at load/unload. - */ - -static int -gsstest_load(struct module *module, int cmd, void *arg) -{ - int error = 0; - - switch (cmd) { - case MOD_LOAD : - break; - case MOD_UNLOAD : - break; - default : - error = EOPNOTSUPP; - break; - } - return error; -} - -SYSCALL_MODULE(gsstest_syscall, &gsstest_offset, &gsstest_sysent, - gsstest_load, NULL); From 3a0cdb2675e64460be17e640fd871907163342e8 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 8 Jan 2025 20:00:12 -0800 Subject: [PATCH 074/143] rpc: clean kernel RPC internal headers of non-kernel declarations The files svc.h and clnt.h derive from the same files that live in /usr/include, however there is nothing really shared between the kernel and libc RPC implementations. The kernel side files are not installed and there is no reason to pollute them with the old definititions. --- sys/rpc/clnt.h | 285 ------------------------------------------------- sys/rpc/svc.h | 273 ---------------------------------------------- 2 files changed, 558 deletions(-) diff --git a/sys/rpc/clnt.h b/sys/rpc/clnt.h index da02137397f1dd..d9fc372709cf1e 100644 --- a/sys/rpc/clnt.h +++ b/sys/rpc/clnt.h @@ -41,12 +41,8 @@ #define _RPC_CLNT_H_ #include #include -#ifdef _KERNEL #include #include -#else -#include -#endif #include /* @@ -90,7 +86,6 @@ struct rpc_err { #define re_lb ru.RE_lb }; -#ifdef _KERNEL /* * Functions of this type may be used to receive notification when RPC * calls have to be re-transmitted etc. @@ -117,7 +112,6 @@ struct rpc_callextra { struct rpc_timers *rc_timers; /* optional RTT timers */ struct rpc_err rc_err; /* detailed call status */ }; -#endif /* * Client rpc handle. @@ -125,7 +119,6 @@ struct rpc_callextra { * Client is responsible for initializing auth, see e.g. auth_none.c. */ typedef struct __rpc_client { -#ifdef _KERNEL volatile u_int cl_refs; /* reference count */ AUTH *cl_auth; /* authenticator */ const struct clnt_ops { @@ -149,28 +142,6 @@ typedef struct __rpc_client { bool_t (*cl_control)(struct __rpc_client *, u_int, void *); } *cl_ops; -#else - AUTH *cl_auth; /* authenticator */ - struct clnt_ops { - /* call remote procedure */ - enum clnt_stat (*cl_call)(struct __rpc_client *, - rpcproc_t, xdrproc_t, void *, xdrproc_t, - void *, struct timeval); - /* abort a call */ - void (*cl_abort)(struct __rpc_client *); - /* get specific error code */ - void (*cl_geterr)(struct __rpc_client *, - struct rpc_err *); - /* frees results */ - bool_t (*cl_freeres)(struct __rpc_client *, - xdrproc_t, void *); - /* destroy this structure */ - void (*cl_destroy)(struct __rpc_client *); - /* the ioctl() of rpc */ - bool_t (*cl_control)(struct __rpc_client *, u_int, - void *); - } *cl_ops; -#endif void *cl_private; /* private stuff */ char *cl_netid; /* network token */ char *cl_tp; /* device name */ @@ -198,7 +169,6 @@ typedef struct __rpc_client { * */ -#ifdef _KERNEL #define CLNT_ACQUIRE(rh) \ refcount_acquire(&(rh)->cl_refs) #define CLNT_RELEASE(rh) \ @@ -246,7 +216,6 @@ enum clnt_stat clnt_call_private(CLIENT *, struct rpc_callextra *, rpcproc_t, #define CLNT_CALL_EXT(rh, ext, proc, xargs, argsp, xres, resp, secs) \ clnt_call_private(rh, ext, proc, xargs, \ argsp, xres, resp, secs) -#endif /* * enum clnt_stat @@ -259,21 +228,12 @@ enum clnt_stat clnt_call_private(CLIENT *, struct rpc_callextra *, rpcproc_t, * void *resp; * struct timeval timeout; */ -#ifdef _KERNEL #define CLNT_CALL(rh, proc, xargs, argsp, xres, resp, secs) \ clnt_call_private(rh, NULL, proc, xargs, \ argsp, xres, resp, secs) #define clnt_call(rh, proc, xargs, argsp, xres, resp, secs) \ clnt_call_private(rh, NULL, proc, xargs, \ argsp, xres, resp, secs) -#else -#define CLNT_CALL(rh, proc, xargs, argsp, xres, resp, secs) \ - ((*(rh)->cl_ops->cl_call)(rh, proc, xargs, \ - argsp, xres, resp, secs)) -#define clnt_call(rh, proc, xargs, argsp, xres, resp, secs) \ - ((*(rh)->cl_ops->cl_call)(rh, proc, xargs, \ - argsp, xres, resp, secs)) -#endif /* * void @@ -339,7 +299,6 @@ enum clnt_stat clnt_call_private(CLIENT *, struct rpc_callextra *, rpcproc_t, #define CLSET_ASYNC 19 #define CLSET_CONNECT 20 /* Use connect() for UDP. (int) */ -#ifdef _KERNEL /* * Kernel control operations. The default msleep string is "rpcrecv", * and sleeps are non-interruptible by default. @@ -362,8 +321,6 @@ struct rpc_reconupcall { void *arg; }; #define CLSET_RECONUPCALL 33 /* Reconnect upcall */ -#endif - /* * void @@ -402,8 +359,6 @@ struct rpc_reconupcall { * belong to the nettype namespace (/etc/netconfig). */ __BEGIN_DECLS -#ifdef _KERNEL - /* * struct socket *so; -- socket * struct sockaddr *svcaddr; -- servers address @@ -440,156 +395,6 @@ extern CLIENT *clnt_vc_create(struct socket *so, extern CLIENT *clnt_reconnect_create(struct netconfig *nconf, struct sockaddr *svcaddr, rpcprog_t program, rpcvers_t version, size_t sendsz, size_t recvsz); - -#else - -extern CLIENT *clnt_create(const char *, const rpcprog_t, const rpcvers_t, - const char *); -/* - * - * const char *hostname; -- hostname - * const rpcprog_t prog; -- program number - * const rpcvers_t vers; -- version number - * const char *nettype; -- network type - */ - - /* - * Generic client creation routine. Just like clnt_create(), except - * it takes an additional timeout parameter. - */ -extern CLIENT * clnt_create_timed(const char *, const rpcprog_t, - const rpcvers_t, const char *, const struct timeval *); -/* - * - * const char *hostname; -- hostname - * const rpcprog_t prog; -- program number - * const rpcvers_t vers; -- version number - * const char *nettype; -- network type - * const struct timeval *tp; -- timeout - */ - -/* - * Generic client creation routine. Supported protocols are which belong - * to the nettype name space. - */ -extern CLIENT *clnt_create_vers(const char *, const rpcprog_t, rpcvers_t *, - const rpcvers_t, const rpcvers_t, - const char *); -/* - * const char *host; -- hostname - * const rpcprog_t prog; -- program number - * rpcvers_t *vers_out; -- servers highest available version - * const rpcvers_t vers_low; -- low version number - * const rpcvers_t vers_high; -- high version number - * const char *nettype; -- network type - */ - -/* - * Generic client creation routine. Supported protocols are which belong - * to the nettype name space. - */ -extern CLIENT * clnt_create_vers_timed(const char *, const rpcprog_t, - rpcvers_t *, const rpcvers_t, const rpcvers_t, const char *, - const struct timeval *); -/* - * const char *host; -- hostname - * const rpcprog_t prog; -- program number - * rpcvers_t *vers_out; -- servers highest available version - * const rpcvers_t vers_low; -- low version number - * const rpcvers_t vers_high; -- high version number - * const char *nettype; -- network type - * const struct timeval *tp -- timeout - */ - -/* - * Generic client creation routine. It takes a netconfig structure - * instead of nettype - */ -extern CLIENT *clnt_tp_create(const char *, const rpcprog_t, - const rpcvers_t, const struct netconfig *); -/* - * const char *hostname; -- hostname - * const rpcprog_t prog; -- program number - * const rpcvers_t vers; -- version number - * const struct netconfig *netconf; -- network config structure - */ - -/* - * Generic client creation routine. Just like clnt_tp_create(), except - * it takes an additional timeout parameter. - */ -extern CLIENT * clnt_tp_create_timed(const char *, const rpcprog_t, - const rpcvers_t, const struct netconfig *, const struct timeval *); -/* - * const char *hostname; -- hostname - * const rpcprog_t prog; -- program number - * const rpcvers_t vers; -- version number - * const struct netconfig *netconf; -- network config structure - * const struct timeval *tp -- timeout - */ - -/* - * Generic TLI create routine. Only provided for compatibility. - */ - -extern CLIENT *clnt_tli_create(const int, const struct netconfig *, - struct netbuf *, const rpcprog_t, - const rpcvers_t, const u_int, const u_int); -/* - * const int fd; -- fd - * const struct netconfig *nconf; -- netconfig structure - * struct netbuf *svcaddr; -- servers address - * const u_long prog; -- program number - * const u_long vers; -- version number - * const u_int sendsz; -- send size - * const u_int recvsz; -- recv size - */ - -/* - * Low level clnt create routine for connectionful transports, e.g. tcp. - */ -extern CLIENT *clnt_vc_create(const int, const struct netbuf *, - const rpcprog_t, const rpcvers_t, - u_int, u_int); -/* - * Added for compatibility to old rpc 4.0. Obsoleted by clnt_vc_create(). - */ -extern CLIENT *clntunix_create(struct sockaddr_un *, - u_long, u_long, int *, u_int, u_int); -/* - * const int fd; -- open file descriptor - * const struct netbuf *svcaddr; -- servers address - * const rpcprog_t prog; -- program number - * const rpcvers_t vers; -- version number - * const u_int sendsz; -- buffer recv size - * const u_int recvsz; -- buffer send size - */ - -/* - * Low level clnt create routine for connectionless transports, e.g. udp. - */ -extern CLIENT *clnt_dg_create(const int, const struct netbuf *, - const rpcprog_t, const rpcvers_t, - const u_int, const u_int); -/* - * const int fd; -- open file descriptor - * const struct netbuf *svcaddr; -- servers address - * const rpcprog_t program; -- program number - * const rpcvers_t version; -- version number - * const u_int sendsz; -- buffer recv size - * const u_int recvsz; -- buffer send size - */ - -/* - * Memory based rpc (for speed check and testing) - * CLIENT * - * clnt_raw_create(prog, vers) - * u_long prog; - * u_long vers; - */ -extern CLIENT *clnt_raw_create(rpcprog_t, rpcvers_t); -#endif - __END_DECLS @@ -626,96 +431,6 @@ struct rpc_createerr { struct rpc_err cf_error; /* useful when cf_stat == RPC_PMAPFAILURE */ }; -#ifdef _KERNEL extern struct rpc_createerr rpc_createerr; -#else -__BEGIN_DECLS -extern struct rpc_createerr *__rpc_createerr(void); -__END_DECLS -#define rpc_createerr (*(__rpc_createerr())) -#endif - -#ifndef _KERNEL -/* - * The simplified interface: - * enum clnt_stat - * rpc_call(host, prognum, versnum, procnum, inproc, in, outproc, out, nettype) - * const char *host; - * const rpcprog_t prognum; - * const rpcvers_t versnum; - * const rpcproc_t procnum; - * const xdrproc_t inproc, outproc; - * const char *in; - * char *out; - * const char *nettype; - */ -__BEGIN_DECLS -extern enum clnt_stat rpc_call(const char *, const rpcprog_t, - const rpcvers_t, const rpcproc_t, - const xdrproc_t, const char *, - const xdrproc_t, char *, const char *); -__END_DECLS - -/* - * RPC broadcast interface - * The call is broadcasted to all locally connected nets. - * - * extern enum clnt_stat - * rpc_broadcast(prog, vers, proc, xargs, argsp, xresults, resultsp, - * eachresult, nettype) - * const rpcprog_t prog; -- program number - * const rpcvers_t vers; -- version number - * const rpcproc_t proc; -- procedure number - * const xdrproc_t xargs; -- xdr routine for args - * caddr_t argsp; -- pointer to args - * const xdrproc_t xresults; -- xdr routine for results - * caddr_t resultsp; -- pointer to results - * const resultproc_t eachresult; -- call with each result - * const char *nettype; -- Transport type - * - * For each valid response received, the procedure eachresult is called. - * Its form is: - * done = eachresult(resp, raddr, nconf) - * bool_t done; - * caddr_t resp; - * struct netbuf *raddr; - * struct netconfig *nconf; - * where resp points to the results of the call and raddr is the - * address if the responder to the broadcast. nconf is the transport - * on which the response was received. - * - * extern enum clnt_stat - * rpc_broadcast_exp(prog, vers, proc, xargs, argsp, xresults, resultsp, - * eachresult, inittime, waittime, nettype) - * const rpcprog_t prog; -- program number - * const rpcvers_t vers; -- version number - * const rpcproc_t proc; -- procedure number - * const xdrproc_t xargs; -- xdr routine for args - * caddr_t argsp; -- pointer to args - * const xdrproc_t xresults; -- xdr routine for results - * caddr_t resultsp; -- pointer to results - * const resultproc_t eachresult; -- call with each result - * const int inittime; -- how long to wait initially - * const int waittime; -- maximum time to wait - * const char *nettype; -- Transport type - */ - -typedef bool_t (*resultproc_t)(caddr_t, ...); - -__BEGIN_DECLS -extern enum clnt_stat rpc_broadcast(const rpcprog_t, const rpcvers_t, - const rpcproc_t, const xdrproc_t, - caddr_t, const xdrproc_t, caddr_t, - const resultproc_t, const char *); -extern enum clnt_stat rpc_broadcast_exp(const rpcprog_t, const rpcvers_t, - const rpcproc_t, const xdrproc_t, - caddr_t, const xdrproc_t, caddr_t, - const resultproc_t, const int, - const int, const char *); -__END_DECLS - -/* For backward compatibility */ -#include -#endif #endif /* !_RPC_CLNT_H_ */ diff --git a/sys/rpc/svc.h b/sys/rpc/svc.h index 43a388984c0070..d8a8d0139cc42d 100644 --- a/sys/rpc/svc.h +++ b/sys/rpc/svc.h @@ -40,14 +40,12 @@ #define _RPC_SVC_H #include -#ifdef _KERNEL #include #include #include #include #include #include -#endif /* * This interface must manage two items concerning remote procedure calling: @@ -95,7 +93,6 @@ struct __rpc_svcxprt; struct mbuf; struct xp_ops { -#ifdef _KERNEL /* receive incoming requests */ bool_t (*xp_recv)(struct __rpc_svcxprt *, struct rpc_msg *, struct sockaddr **, struct mbuf **); @@ -110,34 +107,11 @@ struct xp_ops { void (*xp_destroy)(struct __rpc_svcxprt *); /* catch-all function */ bool_t (*xp_control)(struct __rpc_svcxprt *, const u_int, void *); -#else - /* receive incoming requests */ - bool_t (*xp_recv)(struct __rpc_svcxprt *, struct rpc_msg *); - /* get transport status */ - enum xprt_stat (*xp_stat)(struct __rpc_svcxprt *); - /* get arguments */ - bool_t (*xp_getargs)(struct __rpc_svcxprt *, xdrproc_t, void *); - /* send reply */ - bool_t (*xp_reply)(struct __rpc_svcxprt *, struct rpc_msg *); - /* free mem allocated for args */ - bool_t (*xp_freeargs)(struct __rpc_svcxprt *, xdrproc_t, void *); - /* destroy this struct */ - void (*xp_destroy)(struct __rpc_svcxprt *); -#endif }; -#ifndef _KERNEL -struct xp_ops2 { - /* catch-all function */ - bool_t (*xp_control)(struct __rpc_svcxprt *, const u_int, void *); -}; -#endif - -#ifdef _KERNEL struct __rpc_svcpool; struct __rpc_svcgroup; struct __rpc_svcthread; -#endif /* * Server side transport handle. In the kernel, transports have a @@ -151,7 +125,6 @@ struct __rpc_svcthread; * end for callbacks). */ typedef struct __rpc_svcxprt { -#ifdef _KERNEL volatile u_int xp_refs; struct sx xp_lock; struct __rpc_svcpool *xp_pool; /* owning pool (see below) */ @@ -186,24 +159,6 @@ typedef struct __rpc_svcxprt { uid_t xp_uid; gid_t *xp_gidp; int xp_doneddp; -#else - int xp_fd; - u_short xp_port; /* associated port number */ - const struct xp_ops *xp_ops; - int xp_addrlen; /* length of remote address */ - struct sockaddr_in xp_raddr; /* remote addr. (backward ABI compat) */ - /* XXX - fvdl stick this here for ABI backward compat reasons */ - const struct xp_ops2 *xp_ops2; - char *xp_tp; /* transport provider device name */ - char *xp_netid; /* network token */ - struct netbuf xp_ltaddr; /* local transport address */ - struct netbuf xp_rtaddr; /* remote transport address */ - struct opaque_auth xp_verf; /* raw response verifier */ - void *xp_p1; /* private: for use by svc ops */ - void *xp_p2; /* private: for use by svc ops */ - void *xp_p3; /* private: for use by svc lib */ - int xp_type; /* transport type */ -#endif } SVCXPRT; /* @@ -211,16 +166,9 @@ typedef struct __rpc_svcxprt { */ typedef struct __rpc_svcauth { const struct svc_auth_ops { -#ifdef _KERNEL int (*svc_ah_wrap)(struct __rpc_svcauth *, struct mbuf **); int (*svc_ah_unwrap)(struct __rpc_svcauth *, struct mbuf **); void (*svc_ah_release)(struct __rpc_svcauth *); -#else - int (*svc_ah_wrap)(struct __rpc_svcauth *, XDR *, - xdrproc_t, caddr_t); - int (*svc_ah_unwrap)(struct __rpc_svcauth *, XDR *, - xdrproc_t, caddr_t); -#endif } *svc_ah_ops; void *svc_ah_private; } SVCAUTH; @@ -233,8 +181,6 @@ typedef struct __rpc_svcxprt_ext { SVCAUTH xp_auth; /* interface to auth methods */ } SVCXPRT_EXT; -#ifdef _KERNEL - /* * The services list * Each entry represents a set of procedures (an rpc program). @@ -399,27 +345,6 @@ typedef struct __rpc_svcpool { SVCGROUP sp_groups[SVC_MAXGROUPS]; /* Thread/port groups. */ } SVCPOOL; -#else - -/* - * Service request - */ -struct svc_req { - uint32_t rq_prog; /* service program number */ - uint32_t rq_vers; /* service protocol version */ - uint32_t rq_proc; /* the desired procedure */ - struct opaque_auth rq_cred; /* raw creds from the wire */ - void *rq_clntcred; /* read only cooked cred */ - SVCXPRT *rq_xprt; /* associated transport */ -}; - -/* - * Approved way of getting address of caller - */ -#define svc_getrpccaller(x) (&(x)->xp_rtaddr) - -#endif - /* * Operations defined on an SVCXPRT handle * @@ -428,8 +353,6 @@ struct svc_req { * xdrproc_t xargs; * void * argsp; */ -#ifdef _KERNEL - #define SVC_ACQUIRE(xprt) \ refcount_acquire(&(xprt)->xp_refs) @@ -456,43 +379,6 @@ struct svc_req { #define SVC_CONTROL(xprt, rq, in) \ (*(xprt)->xp_ops->xp_control)((xprt), (rq), (in)) -#else - -#define SVC_RECV(xprt, msg) \ - (*(xprt)->xp_ops->xp_recv)((xprt), (msg)) -#define svc_recv(xprt, msg) \ - (*(xprt)->xp_ops->xp_recv)((xprt), (msg)) - -#define SVC_STAT(xprt) \ - (*(xprt)->xp_ops->xp_stat)(xprt) -#define svc_stat(xprt) \ - (*(xprt)->xp_ops->xp_stat)(xprt) - -#define SVC_GETARGS(xprt, xargs, argsp) \ - (*(xprt)->xp_ops->xp_getargs)((xprt), (xargs), (argsp)) -#define svc_getargs(xprt, xargs, argsp) \ - (*(xprt)->xp_ops->xp_getargs)((xprt), (xargs), (argsp)) - -#define SVC_REPLY(xprt, msg) \ - (*(xprt)->xp_ops->xp_reply) ((xprt), (msg)) -#define svc_reply(xprt, msg) \ - (*(xprt)->xp_ops->xp_reply) ((xprt), (msg)) - -#define SVC_FREEARGS(xprt, xargs, argsp) \ - (*(xprt)->xp_ops->xp_freeargs)((xprt), (xargs), (argsp)) -#define svc_freeargs(xprt, xargs, argsp) \ - (*(xprt)->xp_ops->xp_freeargs)((xprt), (xargs), (argsp)) - -#define SVC_DESTROY(xprt) \ - (*(xprt)->xp_ops->xp_destroy)(xprt) -#define svc_destroy(xprt) \ - (*(xprt)->xp_ops->xp_destroy)(xprt) - -#define SVC_CONTROL(xprt, rq, in) \ - (*(xprt)->xp_ops2->xp_control)((xprt), (rq), (in)) - -#endif - #define SVC_EXT(xprt) \ ((SVCXPRT_EXT *) xprt->xp_p3) @@ -502,19 +388,12 @@ struct svc_req { /* * Operations defined on an SVCAUTH handle */ -#ifdef _KERNEL #define SVCAUTH_WRAP(auth, mp) \ ((auth)->svc_ah_ops->svc_ah_wrap(auth, mp)) #define SVCAUTH_UNWRAP(auth, mp) \ ((auth)->svc_ah_ops->svc_ah_unwrap(auth, mp)) #define SVCAUTH_RELEASE(auth) \ ((auth)->svc_ah_ops->svc_ah_release(auth)) -#else -#define SVCAUTH_WRAP(auth, xdrs, xfunc, xwhere) \ - ((auth)->svc_ah_ops->svc_ah_wrap(auth, xdrs, xfunc, xwhere)) -#define SVCAUTH_UNWRAP(auth, xdrs, xfunc, xwhere) \ - ((auth)->svc_ah_ops->svc_ah_unwrap(auth, xdrs, xfunc, xwhere)) -#endif /* * Service registration @@ -542,14 +421,9 @@ __END_DECLS */ __BEGIN_DECLS -#ifdef _KERNEL extern void svc_unreg(SVCPOOL *, const rpcprog_t, const rpcvers_t); -#else -extern void svc_unreg(const rpcprog_t, const rpcvers_t); -#endif __END_DECLS -#ifdef _KERNEL /* * Service connection loss registration * @@ -573,7 +447,6 @@ __END_DECLS __BEGIN_DECLS extern void svc_loss_unreg(SVCPOOL *, void (*)(SVCXPRT *)); __END_DECLS -#endif /* * Transport registration. @@ -596,8 +469,6 @@ extern void xprt_unregister(SVCXPRT *); extern void __xprt_unregister_unlocked(SVCXPRT *); __END_DECLS -#ifdef _KERNEL - /* * Called when a transport has pending requests. */ @@ -608,8 +479,6 @@ extern void xprt_inactive_locked(SVCXPRT *); extern void xprt_inactive_self(SVCXPRT *); __END_DECLS -#endif - /* * When the service routine is called, it must first check to see if it * knows about the procedure; if not, it should call svcerr_noproc @@ -637,7 +506,6 @@ __END_DECLS */ __BEGIN_DECLS -#ifdef _KERNEL extern bool_t svc_sendreply(struct svc_req *, xdrproc_t, void *); extern bool_t svc_sendreply_mbuf(struct svc_req *, struct mbuf *); extern void svcerr_decode(struct svc_req *); @@ -647,16 +515,6 @@ extern void svcerr_progvers(struct svc_req *, rpcvers_t, rpcvers_t); extern void svcerr_auth(struct svc_req *, enum auth_stat); extern void svcerr_noprog(struct svc_req *); extern void svcerr_systemerr(struct svc_req *); -#else -extern bool_t svc_sendreply(SVCXPRT *, xdrproc_t, void *); -extern void svcerr_decode(SVCXPRT *); -extern void svcerr_weakauth(SVCXPRT *); -extern void svcerr_noproc(SVCXPRT *); -extern void svcerr_progvers(SVCXPRT *, rpcvers_t, rpcvers_t); -extern void svcerr_auth(SVCXPRT *, enum auth_stat); -extern void svcerr_noprog(SVCXPRT *); -extern void svcerr_systemerr(SVCXPRT *); -#endif extern int rpc_reg(rpcprog_t, rpcvers_t, rpcproc_t, char *(*)(char *), xdrproc_t, xdrproc_t, char *); @@ -673,20 +531,6 @@ __END_DECLS * "in-place" results of a select system call (see select, section 2). */ -#ifndef _KERNEL -/* - * Global keeper of rpc service descriptors in use - * dynamic; must be inspected before each call to select - */ -extern int svc_maxfd; -#ifdef FD_SETSIZE -extern fd_set svc_fdset; -#define svc_fds svc_fdset.fds_bits[0] /* compatibility */ -#else -extern int svc_fds; -#endif /* def FD_SETSIZE */ -#endif - /* * a small program implemented by the svc_rpc implementation itself; * also see clnt.h for protocol numbers. @@ -698,22 +542,11 @@ __END_DECLS __BEGIN_DECLS extern SVCXPRT *svc_xprt_alloc(void); extern void svc_xprt_free(SVCXPRT *); -#ifndef _KERNEL -extern void svc_getreq(int); -extern void svc_getreqset(fd_set *); -extern void svc_getreq_common(int); -struct pollfd; -extern void svc_getreq_poll(struct pollfd *, int); -extern void svc_run(void); -extern void svc_exit(void); -#else extern void svc_run(SVCPOOL *); extern void svc_exit(SVCPOOL *); extern bool_t svc_getargs(struct svc_req *, xdrproc_t, void *); extern bool_t svc_freeargs(struct svc_req *, xdrproc_t, void *); extern void svc_freereq(struct svc_req *); - -#endif __END_DECLS /* @@ -728,8 +561,6 @@ __END_DECLS __BEGIN_DECLS -#ifdef _KERNEL - /* * Create a new service pool. */ @@ -812,110 +643,6 @@ extern SVCXPRT *svc_tli_create(SVCPOOL *, const struct netconfig *, * const size_t sendsz; -- max sendsize * const size_t recvsz; -- max recvsize */ - -#else /* !_KERNEL */ - -/* - * Transport independent svc_create routine. - */ -extern int svc_create(void (*)(struct svc_req *, SVCXPRT *), - const rpcprog_t, const rpcvers_t, const char *); -/* - * void (*dispatch)(); -- dispatch routine - * const rpcprog_t prognum; -- program number - * const rpcvers_t versnum; -- version number - * const char *nettype; -- network type - */ - - -/* - * Generic server creation routine. It takes a netconfig structure - * instead of a nettype. - */ - -extern SVCXPRT *svc_tp_create(void (*)(struct svc_req *, SVCXPRT *), - const rpcprog_t, const rpcvers_t, - const struct netconfig *); - /* - * void (*dispatch)(); -- dispatch routine - * const rpcprog_t prognum; -- program number - * const rpcvers_t versnum; -- version number - * const struct netconfig *nconf; -- netconfig structure - */ - -/* - * Generic TLI create routine - */ -extern SVCXPRT *svc_tli_create(const int, const struct netconfig *, - const struct t_bind *, const u_int, - const u_int); -/* - * const int fd; -- connection end point - * const struct netconfig *nconf; -- netconfig structure for network - * const struct t_bind *bindaddr; -- local bind address - * const u_int sendsz; -- max sendsize - * const u_int recvsz; -- max recvsize - */ - -/* - * Connectionless and connectionful create routines - */ - -extern SVCXPRT *svc_vc_create(const int, const u_int, const u_int); -/* - * const int fd; -- open connection end point - * const u_int sendsize; -- max send size - * const u_int recvsize; -- max recv size - */ - -/* - * Added for compatibility to old rpc 4.0. Obsoleted by svc_vc_create(). - */ -extern SVCXPRT *svcunix_create(int, u_int, u_int, char *); - -extern SVCXPRT *svc_dg_create(const int, const u_int, const u_int); - /* - * const int fd; -- open connection - * const u_int sendsize; -- max send size - * const u_int recvsize; -- max recv size - */ - - -/* - * the routine takes any *open* connection - * descriptor as its first input and is used for open connections. - */ -extern SVCXPRT *svc_fd_create(const int, const u_int, const u_int); -/* - * const int fd; -- open connection end point - * const u_int sendsize; -- max send size - * const u_int recvsize; -- max recv size - */ - -/* - * Added for compatibility to old rpc 4.0. Obsoleted by svc_fd_create(). - */ -extern SVCXPRT *svcunixfd_create(int, u_int, u_int); - -/* - * Memory based rpc (for speed check and testing) - */ -extern SVCXPRT *svc_raw_create(void); - -/* - * svc_dg_enable_cache() enables the cache on dg transports. - */ -int svc_dg_enablecache(SVCXPRT *, const u_int); - -int __rpc_get_local_uid(SVCXPRT *_transp, uid_t *_uid); - -#endif /* !_KERNEL */ - __END_DECLS -#ifndef _KERNEL -/* for backward compatibility */ -#include -#endif - #endif /* !_RPC_SVC_H */ From ebb36fcaae4dfd3720e43f8e6a9613e5d84227e0 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 8 Jan 2025 20:00:12 -0800 Subject: [PATCH 075/143] rpc: remove svc_create(), it is not used --- sys/rpc/svc.h | 13 -------- sys/rpc/svc_generic.c | 70 ------------------------------------------- 2 files changed, 83 deletions(-) diff --git a/sys/rpc/svc.h b/sys/rpc/svc.h index d8a8d0139cc42d..92755a1984883f 100644 --- a/sys/rpc/svc.h +++ b/sys/rpc/svc.h @@ -578,19 +578,6 @@ extern void svcpool_destroy(SVCPOOL *pool); */ extern void svcpool_close(SVCPOOL *pool); -/* - * Transport independent svc_create routine. - */ -extern int svc_create(SVCPOOL *, void (*)(struct svc_req *, SVCXPRT *), - const rpcprog_t, const rpcvers_t, const char *); -/* - * void (*dispatch)(); -- dispatch routine - * const rpcprog_t prognum; -- program number - * const rpcvers_t versnum; -- version number - * const char *nettype; -- network type - */ - - /* * Generic server creation routine. It takes a netconfig structure * instead of a nettype. diff --git a/sys/rpc/svc_generic.c b/sys/rpc/svc_generic.c index 6fb43dc5c9406f..12c96eca27c871 100644 --- a/sys/rpc/svc_generic.c +++ b/sys/rpc/svc_generic.c @@ -64,76 +64,6 @@ extern int __svc_vc_setflag(SVCXPRT *, int); -/* - * The highest level interface for server creation. - * It tries for all the nettokens in that particular class of token - * and returns the number of handles it can create and/or find. - * - * It creates a link list of all the handles it could create. - * If svc_create() is called multiple times, it uses the handle - * created earlier instead of creating a new handle every time. - */ -int -svc_create( - SVCPOOL *pool, - void (*dispatch)(struct svc_req *, SVCXPRT *), - rpcprog_t prognum, /* Program number */ - rpcvers_t versnum, /* Version number */ - const char *nettype) /* Networktype token */ -{ - int g, num = 0; - SVCGROUP *grp; - SVCXPRT *xprt; - struct netconfig *nconf; - void *handle; - - if ((handle = __rpc_setconf(nettype)) == NULL) { - printf("svc_create: unknown protocol"); - return (0); - } - while ((nconf = __rpc_getconf(handle)) != NULL) { - for (g = 0; g < SVC_MAXGROUPS; g++) { - grp = &pool->sp_groups[g]; - mtx_lock(&grp->sg_lock); - TAILQ_FOREACH(xprt, &grp->sg_xlist, xp_link) { - if (strcmp(xprt->xp_netid, nconf->nc_netid)) - continue; - /* Found an old one, use it */ - mtx_unlock(&grp->sg_lock); - (void) rpcb_unset(prognum, versnum, nconf); - if (svc_reg(xprt, prognum, versnum, - dispatch, nconf) == FALSE) { - printf( - "svc_create: could not register prog %u vers %u on %s\n", - (unsigned)prognum, (unsigned)versnum, - nconf->nc_netid); - mtx_lock(&grp->sg_lock); - } else { - num++; - mtx_lock(&grp->sg_lock); - break; - } - } - mtx_unlock(&grp->sg_lock); - } - if (xprt == NULL) { - /* It was not found. Now create a new one */ - xprt = svc_tp_create(pool, dispatch, prognum, versnum, - NULL, nconf); - if (xprt) { - num++; - SVC_RELEASE(xprt); - } - } - } - __rpc_endconf(handle); - /* - * In case of num == 0; the error messages are generated by the - * underlying layers; and hence not needed here. - */ - return (num); -} - /* * The high level interface to svc_tli_create(). * It tries to create a server for "nconf" and registers the service From 8e8f8d86e305fe1e90fcfc64c1958e61b359f4f4 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Wed, 8 Jan 2025 20:00:12 -0800 Subject: [PATCH 076/143] rpcbind: remove extraneous check for nconf not being unix(4) We are already inside an if block with exactly same predicate. --- usr.sbin/rpcbind/rpcbind.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/usr.sbin/rpcbind/rpcbind.c b/usr.sbin/rpcbind/rpcbind.c index 44adae366bede3..a836afd24009f9 100644 --- a/usr.sbin/rpcbind/rpcbind.c +++ b/usr.sbin/rpcbind/rpcbind.c @@ -416,18 +416,14 @@ init_transport(struct netconfig *nconf) */ if (strcmp("*", hosts[nhostsbak]) == 0) hosts[nhostsbak] = NULL; - if ((strcmp(nconf->nc_netid, "local") != 0) && - (strcmp(nconf->nc_netid, "unix") != 0)) { - if ((aicode = getaddrinfo(hosts[nhostsbak], - servname, &hints, &res)) != 0) { - syslog(LOG_ERR, - "cannot get local address for %s: %s", + if ((aicode = getaddrinfo(hosts[nhostsbak], servname, &hints, + &res)) != 0) { + syslog(LOG_ERR, "cannot get local address for %s: %s", nconf->nc_netid, gai_strerror(aicode)); continue; - } - addrlen = res->ai_addrlen; - sa = (struct sockaddr *)res->ai_addr; } + addrlen = res->ai_addrlen; + sa = (struct sockaddr *)res->ai_addr; oldmask = umask(S_IXUSR|S_IXGRP|S_IXOTH); if (bind(fd, sa, addrlen) != 0) { syslog(LOG_ERR, "cannot bind %s on %s: %m", From e7fbf52a3e38c4bc4249e6541fe7e42ecc119656 Mon Sep 17 00:00:00 2001 From: Michael Tuexen Date: Thu, 9 Jan 2025 06:27:05 +0100 Subject: [PATCH 077/143] TCP BBR: remove dead code No functional change intended. Reviewed by: Peter Lei, rrs (earlier version) CID: 1523802 MFC after: 1 week Sponsored by: Netflix, Inc. Differential Revision: https://reviews.freebsd.org/D48341 --- sys/netinet/tcp_stacks/bbr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index 4600088bd1a119..17a0744961ce83 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -6781,8 +6781,6 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, t = cts - rsm->r_tim_lastsent[0]; else t = 1; - if ((int)t <= 0) - t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to); @@ -6823,8 +6821,6 @@ bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, t = cts - rsm->r_tim_lastsent[i]; else t = 1; - if ((int)t <= 0) - t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING, rsm->r_tim_lastsent[i], ack_type, to); From 1b4e1171315398decb1ad3fceffcacf29cff218b Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Tue, 7 Jan 2025 09:57:57 +0100 Subject: [PATCH 078/143] loader: Fix orb position Fix the orb position to be aligned with the menu Differential Revision: https://reviews.freebsd.org/D48353 Reviewed by: imp, tsoome Sponsored by: Beckhoff Automation GmbH & Co. KG --- stand/lua/drawer.lua | 4 ++-- stand/lua/gfx-orb.lua | 2 +- stand/lua/gfx-orbbw.lua | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stand/lua/drawer.lua b/stand/lua/drawer.lua index e55702ffee6c52..2d04e29ac46234 100644 --- a/stand/lua/drawer.lua +++ b/stand/lua/drawer.lua @@ -470,9 +470,9 @@ logodefs = { } brand_position = {x = 2, y = 1} -logo_position = {x = 46, y = 4} +logo_position = {x = 40, y = 10} menu_position = {x = 5, y = 10} -frame_size = {w = 42, h = 14} +frame_size = {w = 39, h = 14} default_shift = {x = 0, y = 0} shift = default_shift diff --git a/stand/lua/gfx-orb.lua b/stand/lua/gfx-orb.lua index 00f4aeb3bcebd1..cd834a2d6b8eca 100644 --- a/stand/lua/gfx-orb.lua +++ b/stand/lua/gfx-orb.lua @@ -45,7 +45,7 @@ return { " .---.....----.\027[m", }, requires_color = true, - shift = {x = 2, y = 3}, + shift = {x = 2, y = -1}, image = "/boot/images/freebsd-logo-rev.png", image_rl = 15 } diff --git a/stand/lua/gfx-orbbw.lua b/stand/lua/gfx-orbbw.lua index 93ffd2366196a9..a97174a6a5a4b7 100644 --- a/stand/lua/gfx-orbbw.lua +++ b/stand/lua/gfx-orbbw.lua @@ -44,6 +44,6 @@ return { " .-- `--.", " .---.....----.", }, - shift = {x = 2, y = 4}, + shift = {x = 2, y = -1}, } } From ee233742a5697f64d0f1d722b5e73ff2c5998c62 Mon Sep 17 00:00:00 2001 From: Emmanuel Vadot Date: Tue, 7 Jan 2025 10:34:35 +0100 Subject: [PATCH 079/143] loader: Rework kernel menu section With pkgbase we can have long kernel name, so create a new section for the kernel name. Do not show the "default" text, we already show the "1 of X" part at the end of the line and the default kernel is always number 1 so it's a bit redundant. Differential Revision: https://reviews.freebsd.org/D48354 Reviewed by: imp, tsoome Sponsored by: Beckhoff Automation GmbH & Co. KG --- stand/lua/menu.lua | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/stand/lua/menu.lua b/stand/lua/menu.lua index 2d92be3b7c6e27..7c36b6c8d3c8ec 100644 --- a/stand/lua/menu.lua +++ b/stand/lua/menu.lua @@ -255,9 +255,16 @@ menu.welcome = { }, { entry_type = core.MENU_SEPARATOR, - name = "Options:", + name = "Kernel:", }, menu_entries.kernel_options, + { + entry_type = core.MENU_SEPARATOR, + }, + { + entry_type = core.MENU_SEPARATOR, + name = "Options:", + }, menu_entries.boot_options, menu_entries.zpool_checkpoints, menu_entries.boot_envs, @@ -332,22 +339,19 @@ menu.welcome = { items = core.kernelList, name = function(idx, choice, all_choices) if #all_choices == 0 then - return "Kernel: " + return "" end - local is_default = (idx == 1) - local kernel_name = "" + local kernel_name local name_color - if is_default then + if idx == 1 then name_color = color.escapefg(color.GREEN) - kernel_name = "default/" else name_color = color.escapefg(color.CYAN) end - kernel_name = kernel_name .. name_color .. - choice .. color.resetfg() - return color.highlight("K") .. "ernel: " .. - kernel_name .. " (" .. idx .. " of " .. + kernel_name = name_color .. choice .. + color.resetfg() + return kernel_name .. " (" .. idx .. " of " .. #all_choices .. ")" end, func = function(_, choice, _) From a8d9bd3fa5855fe7583ed05946296ab6b9937d69 Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Wed, 8 Jan 2025 12:13:54 +0100 Subject: [PATCH 080/143] bintrans(1): qp switch to getopt_long In preparation for more arguments, switch bintrans qp argument parsing to getopt_long, while here make the decodign argument being -d|--decode for compatibility with base64 encoding/decoding MFC After: 1 week Reviewed by: pstef Differential Revision: https://reviews.freebsd.org/D48380 --- usr.bin/bintrans/bintrans.1 | 4 +-- usr.bin/bintrans/qp.c | 61 +++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/usr.bin/bintrans/bintrans.1 b/usr.bin/bintrans/bintrans.1 index 3376ecd332edcc..4177a5c6b9ebec 100644 --- a/usr.bin/bintrans/bintrans.1 +++ b/usr.bin/bintrans/bintrans.1 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd January 23, 2024 +.Dd January 8, 2025 .Dt BINTRANS 1 .Os .Sh NAME @@ -230,7 +230,7 @@ through a dedicated program: is a quoted-printable converter and accepts the following options: .Bl -tag -width indent -.It Fl u +.It Fl d Decode. .It Fl o Ar output_file Output to diff --git a/usr.bin/bintrans/qp.c b/usr.bin/bintrans/qp.c index c2c9dfa7a224b7..3bff47945acf9e 100644 --- a/usr.bin/bintrans/qp.c +++ b/usr.bin/bintrans/qp.c @@ -26,6 +26,7 @@ */ #include +#include #include #include #include @@ -151,44 +152,50 @@ static void usage(void) { fprintf(stderr, - "usage: bintrans qp [-u] [-o outputfile] [file name]\n"); + "usage: bintrans qp [-d] [-o outputfile] [file name]\n"); } int main_quotedprintable(int argc, char *argv[]) { - int i; + int ch; bool encode = true; FILE *fp = stdin; FILE *fpo = stdout; - for (i = 1; i < argc; ++i) { - if (argv[i][0] == '-') { - switch (argv[i][1]) { - case 'o': - if (++i >= argc) { - fprintf(stderr, "qp: -o requires a file name.\n"); - exit(EXIT_FAILURE); - } - fpo = fopen(argv[i], "w"); - if (fpo == NULL) { - perror(argv[i]); - exit(EXIT_FAILURE); - } - break; - case 'u': - encode = false; - break; - default: - usage(); - exit(EXIT_FAILURE); - } - } else { - fp = fopen(argv[i], "r"); - if (fp == NULL) { - perror(argv[i]); + static const struct option opts[] = + { + { "decode", no_argument, NULL, 'd'}, + { "output", required_argument, NULL, 'o'}, + {NULL, no_argument, NULL, 0} + }; + + while ((ch = getopt_long(argc, argv, "do:u", opts, NULL)) != -1) { + switch(ch) { + case 'o': + fpo = fopen(optarg, "w"); + if (fpo == NULL) { + perror(optarg); exit(EXIT_FAILURE); } + break; + case 'u': + /* FALLTHROUGH for backward compatibility */ + case 'd': + encode = false; + break; + default: + usage(); + exit(EXIT_FAILURE); + } + }; + argc -= optind; + argv += optind; + if (argc > 0) { + fp = fopen(argv[0], "r"); + if (fp == NULL) { + perror(argv[0]); + exit(EXIT_FAILURE); } } qp(fp, fpo, encode); From beab8b1ddf86a88e0605562f2cc1d6a26c68604f Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Wed, 8 Jan 2025 13:39:30 +0100 Subject: [PATCH 081/143] bintrans(1): RFC2047 variant of quoted print MFC After: 1 week Reviewed by: pstref Differential Revision: https://reviews.freebsd.org/D48381 --- usr.bin/bintrans/bintrans.1 | 2 ++ usr.bin/bintrans/qp.c | 32 +++++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/usr.bin/bintrans/bintrans.1 b/usr.bin/bintrans/bintrans.1 index 4177a5c6b9ebec..02571067c8f0c0 100644 --- a/usr.bin/bintrans/bintrans.1 +++ b/usr.bin/bintrans/bintrans.1 @@ -236,6 +236,8 @@ Decode. Output to .Ar output_file instead of standard output. +.It fl r +Encode/Decode in RFC2047 specific variant. .El .Sh EXAMPLES The following example packages up a source tree, compresses it, diff --git a/usr.bin/bintrans/qp.c b/usr.bin/bintrans/qp.c index 3bff47945acf9e..862db437f4e066 100644 --- a/usr.bin/bintrans/qp.c +++ b/usr.bin/bintrans/qp.c @@ -51,7 +51,7 @@ decode_char(const char *s) static void -decode_quoted_printable(const char *body, FILE *fpo) +decode_quoted_printable(const char *body, FILE *fpo, bool rfc2047) { while (*body != '\0') { switch (*body) { @@ -80,6 +80,12 @@ decode_quoted_printable(const char *body, FILE *fpo) fputc(decode_char(body), fpo); body += 2; break; + case '_': + if (rfc2047) { + fputc(0x20, fpo); + break; + } + /* FALLTHROUGH */ default: fputc(*body, fpo); break; @@ -89,7 +95,7 @@ decode_quoted_printable(const char *body, FILE *fpo) } static void -encode_quoted_printable(const char *body, FILE *fpo) +encode_quoted_printable(const char *body, FILE *fpo, bool rfc2047) { const char *end = body + strlen(body); size_t linelen = 0; @@ -111,7 +117,10 @@ encode_quoted_printable(const char *body, FILE *fpo) if ((*body == ' ' || *body == '\t') && body + 1 < end && (body[1] != '\n' && body[1] != '\r')) { - fputc(*body, fpo); + if (*body == 0x20 && rfc2047) + fputc('_', fpo); + else + fputc(*body, fpo); prev = *body; } else { fprintf(fpo, "=%02X", (unsigned char)*body); @@ -135,16 +144,16 @@ encode_quoted_printable(const char *body, FILE *fpo) } static void -qp(FILE *fp, FILE *fpo, bool encode) +qp(FILE *fp, FILE *fpo, bool encode, bool rfc2047) { char *line = NULL; size_t linecap = 0; - void (*codec)(const char *line, FILE *f); + void (*codec)(const char *line, FILE *f, bool rfc2047); codec = encode ? encode_quoted_printable : decode_quoted_printable ; while (getline(&line, &linecap, fp) > 0) - codec(line, fpo); + codec(line, fpo, rfc2047); free(line); } @@ -152,7 +161,7 @@ static void usage(void) { fprintf(stderr, - "usage: bintrans qp [-d] [-o outputfile] [file name]\n"); + "usage: bintrans qp [-d] [-r] [-o outputfile] [file name]\n"); } int @@ -160,6 +169,7 @@ main_quotedprintable(int argc, char *argv[]) { int ch; bool encode = true; + bool rfc2047 = false; FILE *fp = stdin; FILE *fpo = stdout; @@ -167,10 +177,11 @@ main_quotedprintable(int argc, char *argv[]) { { "decode", no_argument, NULL, 'd'}, { "output", required_argument, NULL, 'o'}, + { "rfc2047", no_argument, NULL, 'r'}, {NULL, no_argument, NULL, 0} }; - while ((ch = getopt_long(argc, argv, "do:u", opts, NULL)) != -1) { + while ((ch = getopt_long(argc, argv, "do:ru", opts, NULL)) != -1) { switch(ch) { case 'o': fpo = fopen(optarg, "w"); @@ -184,6 +195,9 @@ main_quotedprintable(int argc, char *argv[]) case 'd': encode = false; break; + case 'r': + rfc2047 = true; + break; default: usage(); exit(EXIT_FAILURE); @@ -198,7 +212,7 @@ main_quotedprintable(int argc, char *argv[]) exit(EXIT_FAILURE); } } - qp(fp, fpo, encode); + qp(fp, fpo, encode, rfc2047); return (EXIT_SUCCESS); } From 4413d9f3775118c4d8c082a404dd7eb04fe9636a Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Thu, 9 Jan 2025 10:47:22 +0100 Subject: [PATCH 082/143] usb_vendors: update to 2024.12.04 --- share/misc/usb_vendors | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/share/misc/usb_vendors b/share/misc/usb_vendors index 41b367d1a0d6bb..3304de282c1853 100644 --- a/share/misc/usb_vendors +++ b/share/misc/usb_vendors @@ -9,8 +9,8 @@ # The latest version can be obtained from # http://www.linux-usb.org/usb.ids # -# Version: 2024.07.04 -# Date: 2024-07-04 20:34:02 +# Version: 2024.12.04 +# Date: 2024-12-04 20:34:02 # # Vendors, devices and interfaces. Please keep sorted. @@ -16876,7 +16876,7 @@ 0256 Schwalm & Tate LLC pISO Raspberry Pi Hat 053a Hackerspace San Salvador HSSV SAMR21-Mote 0cbd Andrzej Szombierski kuku.eu.org keyboard - 0d32 ODrive Robotics ODrive v3 + 0d32 ODrive Robotics ODrive 1001 InterBiometrics Hub 1002 InterBiometrics Relais 1003 InterBiometrics IBSecureCam-P From 2f82bf3521f955c0ef9cc0019b7f86c13020660c Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Thu, 9 Jan 2025 10:47:59 +0100 Subject: [PATCH 083/143] pci_vendors: update to 2024.11.25 --- share/misc/pci_vendors | 602 ++++++++++++++++++++++++++++++++++------- 1 file changed, 509 insertions(+), 93 deletions(-) diff --git a/share/misc/pci_vendors b/share/misc/pci_vendors index 968338dd109999..0eebacf92d410d 100644 --- a/share/misc/pci_vendors +++ b/share/misc/pci_vendors @@ -1,8 +1,8 @@ # # List of PCI ID's # -# Version: 2024.09.20 -# Date: 2024-09-20 03:15:02 +# Version: 2024.11.25 +# Date: 2024-11-25 03:15:02 # # Maintained by Albert Pool, Martin Mares, and other volunteers from # the PCI ID Project at https://pci-ids.ucw.cz/. @@ -46,13 +46,20 @@ 7a10 Hyper Transport Bridge Controller 7a14 EHCI USB Controller 7a15 Vivante GPU (Graphics Processing Unit) + 7a18 SATA 3 AHCI Controller 7a19 PCI-to-PCI Bridge + 7a1b SPI Controller 7a24 OHCI USB Controller # Found on 7A2000 PCH 7a25 LG100 GPU 7a29 PCI-to-PCI Bridge + 7a34 xHCI USB Controller # Found on 7A2000 PCH 7a36 Display Controller + 7a39 PCIe x1 Root Port + 7a49 PCIe x4 Root Port + 7a59 PCIe x8 Root Port + 7a69 PCIe x16 Root Port 0018 Fn-Link Technology Limited 6252 6252CPUB 802.11ax PCIe Wireless Network Adapter 001c PEAK-System Technik GmbH @@ -3992,6 +3999,7 @@ # Reference 1002 0e3a Radeon RX 6950 XT 1849 5230 Navi 21 [ASRock OC Forumla Radeon RX 6950XT] + 1849 5238 Navi 21 [ASRock Radeon RX 6950 XT Phantom Gaming OC] 1da2 441d Navi 21 [Sapphire Nitro+ Radeon RX 6950 XT] 1eae 6950 Navi 21 [XFX Speedster MERC319 Radeon RX 6950 XT] 73ab Navi 21 Pro-XLA [Radeon Pro W6800X/Radeon Pro W6800X Duo] @@ -4055,11 +4063,13 @@ 744c Navi 31 [Radeon RX 7900 XT/7900 XTX/7900 GRE/7900M] 1002 0e3b RX 7900 XTX / RX 7900 GRE [XFX] 1043 0506 TUF Gaming Radeon RX 7900 XTX OC + 148c 2425 HELLHOUND RX 7900 GRE 1849 5304 Radeon RX 7900 XTX 1da2 471e PULSE RX 7900 XTX 1da2 475e PULSE RX 7900 GRE 1da2 e471 NITRO+ RX 7900 XTX Vapor-X 1eae 7901 RX-79XMERCB9 [SPEEDSTER MERC 310 RX 7900 XTX] + 1eae 790a RX-79GMERCBR [XFX RX 7900 GRE] 745e Navi 31 [Radeon Pro W7800] 7460 Navi32 GL-XL [AMD Radeon PRO V710] 7461 Navi 32 [AMD Radeon PRO V710] @@ -4075,7 +4085,11 @@ 7499 Navi 33 [Radeon RX 7400/7300/Pro W7400] 74a0 Aqua Vanjaram [Instinct MI300A] 74a1 Aqua Vanjaram [Instinct MI300X] + 74a2 Aqua Vanjaram [Instinct MI308X] + 74a5 Aqua Vanjaram [Instinct MI325X] + 74a9 Aqua Vanjaram [Instinct MI300X HF] 74b5 Aqua Vanjaram [Instinct MI300X VF] + 74bd Aqua Vanjaram [Instinct MI300X HF] 7833 RS350 Host Bridge 7834 RS350 [Radeon 9100 PRO/XT IGP] 7835 RS350M [Mobility Radeon 9000 IGP] @@ -4434,7 +4448,6 @@ 99a2 Trinity 2 [Radeon HD 7420G] 99a4 Trinity 2 [Radeon HD 7400G] aa00 R600 HDMI Audio [Radeon HD 2900 GT/PRO/XT] - aa01 RV635 HDMI Audio [Radeon HD 3650/3730/3750] aa08 RV630 HDMI Audio [Radeon HD 2600 PRO/XT / HD 3610] aa10 RV610 HDMI Audio [Radeon HD 2350 PRO / 2400 PRO/XT / HD 3410] 174b aa10 Radeon HD 2400 PRO @@ -5007,6 +5020,14 @@ 1202 Family 10h Processor DRAM Controller 1203 Family 10h Processor Miscellaneous Control 1204 Family 10h Processor Link Control + 12c0 Turin Data Fabric; Function 0 + 12c1 Turin Data Fabric; Function 1 + 12c2 Turin Data Fabric; Function 2 + 12c3 Turin Data Fabric; Function 3 + 12c4 Turin Data Fabric; Function 4 + 12c5 Turin Data Fabric; Function 5 + 12c6 Turin Data Fabric; Function 6 + 12c7 Turin Data Fabric; Function 7 1300 Family 11h Processor HyperTransport Configuration 1301 Family 11h Processor Address Map 1302 Family 11h Processor DRAM Controller @@ -5130,6 +5151,7 @@ 1480 Starship/Matisse Root Complex 1462 7c37 X570-A PRO motherboard 15d9 1b95 H12SSL-i + 1849 1480 ROME2D32LM3 1481 Starship/Matisse IOMMU 1482 Starship/Matisse PCIe Dummy Host Bridge 1483 Starship/Matisse GPP Bridge @@ -5163,18 +5185,72 @@ 149c Matisse USB 3.0 Host Controller 1462 7c37 X570-A PRO motherboard 149d Vangogh CVIP + 149e Genoa/Bergamo IOMMU + 149f Genoa/Bergamo Dummy Host Bridge + 14a4 Genoa/Bergamo Root Complex + 14a5 Genoa/Bergamo GPP Bridge + 14a6 Genoa/Bergamo RCEC + 14a7 Genoa/Bergamo Internal PCIe GPP Bridge to Bus [D:B] + 14aa Genoa/Bergamo GPP Bridge + 14ab Genoa/Bergamo GPP Bridge + 14ac Genoa/Bergamo Dummy Function + 14ad Genoa/Bergamo Data Fabric; Function 0 + 14ae Genoa/Bergamo Data Fabric; Function 1 + 14af Genoa/Bergamo Data Fabric; Function 2 + 14b0 Genoa/Bergamo Data Fabric; Function 3 + 14b1 Genoa/Bergamo Data Fabric; Function 4 + 14b2 Genoa/Bergamo Data Fabric; Function 5 + 14b3 Genoa/Bergamo Data Fabric; Function 6 + 14b4 Genoa/Bergamo Data Fabric; Function 7 14b5 Family 17h-19h PCIe Root Complex 14b6 Family 17h-19h IOMMU 14b7 Family 17h-19h PCIe Dummy Host Bridge 14b8 Family 17h-19h PCIe GPP Bridge 14b9 Family 17h-19h Internal PCIe GPP Bridge 14ba Family 17h-19h PCIe GPP Bridge + 14c1 Secondary vNTB # Server device 14ca Genoa CCP/PSP 4.0 Device 14cd Family 19h USB4/Thunderbolt PCIe tunnel - 14de Phoenix PCIe Dummy Function + 14d8 Raphael/Granite Ridge Root Complex + 14d9 Raphael/Granite Ridge IOMMU + 14da Raphael/Granite Ridge Dummy Host Bridge + 14db Raphael/Granite Ridge GPP Bridge + 14dc SDXI + 14dd Raphael/Granite Ridge Internal GPP Bridge to Bus [C:A] + 14de Raphael/Granite Ridge PCIe Dummy Function + 14e0 Raphael/Granite Ridge Data Fabric; Function 0 + 14e1 Raphael/Granite Ridge Data Fabric; Function 1 + 14e2 Raphael/Granite Ridge Data Fabric; Function 2 + 14e3 Raphael/Granite Ridge Data Fabric; Function 3 + 14e4 Raphael/Granite Ridge Data Fabric; Function 4 + 14e5 Raphael/Granite Ridge Data Fabric; Function 5 + 14e6 Raphael/Granite Ridge Data Fabric; Function 6 + 14e7 Raphael/Granite Ridge Data Fabric; Function 7 + 14e8 Phoenix Root Complex + 14e9 Phoenix IOMMU + 14ea Phoenix Dummy Host Bridge + 14eb Phoenix Internal GPP Bridge to Bus [C:A] + 14ec Phoenix Dummy Function + 14ed Phoenix GPP Bridge + 14ee Phoenix GPP Bridge 14ef Family 19h USB4/Thunderbolt PCIe tunnel + 14f0 Phoenix Data Fabric; Function 0 + 14f1 Phoenix Data Fabric; Function 1 + 14f2 Phoenix Data Fabric; Function 2 + 14f3 Phoenix Data Fabric; Function 3 + 14f4 Phoenix Data Fabric; Function 4 + 14f5 Phoenix Data Fabric; Function 5 + 14f6 Phoenix Data Fabric; Function 6 + 14f7 Phoenix Data Fabric; Function 7 1502 AMD IPU Device + 1507 Strix Root Complex + 1508 Strix IOMMU + 1509 Strix Dummy Host Bridge + 150a Strix PCIe USB4 Bridge + 150b Strix GPP Bridge + 150c Strix Internal GPP Bridge to Bus [C:A] + 150d Strix PCIe Dummy function 1510 Family 14h Processor Root Complex 174b 1001 PURE Fusion Mini 1512 Family 14h Processor Root Port @@ -5194,12 +5270,11 @@ 1537 Kabini/Mullins PSP-Platform Security Processor 1538 Family 16h Processor Function 0 1539 Kabini P2P Bridge for PCIe Ports[4:0] -# AMD EPYC Turin CPU - 153a Family 1Ah (Models 00h-0Fh) Root Complex -# AMD EPYC Turin CPU - 153b Family 1Ah (Models 00h-0Fh) IOMMU -# AMD EPYC Turin CPU - 153d Family 1Ah (Models 00h-0Fh) PCIe Dummy Host Bridge + 153a Turin Root Complex + 153b Turin IOMMU + 153c Turin RCEC + 153d Turin PCIe Dummy Host Bridge + 153e Turin GPP Bridge 1540 Kryptos/Cato/Garfield/Garfield+/Arlene/Pooky HT Configuration 1541 Kryptos/Cato/Garfield/Garfield+/Arlene/Pooky Address Maps 1542 Kryptos/Cato/Garfield/Garfield+/Arlene/Pooky DRAM Configuration @@ -5216,10 +5291,10 @@ 154f Anubis Audio Processor 1550 Garfield+/Arlene/Pooky/Anubis SPLL Configuration 1553 Arlene/Pooky P2P Bridge for PCIE (3:0) -# AMD EPYC Turin CPU - 1555 Family 1Ah (Models 00h-0Fh) Internal PCIe GPP Bridge -# AMD EPYC Turin CPU - 1556 Family 1Ah (Models 00h-0Fh) PCIe Dummy Function + 1554 Turin GPP Bridge + 1555 Turin Internal PCIe GPP Bridge to Bus [D:C] + 1556 Turin PCIe Dummy Function + 1557 Turin USB 3.1 xHCI 155b Anubis Root Complex 155c Anubis IOMMU 155d Anubis UMI PCIe Dummy Bridge @@ -5228,6 +5303,7 @@ 1566 Family 16h (Models 30h-3fh) Processor Root Complex 1567 Mullins IOMMU 156b Family 16h (Models 30h-3fh) Host Bridge + 156e Turin CCP/ASP 1570 Family 15h (Models 60h-6fh) Processor Function 0 1571 Family 15h (Models 60h-6fh) Processor Function 1 1572 Family 15h (Models 60h-6fh) Processor Function 2 @@ -5266,11 +5342,13 @@ 15b3 Stoney Miscellaneous Configuration 15b4 Stoney PM Configuration 15b5 Stoney NB Performance Monitor + 15b6 Raphael/Granite Ridge USB 3.1 xHCI + 15b7 Raphael/Granite Ridge USB 3.1 xHCI 15bc Stoney PCIe [GFX,GPP] Bridge [4:0] 15be Stoney Audio Processor 15c4 Phoenix USB4/Thunderbolt NHI controller #1 15c5 Phoenix USB4/Thunderbolt NHI controller #2 - 15c7 Family 19h (Model 74h) CCP/PSP 3.0 Device + 15c7 Phoenix CCP/PSP 3.0 Device 15d0 Raven/Raven2 Root Complex 103c 8615 Pavilion Laptop 15-cw1xxx 1043 876b PRIME B450M-A Motherboard @@ -5309,7 +5387,7 @@ 15e2 ACP/ACP3X/ACP6x Audio Coprocessor 17aa 5124 ThinkPad E595 ea50 ce19 mCOM10-L1900 - 15e3 Family 17h/19h HD Audio Controller + 15e3 Family 17h/19h/1ah HD Audio Controller 103c 8615 Pavilion Laptop 15-cw1xxx 103c 8b17 ProBook 445 G9/455 G9 1043 86c7 PRIME B450M-A Motherboard @@ -5402,6 +5480,8 @@ 1647 VanGogh PCIe GPP Bridge 1648 VanGogh Internal PCIe GPP Bridge to Bus 1649 Family 19h PSP/CCP + 164a Sensor Fusion Hub + 164b Non-Sensor Fusion Hub 164f Milan IOMMU 1650 Milan Data Fabric; Function 0 1651 Milan Data Fabric; Function 1 @@ -5437,6 +5517,14 @@ 167e Rembrandt Data Fabric: Device 18h; Function 5 167f Rembrandt Data Fabric: Device 18h; Function 6 1680 Rembrandt Data Fabric: Device 18h; Function 7 + 16f8 Strix Data Fabric; Function 0 + 16f9 Strix Data Fabric; Function 1 + 16fa Strix Data Fabric; Function 2 + 16fb Strix Data Fabric; Function 3 + 16fc Strix Data Fabric; Function 4 + 16fd Strix Data Fabric; Function 5 + 16fe Strix Data Fabric; Function 6 + 16ff Strix Data Fabric; Function 7 1700 Family 12h/14h Processor Function 0 1701 Family 12h/14h Processor Function 1 1702 Family 12h/14h Processor Function 2 @@ -5454,6 +5542,8 @@ 1716 Family 12h/14h Processor Function 5 1718 Family 12h/14h Processor Function 6 1719 Family 12h/14h Processor Function 7 + 17e0 Strix CCP/ASP + 17f0 Strix Neural Processing Unit 2000 79C97x [PCnet32 LANCE] 1014 2000 NetFinity 10/100 Fast Ethernet 1022 2000 PCnet - Fast 79C971 @@ -5808,7 +5898,8 @@ 5225 M5225 5229 M5229 5235 M5235 - 5237 M5237 PCI USB Host Controller + 5237 OHCI USB Controller + 5239 EHCI USB Controller 5240 EIDE Controller 5241 PCMCIA Bridge 5242 General Purpose Controller @@ -6731,28 +6822,28 @@ c066 3010S Ultra3 Dual Channel 1045 OPTi Inc. a0f8 82C750 [Vendetta] USB Controller - c101 92C264 + c101 82C264 GUI Accelerator c178 92C178 c556 82X556 [Viper] c557 82C557 [Viper-M] c558 82C558 [Viper-M ISA+IDE] - c567 82C750 [Vendetta], device 0 - c568 82C750 [Vendetta], device 1 + c567 82C750 [Vendetta] Host Bridge + c568 82C750 [Vendetta] ISA Bridge c569 82C579 [Viper XPress+ Chipset] - c621 82C621 [Viper-M/N+] + c621 82C621A PCI IDE Contoller c700 82C700 [FireStar] - c701 82C701 [FireStar Plus] - c814 82C814 [Firebridge 1] + c701 82C700 [FireStar] Host Bridge + c814 82C814 [FireBridge II] Docking Stration Controller c822 82C822 - c824 82C824 - c825 82C825 [Firebridge 2] + c824 82C824 [FireFox] 32-Bit PC Card Controller + c825 82C825 [FireBridge II] Docking Stration Controller c832 82C832 - c861 82C861 OHCI USB Host + c861 82C861/2/3 [FireLink] PCI-USB Host Bridge c881 82C881 [FireLink] 1394 OHCI Link Controller c895 82C895 - c935 EV1935 ECTIVA MachOne PCIAudio - d568 82C825 [Firebridge 2] - d721 IDE [FireStar] + c935 82С935 [MachOne] Integrated PCI Audio Processor + d568 82C700 [FireStar] PCI IDE Controller + d721 82C700 [FireStar] PCI IDE Controller 1046 IPC Corporation, Ltd. 1047 Genoa Systems Corp 1048 Elsa AG @@ -7151,12 +7242,13 @@ 0001 W83769F 0033 W89C33D 802.11 a/b/g BB/MAC 0105 W82C105 + 0628 W83628F/W83629D PCI to ISA Bridge Set 0840 W89C840 1050 0001 W89C840 Ethernet Adapter 1050 0840 W89C840 Ethernet Adapter 0940 W89C940 - 5a5a W89C940F - 6692 W6692 + 5a5a W89C940 Twisted-pair Ether-LAN Controller With PCI Interface [ELANC-PCI] + 6692 W6692 PCI ISDN S/T-Controller 1043 1702 ISDN Adapter (PCI Bus, D, W) 1043 1703 ISDN Adapter (PCI Bus, DV, W) 1043 1707 ISDN Adapter (PCI Bus, DV, W) @@ -7165,6 +7257,7 @@ 144f 1707 ISDN Adapter (PCI Bus, DV, W) 9921 W99200F MPEG-1 Video Encoder 9922 W99200F/W9922PF MPEG-1/2 Video Encoder + 9960 W9960CF Video Codec 9970 W9970CF 1051 Anigma, Inc. 1052 ?Young Micro Systems @@ -9163,13 +9256,13 @@ 5842 2051 ISA bridge 10ab Digicom 10ac Honeywell IAC -10ad Symphony Labs +10ad Winbond Electronics Corp / Symphony Labs 0001 W83769F 0003 SL82C103 0005 SL82C105 0103 SL82c103 - 0105 SL82c105 - 0565 W83C553F/W83C554F + 0105 SL82C105/W83C55xF Bus Master IDE + 0565 W83C553F/554F ISA bridge 10ae Cornerstone Technology 10af Micro Computer Systems Inc 10b0 CardExpert Technology @@ -13009,7 +13102,7 @@ 2182 TU116 [GeForce GTX 1660 Ti] 2183 TU116 2184 TU116 [GeForce GTX 1660] - 2187 TU116 [GeForce GTX 1660 SUPER] + 2187 TU116 [GeForce GTX 1650 SUPER] 2188 TU116 [GeForce GTX 1650] 2189 TU116 [CMP 30HX] 2191 TU116M [GeForce GTX 1660 Ti Mobile] @@ -13047,6 +13140,7 @@ 223f GA102GL 228b GA104 High Definition Audio Controller 228e GA106 High Definition Audio Controller + 2291 GA107 High Definition Audio Controller 2296 Tegra PCIe Endpoint Virtual Network 22a3 GH100 [H100 NVSwitch] 22ba AD102 High Definition Audio Controller @@ -13059,6 +13153,7 @@ 2322 GH100 [H800 PCIe] 2324 GH100 [H800] 2329 GH100 [H20] + 232c GH100 [H20 HBM3e] 2330 GH100 [H100 SXM5 80GB] 2331 GH100 [H100 PCIe] 2335 GH100 [H200 SXM 141GB] @@ -13072,6 +13167,7 @@ 2342 GH100 [GH200 120GB / 480GB] 2343 GH100 2345 GH100 [GH100-88K-A1] + 2348 GH100 [GH200 144G HBM3e] 237f GH100 [Skinny Joe] 23b0 GH100 23f0 GH100 @@ -13150,7 +13246,7 @@ 25a9 GA107M [GeForce RTX 2050] 25aa GA107M [GeForce MX570 A] 25ab GA107M [GeForce RTX 3050 4GB Laptop GPU] - 25ac GN20-P0-R-K2 [GeForce RTX 3050 6GB Laptop GPU] + 25ac GA107BM / GN20-P0-R-K2 [GeForce RTX 3050 6GB Laptop GPU] 25ad GA107 [GeForce RTX 2050] 25af GA107 [GeForce RTX 3050 Engineering Sample] 25b0 GA107GL [RTX A1000] @@ -13167,7 +13263,7 @@ 25e0 GA107BM [GeForce RTX 3050 Ti Mobile] 25e2 GA107BM [GeForce RTX 3050 Mobile] 25e5 GA107BM [GeForce RTX 3050 Mobile] - 25ec GN20-P0-R-K2 [GeForce RTX 3050 6GB Laptop GPU] + 25ec GA107BM / GN20-P0-R-K2 [GeForce RTX 3050 6GB Laptop GPU] 25ed GA107 [GeForce RTX 2050] 25f9 GA107 [RTX A1000 Embedded GPU ] 25fa GA107 [RTX A2000 Embedded GPU] @@ -13190,9 +13286,9 @@ 2704 AD103 [GeForce RTX 4080] 2705 AD103 [GeForce RTX 4070 Ti SUPER] 2709 AD103 [GeForce RTX 4070] - 2717 GN21-X11 [GeForce RTX 4090 Laptop GPU] + 2717 AD103M / GN21-X11 [GeForce RTX 4090 Laptop GPU] 2730 AD103GLM [RTX 5000 Ada Generation Laptop GPU] - 2757 GN21-X11 [GeForce RTX 4090 Laptop GPU] + 2757 AD103M / GN21-X11 [GeForce RTX 4090 Laptop GPU] 2770 AD103GLM [RTX 5000 Ada Generation Embedded GPU] 2782 AD104 [GeForce RTX 4070 Ti] 2783 AD104 [GeForce RTX 4070 SUPER] @@ -13230,8 +13326,24 @@ 28e0 AD107M [GeForce RTX 4060 Max-Q / Mobile] 28e1 AD107M [GeForce RTX 4050 Max-Q / Mobile] 28f8 AD107GLM [RTX 2000 Ada Generation Embedded GPU] - 2900 GB100 - 2940 GB100 + 2900 GB100 [Reserved Dev ID A] + 2940 GB100 [Reserved Dev ID B] + 2941 GB100 [GB200 SKU] + 2980 GB102 + 29c0 GB102 + 2c18 GB203M / GN22 [GeForce RTX 5090 Max-Q / Mobile] + 2c19 GB203M / GN22 [GeForce RTX 5080 Max-Q / Mobile] + 2c2c GB6-256(N22W-ES-A1) + 2c58 GB203M / GN22-X11 [GeForce RTX 5090 Max-Q / Mobile] + 2c59 GB203M / GN22-X9 [GeForce RTX 5080 Max-Q / Mobile] + 2d18 AD108M [GeForce RTX 5070 Max-Q / Mobile] + 2d19 AD108M [GeForce RTX 5060 Max-Q / Mobile] + 2d58 AD108M [GeForce RTX 5070 Max-Q / Mobile] + 2d59 AD108M [GeForce RTX 5060 Max-Q / Mobile] + 2d98 AD108M [GeForce RTX 5050 Max-Q / Mobile] + 2dd8 AD108M [GeForce RTX 5050 Max-Q / Mobile] + 2f18 AD108M [GeForce RTX 5070 Ti Max-Q / Mobile] + 2f58 AD108M [GeForce RTX 5070 Ti Max-Q / Mobile] 10df Emulex Corporation 0720 OneConnect NIC (Skyhawk) 103c 1934 FlexFabric 20Gb 2-port 650M Adapter @@ -13333,6 +13445,7 @@ f500 LPe37000/LPe38000 Series 32Gb/64Gb Fibre Channel Adapter 1014 06c1 PCIe4 4-Port 32Gb Fibre Channel Adapter for POWER (FC EN1L/EN1M; CCIN 2CFC) 1014 06c2 PCIe4 2-Port 64Gb Fibre Channel Adapter for POWER (FC EN1N/EN1P; CCIN 2CFD) + f600 LPe37100S/LPe38100S Series 32Gb/64Gb Fibre Channel Adapter f700 LP7000 Fibre Channel Host Adapter f701 LP7000 Fibre Channel Host Adapter Alternate ID (JX1:2-3, JX2:1-2) f800 LP8000 Fibre Channel Host Adapter @@ -13671,6 +13784,7 @@ 8813 RTL8813AE 802.11ac PCIe Wireless Network Adapter 8821 RTL8821AE 802.11ac PCIe Wireless Network Adapter 8852 RTL8852AE 802.11ax PCIe Wireless Network Adapter + 8922 RTL8922AE 802.11be PCIe Wireless Network Adapter a85a RTL8852AE WiFi 6 802.11ax PCIe Adapter b520 RTL8852BE-VT PCIe 802.11ax Wireless Network Controller b723 RTL8723BE PCIe Wireless Network Adapter @@ -14046,6 +14160,7 @@ 0410 VX900 Series Host Bridge: Host Control 0415 VT6415 PATA IDE Host Controller 1043 838f Motherboard + 0419 VN1000 Host Bridge 0501 VT8501 [Apollo MVP4] 0505 VT82C505 # Shares chip with :0576. The VT82C576M has :1571 instead of :0561. @@ -14137,6 +14252,7 @@ 1364 CN896/VN896/P4M900 Host Bridge 1409 VX855/VX875 Error Reporting 1410 VX900 Series Error Reporting + 1419 VN1000 Host Bridge 1571 VT82C576M/VT82C586 1595 VT82C595/97 [Apollo VP2/97] 1732 VT1732 [Envy24 II] PCI Multi-Channel Audio Controller @@ -14162,11 +14278,12 @@ 2364 CN896/VN896/P4M900 Host Bridge 2409 VX855/VX875 Host Bus Control 2410 VX900 Series CPU Bus Controller + 2419 VN1000 Host Bridge 287a VT8251 PCI to PCI Bridge 287b VT8251 Host Bridge 287c VT8251 PCIE Root Port 287d VT8251 PCIE Root Port - 287e VT8237/8251 Ultra VLINK Controller + 287e VT8237/8251/8261 Ultra VLINK Controller 3022 CLE266 3038 VT82xx/62xx/VX700/8x0/900 UHCI USB 1.1 Controller 0925 1234 onboard UHCI USB 1.1 Controller @@ -14331,7 +14448,7 @@ 3116 VT8375 [KM266/KL266] Host Bridge 1297 f641 FX41 motherboard 3118 CN400/PM800/PM880/PN800/PN880 [S3 UniChrome Pro] - 3119 VT6120/VT6121/VT6122 Gigabit Ethernet Adapter + 3119 VT6120/VT6121/VT6122/VT6130 Gigabit Ethernet Adapter 3122 VT8623 [Apollo CLE266] integrated CastleRock graphics 3123 VT8623 [Apollo CLE266] 3128 VT8753 [P4X266 AGP] @@ -14432,12 +14549,14 @@ 3372 VT8237S PCI to ISA Bridge 337a VT8237A PCI to PCI Bridge 337b VT8237A Host Bridge + 3402 VT8261 PCI to ISA Bridge 3403 VT6315 Series Firewire Controller 1043 8374 M5A88-V EVO 1043 8384 P8P67 Deluxe Motherboard 3409 VX855/VX875 DRAM Bus Control 3410 VX900 Series DRAM Bus Control 19da a179 ZBOX nano VD01 + 3419 VN1000 Host Bridge 3432 VL800/801 xHCI USB 3.0 Controller 3456 VX11 Standard Host Bridge 345b VX11 Miscellaneous Bus @@ -14466,6 +14585,7 @@ 4409 VX855/VX875 Power Management Control 4410 VX900 Series Power Management and Chip Testing Control 19da a179 ZBOX nano VD01 + 4419 VN1000 Host Bridge 5030 VT82C596 ACPI [Apollo PRO] 5122 VX855/VX875 Chrome 9 HCM Integrated Graphics 5208 PT890 I/O APIC Interrupt Controller @@ -14483,6 +14603,7 @@ 5372 VT8237/8251 Serial ATA Controller 5409 VX855/VX875 APIC and Central Traffic Control 5410 VX900 Series APIC and Central Traffic Control + 5419 VN1000 I/O APIC Interrupt Controller 6100 VT85C100A [Rhine II] 6122 VN1000 Graphics [Chrome 520 IGP] 6287 SATA RAID Controller @@ -14493,6 +14614,7 @@ 6409 VX855/VX875 Scratch Registers 6410 VX900 Series Scratch Registers 19da a179 ZBOX nano VD01 + 6419 VN1000 Host Bridge 7122 VX900 Graphics [Chrome9 HD] 7204 K8M800 Host Bridge 7205 KM400/KN400/P4M800 [S3 UniChrome] @@ -14519,6 +14641,7 @@ 7409 VX855/VX875 North-South Module Interface Control 7410 VX900 Series North-South Module Interface Control 19da a179 ZBOX nano VD01 + 7419 VN1000 Host Bridge 8231 VT8231 [PCI-to-ISA Bridge] 8235 VT8235 ACPI 8305 VT8363/8365 [KT133/KM133 AGP] @@ -14543,12 +14666,14 @@ 8a26 KL133/KL133A/KM133/KM133A [S3 ProSavage] 8d01 PN133/PN133T [S3 Twister] 8d04 KM266/P4M266/P4M266A/P4N266 [S3 ProSavageDDR] + 9000 VT8261 IDE Controller [StorX IDE Controller - 9000] 9001 VX900 Series Serial-ATA Controller + 9040 VT8261 SATA Controller [StorX RAID Controller - 9040] 9082 Standard AHCI 1.0 SATA Controller 9140 HDMI Audio Device 9201 USB3.0 Controller 9380 Ncore Coprocessor for Centaur CNS - 9530 VX800/820/900 Series Secure Digital Memory Card Controller + 9530 VX800/820/900/VT8261 Series Secure Digital Memory Card Controller 95d0 VX800/820/900 Series SDIO Host Controller a208 PT890 PCI to PCI Bridge Controller a238 K8T890 PCI to PCI Bridge Controller @@ -14557,6 +14682,7 @@ a364 CN896/VN896/P4M900 PCI to PCI Bridge Controller a409 VX855/VX875/VX900 Series USB Device Controller a410 VX900 Series PCI Express Root Port 0 + a419 VN1000 PCI to PCI Bridge b091 VT8633 [Apollo Pro266 AGP] b099 VT8366/A/7 [Apollo KT266/A/333 AGP] b101 VT8653 AGP Bridge @@ -14572,6 +14698,7 @@ b213 VPX/VPX2 I/O APIC Interrupt Controller b353 VX855/VX875/VX900 PCI to PCI Bridge b410 VX900 Series PCI Express Root Port 1 + b419 VN1000 Host Bridge b999 [K8T890 North / VT8237 South] PCI Bridge c208 PT890 PCI to PCI Bridge Controller c238 K8T890 PCI to PCI Bridge Controller @@ -14581,22 +14708,26 @@ c364 CN896/VN896/P4M900 PCI to PCI Bridge Controller c409 VX855/VX875 EIDE Controller c410 VX900 Series PCI Express Root Port 2 + c419 VN1000 PCI to PCI Bridge d104 VT8237R USB UDCI Controller d208 PT890 PCI to PCI Bridge Controller d213 VPX/VPX2 PCI to PCI Bridge Controller d238 K8T890 PCI to PCI Bridge Controller d340 PT900 PCI to PCI Bridge Controller d410 VX900 Series PCI Express Root Port 3 + d419 VN1000 PCI to PCI Bridge e208 PT890 PCI to PCI Bridge Controller e238 K8T890 PCI to PCI Bridge Controller e340 PT900 PCI to PCI Bridge Controller e353 VX800/820-Series PCI-Express Root Port 0 e410 VX900 Series PCI Express Physical Layer Electrical Sub-block + e419 VN1000 PCI to PCI Bridge f208 PT890 PCI to PCI Bridge Controller f238 K8T890 PCI to PCI Bridge Controller f340 PT900 PCI to PCI Bridge Controller f353 VX800/820-Series PCI-Express Root Port 1 f410 VX900 Series PCI UART Port 0-3 + f419 VN1000 PCI to PCI Bridge 1107 Stratus Computers 0576 VIA VT82C570MV [Apollo] (Wrong vendor ID!) 1108 Proteon, Inc. @@ -17016,9 +17147,7 @@ 123b Seeq Technology, Inc. 123c Century Systems, Inc. 123d Engineering Design Team, Inc. - 0000 EasyConnect 8/32 - 0002 EasyConnect 8/64 - 0003 EasyIO + 0000 PCI 11W 0047 PCIe4 CDa 004b PCIe4 CDa 16 009d VisionLink F1 @@ -17132,7 +17261,9 @@ 1028 0085 ES1968 Maestro-2 PCI 1033 8051 ES1968 Maestro-2 Audiodrive 1969 ES1938/ES1946/ES1969 Solo-1 Audiodrive + 1014 0162 16 Bit PCI Audio Adapter (37L4457) 1014 0166 ES1969 SOLO-1 AudioDrive on IBM Aptiva Mainboard + 121f 8800 eDio Hi-Live SC1938 125d 8888 Solo-1 Audio Adapter 125d 8898 ES1938S TTSOLO1-SL [TerraTec 128i PCI] 153b 111b Terratec 128i PCI @@ -18379,6 +18510,8 @@ 0207 GLN180PEX GPS/GLONASS receiver (PCI Express) 0208 GPS180AMC GPS Receiver (PCI Express / MicroTCA / AdvancedMC) 0209 GNS181PEX GPS/Galileo/GLONASS/BEIDOU receiver (PCI Express) + 020a GPS183PEX GPS Receiver (PCI Express) + 020b GNS183PEX GPS/Galileo/GLONASS/BEIDOU receiver (PCI Express) 0301 TCR510PCI IRIG Timecode Reader 0302 TCR167PCI IRIG Timecode Reader 0303 TCR511PCI IRIG Timecode Reader @@ -20018,6 +20151,7 @@ 1028 215a DC NVMe PM9A3 RI U.2 960GB 1028 215b DC NVMe PM9A3 RI U.2 1.92TB 1028 215c DC NVMe PM9A3 RI U.2 3.84TB + 1028 215d Dell DC NVMe PM9A3 RI U.2 7.68TB 1028 2166 DC NVMe PM9A3 RI 110M.2 960GB 1028 2167 DC NVMe PM9A3 RI 110M.2 1.92TB 1028 2168 DC NVMe PM9A3 RI 80M.2 480GB @@ -20154,6 +20288,45 @@ 1028 225e NVMe FIPS PM1745 MU U.2 12.8TB 1028 225f NVMe PM1745 MU U.2 12.8TB a900 NVMe SSD Controller PM9DXa + 1028 230f DC NVMe PM9D3a RI 80M.2 480GB ISE + 1028 2310 DC NVMe PM9D3a RI 80M.2 960GB ISE + 1028 2311 DC NVMe PM9D3a RI 80M.2 1.92TB ISE + 1028 2341 DC NVMe PM9D3a RI U.2 960GB  + 1028 2342 DC NVMe PM9D3a RI U.2 1.92TB + 1028 2343 DC NVMe PM9D3a RI U.2 3.84TB + 1028 2344 DC NVMe PM9D3a RI U.2 7.68GTB + 1028 2345 DC NVMe PM9D3a RI U.2 15.36TB + 1028 2346 DC NVMe FIPS PM9D3a RI U.2 960GB + 1028 2347 DC NVMe FIPS PM9D3a RI U.2 1.92TB + 1028 2348 DC NVMe FIPS PM9D3a RI U.2 3.84TB + 1028 2349 DC NVMe FIPS PM9D3a RI U.2 7.68TB + 1028 234a DC NVMe FIPS PM9D3a RI U.2 15.36TB  + 1028 234d DC NVMe PM9D3a RI E3s 1.92TB + 1028 234e DC NVMe PM9D3a RI E3s 3.84TB  + 1028 234f DC NVMe PM9D3a RI E3s 7.68GTB + 1028 2350 DC NVMe PM9D3a RI E3s 15.36TB + 1028 2351 DC NVMe FIPS PM9D3a RI E3s 1.92TB + 1028 2352 DC NVMe FIPS PM9D3a RI E3s 3.84TB + 1028 2353 DC NVMe FIPS PM9D3a RI E3s 7.68TB + 1028 2354 DC NVMe FIPS PM9D3a RI E3s 15.36TB + 1028 2355 DC NVMe PM9D5a MU U.2 800GB + 1028 2356 DC NVMe PM9D5a MU U.2 1.6TB + 1028 2357 DC NVMe PM9D5a MU U.2 3.2TB + 1028 2358 DC NVMe PM9D5a MU U.2 6.4TB + 1028 2359 DC NVMe PM9D5a MU E3.s 1.6TB + 1028 235a DC NVMe PM9D5a MU E3.s 3.2TB + 1028 235b DC NVMe PM9D5a MU E3.s 6.4TB + aa00 NVMe SSD Controller BM1743 + 1028 2312 NVMe FIPS BM1743 QLC U.2 15.36TB + 1028 2313 NVMe FIPS BM1743 QLC U.2 30.72TB + 1028 2314 NVMe FIPS BM1743 QLC U.2 61.44TB + 1028 2315 NVMe BM1743 QLC U.2 15.36TB + 1028 2316 NVMe BM1743 QLC U.2 30.72TB + 1028 2317 NVMe BM1743 QLC U.2 61.44TB + 1028 2364 NVMe BM1743 QLC U.2 122.88TB + 1028 2366 MZ3MO15THCLCAD3 + 1028 2367 MZ3MO30THCLFAD3 + ac00 NVMe SSD Controller PM175x ecec Exynos 8895 PCIe Root Complex 144e OLITEC 144f Askey Computer Corp. @@ -20353,6 +20526,7 @@ 14c3 MEDIATEK Corp. 0608 MT7921K (RZ608) Wi-Fi 6E 80MHz 0616 MT7922 802.11ax PCI Express Wireless Network Adapter + 4d75 T700 5G Modem [5G Solution 5000] 7603 MT7603E 802.11bgn PCI Express Wireless Network Adapter 7612 MT7612E 802.11acbgn PCI Express Wireless Network Adapter 7615 MT7615E 802.11ac PCI Express Wireless Network Adapter @@ -20360,12 +20534,16 @@ 7650 MT7650 802.11ac # MT7612E too? 7662 MT7662E 802.11ac PCI Express Wireless Network Adapter + 7663 MT7663 802.11ac PCI Express Wireless Network Adapter 7915 MT7915E 802.11ax PCI Express Wireless Network Adapter 7916 MT7905D/MT7975 # WiFi 6E capable 7922 MT7922 802.11ax PCI Express Wireless Network Adapter 1a3b 5300 ASUS PCE-AXE59BT 7961 MT7921 802.11ax PCI Express Wireless Network Adapter + 7988 MT7988 PCIe Host Bridge [Filogic 880] + 7990 MT7996 802.11be PCI Express Wireless Network Adapter (Port 0) + 7991 MT7996 802.11be PCI Express Wireless Network Adapter (Port 1) 8650 MT7650 Bluetooth 14c4 IWASAKI Information Systems Co Ltd 14c5 Automation Products AB @@ -20992,7 +21170,20 @@ 14e4 d142 NetXtreme-E P425D BCM57504 4x25G SFP28 PCIE 1590 0420 HPE Ethernet 25/50Gb 2-port 6310C Adapter 1752 BCM57502 NetXtreme-E 10Gb/25Gb/40Gb/50Gb Ethernet - 1760 BCM57608 10Gb/25Gb/50Gb/100Gb/200Gb/400Gb Ethernet + 1760 BCM57608 25Gb/50Gb/100Gb/200Gb/400Gb Ethernet + 14e4 9110 BCM57608 1x400G PCIe Ethernet NIC + 14e4 9120 BCM57608 2x200G PCIe Ethernet NIC + 14e4 9121 BCM57608 2x100G PCIe Ethernet NIC + 14e4 9125 BCM57608 2x200G PCIe Ethernet NIC + 14e4 9126 BCM57608 2x100G PCIe Ethernet NIC + 14e4 9140 BCM57608 1x400G QSFP-DD PCIe Ethernet NIC + 14e4 9310 BCM57608 1x400G QSFP-DD OCP Ethernet NIC + 14e4 9311 BCM57608 1x400G OCP Ethernet NIC + 14e4 9312 BCM57608 1x200G OCP Ethernet NIC + 14e4 9320 BCM57608 2x200G OCP Ethernet NIC + 14e4 9325 BCM57608 2x200G OCP Ethernet NIC + 14e4 9326 BCM57608 2x100G OCP Ethernet NIC + 14e4 9340 BCM57608 4x100G OCP Ethernet NIC 14e4 d125 BCM57608 2x200G PCIe Ethernet NIC 1800 BCM57502 NetXtreme-E Ethernet Partition 1801 BCM57504 NetXtreme-E Ethernet Partition @@ -21010,6 +21201,7 @@ 14e4 df24 BCM57508 NetXtreme-E NGM2100D 2x100G KR Mezz Ethernet Virtual Function 1809 BCM5750X NetXtreme-E RDMA Virtual Function 14e4 df24 BCM57508 NetXtreme-E NGM2100D 2x100G KR Mezz RDMA Virtual Function + 1819 BCM5760X Ethernet Virtual Function 2711 BCM2711 PCIe Bridge 2712 BCM2712 PCIe Bridge 3352 BCM3352 @@ -21983,6 +22175,7 @@ 6893 3U OpenVPX Multi-function I/O Board [Model 68C3] 15ad VMware 0405 SVGA II Adapter + 0406 SVGA II Adapter (Fusion) 0710 SVGA Adapter 0720 VMXNET Ethernet Controller 0740 Virtual Machine Communication Interface @@ -22061,7 +22254,7 @@ 0271 Spectrum-5 RMA 0274 Spectrum-6 in Flash Recovery Mode 0275 Spectrum-6 RMA - 0277 Spectrum-4TOR RMA + 0277 Spectrum-6 Tile 0278 Quantum-4 in Flash Recovery Mode 0279 Quantum-4 RMA 027a Eros Chiplet @@ -22075,6 +22268,7 @@ # Flash recovery 0288 Arcus2 0289 Arcus2 RMA + 0290 SagittaZ 1002 MT25400 Family [ConnectX-2 Virtual Function] 1003 MT27500 Family [ConnectX-3] 1014 04b5 PCIe3 40GbE RoCE Converged Host Bus Adapter for Power @@ -22151,6 +22345,7 @@ 193d 1083 NIC-ETH640F-3S-2P # NIC-ETH540F-3S-2P OCP3.0 2x10G Card 193d 1084 NIC-ETH540F-3S-2P + 1e81 0c10 25GbE dual-port SFP28, PCIe3.0 x8 [3SC10] 1016 MT27710 Family [ConnectX-4 Lx Virtual Function] 1017 MT27800 Family [ConnectX-5] 15b3 0006 ConnectX-5 EN network interface card, 100GbE single-port QSFP28, PCIe3.0 x16, tall bracket; MCX515A-CCAT @@ -22169,9 +22364,14 @@ 101b MT28908 Family [ConnectX-6] 101c MT28908 Family [ConnectX-6 Virtual Function] 101d MT2892 Family [ConnectX-6 Dx] + 193d 1055 NIC-ETH1040F-LP-2P QSFP56 2x100GbE PCIe Network Adapter 101e ConnectX Family mlx5Gen Virtual Function 101f MT2894 Family [ConnectX-6 Lx] 193d 1035 NIC-ETH641F-LP-2P SFP28 2x25GbE PCIe Network Adapter + 1bd4 00ac O252MCX6Lx + 1bd4 00ae S252MCX6Lx + 1ff9 00ad ENFM6251-SP2 + 1ff9 00af ENPM6251-SP2 1020 MT28860 1021 MT2910 Family [ConnectX-7] 1023 CX8 Family [ConnectX-8] @@ -22316,7 +22516,7 @@ 5006 SanDisk Extreme Pro / WD Black SN750 / PC SN730 / Red SN700 NVMe SSD 5007 IX SN530 NVMe SSD (DRAM-less) 5008 PC SN530 NVMe SSD (DRAM-less) - 5009 SanDisk Ultra 3D / WD Blue SN550 NVMe SSD + 5009 SanDisk Ultra 3D / WD PC SN530, IX SN530, Blue SN550 NVMe SSD (DRAM-less) 15b7 5009 WD Blue SN550 NVMe SSD 500b PC SN530 NVMe SSD 1414 500b Xbox Series X @@ -22339,6 +22539,7 @@ 5036 WD PC SN5000S M.2 2280 NVMe SSD (DRAM-less) 5041 WD Blue SN580 NVMe SSD (DRAM-less) 5042 WD Black SN770M NVMe SSD (DRAM-less) + 5046 SanDisk Extreme NVMe SSD (DRAM-less) 15b8 ADDI-DATA GmbH 1001 APCI1516 SP controller (16 digi outputs) 1003 APCI1032 SP controller (32 digi inputs w/ opto coupler) @@ -23436,6 +23637,7 @@ 8084 GL880 USB 2.0 EHCI controller 9750 GL9750 SD Host Controller 9755 GL9755 SD Host Controller + 9767 GL9767 SD Host Controller e763 GL9763E eMMC Controller 17aa Lenovo 0003 LENSE20256GMSP34MEAT2TA @@ -23930,6 +24132,8 @@ # Sitecom HFC-S based ISDN controller card DC-105v2 3069 DC-105v2 ISDN controller 18d4 Celestica +# OCP-TAP + 1007 Time Card 18d8 Dialogue Technology Corp. 18dd Artimi Inc 4c6f Artimi RTMI-100 UWB adapter @@ -24676,6 +24880,7 @@ 1a03 ASPEED Technology, Inc. 1150 AST1150 PCI-to-PCI Bridge 2000 ASPEED Graphics Family + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 15d9 1b95 H12SSL-i 1a05 deltaww @@ -24826,6 +25031,7 @@ 1ad7 Spectracom Corporation 8000 TSync-PCIe Time Code Processor 9100 TPRO-PCI-66U Timecode Reader/Generator + a000 OCP-TAP [ARTCard] 1ade Spin Master Ltd. 1501 Swipetech barcode scanner 3038 PCIe Video Bridge @@ -25457,6 +25663,7 @@ 0022 FD788 0023 FD722-M2 0024 FD722 with bypass + 0025 FD922 1c28 Lite-On IT Corp. / Plextor 0122 M6e PCI Express SSD [Marvell 88SS9183] # previously Fiberblaze @@ -25656,6 +25863,12 @@ 1c5f 5437 NVMe SSD PBlaze6 6647 3200G 2.5" U.2(dual port) 1c5f 5441 NVMe SSD PBlaze6 6547 6400G 2.5" U.2 1c5f 5447 NVMe SSD PBlaze6 6647 6400G 2.5" U.2(dual port) + 0027 PBlaze7 7A40/7A46 NVMe SSD + 1c5f 1421 NVMe SSD PBlaze7 7A40 1920G 2.5" U.2 + 1c5f 1431 NVMe SSD PBlaze7 7A40 3840G 2.5" U.2 + 1c5f 1441 NVMe SSD PBlaze7 7A40 7680G 2.5" U.2 + 1c5f 5431 NVMe SSD PBlaze7 7A46 3200G 2.5" U.2 + 1c5f 5441 NVMe SSD PBlaze7 7A46 6400G 2.5" U.2 003d PBlaze5 920/926 1c5f 0a30 NVMe SSD PBlaze5 920 3840G AIC 1c5f 0a31 NVMe SSD PBlaze5 920 3840G 2.5" U.2 @@ -25679,9 +25892,12 @@ 1c5f 4b61 NVMe SSD PBlaze6 6936 25600GB 2.5" U.3 003f PBlaze7 7940/7946 NVMe SSD 1c5f 0431 NVMe SSD PBlaze7 7940 3840G 2.5" U.2 + 1c5f 0441 NVMe SSD PBlaze7 7940 7680G 2.5" U.2 + 1c5f 0451 NVMe SSD PBlaze7 7940 15360G 2.5" U.2 1c5f 0c31 NVMe SSD PBlaze7 7940 3840G 2.5" U.2 1c5f 0c41 NVMe SSD PBlaze7 7940 7680G 2.5" U.2 1c5f 0c51 NVMe SSD PBlaze7 7940 15360G 2.5" U.2 + 1c5f 0c61 NVMe SSD PBlaze7 7940 30720G 2.5" U.2 1c5f 1430 NVMe SSD PBlaze7 7940 3840G AIC 1c5f 1431 NVMe SSD PBlaze7 7940 3840G 2.5" U.2 1c5f 1435 NVMe SSD PBlaze7 7940 3840G E1.S @@ -25690,6 +25906,9 @@ 1c5f 1445 NVMe SSD PBlaze7 7940 7680G E1.S 1c5f 1450 NVMe SSD PBlaze7 7940 15360G AIC 1c5f 1451 NVMe SSD PBlaze7 7940 15360G 2.5" U.2 + 1c5f 4431 NVMe SSD PBlaze7 7946 3200G 2.5" U.2 + 1c5f 4441 NVMe SSD PBlaze7 7946 6400G 2.5" U.2 + 1c5f 4451 NVMe SSD PBlaze7 7946 12800G 2.5" U.2 1c5f 4c31 NVMe SSD PBlaze7 7946 3200G 2.5" U.2 1c5f 4c41 NVMe SSD PBlaze7 7946 6400G 2.5" U.2 1c5f 4c51 NVMe SSD PBlaze7 7946 12800G 2.5" U.2 @@ -25775,6 +25994,7 @@ 627a LEGEND 800 NVMe SSD (DRAM-less) # 500GB 628a LEGEND 800 NVMe SSD (DRAM-less) + 642a XPG GAMMIX S50 CORE NVMe SSD (DRAM-less) 8201 XPG SX8200 Pro PCIe Gen3x4 M.2 2280 Solid State Drive 1cc4 Shenzhen Unionmemory Information System Ltd. 1203 NVMe SSD Controller UHXXXa series @@ -25808,6 +26028,8 @@ 6a03 RPETJ512MKP1QDQ PCIe 4.0 NVMe SSD 512GB (DRAM-less) 6a13 RPJYJ512MKN1QWQ PCIe 4.0 NVMe SSD 512GB (DRAM-less) 6a14 RPEYJ1T24MKN2QWY PCIe 4.0 NVMe SSD 1024GB (DRAM-less) + 6b13 RPJYJ512MLR1QWY PCIe 4.0 NVMe SSD 512GB (DRAM-less) + 6b14 RPJYJ1T24MLR1HWY PCIe 4.0 NVMe SSD 1024GB (DRAM-less) 8030 NVMe SSD Controller UH8X2X/UH7X2X series 1cc4 1122 NVMe SSD UH812a U.2 1.92TB 1cc4 1123 NVMe SSD UH812a U.2 3.84TB @@ -25894,6 +26116,7 @@ efa0 Elastic Fabric Adapter (EFA) efa1 Elastic Fabric Adapter (EFA) efa2 Elastic Fabric Adapter (EFA) + efa3 Elastic Fabric Adapter (EFA) 1d17 Zhaoxin 070f ZX-100 PCI Express Root Port 0710 ZX-100/ZX-200 PCI Express Root Port @@ -26106,6 +26329,8 @@ 1028 AR-P2P-ATR [P2P Actor Function] 1029 AR-P2P-UTL [P2P Utility Function] 102a AR-TK242-FX2 [4x100GbE Gen5 Packet Capture-Replay Device] + 102b AR-ARKV-FX1 [Arkville 128B DPDK Data Mover for Versal/CPM5] + 102c AR-TK242-V80 [Gen5 PCAP Processor] 4200 A5PL-E1-10GETI [10 GbE Ethernet Traffic Instrument] 1d72 Xiaomi 1d78 DERA Storage @@ -26255,6 +26480,7 @@ 0010 Networking DOM Engine 0011 IO Bridge 0013 Host Network Interface + 0400 Time Card 1da1 Teko Telecom S.r.l. 1da2 Sapphire Technology Limited 475d Radeon RX 7800 XT [PULSE] @@ -26270,6 +26496,7 @@ 1010 HL-2000 AI Training Accelerator [Gaudi secured] # PCIe accelerator card for Deep Learning training tasks 1020 Gaudi2 AI Training Accelerator + 1060 Gaudi3 AI Training Accelerator 1da8 Corigine, Inc. 3800 Network Flow Processor 3800 3803 Network Flow Processor 3800 Virtual Function @@ -26333,6 +26560,7 @@ 1dbe 5007 Dongting-N3 DC SSD U.2 12800GB 1dbe 5008 Dongting-N3 DC SSD U.2 15360GB 1dbe 5009 Dongting-N3 DC SSD U.2 25600GB + 1dbe 5010 Dongting-N3 DC SSD U.2 30720GB 5669 NVMe SSD Controller IG5669 [Tacoma] 1dbf Guizhou Huaxintong Semiconductor Technology Co., Ltd 0401 StarDragon4800 PCI Express Root Port @@ -26793,6 +27021,7 @@ 1028 223d Ent NVMe CM7 U.2 MU 3.2TB 1028 223e Ent NVMe CM7 U.2 MU 1.6TB 002a Exceria Plus G3 NVMe SSD (DRAM-less) + 002b NVMe SSD Controller CD8P 002c NVMe SSD Controller CD8P EDSFF 1028 22bf DC NVMe CD8P E3.S 15.36TB 1028 22c0 DC NVMe CD8P E3.S 7.68TB @@ -26848,8 +27077,14 @@ 1e3b DapuStor Corporation 0600 NVMe SSD Controller DP600 1e3b 0006 Enterprise NVMe SSD U.2 7.68TB (J5000) + 1e3b 000c Enterprise NVMe SSD U.2 30.72TB (J5060) + 1e3b 000d Enterprise NVMe SSD U.2 61.44TB (J5060) + 1e3b 000e Enterprise NVMe SSD U.2 30.72TB (J5060D) + 1e3b 000f Enterprise NVMe SSD U.2 61.44TB (J5060D) 1e3b 0010 Enterprise NVMe SSD U.2 3.84TB (R5102) 1e3b 0013 Enterprise NVMe SSD U.2 3.20TB (R5302) + 1e3b 0027 Enterprise NVMe SSD U.2 61.44TB (J5060) + 1e3b 0028 Enterprise NVMe SSD U.2 61.44TB (J5060D) 1e3b 0030 Enterprise NVMe SSD U.2 3.84TB (J5100) 1e3b 0031 Enterprise NVMe SSD U.2 7.68TB (J5100) 1e3b 0032 Enterprise NVMe SSD U.2 15.36TB (J5100) @@ -26905,7 +27140,7 @@ 1e3b 00ea Enterprise NVMe SSD U.2 3.20TB (J5301D) 1e3b 00eb Enterprise NVMe SSD U.2 6.40TB (J5301D) 1e3b 00ec Enterprise NVMe SSD U.2 30.72TB (J5101) - 1e3b 00ed NVMe SSD U.2 30.72TB (R5101) + 1e3b 00ed Enterprise NVMe SSD U.2 30.72TB (R5101) 1e3b 00ee Enterprise NVMe SSD U.2 15.36B (J5101) 1e3b 00ef Enterprise NVMe SSD U.2 12.80TB (J5301) 1e3b 00f0 Enterprise NVMe SSD U.2 0.40TB (X2900) @@ -26914,7 +27149,7 @@ 1e3b 00f3 Enterprise NVMe SSD U.2 3.20TB (X2900) 1e3b 00f5 Enterprise NVMe SSD U.2 0.40TB (X2900P) 1e3b 00f6 Enterprise NVMe SSD U.2 0.80TB (X2900P) - 0800 DP800 + 0800 NVMe SSD Controller DP800 1e3b 0001 Enterprise NVMe SSD U.2 3.84TB(R6100) 1e3b 0007 Enterprise NVMe SSD U.2 15.36TB (R6100) 1e3b 000a Enterprise NVMe SSD U.2 3.20TB (R6300) @@ -26962,6 +27197,12 @@ 1e3b 0082 Enterprise NVMe SSD U.2 7.68TB (H5100) 1e3b 0084 Enterprise NVMe SSD U.2 3.2TB (H5300) 1e3b 0085 Enterprise NVMe SSD U.2 6.4TB (H5300) + 3001 Ethernet Controller DN200 for 10GbE SFP+ + 1e3b 3001 Ethernet Network Adapter DN200-X1V for 10GbE SFP+ 2-port + 3002 Ethernet Controller DN200 Series Virtual Function + 300c Ethernet RAID Combo Controller DN200C for 1GbE + 1e3b 300c Ethernet RAID Combo Adapter DN200C-G2V for 1GbE 4-port + 300d Ethernet RAID Combo Controller DN200C Series Virtual Function 1e3d Burlywood, Inc 1e43 MaxLinear Inc 8904 MxL8904 @@ -27078,6 +27319,7 @@ 1eac Quectel Wireless Solutions Co., Ltd. 1001 EM120R-GL LTE Modem 1002 EM160R-GL LTE Modem + 2001 EM120R-GL 1eae XFX Limited 1eb0 Shenzhen Electrical Appliances CO. 1901 NVMe SSD Controller (DRAM-less) @@ -27177,7 +27419,7 @@ 1ee1 000b Airglow A430 NVMe SSD U.2 4.8TB 1ee1 0012 Airglow Z400 NVMe ZNS SSD U.2 5.76TB 1ee4 PETAIO INC - 1180 P8118 NVMe SSD Series + 1180 PETA8118 NVMe SSD Series 1ee4 0015 NVMe SSD U.2 1.92TB (P8118E) 1ee4 0016 NVMe SSD U.2 3.84TB (P8118E) 1ee4 0017 NVMe SSD U.2 7.68TB (P8118E) @@ -27196,6 +27438,72 @@ 1ee4 0225 NVMe SSD U.2 1.6TB (P8118X) 1ee4 0226 NVMe SSD U.2 3.2TB (P8118X) 1ee4 0227 NVMe SSD U.2 6.4TB (P8118X) + 1ee4 1013 NVMe SSD M.2 480GB (P8118E) + 1ee4 1014 NVMe SSD M.2 960GB (P8118E) + 1ee4 1015 NVMe SSD M.2 1.92TB (P8118E) + 1ee4 1016 NVMe SSD M.2 3.84TB (P8118E) + 1ee4 1023 NVMe SSD M.2 400GB (P8118E) + 1ee4 1024 NVMe SSD M.2 800GB (P8118E) + 1ee4 1025 NVMe SSD M.2 1.6TB (P8118E) + 1ee4 1026 NVMe SSD M.2 3.2TB (P8118E) + 1ee4 1113 NVMe SSD M.2 480GB (P8118Z) + 1ee4 1114 NVMe SSD M.2 960GB (P8118Z) + 1ee4 1115 NVMe SSD M.2 1.92TB (P8118Z) + 1ee4 1116 NVMe SSD M.2 3.84TB (P8118Z) + 1ee4 1123 NVMe SSD M.2 400GB (P8118Z) + 1ee4 1124 NVMe SSD M.2 800GB (P8118Z) + 1ee4 1125 NVMe SSD M.2 1.6TB (P8118Z) + 1ee4 1126 NVMe SSD M.2 3.2TB (P8118Z) + 1ee4 1213 NVMe SSD M.2 480GB (P8118X) + 1ee4 1214 NVMe SSD M.2 960GB (P8118X) + 1ee4 1215 NVMe SSD M.2 1.92TB (P8118X) + 1ee4 1216 NVMe SSD M.2 3.84TB (P8118X) + 1ee4 1223 NVMe SSD M.2 400GB (P8118X) + 1ee4 1224 NVMe SSD M.2 800GB (P8118X) + 1ee4 1225 NVMe SSD M.2 1.6TB (P8118X) + 1ee4 1226 NVMe SSD M.2 3.2TB (P8118X) + 1ee4 2015 NVMe SSD E1.S 1.92TB (P8118E) + 1ee4 2016 NVMe SSD E1.S 3.84TB (P8118E) + 1ee4 2017 NVMe SSD E1.S 7.68TB (P8118E) + 1ee4 2025 NVMe SSD E1.S 1.6TB (P8118E) + 1ee4 2026 NVMe SSD E1.S 3.2TB (P8118E) + 1ee4 2027 NVMe SSD E1.S 6.4TB (P8118E) + 1ee4 2115 NVMe SSD E1.S 1.92TB (P8118Z) + 1ee4 2116 NVMe SSD E1.S 3.84TB (P8118Z) + 1ee4 2117 NVMe SSD E1.S 7.68TB (P8118Z) + 1ee4 2125 NVMe SSD E1.S 1.6TB (P8118Z) + 1ee4 2126 NVMe SSD E1.S 3.2TB (P8118Z) + 1ee4 2127 NVMe SSD E1.S 6.4TB (P8118Z) + 1ee4 2215 NVMe SSD E1.S 1.92TB (P8118X) + 1ee4 2216 NVMe SSD E1.S 3.84TB (P8118X) + 1ee4 2217 NVMe SSD E1.S 7.68TB (P8118X) + 1ee4 2225 NVMe SSD E1.S 1.6TB (P8118X) + 1ee4 2226 NVMe SSD E1.S 3.2TB (P8118X) + 1ee4 2227 NVMe SSD E1.S 6.4TB (P8118X) + 1ee4 3013 NVMe SSD AIC 480GB (P8118E) + 1ee4 3014 NVMe SSD AIC 960GB (P8118E) + 1ee4 3015 NVMe SSD AIC 1.92TB (P8118E) + 1ee4 3016 NVMe SSD AIC 3.84TB (P8118E) + 1ee4 3017 NVMe SSD AIC 7.68TB (P8118E) + 1ee4 3025 NVMe SSD AIC 1.6TB (P8118E) + 1ee4 3026 NVMe SSD AIC 3.2TB (P8118E) + 1ee4 3027 NVMe SSD AIC 6.4TB (P8118E) + 1ee4 3113 NVMe SSD AIC 480GB (P8118Z) + 1ee4 3114 NVMe SSD AIC 960GB (P8118Z) + 1ee4 3115 NVMe SSD AIC 1.92TB (P8118Z) + 1ee4 3116 NVMe SSD AIC 3.84TB (P8118Z) + 1ee4 3117 NVMe SSD AIC 7.68TB (P8118Z) + 1ee4 3125 NVMe SSD AIC 1.6TB (P8118Z) + 1ee4 3126 NVMe SSD AIC 3.2TB (P8118Z) + 1ee4 3127 NVMe SSD AIC 6.4TB (P8118Z) + 1ee4 3213 NVMe SSD AIC 480GB (P8118X) + 1ee4 3214 NVMe SSD AIC 960GB (P8118X) + 1ee4 3215 NVMe SSD AIC 1.92TB (P8118X) + 1ee4 3216 NVMe SSD AIC 3.84TB (P8118X) + 1ee4 3217 NVMe SSD AIC 7.68TB (P8118X) + 1ee4 3225 NVMe SSD AIC 1.6TB (P8118X) + 1ee4 3226 NVMe SSD AIC 3.2TB (P8118X) + 1ee4 3227 NVMe SSD AIC 6.4TB (P8118X) 1ee4 abcd NVMe SSD U.2 1ee9 SUSE LLC 1eec Viscore Technologies Ltd @@ -27357,8 +27665,19 @@ 5236 PCIe 4 INNOGRIT based NVMe SSD 5765 PCIe 3 NVMe SSD (DRAM-less) 1f44 VVDN Technologies Private Limited -# YUSUR Technology Co., Ltd. -1f47 YUSUR Tech +1f47 YUSUR Technology Co., Ltd. + 1001 FLEXFLOW-2200T Ethernet Controller + 1f47 0001 FLEXFLOW-2200T Ethernet 10G 2P + 1f47 0002 FLEXFLOW-2200T Ethernet 25G 2P- + 1f47 0003 FLEXFLOW-2200T Ethernet 40G 2P + 1f47 0004 FLEXFLOW-2200T Ethernet 100G 1P + 1f47 0005 FLEXFLOW-2200T Ethernet 100G 2P + 1f47 0006 FLEXFLOW-2200T Ethernet 10G 2P + 1f47 0007 FLEXFLOW-2200T Ethernet 25G 2P + 1f47 0008 FLEXFLOW-2200T Ethernet 40G 2P + 1f47 0009 FLEXFLOW-2200T Ethernet 100G 1P + 1f47 000a FLEXFLOW-2200T Ethernet 100G 2P + 1003 FLEXFLOW-2200T Ethernet Controller MGMT Function # Network Accelerating Card 2018 DPU Card # Network Accelerating Card @@ -27490,6 +27809,9 @@ 1fd4 SUNIX Co., Ltd. 0001 Matrix multiport serial adapter 1999 Multiport serial controller +1fdd Wuqi Microelectronics Co., Ltd. + 0001 WQ9201 802.11ax PCIe Wireless Network Adapter + 1001 WQ9301 802.11ax PCIe Wireless Access Points 1fde Kratos Defense & Security Solutions, Inc. 1125 OpenEdge 1125P 2500 OpenEdge 2500P @@ -27497,7 +27819,8 @@ 1010 AWM 1 2000 AWM 2 2010 AWM 2-M -1fe1 Beijing Eswin Computing Technology Co., Ltd. +1fe1 Beijing ESWIN Computing Technology Co., Ltd. + 2030 EIC7700 Root Complex 1fe4 HippStor Technology 1600 HP600 Series NVMe SSD 1fe4 0075 Enterprise NVMe SSD U.2 3.84TB(HP610) @@ -27505,6 +27828,7 @@ 1fe4 0077 Enterprise NVMe SSD U.2 6.40TB(HP630) 1fe4 0078 Enterprise NVMe SSD U.2 3.20TB(HP630) 1fe9 MemryX + 0100 MX3 # LinkData Technology (Tianjin) Co., LTD 1ff2 Linkdata 10a1 NIC1160 Ethernet Controller Family @@ -27573,32 +27897,32 @@ 006d HS610 2646 Kingston Technology Company, Inc. 0010 HyperX Predator PCIe AHCI SSD - 2262 KC2000/KC2500 NVMe SSD SM2262EN - 2263 A2000 NVMe SSD SM2263EN + 2262 KC2000/KC2500 NVMe SSD [SM2262EN] + 2263 A2000 NVMe SSD [SM2263EN] 5008 A1000/U-SNS8154P3 x2 NVMe SSD - 500a DC1000B NVMe SSD E12DC - 500b DC1000M NVMe SSD SM2270 + 500a DC1000B NVMe SSD [E12DC] + 500b DC1000M NVMe SSD [SM2270] 500c OM8PCP Design-In PCIe 3 NVMe SSD (DRAM-less) 500d OM3PDP3 NVMe SSD - 500e NV1 NVMe SSD E13T (DRAM-less) - 500f NV1 NVMe SSD SM2263XT (DRAM-less) + 500e NV1 NVMe SSD [E13T] (DRAM-less) + 500f NV1 NVMe SSD [SM2263XT] (DRAM-less) 5010 OM8SBP NVMe PCIe SSD (DRAM-less) - 5012 DC1500M NVMe SSD SM2270 - 5013 KC3000/FURY Renegade NVMe SSD E18 + 5012 DC1500M NVMe SSD [SM2270] + 5013 KC3000/FURY Renegade NVMe SSD [E18] 5014 OM8SEP4 Design-In PCIe 4 NVMe SSD (TLC) (DRAM-less) 5016 OM3PGP4 NVMe SSD - 5017 NV2 NVMe SSD SM2267XT (DRAM-less) - 5019 NV2 NVMe SSD E21T (DRAM-less) + 5017 NV2 NVMe SSD [SM2267XT] (DRAM-less) + 5019 NV2 NVMe SSD [E21T] (DRAM-less) # 128GB 501a OM8PGP4 Design-In PCIe 4 NVMe SSD (TLC) (DRAM-less) 501b OM8PGP4 NVMe PCIe SSD (DRAM-less) - 501c NV2 NVMe SSD E19T (DRAM-less) - 501d NV2 NVMe SSD TC2200 (DRAM-less) - 501f FURY Renegade NVMe SSD with heatsink + 501c NV2 NVMe SSD [E19T] (DRAM-less) + 501d NV2 NVMe SSD [TC2200] (DRAM-less) + 501f FURY Renegade NVMe SSD + Heatsink [E18] 5021 OM8SEP4 Design-In PCIe 4 NVMe SSD (QLC) (DRAM-less) 5022 OM8PGP4 Design-In PCIe 4 NVMe SSD (QLC) (DRAM-less) - 5023 NV2 NVMe SSD SM2269XT (DRAM-less) - 5024 DC2000B NVMe SSD E18DC + 5023 NV2 NVMe SSD [SM2269XT] (DRAM-less) + 5024 DC2000B NVMe SSD [E18DC] 5025 NV3 NVMe SSD TC2201 (DRAM-less) 5026 NV3 NVMe SSD E21T (DRAM-less) 5027 NV3 NVMe SSD E27T (DRAM-less) @@ -28344,6 +28668,7 @@ # Wrong ID used in subsystem ID of AsusTek PCI-USB2 PCI card. 807d Asustek Computer, Inc. 8080 Chengdu Storeswift Technology Co., Ltd. + 4016 CX4016A NVMe SSD Controller 8086 Intel Corporation 0007 82379AB 0008 Extended Express System Support Controller @@ -28644,6 +28969,7 @@ 0685 Z490 Chipset LPC/eSPI Controller 0687 Q470 Chipset LPC/eSPI Controller 068d Comet Lake LPC Controller + 068e WM490 Chipset LPC/eSPI Controller 06a3 Comet Lake PCH SMBus Controller 06a4 Comet Lake PCH SPI Controller 06a8 Comet Lake PCH Serial IO UART Host Controller #0 @@ -28652,9 +28978,11 @@ 06ab Comet Lake PCH Serial IO SPI Controller #1 06ac Comet Lake PCI Express Root Port #21 06b0 Comet Lake PCI Express Root Port #9 + 06b8 Comet Lake PCIe Root Port #1 06ba Comet Lake PCI Express Root Port #1 06bb Comet Lake PCI Express Root Port #4 06bd Comet Lake PCIe Port #6 + 06be Comet Lake PCIe Root Port #7 06bf Comet Lake PCIe Port #8 06c0 Comet Lake PCI Express Root Port #17 06c8 Comet Lake PCH cAVS @@ -28678,6 +29006,7 @@ 8086 42a4 Dual Band Wi-Fi 5(802.11ac) Wireless-AC 9462 80MHz 1x1 [Jefferson Peak] 06f9 Comet Lake PCH Thermal Controller 06fb Comet Lake PCH Serial IO SPI Controller #2 + 06fc Comet Lake PCH Integrated Sensor Solution 0700 CE Media Processor A/V Bridge 0701 CE Media Processor NAND Flash Controller 0703 CE Media Processor Media Control Unit 1 @@ -29716,6 +30045,7 @@ 8086 10a6 PRO/1000 PF Quad Port Server Adapter 10a6 82599EB 10-Gigabit Dummy Function 10a7 82575EB Gigabit Network Connection + 15d9 10a7 X10DRW-i 8086 10a8 82575EB Gigabit Riser Card 10a9 82575EB Gigabit Backplane Connection 10b0 82573L PRO/1000 PL Network Connection @@ -29878,6 +30208,7 @@ 1bd4 002f 10G SFP+ DP EP102Fi4A Adapter 1bd4 0032 10G SFP+ DP EP102Fi4 Adapter 1bd4 0067 F102I82599 + 1f3f 0a00 Dual-port 10-Gigabit SFI/SFP+ Network Connection 4c52 1024 LR-LINK LRES9804BF Quad-port 10Gb Ethernet Server Adapter 4c52 3002 LRES3002PF Dual-port 10Gb Ethernet Server Adapter for OCP 4c52 3012 LRES3012PF Dual-port 10Gb Ethernet Server Adapter for OCP @@ -30233,6 +30564,7 @@ 1137 023e 1GigE I350 LOM 15d9 0000 AOC-SGP-i4 15d9 0652 Dual Port i350 GbE MicroLP [AOC-CGP-i2] + 15d9 1521 X10DRW-i 17aa 1074 ThinkServer I350-T4 AnyFabric 17aa 4005 I350 Gigabit Network Connection 18d4 0c07 I350 1Gb 2-port RJ45 OCP Mezz Card MOP41-I-1GT2 @@ -30754,8 +31086,8 @@ 1028 09be Latitude 7410 15ec JHL7540 Thunderbolt 3 USB Controller [Titan Ridge 4C 2018] 1028 09be Latitude 7410 - 15ef JHL7540 Thunderbolt 3 Bridge [Titan Ridge DD 2018] - 15f0 JHL7540 Thunderbolt 3 USB Controller [Titan Ridge DD 2018] + 15ef JHL7440 Thunderbolt 3 Bridge [Titan Ridge DD 2018] + 15f0 JHL7440 Thunderbolt 3 USB Controller [Titan Ridge DD 2018] 15f2 Ethernet Controller I225-LM 4c52 2031 LRES2031PT Single-port 2.5Gb Ethernet Network Adapter 8086 0001 Ethernet Network Adapter I225-T1 @@ -30772,6 +31104,7 @@ 15fc Ethernet Connection (13) I219-V 15ff Ethernet Controller X710 for 10GBASE-T 1014 0000 PCIe3 4-port 10GbE Base-T Adapter + 108e 0000 Quad Port 10GBase-T Adapter - CP 108e 7b1f Quad Port 10GBase-T Adapter - CP 1137 0000 X710TLG GbE RJ45 PCIe NIC 1137 02c1 X710T2LG 2x10 GbE RJ45 PCIe NIC @@ -34175,6 +34508,7 @@ 2e95 4 Series Chipset HECI Controller 2e96 4 Series Chipset PT IDER Controller 2f00 Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 + 15d9 0821 X10DRW-i 2f01 Xeon E7 v3/Xeon E5 v3/Core i7 PCI Express Root Port 0 2f02 Xeon E7 v3/Xeon E5 v3/Core i7 PCI Express Root Port 1 2f03 Xeon E7 v3/Xeon E5 v3/Core i7 PCI Express Root Port 1 @@ -34203,28 +34537,48 @@ 2f1b Xeon E7 v3/Xeon E5 v3/Core i7 IIO Debug 2f1c Xeon E7 v3/Xeon E5 v3/Core i7 IIO Debug 2f1d Xeon E7 v3/Xeon E5 v3/Core i7 PCIe Ring Interface + 15d9 0821 X10DRW-i 2f1e Xeon E7 v3/Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers + 15d9 0821 X10DRW-i 2f1f Xeon E7 v3/Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers + 15d9 0821 X10DRW-i 2f20 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 0 + 15d9 0821 X10DRW-i 2f21 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 1 + 15d9 0821 X10DRW-i 2f22 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 2 + 15d9 0821 X10DRW-i 2f23 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 3 + 15d9 0821 X10DRW-i 2f24 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 4 + 15d9 0821 X10DRW-i 2f25 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 5 + 15d9 0821 X10DRW-i 2f26 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 6 + 15d9 0821 X10DRW-i 2f27 Xeon E7 v3/Xeon E5 v3/Core i7 DMA Channel 7 + 15d9 0821 X10DRW-i 2f28 Xeon E7 v3/Xeon E5 v3/Core i7 Address Map, VTd_Misc, System Management + 15d9 0821 X10DRW-i 2f29 Xeon E7 v3/Xeon E5 v3/Core i7 Hot Plug + 15d9 0821 X10DRW-i 2f2a Xeon E7 v3/Xeon E5 v3/Core i7 RAS, Control Status and Global Errors + 15d9 0821 X10DRW-i 2f2c Xeon E7 v3/Xeon E5 v3/Core i7 I/O APIC + 15d9 0821 X10DRW-i 2f2e Xeon E7 v3/Xeon E5 v3/Core i7 RAID 5/6 2f2f Xeon E7 v3/Xeon E5 v3/Core i7 RAID 5/6 2f30 Xeon E7 v3/Xeon E5 v3/Core i7 Home Agent 0 + 15d9 0821 X10DRW-i 2f32 Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 0 + 15d9 0821 X10DRW-i 2f33 Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 1 2f34 Xeon E7 v3/Xeon E5 v3/Core i7 PCIe Ring Interface + 15d9 0821 X10DRW-i 2f36 Xeon E7 v3/Xeon E5 v3/Core i7 R3 QPI Link 0 & 1 Monitoring + 15d9 0821 X10DRW-i 2f37 Xeon E7 v3/Xeon E5 v3/Core i7 R3 QPI Link 0 & 1 Monitoring + 15d9 0821 X10DRW-i 2f38 Xeon E7 v3/Xeon E5 v3/Core i7 Home Agent 1 2f39 Xeon E7 v3/Xeon E5 v3/Core i7 I/O Performance Monitoring 2f3a Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 2 @@ -34250,10 +34604,14 @@ 2f78 Xeon E7 v3/Xeon E5 v3/Core i7 Home Agent 1 Debug 2f79 Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 1 Target Address, Thermal & RAS Registers 2f7d Xeon E7 v3/Xeon E5 v3/Core i7 Scratchpad & Semaphore Registers + 15d9 0821 X10DRW-i 2f7e Xeon E7 v3/Xeon E5 v3/Core i7 E3 QPI Link Debug 2f80 Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 0 + 15d9 0821 X10DRW-i 2f81 Xeon E7 v3/Xeon E5 v3/Core i7 R3 QPI Link 0 & 1 Monitoring + 15d9 0821 X10DRW-i 2f83 Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 0 + 15d9 0821 X10DRW-i 2f85 Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 0 Debug 2f86 Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 0 Debug 2f87 Xeon E7 v3/Xeon E5 v3/Core i7 QPI Link 0 Debug @@ -34268,11 +34626,16 @@ 2f9a Xeon E7 v3/Xeon E5 v3/Core i7 Power Control Unit 2f9c Xeon E7 v3/Xeon E5 v3/Core i7 Power Control Unit 2fa0 Xeon E7 v3/Xeon E5 v3/Core i7 Home Agent 0 + 15d9 0821 X10DRW-i 2fa8 Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 0 Target Address, Thermal & RAS Registers 2faa Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 0 Channel Target Address Decoder + 15d9 0821 X10DRW-i 2fab Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 0 Channel Target Address Decoder + 15d9 0821 X10DRW-i 2fac Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 0 Channel Target Address Decoder + 15d9 0821 X10DRW-i 2fad Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 0 Channel Target Address Decoder + 15d9 0821 X10DRW-i 2fae Xeon E7 v3/Xeon E5 v3/Core i7 DDRIO Channel 0/1 Broadcast 2faf Xeon E7 v3/Xeon E5 v3/Core i7 DDRIO Global Broadcast 2fb0 Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 0 Channel 0 Thermal Control @@ -34306,13 +34669,21 @@ 2fd6 Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 1 Channel 2 ERROR Registers 2fd7 Xeon E7 v3/Xeon E5 v3/Core i7 Integrated Memory Controller 1 Channel 3 ERROR Registers 2fe0 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe1 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe2 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe3 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe4 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe5 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe6 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe7 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers + 15d9 0821 X10DRW-i 2fe8 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers 2fe9 Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers 2fea Xeon E7 v3/Xeon E5 v3/Core i7 Unicast Registers @@ -34334,8 +34705,11 @@ 2ffa Xeon E7 v3/Xeon E5 v3/Core i7 Buffered Ring Agent 2ffb Xeon E7 v3/Xeon E5 v3/Core i7 Buffered Ring Agent 2ffc Xeon E7 v3/Xeon E5 v3/Core i7 System Address Decoder & Broadcast Registers + 15d9 0821 X10DRW-i 2ffd Xeon E7 v3/Xeon E5 v3/Core i7 System Address Decoder & Broadcast Registers + 15d9 0821 X10DRW-i 2ffe Xeon E7 v3/Xeon E5 v3/Core i7 System Address Decoder & Broadcast Registers + 15d9 0821 X10DRW-i 3101 Killer E3100X 2.5 Gigabit Ethernet Controller 3140 Easel/Monette Hill Image Processor [Pixel Visual Core] 3165 Wireless 3165 @@ -35295,11 +35669,13 @@ 43ba Tiger Lake-H PCIe Root Port #3 43bb Tiger Lake-H PCIe Root Port #4 43bc Tiger Lake-H PCI Express Root Port #5 + 43be 11th Gen Core Processor PCIe Root Port #7 43c0 Tiger Lake-H PCIe Root Port #17 43c7 Tiger Lake-H PCIe Root Port #24 43c8 Tiger Lake-H HD Audio Controller 43d3 Tiger Lake SATA AHCI Controller 43e0 Tiger Lake-H Management Engine Interface + 43e3 Tiger Lake AMT SOL Redirection 43e8 Tiger Lake-H Serial IO I2C Controller #0 43e9 Tiger Lake-H Serial IO I2C Controller #1 43ed Tiger Lake-H USB 3.2 Gen 2x1 xHCI Host Controller @@ -35591,6 +35967,10 @@ 54b1 Alder Lake-N PCI Express Root Port #10 54b2 Alder Lake-N PCI Express Root Port #11 54b3 Alder Lake-N PCI Express Root Port #12 + 54b8 Alder Lake-N PCI Express Root Port #1 + 54b9 Alder Lake-N PCI Express Root Port #2 + 54ba Alder Lake-N PCI Express Root Port #3 + 54be Alder Lake-N PCI Express Root Port #7 54c8 Alder Lake-N PCH High Definition Audio Controller 54d3 Alder Lake-N SATA AHCI Controller 54e0 Alder Lake-N PCH HECI Controller @@ -35623,12 +36003,12 @@ 56b1 DG2 [Arc Pro A40/A50] 56b2 DG2 [Arc Pro A60M] 56b3 DG2 [Arc Pro A60] - 56ba DG2 [Intel Graphics] - 56bb DG2 [Intel Graphics] - 56bc DG2 [Intel Graphics] - 56bd DG2 [Intel Graphics] - 56be DG2 [Arc Graphics A750E] - 56bf DG2 [Arc Graphics A580E] + 56ba DG2 [Arc A380E] + 56bb DG2 [Arc A310E] + 56bc DG2 [Arc A370E] + 56bd DG2 [Arc A350E] + 56be DG2 [Arc A750E] + 56bf DG2 [Arc A580E] 56c0 ATS-M [Data Center GPU Flex 170] 56c1 ATS-M [Data Center GPU Flex 140] 56c2 ATS-M [Data Center GPU Flex 170V] @@ -35644,6 +36024,7 @@ 579c Ethernet Connection E825-C for backplane 579d Ethernet Connection E825-C for QSFP 579e Ethernet Connection E825-C for SFP + 579f Ethernet Connection E825-C 10GbE 57a4 Thunderbolt Bridge [Barlow Ridge Hub 40G 2023] 57a5 Thunderbolt USB Controller [Barlow Ridge Hub 40G 2023] 57ad E610 Virtual Function @@ -35688,6 +36069,7 @@ 17aa 2248 ThinkPad T570 17aa 224f ThinkPad X1 Carbon 5th Gen 5917 UHD Graphics 620 + 17aa 225d ThinkPad T480 (20L5) 17aa 225e ThinkPad T480 5918 Xeon E3-1200 v6/7th Gen Core Processor Host Bridge/DRAM Registers 591b HD Graphics 630 @@ -36020,6 +36402,18 @@ 7601 82372FB PIIX5 IDE 7602 82372FB PIIX5 USB 7603 82372FB PIIX5 SMBus + 7725 Arrow Lake-H [PCH Serial IO UART Host Controller] + 7726 Arrow Lake-H PCH Serial IO UART Host Controller] + 7727 Arrow Lake-H [LPC/eSPI Controller] + 7730 Arrow Lake-H [LPC/eSPI Controller] + 7746 Arrow Lake-H [LPC/eSPI Controller] + 7750 Arrow Lake-H [Serial IO I2C Host Controller] + 7751 Arrow Lake-H [Serial IO I2C Host Controller] + 7752 Arrow Lake-H [PCH Serial IO UART Host Controller] + 7778 Arrow Lake-H [Serial IO I2C Host Controller] + 7779 Arrow Lake-H [Serial IO I2C Host Controller] + 777a Arrow Lake-H [Serial IO I2C Host Controller] + 777b Arrow Lake-H [Serial IO I2C Host Controller] 7800 82740 (i740) AGP Graphics Accelerator 003d 0008 Starfighter AGP 003d 000b Starfighter AGP @@ -36073,12 +36467,15 @@ 7acf Alder Lake-S PCH Serial IO I2C Controller #3 7ad0 Alder Lake-S HD Audio Controller 7ae0 Alder Lake-S PCH USB 3.2 Gen 2x2 XHCI Controller + 7ae1 Alder Lake-S PCH USB 3.2 Gen 1x1 xDCI Controller 7ae2 Alder Lake-S PCH SATA Controller [AHCI Mode] 7ae8 Alder Lake-S PCH HECI Controller #1 + 7aeb Alder Lake-S Keyboard and Text (KT) Redirection 7af0 Alder Lake-S PCH CNVi WiFi 8086 0034 Wireless-AC 9560 8086 0070 Wi-Fi 6 AX201 160MHz 8086 0094 Wi-Fi 6 AX201 160MHz + 7af8 Alder Lake-S Integrated Sensor Hub 7afc Alder Lake-S PCH Serial IO I2C Controller #4 7afd Alder Lake-S PCH Serial IO I2C Controller #5 7d03 Meteor Lake-P Dynamic Tuning Technology @@ -36092,7 +36489,7 @@ 7d51 Arrow Lake-P [Intel Graphics] 7d55 Meteor Lake-P [Intel Arc Graphics] 7d60 Meteor Lake-M [Intel Graphics] - 7d67 Arrow Lake-U [Intel Graphics] + 7d67 Arrow Lake-S [Intel Graphics] 7dd1 Arrow Lake-P [Intel Graphics] 7dd5 Meteor Lake-P [Intel Graphics] 7e01 Meteor Lake-P LPC/eSPI Controller @@ -36387,6 +36784,7 @@ 8cc6 H97 Chipset LPC Controller 8d00 C610/X99 series chipset 4-port SATA Controller [IDE mode] 8d02 C610/X99 series chipset 6-Port SATA Controller [AHCI mode] + 15d9 0821 X10DRW-i 8d04 C610/X99 series chipset SATA Controller [RAID mode] 8d06 C610/X99 series chipset SATA Controller [RAID mode] 17aa 1031 ThinkServer RAID 110i @@ -36411,19 +36809,26 @@ 8d20 C610/X99 series chipset HD Audio Controller 8d21 C610/X99 series chipset HD Audio Controller 8d22 C610/X99 series chipset SMBus Controller + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d24 C610/X99 series chipset Thermal Subsystem + 15d9 0821 X10DRW-i 8d26 C610/X99 series chipset USB Enhanced Host Controller #1 + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d2d C610/X99 series chipset USB Enhanced Host Controller #2 + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d31 C610/X99 series chipset USB xHCI Host Controller + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d33 C610/X99 series chipset LAN Controller 8d34 C610/X99 series chipset NAND Controller 8d3a C610/X99 series chipset MEI Controller #1 + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d3b C610/X99 series chipset MEI Controller #2 + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d3c C610/X99 series chipset IDE-r Controller 8d3d C610/X99 series chipset KT Controller @@ -36432,6 +36837,7 @@ 8d42 C610/X99 series chipset LPC Controller 8d43 C610/X99 series chipset LPC Controller 8d44 C610/X99 series chipset LPC Controller + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d45 C610/X99 series chipset LPC Controller 8d46 C610/X99 series chipset LPC Controller @@ -36446,11 +36852,13 @@ 8d4f C610/X99 series chipset LPC Controller 8d60 C610/X99 series chipset sSATA Controller [IDE mode] 8d62 C610/X99 series chipset sSATA Controller [AHCI mode] + 15d9 0821 X10DRW-i 8d64 C610/X99 series chipset sSATA Controller [RAID mode] 8d66 C610/X99 series chipset sSATA Controller [RAID mode] 8d68 C610/X99 series chipset sSATA Controller [IDE mode] 8d6e C610/X99 series chipset sSATA Controller [RAID mode] 8d7c C610/X99 series chipset SPSR + 15d9 0821 X10DRW-i 15d9 0832 X10SRL-F 8d7d C610/X99 series chipset MS SMBus 0 8d7e C610/X99 series chipset MS SMBus 1 @@ -36467,6 +36875,7 @@ 9841 Lakefield GT1.5 [UHD Graphics] 9a01 11th Gen Core Processor PCIe Controller #1 9a03 TigerLake-LP Dynamic Tuning Processor Participant + 9a07 11th Gen Core Processor PCIe Controller #2 9a09 11th Gen Core Processor PCIe Controller 9a0b Volume Management Device NVMe RAID Controller 9a0d Tigerlake Telemetry Aggregator Driver @@ -37206,6 +37615,7 @@ a74d Raptor Lake PCIe 4.0 Graphics Port a74f GNA Scoring Accelerator module 1028 0c06 Precision 3580 + a75d Raptor Lake IPU a76d Raptor Lake-P Thunderbolt 4 NHI #1 a76e Raptor Lake-P Thunderbolt 4 PCI Express Root Port #0 a77d Raptor Lake Crashlog and Telemetry @@ -37260,6 +37670,14 @@ ad0b Volume Management Device NVMe RAID Controller Intel Corporation ad1d Arrow Lake NPU b03e Panther Lake NPU + b080 Panther Lake [Intel Graphics] + b081 Panther Lake [Intel Graphics] + b082 Panther Lake [Intel Graphics] + b083 Panther Lake [Intel Graphics] + b08f Panther Lake [Intel Graphics] + b090 Panther Lake [Intel Graphics] + b0a0 Panther Lake [Intel Graphics] + b0b0 Panther Lake [Intel Graphics] b152 21152 PCI-to-PCI Bridge 8086 b152 21152 PCI-to-PCI Bridge # observed, and documented in Intel revision note; new mask of 1011:0026 @@ -37271,6 +37689,7 @@ 4c53 1050 CT7 mainboard 4c53 1051 CE7 mainboard e4bf 1000 CC8-1-BLUES + b640 Arrow Lake-H [Intel Graphics] d130 Core Processor DMI 15d9 0605 X8SIL d131 Core Processor DMI @@ -37386,6 +37805,10 @@ 8088 2000 Ethernet Network Adaptor RP2000 for 10GbE SFP+ 8088 2300 Ethernet Network Adaptor RP2000-A03 for 10GbE SFP+ 8088 2400 Ethernet Network Adaptor RP2000-A04 for 10GbE SFP+ + 5025 Ethernet Controller WX5025 for 25GbE SFP28 + 8088 1000 Dual-Port Ethernet Network Adapter FF5025-DDATACXX + 5125 Ethernet Controller WX5025AL for 25GbE SFP28 + 8088 3000 Dual-Port Ethernet Network Adapter FF5025-DDATAIXX 80ee InnoTek Systemberatung GmbH beef VirtualBox Graphics Adapter cafe VirtualBox Guest Service @@ -37402,6 +37825,7 @@ 8510 0007 GB2062-PCIe-C40 8510 0008 CQ2040-MXM-M60 8510 0009 GB2062-PCIe-C20 + 8510 000b GB2062-PCIe-HIEILP42 8510 000c CQ2040-PUB 8510 0201 GB2062-PUB-DDR # nee ScaleMP @@ -37837,6 +38261,7 @@ 1d49 0621 ThinkSystem RAID 9350-8i 2GB Flash PCIe 12Gb Internal Adapter 1d49 0622 ThinkSystem RAID 9350-16i 4GB Flash PCIe 12Gb Adapter 1d49 0623 ThinkSystem RAID 9350-16i 4GB Flash PCIe 12Gb Internal Adapter + 1f3f 0610 3S610-8i, SAS/SATA 12Gb HBA 9005 0608 SmartRAID 3162-8i /e 9005 0800 SmartRAID 3154-8i 9005 0801 SmartRAID 3152-8i @@ -38022,6 +38447,8 @@ 1501 STAR1500C NVMe SSD 1502 STAR1500E NVMe SSD 1504 STAR1500L NVMe SSD +# NVMe Gen5 Controller 16ch + 1516 STAR1516 PCIe NVMe SSD Controller 2000 STAR2000 NVMe Controller 2001 STAR2000E NVMe SSD 2002 STAR2000C NVMe SSD @@ -38126,7 +38553,9 @@ c0a9 Micron/Crucial Technology 5412 P5 NVMe PCIe SSD[SlashP5] 5415 T500 NVMe PCIe SSD 5419 T700 NVMe PCIe SSD + 5420 P3 NVMe PCIe SSD (DRAM-less) 5421 P3 Plus NVMe PCIe SSD (DRAM-less) + 5426 P310 NVMe PCIe SSD (DRAM-less) 542b T705 NVMe PCIe SSD c0de Motorola c0fe Motion Engineering, Inc. @@ -38203,31 +38632,18 @@ d209 Ultimarc 15a2 SpinTrak 1601 AimTrak d20c Chengdu BeiZhongWangXin Technology Co., Ltd. - 5010 NE5000 Ethernet Controller 5011 NE5000 Ethernet Controller d20c e120 N5 Series 2-port 10GbE Network Adapter d20c e140 N5 Series 4-port 10GbE Network Adapter d20c e220 N5 Series 2-port 25GbE Network Adapter d20c e221 N5S Series 2-port 25GbE Network Adapter d20c e22c N5 Series 2-port 25GbE Network Adapter for OCP - d20c e22d N5S Series 2-port 25GbE Network Adapter for OCP - 6010 NE6000 Ethernet Controller 6011 NE6000 Ethernet Controller d20c a001 N6S Series Network Adapter - d20c a141 N6S Series 4-port 10GbE Network Adapter - d20c a221 N6S Series 2-port 25GbE Network Adapter - d20c a241 N6S Series 4-port 25GbE Network Adapter - d20c a421 N6S Series 2-port 40GbE Network Adapter - d20c aa21 N6S Series 2-port 100GbE Network Adapter - d20c d221 N6S Series 2-port 25GbE Network Adapter with DPI - d20c da21 N6S Series 2-port 100GbE Network Adapter with DPI d20c e221 N6S Series 2-port 25GbE Network Adapter d20c e281 N6S Series 8-port 25GbE Network Adapter d20c e421 N6S Series 2-port 40GbE Network Adapter - d20c ea20 N6 Series 2-port 100GbE Network Adapter d20c ea21 N6S Series 2-port 100GbE Network Adapter - d20c ea2c N6 Series 2-port 100GbE Network Adapter for OCP - d20c ea2d N6S Series 2-port 100GbE Network Adapter for OCP d4d4 Dy4 Systems Inc 0601 PCI Mezzanine Card d531 I+ME ACTIA GmbH From 19a6bc9f51e5c5705a2b396b0da61e6536acb4cb Mon Sep 17 00:00:00 2001 From: Renato Botelho Date: Thu, 9 Jan 2025 09:16:10 -0300 Subject: [PATCH 084/143] fwget: Silence log() when -q is used Summary: Silence log function when -q parameter is used to prevent undesired output PR: 283939 Reviewed By: manu Differential Revision: https://reviews.freebsd.org/D48391 Sponsored by: Rubicon Communications, LLC ("Netgate") --- usr.sbin/fwget/fwget.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/usr.sbin/fwget/fwget.sh b/usr.sbin/fwget/fwget.sh index 3e2181e53b7361..d87cd03aa139a4 100755 --- a/usr.sbin/fwget/fwget.sh +++ b/usr.sbin/fwget/fwget.sh @@ -47,7 +47,9 @@ EOF log() { - echo "$@" + if [ "${QUIET}" != "y" ]; then + echo "$@" + fi } log_verbose() From 7c94d515db900401a339cd26861856c8fefb3086 Mon Sep 17 00:00:00 2001 From: David Bright Date: Sun, 5 Jan 2025 11:24:13 -0600 Subject: [PATCH 085/143] aio_kqueue_test: Fix CID 1558429 Fix a Coverity error in the aio_kqueue_test that could theoretically (but probably not realistically) cause overindexing an array. Differential Revision: https://reviews.freebsd.org/D48328 Reviewed by: asomers, vangyzen Sponsored by: Dell Technologies --- tests/sys/aio/aio_kqueue_test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/sys/aio/aio_kqueue_test.c b/tests/sys/aio/aio_kqueue_test.c index c2478a9d05b363..5e5cb40d07525c 100644 --- a/tests/sys/aio/aio_kqueue_test.c +++ b/tests/sys/aio/aio_kqueue_test.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -192,6 +193,7 @@ main (int argc, char *argv[]) for (j = 0; j < max_queue_per_proc && iocb[j] != kq_iocb; j++) ; + assert(j < max_queue_per_proc); #ifdef DEBUG printf("kq_iocb %p\n", kq_iocb); From 4a46ece6c6a90f18effbfae7ddef79b41ef43eec Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 9 Jan 2025 14:49:34 +0000 Subject: [PATCH 086/143] vmm: Fix error handling in vmm_handler() In commit a97f683fe3c4 I didn't add code to remove the vmmctl device when vmm.ko is unloaded, so it would persist and prevent vmm.ko from being re-loaded. Extend vmmdev_cleanup() to destroy the vmmctl cdev. Also call vmmdev_cleanup() if vmm_init() fails. Reviewed by: corvink, andrew Fixes: a97f683fe3c4 ("vmm: Add a device file interface for creating and destroying VMs") Differential Revision: https://reviews.freebsd.org/D48269 --- sys/amd64/vmm/vmm.c | 2 ++ sys/arm64/vmm/vmm.c | 11 ++++++++--- sys/dev/vmm/vmm_dev.c | 34 +++++++++++++++++++--------------- sys/riscv/vmm/vmm.c | 11 ++++++++--- 4 files changed, 37 insertions(+), 21 deletions(-) diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index d05d979a531a2b..aa13d506ac6af2 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -467,6 +467,8 @@ vmm_handler(module_t mod, int what, void *arg) error = vmm_init(); if (error == 0) vmm_initialized = 1; + else + (void)vmmdev_cleanup(); } else { error = ENXIO; } diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c index 808df5e599ace3..77c565e3726441 100644 --- a/sys/arm64/vmm/vmm.c +++ b/sys/arm64/vmm/vmm.c @@ -361,21 +361,26 @@ vmm_handler(module_t mod, int what, void *arg) switch (what) { case MOD_LOAD: - /* TODO: if (vmm_is_hw_supported()) { */ error = vmmdev_init(); if (error != 0) break; error = vmm_init(); if (error == 0) vmm_initialized = true; + else + (void)vmmdev_cleanup(); break; case MOD_UNLOAD: - /* TODO: if (vmm_is_hw_supported()) { */ error = vmmdev_cleanup(); if (error == 0 && vmm_initialized) { error = vmmops_modcleanup(); - if (error) + if (error) { + /* + * Something bad happened - prevent new + * VMs from being created + */ vmm_initialized = false; + } } break; default: diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c index 4ab99f92f72a3c..27c960c8ef2eba 100644 --- a/sys/dev/vmm/vmm_dev.c +++ b/sys/dev/vmm/vmm_dev.c @@ -979,6 +979,7 @@ vmmctl_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, return (error); } +static struct cdev *vmmctl_cdev; static struct cdevsw vmmctlsw = { .d_name = "vmmctl", .d_version = D_VERSION, @@ -989,31 +990,34 @@ static struct cdevsw vmmctlsw = { int vmmdev_init(void) { - struct cdev *cdev; int error; - error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmctlsw, NULL, + sx_xlock(&vmmdev_mtx); + error = make_dev_p(MAKEDEV_CHECKNAME, &vmmctl_cdev, &vmmctlsw, NULL, UID_ROOT, GID_WHEEL, 0600, "vmmctl"); - if (error) - return (error); - - pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, - "Allow use of vmm in a jail."); + if (error == 0) + pr_allow_flag = prison_add_allow(NULL, "vmm", NULL, + "Allow use of vmm in a jail."); + sx_xunlock(&vmmdev_mtx); - return (0); + return (error); } int vmmdev_cleanup(void) { - int error; - - if (SLIST_EMPTY(&head)) - error = 0; - else - error = EBUSY; + sx_xlock(&vmmdev_mtx); + if (!SLIST_EMPTY(&head)) { + sx_xunlock(&vmmdev_mtx); + return (EBUSY); + } + if (vmmctl_cdev != NULL) { + destroy_dev(vmmctl_cdev); + vmmctl_cdev = NULL; + } + sx_xunlock(&vmmdev_mtx); - return (error); + return (0); } static int diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c index f7cbfc1dfea580..96871fc88453c7 100644 --- a/sys/riscv/vmm/vmm.c +++ b/sys/riscv/vmm/vmm.c @@ -259,21 +259,26 @@ vmm_handler(module_t mod, int what, void *arg) switch (what) { case MOD_LOAD: - /* TODO: check if has_hyp here? */ error = vmmdev_init(); if (error != 0) break; error = vmm_init(); if (error == 0) vmm_initialized = true; + else + (void)vmmdev_cleanup(); break; case MOD_UNLOAD: - /* TODO: check if has_hyp here? */ error = vmmdev_cleanup(); if (error == 0 && vmm_initialized) { error = vmmops_modcleanup(); - if (error) + if (error) { + /* + * Something bad happened - prevent new + * VMs from being created + */ vmm_initialized = false; + } } break; default: From fe1165df4b776b14b21a04d2ef3fc4c46740c2f5 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 9 Jan 2025 14:53:37 +0000 Subject: [PATCH 087/143] vm_pageout: Make vmd_oom a bool No functional change intended. Reviewed by: dougm, kib MFC after: 1 week Sponsored by: Klara, Inc. Sponsored by: Modirum MDPay Differential Revision: https://reviews.freebsd.org/D48376 --- sys/vm/vm_page.c | 2 +- sys/vm/vm_pageout.c | 6 +++--- sys/vm/vm_pagequeue.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index f351295c1af56a..f042d4767b36b9 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -460,7 +460,7 @@ vm_page_domain_init(int domain) vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; - vmd->vmd_oom = FALSE; + vmd->vmd_oom = false; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 28a54a83fd4971..d26e04f60c0090 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1773,7 +1773,7 @@ vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { - vmd->vmd_oom = FALSE; + vmd->vmd_oom = false; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; @@ -1788,7 +1788,7 @@ vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, if (vmd->vmd_oom) return; - vmd->vmd_oom = TRUE; + vmd->vmd_oom = true; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; @@ -1806,7 +1806,7 @@ vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, * memory condition is still there, due to vmd_oom being * false. */ - vmd->vmd_oom = FALSE; + vmd->vmd_oom = false; atomic_subtract_int(&vm_pageout_oom_vote, 1); } diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index af1183e63e53c8..23a3ea96d80c70 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -257,7 +257,7 @@ struct vm_domain { /* Paging control variables, used within single threaded page daemon. */ struct pidctrl vmd_pid; /* Pageout controller. */ - boolean_t vmd_oom; + bool vmd_oom; u_int vmd_inactive_threads; u_int vmd_inactive_shortage; /* Per-thread shortage. */ blockcount_t vmd_inactive_running; /* Number of inactive threads. */ From 55b343f4f9bc586eba5e26a2524a35f04dd60c65 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 9 Jan 2025 14:54:10 +0000 Subject: [PATCH 088/143] vm_pageout: Add a chicken switch for multithreaded PQ_INACTIVE scanning Right now we have the vm.pageout_cpus_per_thread tunable which controls the number of threads to start up per CPU per NUMA domain, but after booting, it's not possible to disable multi-threaded scanning. There is at least one workload where this mechanism doesn't work well; let's make it possible to disable it without a reboot, to simplify troubleshooting. Reviewed by: dougm, kib MFC after: 2 weeks Sponsored by: Klara, Inc. Sponsored by: Modirum MDPay Differential Revision: https://reviews.freebsd.org/D48377 --- sys/vm/vm_page.c | 1 + sys/vm/vm_pageout.c | 9 +++++++-- sys/vm/vm_pagequeue.h | 5 +++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index f042d4767b36b9..ba22c7f97f2f92 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -461,6 +461,7 @@ vm_page_domain_init(int domain) vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = false; + vmd->vmd_helper_threads_enabled = true; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index d26e04f60c0090..e2efa11842b5a8 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1644,8 +1644,9 @@ vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage) * If we have more work than we can do in a quarter of our interval, we * fire off multiple threads to process it. */ - threads = vmd->vmd_inactive_threads; - if (threads > 1 && vmd->vmd_inactive_pps != 0 && + if ((threads = vmd->vmd_inactive_threads) > 1 && + vmd->vmd_helper_threads_enabled && + vmd->vmd_inactive_pps != 0 && shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) { vmd->vmd_inactive_shortage /= threads; slop = shortage % threads; @@ -2269,6 +2270,10 @@ vm_pageout_init_domain(int domain) pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid)); vmd->vmd_inactive_threads = get_pageout_threads_per_domain(vmd); + SYSCTL_ADD_BOOL(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, + "pageout_helper_threads_enabled", CTLFLAG_RWTUN, + &vmd->vmd_helper_threads_enabled, 0, + "Enable multi-threaded inactive queue scanning"); } static void diff --git a/sys/vm/vm_pagequeue.h b/sys/vm/vm_pagequeue.h index 23a3ea96d80c70..72fd1bb473185e 100644 --- a/sys/vm/vm_pagequeue.h +++ b/sys/vm/vm_pagequeue.h @@ -257,8 +257,9 @@ struct vm_domain { /* Paging control variables, used within single threaded page daemon. */ struct pidctrl vmd_pid; /* Pageout controller. */ - bool vmd_oom; - u_int vmd_inactive_threads; + bool vmd_oom; /* An OOM kill was requested. */ + bool vmd_helper_threads_enabled;/* Use multiple threads to scan. */ + u_int vmd_inactive_threads; /* Number of extra helper threads. */ u_int vmd_inactive_shortage; /* Per-thread shortage. */ blockcount_t vmd_inactive_running; /* Number of inactive threads. */ blockcount_t vmd_inactive_starting; /* Number of threads started. */ From fb98fc4755def2cb8ca145751b0e54485d5e2f4a Mon Sep 17 00:00:00 2001 From: Renato Botelho Date: Thu, 9 Jan 2025 11:24:18 -0300 Subject: [PATCH 089/143] fwget: Simplify logic Summary: Use log() to print messages that should be supressed when -q is in use. No functional changes intended. Differential Revision: https://reviews.freebsd.org/D48393 Reviewed By: manu Sponsored by: Rubicon Communications, LLC ("Netgate") --- usr.sbin/fwget/fwget.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/usr.sbin/fwget/fwget.sh b/usr.sbin/fwget/fwget.sh index d87cd03aa139a4..138a2a26bfb127 100755 --- a/usr.sbin/fwget/fwget.sh +++ b/usr.sbin/fwget/fwget.sh @@ -120,16 +120,12 @@ done case "${packages}" in ""|^[[:space:]]*$) - if [ "${QUIET}" != "y" ]; then - echo "No firmware packages to install." - fi + log "No firmware packages to install." exit 0 ;; esac -if [ "${QUIET}" != "y" ]; then - echo "Needed firmware packages: '${packages}'" -fi +log "Needed firmware packages: '${packages}'" if [ "${DRY_RUN}" = "y" ]; then if [ "${QUIET}" = "y" ]; then for pkg in ${packages}; do From 0acab8b3d1336d4db73a9946ef76b4bcd0b0aabe Mon Sep 17 00:00:00 2001 From: Doug Ambrisko Date: Thu, 9 Jan 2025 08:28:37 -0800 Subject: [PATCH 090/143] enic(4): fix down/up, MTU changes and more ifconfig down/up cycles was not working. Fix that which is required to support MTU changes. Now doing ifconfig enic0 mtu 3000 for example works. If the MTU is changes in the VIC HW configuration, that is not reflected in and the OS reports the default 1500. I need to look at that but changing it via ifconfig works. So this is different then what Linux does. Change TX interrupt allocation to be in this driver. Change the admin interrupt count to 2. This make multiple queues work but need to be done as pairs so if the VIC has more TX or RX queues setup in the VIC configuration it will use the lesser value. While updating the TX interrupt also add support for devcmd2. Enable checksum offloading. PR: 282095 --- sys/dev/enic/cq_desc.h | 15 --- sys/dev/enic/enic.h | 76 +++++------ sys/dev/enic/enic_res.c | 4 +- sys/dev/enic/enic_res.h | 2 - sys/dev/enic/enic_txrx.c | 39 ++++-- sys/dev/enic/if_enic.c | 173 +++++++++++++++++++++++--- sys/dev/enic/vnic_cq.h | 5 +- sys/dev/enic/vnic_dev.c | 235 ++++++++++++++++++++++++++++++----- sys/dev/enic/vnic_dev.h | 8 +- sys/dev/enic/vnic_intr.c | 2 +- sys/dev/enic/vnic_intr.h | 2 +- sys/dev/enic/vnic_resource.h | 1 + sys/dev/enic/vnic_rq.c | 5 +- sys/dev/enic/vnic_rq.h | 1 - sys/dev/enic/vnic_rss.h | 5 - sys/dev/enic/vnic_wq.c | 104 +++++++++++++++- sys/dev/enic/vnic_wq.h | 18 ++- 17 files changed, 559 insertions(+), 136 deletions(-) diff --git a/sys/dev/enic/cq_desc.h b/sys/dev/enic/cq_desc.h index ae8847c6d9a1b2..4fb8cce7212ee4 100644 --- a/sys/dev/enic/cq_desc.h +++ b/sys/dev/enic/cq_desc.h @@ -44,14 +44,6 @@ struct cq_desc { #define CQ_DESC_COMP_NDX_BITS 12 #define CQ_DESC_COMP_NDX_MASK ((1 << CQ_DESC_COMP_NDX_BITS) - 1) -static inline void cq_color_enc(struct cq_desc *desc, const u8 color) -{ - if (color) - desc->type_color |= (1 << CQ_DESC_COLOR_SHIFT); - else - desc->type_color &= ~(1 << CQ_DESC_COLOR_SHIFT); -} - static inline void cq_desc_enc(struct cq_desc *desc, const u8 type, const u8 color, const u16 q_number, const u16 completed_index) @@ -87,11 +79,4 @@ static inline void cq_desc_dec(const struct cq_desc *desc_arg, CQ_DESC_COMP_NDX_MASK; } -static inline void cq_color_dec(const struct cq_desc *desc_arg, u8 *color) -{ - volatile const struct cq_desc *desc = desc_arg; - - *color = (desc->type_color >> CQ_DESC_COLOR_SHIFT) & CQ_DESC_COLOR_MASK; -} - #endif /* _CQ_DESC_H_ */ diff --git a/sys/dev/enic/enic.h b/sys/dev/enic/enic.h index 8c221272654839..eec6de823c9de5 100644 --- a/sys/dev/enic/enic.h +++ b/sys/dev/enic/enic.h @@ -108,13 +108,13 @@ struct vnic_res { #define ENIC_DEFAULT_VXLAN_PORT 4789 /* - * Interrupt 0: LSC and errors * Interrupt 1: rx queue 0 * Interrupt 2: rx queue 1 * ... + * Interrupt x: LSC and errors */ #define ENICPMD_LSC_INTR_OFFSET 0 -#define ENICPMD_RXQ_INTR_OFFSET 1 +#define ENICPMD_RXQ_INTR_OFFSET 0 #include "vnic_devcmd.h" @@ -152,6 +152,9 @@ struct vnic_dev { u64 args[VNIC_DEVCMD_NARGS]; int in_reset; struct vnic_intr_coal_timer_info intr_coal_timer_info; + struct devcmd2_controller *devcmd2; + int (*devcmd_rtn)(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + int wait); void *(*alloc_consistent)(void *priv, size_t size, bus_addr_t *dma_handle, struct iflib_dma_info *res, u8 *name); void (*free_consistent)(void *priv, size_t size, void *vaddr, @@ -175,6 +178,28 @@ struct intr_queue { struct enic_softc *softc; }; +#define ENIC_MAX_LINK_SPEEDS 3 +#define ENIC_LINK_SPEED_10G 10000 +#define ENIC_LINK_SPEED_4G 4000 +#define ENIC_LINK_40G_INDEX 2 +#define ENIC_LINK_10G_INDEX 1 +#define ENIC_LINK_4G_INDEX 0 +#define ENIC_RX_COALESCE_RANGE_END 125 +#define ENIC_AIC_TS_BREAK 100 + +struct enic_rx_coal { + u32 small_pkt_range_start; + u32 large_pkt_range_start; + u32 range_end; + u32 use_adaptive_rx_coalesce; +}; + +/* Store only the lower range. Higher range is given by fw. */ +struct enic_intr_mod_range { + u32 small_pkt_range_start; + u32 large_pkt_range_start; +}; + struct enic { struct enic *next; struct rte_pci_device *pdev; @@ -267,6 +292,9 @@ struct enic { uint64_t tx_offload_mask; /* PKT_TX flags accepted */ struct enic_softc *softc; int port_mtu; + struct enic_rx_coal rx_coalesce_setting; + u32 rx_coalesce_usecs; + u32 tx_coalesce_usecs; }; struct enic_softc { @@ -307,11 +335,6 @@ struct enic_softc { /* Per-instance private data structure */ -static inline unsigned int enic_vnic_rq_count(struct enic *enic) -{ - return enic->rq_count; -} - static inline unsigned int enic_cq_rq(struct enic *enic, unsigned int rq) { return rq; @@ -322,21 +345,6 @@ static inline unsigned int enic_cq_wq(struct enic *enic, unsigned int wq) return enic->rq_count + wq; } -static inline uint32_t -enic_ring_add(uint32_t n_descriptors, uint32_t i0, uint32_t i1) -{ - uint32_t d = i0 + i1; - d -= (d >= n_descriptors) ? n_descriptors : 0; - return d; -} - -static inline uint32_t -enic_ring_sub(uint32_t n_descriptors, uint32_t i0, uint32_t i1) -{ - int32_t d = i1 - i0; - return (uint32_t)((d < 0) ? ((int32_t)n_descriptors + d) : d); -} - static inline uint32_t enic_ring_incr(uint32_t n_descriptors, uint32_t idx) { @@ -346,34 +354,14 @@ enic_ring_incr(uint32_t n_descriptors, uint32_t idx) return idx; } -void enic_free_wq(void *txq); -int enic_alloc_intr_resources(struct enic *enic); int enic_setup_finish(struct enic *enic); -int enic_alloc_wq(struct enic *enic, uint16_t queue_idx, - unsigned int socket_id, uint16_t nb_desc); void enic_start_wq(struct enic *enic, uint16_t queue_idx); int enic_stop_wq(struct enic *enic, uint16_t queue_idx); void enic_start_rq(struct enic *enic, uint16_t queue_idx); -void enic_free_rq(void *rxq); -int enic_set_vnic_res(struct enic *enic); -int enic_init_rss_nic_cfg(struct enic *enic); -int enic_set_rss_reta(struct enic *enic, union vnic_rss_cpu *rss_cpu); -int enic_set_vlan_strip(struct enic *enic); +int enic_stop_rq(struct enic *enic, uint16_t queue_idx); +void enic_dev_disable(struct enic *enic); int enic_enable(struct enic *enic); int enic_disable(struct enic *enic); -void enic_remove(struct enic *enic); -int enic_get_link_status(struct enic *enic); -void enic_dev_stats_clear(struct enic *enic); -void enic_add_packet_filter(struct enic *enic); -int enic_set_mac_address(struct enic *enic, uint8_t *mac_addr); -int enic_del_mac_address(struct enic *enic, int mac_index); -unsigned int enic_cleanup_wq(struct enic *enic, struct vnic_wq *wq); - -void enic_post_wq_index(struct vnic_wq *wq); -int enic_probe(struct enic *enic); -int enic_clsf_init(struct enic *enic); -void enic_clsf_destroy(struct enic *enic); -int enic_set_mtu(struct enic *enic, uint16_t new_mtu); int enic_link_update(struct enic *enic); bool enic_use_vector_rx_handler(struct enic *enic); void enic_fdir_info(struct enic *enic); diff --git a/sys/dev/enic/enic_res.c b/sys/dev/enic/enic_res.c index d264874557a023..413873ad0fb446 100644 --- a/sys/dev/enic/enic_res.c +++ b/sys/dev/enic/enic_res.c @@ -95,11 +95,11 @@ int enic_get_vnic_config(struct enic *enic) dev_info(enic_get_dev(enic), "vNIC MAC addr %02x:%02x:%02x:%02x:%02x:%02x " - "wq/rq %d/%d mtu d, max mtu:%d\n", + "wq/rq %d/%d mtu %d, max mtu:%d\n", enic->mac_addr[0], enic->mac_addr[1], enic->mac_addr[2], enic->mac_addr[3], enic->mac_addr[4], enic->mac_addr[5], c->wq_desc_count, c->rq_desc_count, - /* enic->rte_dev->data->mtu, */ enic->max_mtu); + c->mtu, enic->max_mtu); dev_info(enic_get_dev(enic), "vNIC csum tx/rx %s/%s " "rss %s intr mode %s type %s timer %d usec " "loopback tag 0x%04x\n", diff --git a/sys/dev/enic/enic_res.h b/sys/dev/enic/enic_res.h index 1a6f3a3ca98f57..82963e61a44f6c 100644 --- a/sys/dev/enic/enic_res.h +++ b/sys/dev/enic/enic_res.h @@ -67,7 +67,5 @@ int enic_set_nic_cfg(struct enic *enic, u8 rss_default_cpu, u8 rss_hash_type, u8 ig_vlan_strip_en); void enic_get_res_counts(struct enic *enic); void enic_init_vnic_resources(struct enic *enic); -int enic_alloc_vnic_resources(struct enic *); -void enic_free_vnic_resources(struct enic *); #endif /* _ENIC_RES_H_ */ diff --git a/sys/dev/enic/enic_txrx.c b/sys/dev/enic/enic_txrx.c index 5a557fc7f94aa9..169041587d0625 100644 --- a/sys/dev/enic/enic_txrx.c +++ b/sys/dev/enic/enic_txrx.c @@ -103,6 +103,7 @@ enic_isc_txd_encap(void *vsc, if_pkt_info_t pi) softc = vsc; enic = &softc->enic; + if_softc_ctx_t scctx = softc->scctx; wq = &enic->wq[pi->ipi_qsidx]; nsegs = pi->ipi_nsegs; @@ -112,6 +113,9 @@ enic_isc_txd_encap(void *vsc, if_pkt_info_t pi) head_idx = wq->head_idx; desc_count = wq->ring.desc_count; + if ((scctx->isc_capenable & IFCAP_RXCSUM) != 0) + offload_mode |= WQ_ENET_OFFLOAD_MODE_CSUM; + for (i = 0; i < nsegs; i++) { eop = 0; cq = 0; @@ -320,7 +324,7 @@ enic_isc_rxd_flush(void *vsc, uint16_t rxqid, uint8_t flid, qidx_t pidx) static int enic_legacy_intr(void *xsc) { - return -1; + return (1); } static inline void @@ -375,7 +379,7 @@ enic_wq_service(struct vnic_dev *vdev, struct cq_desc *cq_desc, u8 type, vnic_wq_service(&enic->wq[q_number], cq_desc, completed_index, NULL, opaque); - return 0; + return (0); } static void @@ -384,7 +388,7 @@ vnic_rq_service(struct vnic_rq *rq, struct cq_desc *cq_desc, void(*buf_service)(struct vnic_rq *rq, struct cq_desc *cq_desc, /* struct vnic_rq_buf * *buf, */ int skipped, void *opaque), void *opaque) { - + if_softc_ctx_t scctx; if_rxd_info_t ri = (if_rxd_info_t) opaque; u8 type, color, eop, sop, ingress_port, vlan_stripped; u8 fcoe, fcoe_sof, fcoe_fc_crc_ok, fcoe_enc_error, fcoe_eof; @@ -396,6 +400,8 @@ vnic_rq_service(struct vnic_rq *rq, struct cq_desc *cq_desc, int cqidx; if_rxd_frag_t frag; + scctx = rq->vdev->softc->scctx; + cq_enet_rq_desc_dec((struct cq_enet_rq_desc *)cq_desc, &type, &color, &q_number, &completed_index, &ingress_port, &fcoe, &eop, &sop, &rss_type, @@ -419,6 +425,11 @@ vnic_rq_service(struct vnic_rq *rq, struct cq_desc *cq_desc, ri->iri_cidx = cqidx; ri->iri_nfrags = 1; ri->iri_len = bytes_written; + + if ((scctx->isc_capenable & IFCAP_RXCSUM) != 0) + if (!csum_not_calc && (tcp_udp_csum_ok || ipv4_csum_ok)) { + ri->iri_csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID); + } } static int @@ -431,7 +442,7 @@ enic_rq_service(struct vnic_dev *vdev, struct cq_desc *cq_desc, vnic_rq_service(&enic->rq[ri->iri_qsidx], cq_desc, completed_index, VNIC_RQ_RETURN_DESC, NULL, /* enic_rq_indicate_buf, */ opaque); - return 0; + return (0); } void @@ -468,10 +479,8 @@ enic_stop_wq(struct enic *enic, uint16_t queue_idx) int ret; ret = vnic_wq_disable(&enic->wq[queue_idx]); - if (ret) - return ret; - return 0; + return (ret); } void @@ -483,3 +492,19 @@ enic_start_rq(struct enic *enic, uint16_t queue_idx) vnic_rq_enable(rq); enic_initial_post_rx(enic, rq); } + +int +enic_stop_rq(struct enic *enic, uint16_t queue_idx) +{ + int ret; + + ret = vnic_rq_disable(&enic->rq[queue_idx]); + + return (ret); +} + + +void +enic_dev_disable(struct enic *enic) { + vnic_dev_disable(enic->vdev); +} diff --git a/sys/dev/enic/if_enic.c b/sys/dev/enic/if_enic.c index dc0c0d028e2043..26776244778e2f 100644 --- a/sys/dev/enic/if_enic.c +++ b/sys/dev/enic/if_enic.c @@ -201,11 +201,11 @@ static struct if_shared_ctx enic_sctx_init = { * descriptor */ .isc_rx_nsegments = 1, /* One mapping per descriptor */ .isc_rx_maxsegsize = ENIC_DEFAULT_RX_MAX_PKT_SIZE, - .isc_admin_intrcnt = 3, + .isc_admin_intrcnt = 2, .isc_vendor_info = enic_vendor_info_array, .isc_driver_version = "1", .isc_driver = &enic_iflib_driver, - .isc_flags = IFLIB_HAS_RXCQ | IFLIB_HAS_TXCQ, + .isc_flags = IFLIB_HAS_RXCQ | IFLIB_HAS_TXCQ | IFLIB_SKIP_MSIX, /* * Number of receive queues per receive queue set, with associated @@ -235,6 +235,99 @@ enic_register(device_t dev) return (&enic_sctx_init); } +static int +enic_allocate_msix(struct enic_softc *softc) { + if_ctx_t ctx; + if_softc_ctx_t scctx; + if_shared_ctx_t sctx; + device_t dev; + cpuset_t cpus; + int queues, vectors, requested; + int err = 0; + + dev = softc->dev; + ctx = softc->ctx; + scctx = softc->scctx; + sctx = iflib_get_sctx(ctx); + + if (bus_get_cpus(dev, INTR_CPUS, sizeof(cpus), &cpus) != 0) { + device_printf(dev, "Unable to fetch CPU list\n"); + CPU_COPY(&all_cpus, &cpus); + } + + + queues = CPU_COUNT(&cpus); + queues = imin(queues, scctx->isc_nrxqsets); + queues = imin(queues, scctx->isc_ntxqsets); + requested = queues * 2 + sctx->isc_admin_intrcnt; + scctx->isc_nrxqsets = queues; + scctx->isc_ntxqsets = queues; + + vectors = requested; + if ((err = pci_alloc_msix(dev, &vectors)) != 0) { + device_printf(dev, + "failed to allocate %d MSI-X vectors, err: %d\n", requested, + err); + err = 1; + goto enic_allocate_msix_out; + } else { + if (vectors != requested) { + device_printf(dev, + "Unable to allocate sufficient MSI-X vectors " + "(got %d, need %d)\n", requested, vectors); + pci_release_msi(dev); + err = 1; + goto enic_allocate_msix_out; + } + } + + device_printf(dev, "Using MSI-X interrupts with %d vectors\n", + vectors); + + scctx->isc_intr = IFLIB_INTR_MSIX; + scctx->isc_vectors = vectors; + +enic_allocate_msix_out: + return (err); + +} + +static struct enic_intr_mod_range mod_range[ENIC_MAX_LINK_SPEEDS] = { + {0, 0}, /* 0 - 4 Gbps */ + {0, 3}, /* 4 - 10 Gbps */ + {3, 6}, /* 10 - 40 Gbps */ +}; + +static void enic_set_rx_coal_setting(struct enic *enic) +{ + unsigned int speed; + int index = -1; + struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting; + + /* 1. Read the link speed from fw + * 2. Pick the default range for the speed + * 3. Update it in enic->rx_coalesce_setting + */ + speed = vnic_dev_port_speed(enic->vdev); + if (ENIC_LINK_SPEED_10G < speed) + index = ENIC_LINK_40G_INDEX; + else if (ENIC_LINK_SPEED_4G < speed) + index = ENIC_LINK_10G_INDEX; + else + index = ENIC_LINK_4G_INDEX; + + rx_coal->small_pkt_range_start = mod_range[index].small_pkt_range_start; + rx_coal->large_pkt_range_start = mod_range[index].large_pkt_range_start; + rx_coal->range_end = ENIC_RX_COALESCE_RANGE_END; + + /* Start with the value provided by UCSM */ + for (index = 0; index < enic->rq_count; index++) + enic->cq[index].cur_rx_coal_timeval = + enic->config.intr_timer_usec; + + rx_coal->use_adaptive_rx_coalesce = 1; +} + static int enic_attach_pre(if_ctx_t ctx) { @@ -283,6 +376,8 @@ enic_attach_pre(if_ctx_t ctx) ENIC_LOCK(softc); vnic_dev_register(vdev, &softc->mem, 1); enic->vdev = vdev; + vnic_dev_cmd_init(enic->vdev); + vdev->devcmd = vnic_dev_get_res(vdev, RES_TYPE_DEVCMD, 0); vnic_dev_cmd(vdev, CMD_INIT_v1, &a0, &a1, wait); @@ -326,6 +421,7 @@ enic_attach_pre(if_ctx_t ctx) /* Set ingress vlan rewrite mode before vnic initialization */ enic->ig_vlan_rewrite_mode = IG_VLAN_REWRITE_MODE_UNTAG_DEFAULT_VLAN; + enic->ig_vlan_rewrite_mode = IG_VLAN_REWRITE_MODE_PRIORITY_TAG_DEFAULT_VLAN; err = vnic_dev_set_ig_vlan_rewrite_mode(enic->vdev, enic->ig_vlan_rewrite_mode); if (err) { @@ -360,8 +456,10 @@ enic_attach_pre(if_ctx_t ctx) softc->scctx = iflib_get_softc_ctx(ctx); scctx = softc->scctx; scctx->isc_txrx = &enic_txrx; - scctx->isc_capabilities = scctx->isc_capenable = 0; + scctx->isc_capabilities = scctx->isc_capenable = \ + IFCAP_HWCSUM; scctx->isc_tx_csum_flags = 0; + if_setmtu(softc->ifp, enic->config.mtu); scctx->isc_max_frame_size = enic->config.mtu + ETHER_HDR_LEN + \ ETHER_CRC_LEN; scctx->isc_nrxqsets_max = enic->conf_rq_count; @@ -389,7 +487,6 @@ enic_attach_pre(if_ctx_t ctx) } scctx->isc_tx_nsegments = 31; - scctx->isc_vectors = enic->conf_cq_count; scctx->isc_msix_bar = -1; ifmedia_add(softc->media, IFM_ETHER | IFM_AUTO, 0, NULL); @@ -416,12 +513,20 @@ enic_attach_pre(if_ctx_t ctx) err = vnic_dev_alloc_stats_mem(enic->vdev); if (err) { dev_err(enic, "Failed to allocate cmd memory, aborting\n"); + goto err_out_dev_close; + } + + err = enic_allocate_msix(softc); + if (err) { + dev_err(enic, "Failed to allocate MSIX, aborting\n"); + goto err_out_dev_close; } return (rc); err_out_dev_close: vnic_dev_close(enic->vdev); + vnic_dev_deinit_devcmd2(enic->vdev); err_out_unregister: free(softc->vdev.devcmd, M_DEVBUF); free(softc->enic.intr_queues, M_DEVBUF); @@ -482,9 +587,10 @@ enic_msix_intr_assign(if_ctx_t ctx, int msix) snprintf(irq_name, sizeof(irq_name), "etxq%d:%d", i - scctx->isc_nrxqsets, device_get_unit(softc->dev)); - - iflib_softirq_alloc_generic(ctx, &enic->intr_queues[i].intr_irq, IFLIB_INTR_TX, &enic->wq[i - scctx->isc_nrxqsets], i - scctx->isc_nrxqsets, irq_name); - + iflib_softirq_alloc_generic(ctx, + &enic->intr_queues[i].intr_irq, IFLIB_INTR_TX, + &enic->wq[i - scctx->isc_nrxqsets], i - scctx->isc_nrxqsets, + irq_name); enic->intr[i].index = i; enic->intr[i].vdev = enic->vdev; @@ -567,6 +673,7 @@ enic_attach_post(if_ctx_t ctx) enic_setup_sysctl(softc); enic_init_vnic_resources(enic); + enic_set_rx_coal_setting(enic); enic_setup_finish(enic); ifmedia_add(softc->media, IFM_ETHER | IFM_AUTO, 0, NULL); @@ -589,7 +696,9 @@ enic_detach(if_ctx_t ctx) enic_free_irqs(softc); ENIC_LOCK(softc); + vnic_dev_deinit(enic->vdev); vnic_dev_close(enic->vdev); + vnic_dev_deinit_devcmd2(enic->vdev); free(softc->vdev.devcmd, M_DEVBUF); pci_disable_busmaster(softc->dev); enic_pci_mapping_free(softc); @@ -807,6 +916,11 @@ enic_stop(if_ctx_t ctx) struct enic *enic; if_softc_ctx_t scctx; unsigned int index; + struct vnic_wq *wq; + struct vnic_rq *rq; + struct vnic_cq *cq; + unsigned int cq_wq, cq_rq; + softc = iflib_get_softc(ctx); scctx = softc->scctx; @@ -817,15 +931,36 @@ enic_stop(if_ctx_t ctx) softc->link_active = 0; softc->stopped = 1; + enic_dev_disable(enic); + for (index = 0; index < scctx->isc_ntxqsets; index++) { enic_stop_wq(enic, index); vnic_wq_clean(&enic->wq[index]); vnic_cq_clean(&enic->cq[enic_cq_rq(enic, index)]); + + wq = &softc->enic.wq[index]; + wq->ring.desc_avail = wq->ring.desc_count - 1; + wq->ring.last_count = wq->ring.desc_count; + wq->head_idx = 0; + wq->tail_idx = 0; + + cq_wq = enic_cq_wq(&softc->enic, index); + cq = &softc->enic.cq[cq_wq]; + cq->ring.desc_avail = cq->ring.desc_count - 1; } for (index = 0; index < scctx->isc_nrxqsets; index++) { + enic_stop_rq(enic, index); vnic_rq_clean(&enic->rq[index]); vnic_cq_clean(&enic->cq[enic_cq_wq(enic, index)]); + + rq = &softc->enic.rq[index]; + cq_rq = enic_cq_rq(&softc->enic, index); + cq = &softc->enic.cq[cq_rq]; + + cq->ring.desc_avail = cq->ring.desc_count - 1; + rq->ring.desc_avail = rq->ring.desc_count - 1; + rq->need_initial_post = true; } for (index = 0; index < scctx->isc_vectors; index++) { @@ -845,6 +980,9 @@ enic_init(if_ctx_t ctx) scctx = softc->scctx; enic = &softc->enic; + + enic_init_vnic_resources(enic); + for (index = 0; index < scctx->isc_ntxqsets; index++) enic_prep_wq_for_simple_tx(&softc->enic, index); @@ -862,6 +1000,8 @@ enic_init(if_ctx_t ctx) vnic_dev_enable_wait(enic->vdev); ENIC_UNLOCK(softc); + softc->stopped = 0; + enic_link_status(softc); } @@ -942,12 +1082,14 @@ enic_mtu_set(if_ctx_t ctx, uint32_t mtu) softc = iflib_get_softc(ctx); enic = &softc->enic; + enic_stop(softc->ctx); if (mtu > enic->port_mtu){ return (EINVAL); } enic->config.mtu = mtu; scctx->isc_max_frame_size = mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; + enic_init(softc->ctx); return (0); } @@ -1026,7 +1168,6 @@ static void enic_update_admin_status(if_ctx_t ctx) { struct enic_softc *softc; - softc = iflib_get_softc(ctx); enic_link_status(softc); @@ -1357,7 +1498,7 @@ enic_dev_init(struct enic *enic) if (vnic_dev_overlay_offload_cfg(enic->vdev, OVERLAY_CFG_VXLAN_PORT_UPDATE, ENIC_DEFAULT_VXLAN_PORT)) { dev_err(enic, "failed to update vxlan port\n"); - return -EINVAL; + return (EINVAL); } } return 0; @@ -1441,7 +1582,7 @@ enic_dev_wait(struct vnic_dev *vdev, int (*start) (struct vnic_dev *, int), return 0; usleep(1000); } - return -ETIMEDOUT; + return (ETIMEDOUT); } static int @@ -1452,7 +1593,7 @@ enic_map_bar(struct enic_softc *softc, struct enic_bar_info *bar, int bar_num, if (bar->res != NULL) { device_printf(softc->dev, "Bar %d already mapped\n", bar_num); - return EDOOFUS; + return (EDOOFUS); } bar->rid = PCIR_BAR(bar_num); @@ -1481,20 +1622,18 @@ enic_init_vnic_resources(struct enic *enic) unsigned int rxq_interrupt_enable = 0; unsigned int rxq_interrupt_offset = ENICPMD_RXQ_INTR_OFFSET; unsigned int txq_interrupt_enable = 0; - unsigned int txq_interrupt_offset = ENICPMD_RXQ_INTR_OFFSET; + unsigned int txq_interrupt_offset; unsigned int index = 0; unsigned int cq_idx; if_softc_ctx_t scctx; scctx = enic->softc->scctx; - rxq_interrupt_enable = 1; - txq_interrupt_enable = 1; + txq_interrupt_enable = 0; rxq_interrupt_offset = 0; - txq_interrupt_offset = enic->intr_count - 2; - txq_interrupt_offset = 1; + txq_interrupt_offset = scctx->isc_nrxqsets; for (index = 0; index < enic->intr_count; index++) { vnic_intr_alloc(enic->vdev, &enic->intr[index], index); @@ -1568,7 +1707,7 @@ enic_update_packet_filter(struct enic *enic) } static bool -enic_if_needs_restart(if_ctx_t ctx __unused, enum iflib_restart_event event) +enic_if_needs_restart(if_ctx_t ctx, enum iflib_restart_event event) { switch (event) { case IFLIB_RESTART_VLAN_CONFIG: diff --git a/sys/dev/enic/vnic_cq.h b/sys/dev/enic/vnic_cq.h index 26f9009612c5dd..b4549ee58c64b2 100644 --- a/sys/dev/enic/vnic_cq.h +++ b/sys/dev/enic/vnic_cq.h @@ -63,6 +63,8 @@ struct vnic_cq { unsigned int to_clean; unsigned int last_color; unsigned int interrupt_offset; + unsigned int cur_rx_coal_timeval; + unsigned int tobe_rx_coal_timeval; #ifdef ENIC_AIC struct vnic_rx_bytes_counter pkt_size_counter; unsigned int cur_rx_coal_timeval; @@ -75,15 +77,12 @@ struct vnic_cq { int nrxqsets_start; }; -void vnic_cq_free(struct vnic_cq *cq); void vnic_cq_init(struct vnic_cq *cq, unsigned int flow_control_enable, unsigned int color_enable, unsigned int cq_head, unsigned int cq_tail, unsigned int cq_tail_color, unsigned int interrupt_enable, unsigned int cq_entry_enable, unsigned int message_enable, unsigned int interrupt_offset, u64 message_addr); void vnic_cq_clean(struct vnic_cq *cq); -int vnic_cq_mem_size(struct vnic_cq *cq, unsigned int desc_count, - unsigned int desc_size); static inline unsigned int vnic_cq_service(struct vnic_cq *cq, unsigned int work_to_do, diff --git a/sys/dev/enic/vnic_dev.c b/sys/dev/enic/vnic_dev.c index 3425d7372e5683..2d555cb2b34dc3 100644 --- a/sys/dev/enic/vnic_dev.c +++ b/sys/dev/enic/vnic_dev.c @@ -44,7 +44,7 @@ static int vnic_dev_discover_res(struct vnic_dev *vdev, u8 type; if (num_bars == 0) - return -EINVAL; + return (EINVAL); rh = malloc(sizeof(*rh), M_DEVBUF, M_NOWAIT | M_ZERO); mrh = malloc(sizeof(*mrh), M_DEVBUF, M_NOWAIT | M_ZERO); @@ -52,7 +52,7 @@ static int vnic_dev_discover_res(struct vnic_dev *vdev, pr_err("vNIC BAR0 res hdr not mem-mapped\n"); free(rh, M_DEVBUF); free(mrh, M_DEVBUF); - return -EINVAL; + return (EINVAL); } /* Check for mgmt vnic in addition to normal vnic */ @@ -69,7 +69,7 @@ static int vnic_dev_discover_res(struct vnic_dev *vdev, rh->magic, rh->version); free(rh, M_DEVBUF); free(mrh, M_DEVBUF); - return -EINVAL; + return (EINVAL); } } @@ -97,6 +97,7 @@ static int vnic_dev_discover_res(struct vnic_dev *vdev, case RES_TYPE_INTR_CTRL: case RES_TYPE_INTR_PBA_LEGACY: case RES_TYPE_DEVCMD: + case RES_TYPE_DEVCMD2: break; default: ENIC_BUS_READ_REGION_4(softc, mem, r_offset, (void *)r, sizeof(*r) / 4); @@ -189,12 +190,12 @@ static int _vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, status = ENIC_BUS_READ_4(devcmd, DEVCMD_STATUS); if (status == 0xFFFFFFFF) { /* PCI-e target device is gone */ - return -ENODEV; + return (ENODEV); } if (status & STAT_BUSY) { pr_err("Busy devcmd %d\n", _CMD_N(cmd)); - return -EBUSY; + return (EBUSY); } if (_CMD_DIR(cmd) & _CMD_DIR_WRITE) { @@ -214,7 +215,7 @@ static int _vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, status = ENIC_BUS_READ_4(devcmd, DEVCMD_STATUS); if (status == 0xFFFFFFFF) { /* PCI-e target device is gone */ - return -ENODEV; + return (ENODEV); } if (!(status & STAT_BUSY)) { @@ -225,7 +226,7 @@ static int _vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, pr_err("Devcmd %d failed " \ "with error code %d\n", _CMD_N(cmd), err); - return err; + return (err); } if (_CMD_DIR(cmd) & _CMD_DIR_READ) { @@ -237,7 +238,82 @@ static int _vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, } pr_err("Timedout devcmd %d\n", _CMD_N(cmd)); - return -ETIMEDOUT; + return (ETIMEDOUT); +} + +static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + int wait) +{ + struct devcmd2_controller *dc2c = vdev->devcmd2; + struct devcmd2_result *result; + u8 color; + unsigned int i; + u32 fetch_index, new_posted; + int delay, err; + u32 posted = dc2c->posted; + + fetch_index = ENIC_BUS_READ_4(dc2c->wq_ctrl, TX_FETCH_INDEX); + if (fetch_index == 0xFFFFFFFF) + return (ENODEV); + + new_posted = (posted + 1) % DEVCMD2_RING_SIZE; + + if (new_posted == fetch_index) { + device_printf(dev_from_vnic_dev(vdev), + "devcmd2 %d: wq is full. fetch index: %u, posted index: %u\n", + _CMD_N(cmd), fetch_index, posted); + return (EBUSY); + } + + dc2c->cmd_ring[posted].cmd = cmd; + dc2c->cmd_ring[posted].flags = 0; + + if ((_CMD_FLAGS(cmd) & _CMD_FLAGS_NOWAIT)) + dc2c->cmd_ring[posted].flags |= DEVCMD2_FNORESULT; + if (_CMD_DIR(cmd) & _CMD_DIR_WRITE) + for (i = 0; i < VNIC_DEVCMD_NARGS; i++) + dc2c->cmd_ring[posted].args[i] = vdev->args[i]; + + ENIC_BUS_WRITE_4(dc2c->wq_ctrl, TX_POSTED_INDEX, new_posted); + dc2c->posted = new_posted; + + if (dc2c->cmd_ring[posted].flags & DEVCMD2_FNORESULT) + return (0); + + result = dc2c->result + dc2c->next_result; + color = dc2c->color; + + dc2c->next_result++; + if (dc2c->next_result == dc2c->result_size) { + dc2c->next_result = 0; + dc2c->color = dc2c->color ? 0 : 1; + } + + for (delay = 0; delay < wait; delay++) { + if (result->color == color) { + if (result->error) { + err = result->error; + if (err != ERR_ECMDUNKNOWN || + cmd != CMD_CAPABILITY) + device_printf(dev_from_vnic_dev(vdev), + "Error %d devcmd %d\n", err, + _CMD_N(cmd)); + return (err); + } + if (_CMD_DIR(cmd) & _CMD_DIR_READ) + for (i = 0; i < VNIC_DEVCMD2_NARGS; i++) + vdev->args[i] = result->results[i]; + + return 0; + } + udelay(100); + } + + device_printf(dev_from_vnic_dev(vdev), + "devcmd %d timed out\n", _CMD_N(cmd)); + + + return (ETIMEDOUT); } static int vnic_dev_cmd_proxy(struct vnic_dev *vdev, @@ -253,7 +329,7 @@ static int vnic_dev_cmd_proxy(struct vnic_dev *vdev, */ if (nargs > VNIC_DEVCMD_NARGS - 2) { pr_err("number of args %d exceeds the maximum\n", nargs); - return -EINVAL; + return (EINVAL); } memset(vdev->args, 0, sizeof(vdev->args)); @@ -261,9 +337,9 @@ static int vnic_dev_cmd_proxy(struct vnic_dev *vdev, vdev->args[1] = cmd; memcpy(&vdev->args[2], args, nargs * sizeof(args[0])); - err = _vnic_dev_cmd(vdev, proxy_cmd, wait); + err = vdev->devcmd_rtn(vdev, proxy_cmd, wait); if (err) - return err; + return (err); status = (u32)vdev->args[0]; if (status & STAT_ERROR) { @@ -271,7 +347,7 @@ static int vnic_dev_cmd_proxy(struct vnic_dev *vdev, if (err != ERR_ECMDUNKNOWN || cmd != CMD_CAPABILITY) pr_err("Error %d proxy devcmd %d\n", err, _CMD_N(cmd)); - return err; + return (err); } memcpy(args, &vdev->args[1], nargs * sizeof(args[0])); @@ -286,16 +362,16 @@ static int vnic_dev_cmd_no_proxy(struct vnic_dev *vdev, if (nargs > VNIC_DEVCMD_NARGS) { pr_err("number of args %d exceeds the maximum\n", nargs); - return -EINVAL; + return (EINVAL); } memset(vdev->args, 0, sizeof(vdev->args)); memcpy(vdev->args, args, nargs * sizeof(args[0])); - err = _vnic_dev_cmd(vdev, cmd, wait); + err = vdev->devcmd_rtn(vdev, cmd, wait); memcpy(args, vdev->args, nargs * sizeof(args[0])); - return err; + return (err); } int vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, @@ -328,7 +404,7 @@ int vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, *a1 = args[1]; } - return err; + return (err); } int vnic_dev_cmd_args(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, @@ -400,7 +476,7 @@ int vnic_dev_capable_filter_mode(struct vnic_dev *vdev, u32 *mode, args[1] = 0; err = vnic_dev_cmd_args(vdev, CMD_CAPABILITY, args, 2, 1000); if (err) - return err; + return (err); max_level = args[1]; goto parse_max_level; } else if (args[2] == FILTER_CAP_MODE_V1) { @@ -479,7 +555,7 @@ int vnic_dev_spec(struct vnic_dev *vdev, unsigned int offset, size_t size, break; } - return err; + return (err); } int vnic_dev_stats_clear(struct vnic_dev *vdev) @@ -497,7 +573,7 @@ int vnic_dev_stats_dump(struct vnic_dev *vdev, struct vnic_stats **stats) int rc; if (!vdev->stats) - return -ENOMEM; + return (ENOMEM); *stats = vdev->stats; a0 = vdev->stats_res.idi_paddr; @@ -524,10 +600,10 @@ int vnic_dev_counter_dma_cfg(struct vnic_dev *vdev, u32 period, int err; if (num_counters > VNIC_MAX_FLOW_COUNTERS) - return -ENOMEM; + return (ENOMEM); if (period > 0 && (period < VNIC_COUNTER_DMA_MIN_PERIOD || num_counters == 0)) - return -EINVAL; + return (EINVAL); args[0] = num_counters; args[1] = vdev->flow_counters_res.idi_paddr; @@ -545,7 +621,7 @@ int vnic_dev_counter_dma_cfg(struct vnic_dev *vdev, u32 period, vdev->flow_counters_dma_active = (num_counters != 0 && period != 0); - return err; + return (err); } int vnic_dev_close(struct vnic_dev *vdev) @@ -593,7 +669,7 @@ int vnic_dev_open_done(struct vnic_dev *vdev, int *done) err = vnic_dev_cmd(vdev, CMD_OPEN_STATUS, &a0, &a1, wait); if (err) - return err; + return (err); *done = (a0 == 0); @@ -611,7 +687,7 @@ int vnic_dev_get_mac_addr(struct vnic_dev *vdev, u8 *mac_addr) err = vnic_dev_cmd(vdev, CMD_GET_MAC_ADDR, &a0, &a1, wait); if (err) - return err; + return (err); for (i = 0; i < ETH_ALEN; i++) mac_addr[i] = ((u8 *)&a0)[i]; @@ -636,7 +712,7 @@ int vnic_dev_packet_filter(struct vnic_dev *vdev, int directed, int multicast, if (err) pr_err("Can't set packet filter\n"); - return err; + return (err); } int vnic_dev_add_addr(struct vnic_dev *vdev, u8 *addr) @@ -655,7 +731,7 @@ int vnic_dev_add_addr(struct vnic_dev *vdev, u8 *addr) addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], err); - return err; + return (err); } int vnic_dev_del_addr(struct vnic_dev *vdev, u8 *addr) @@ -674,7 +750,7 @@ int vnic_dev_del_addr(struct vnic_dev *vdev, u8 *addr) addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], err); - return err; + return (err); } int vnic_dev_set_ig_vlan_rewrite_mode(struct vnic_dev *vdev, @@ -771,7 +847,7 @@ int vnic_dev_notify_unsetcmd(struct vnic_dev *vdev) vdev->notify_sz = 0; } - return err; + return (err); } int vnic_dev_notify_unset(struct vnic_dev *vdev) @@ -807,7 +883,8 @@ static int vnic_dev_notify_ready(struct vnic_dev *vdev) csum += words[i]; } while (csum != words[0]); - return 1; + + return (1); } int vnic_dev_init(struct vnic_dev *vdev, int arg) @@ -923,7 +1000,7 @@ int vnic_dev_alloc_counter_mem(struct vnic_dev *vdev) iflib_dma_alloc(softc->ctx, sizeof(struct vnic_counter_counts) * VNIC_MAX_FLOW_COUNTERS, &vdev->flow_counters_res, 0); vdev->flow_counters = (struct vnic_counter_counts *)vdev->flow_counters_res.idi_vaddr; vdev->flow_counters_dma_active = 0; - return vdev->flow_counters == NULL ? -ENOMEM : 0; + return (vdev->flow_counters == NULL ? ENOMEM : 0); } struct vnic_dev *vnic_dev_register(struct vnic_dev *vdev, @@ -942,6 +1019,85 @@ struct vnic_dev *vnic_dev_register(struct vnic_dev *vdev, return NULL; } +static int vnic_dev_init_devcmd1(struct vnic_dev *vdev) +{ + vdev->devcmd = vnic_dev_get_res(vdev, RES_TYPE_DEVCMD, 0); + if (!vdev->devcmd) + return (ENODEV); + vdev->devcmd_rtn = _vnic_dev_cmd; + + return 0; +} + +static int vnic_dev_init_devcmd2(struct vnic_dev *vdev) +{ + int err; + unsigned int fetch_index; + + + err = 0; + + if (vdev->devcmd2) + return (0); + + vdev->devcmd2 = malloc(sizeof(*vdev->devcmd2), M_DEVBUF, + M_NOWAIT | M_ZERO); + + if (!vdev->devcmd2) { + return (ENOMEM); + } + + vdev->devcmd2->color = 1; + vdev->devcmd2->result_size = DEVCMD2_RING_SIZE; + + err = enic_wq_devcmd2_alloc(vdev, &vdev->devcmd2->wq, DEVCMD2_RING_SIZE, + DEVCMD2_DESC_SIZE); + + if (err) { + goto err_free_devcmd2; + } + vdev->devcmd2->wq_ctrl = vdev->devcmd2->wq.ctrl; + vdev->devcmd2->cmd_ring = vdev->devcmd2->wq.ring.descs; + + fetch_index = ENIC_BUS_READ_4(vdev->devcmd2->wq.ctrl, TX_FETCH_INDEX); + if (fetch_index == 0xFFFFFFFF) + return (ENODEV); + + enic_wq_init_start(&vdev->devcmd2->wq, 0, fetch_index, fetch_index, 0, + 0); + vdev->devcmd2->posted = fetch_index; + vnic_wq_enable(&vdev->devcmd2->wq); + + err = vnic_dev_alloc_desc_ring(vdev, &vdev->devcmd2->results_ring, + DEVCMD2_RING_SIZE, DEVCMD2_DESC_SIZE); + if (err) + goto err_free_devcmd2; + + vdev->devcmd2->result = vdev->devcmd2->results_ring.descs; + vdev->args[0] = (u64)vdev->devcmd2->results_ring.base_addr | + VNIC_PADDR_TARGET; + vdev->args[1] = DEVCMD2_RING_SIZE; + + err = _vnic_dev_cmd2(vdev, CMD_INITIALIZE_DEVCMD2, 1000); + if (err) + goto err_free_devcmd2; + + vdev->devcmd_rtn = _vnic_dev_cmd2; + + return (err); + +err_free_devcmd2: + err = ENOMEM; + if (vdev->devcmd2->wq_ctrl) + vnic_wq_free(&vdev->devcmd2->wq); + if (vdev->devcmd2->result) + vnic_dev_free_desc_ring(vdev, &vdev->devcmd2->results_ring); + free(vdev->devcmd2, M_DEVBUF); + vdev->devcmd2 = NULL; + + return (err); +} + /* * vnic_dev_classifier: Add/Delete classifier entries * @vdev: vdev of the device @@ -1037,3 +1193,22 @@ bool vnic_dev_counter_query(struct vnic_dev *vdev, uint32_t idx, device_t dev_from_vnic_dev(struct vnic_dev *vdev) { return (vdev->softc->dev); } + +int vnic_dev_cmd_init(struct vnic_dev *vdev) { + int err; + void __iomem *res; + + res = vnic_dev_get_res(vdev, RES_TYPE_DEVCMD2, 0); + if (res) { + err = vnic_dev_init_devcmd2(vdev); + if (err) + device_printf(dev_from_vnic_dev(vdev), + "DEVCMD2 init failed, Using DEVCMD1\n"); + else + return 0; + } + + err = vnic_dev_init_devcmd1(vdev); + + return (err); +} diff --git a/sys/dev/enic/vnic_dev.h b/sys/dev/enic/vnic_dev.h index f8ca29f4e175b2..5e2d01d985f3f5 100644 --- a/sys/dev/enic/vnic_dev.h +++ b/sys/dev/enic/vnic_dev.h @@ -38,6 +38,7 @@ struct vnic_dev_ring { unsigned int desc_count; unsigned int desc_avail; unsigned int last_count; + iflib_dma_info_t ifdip; }; struct vnic_dev_iomap_info { @@ -69,6 +70,10 @@ unsigned long vnic_dev_get_res_type_len(struct vnic_dev *vdev, unsigned int vnic_dev_desc_ring_size(struct vnic_dev_ring *ring, unsigned int desc_count, unsigned int desc_size); void vnic_dev_clear_desc_ring(struct vnic_dev_ring *ring); +int vnic_dev_alloc_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring, + unsigned int desc_count, unsigned int desc_size); +void vnic_dev_free_desc_ring(struct vnic_dev *vdev, + struct vnic_dev_ring *ring); int vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1, int wait); int vnic_dev_cmd_args(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, @@ -143,7 +148,7 @@ struct vnic_dev *vnic_dev_register(struct vnic_dev *vdev, struct rte_pci_device *vnic_dev_get_pdev(struct vnic_dev *vdev); int vnic_dev_alloc_stats_mem(struct vnic_dev *vdev); int vnic_dev_alloc_counter_mem(struct vnic_dev *vdev); -int vnic_dev_cmd_init(struct vnic_dev *vdev, int fallback); +int vnic_dev_cmd_init(struct vnic_dev *vdev); int vnic_dev_get_size(void); int vnic_dev_int13(struct vnic_dev *vdev, u64 arg, u32 op); int vnic_dev_perbi(struct vnic_dev *vdev, u64 arg, u32 op); @@ -164,6 +169,7 @@ bool vnic_dev_counter_alloc(struct vnic_dev *vdev, uint32_t *idx); bool vnic_dev_counter_free(struct vnic_dev *vdev, uint32_t idx); bool vnic_dev_counter_query(struct vnic_dev *vdev, uint32_t idx, bool reset, uint64_t *packets, uint64_t *bytes); +void vnic_dev_deinit_devcmd2(struct vnic_dev *vdev); device_t dev_from_vnic_dev(struct vnic_dev *vdev); diff --git a/sys/dev/enic/vnic_intr.c b/sys/dev/enic/vnic_intr.c index 38e2ea6e066bbd..8a6494efd5f311 100644 --- a/sys/dev/enic/vnic_intr.c +++ b/sys/dev/enic/vnic_intr.c @@ -21,7 +21,7 @@ int vnic_intr_alloc(struct vnic_dev *vdev, struct vnic_intr *intr, intr->ctrl = vnic_dev_get_res(vdev, RES_TYPE_INTR_CTRL, index); if (!intr->ctrl) { pr_err("Failed to hook INTR[%d].ctrl resource\n", index); - return -EINVAL; + return (EINVAL); } return 0; diff --git a/sys/dev/enic/vnic_intr.h b/sys/dev/enic/vnic_intr.h index 22db66096aaefb..6d1e8e1cf050c5 100644 --- a/sys/dev/enic/vnic_intr.h +++ b/sys/dev/enic/vnic_intr.h @@ -76,7 +76,7 @@ static inline void vnic_intr_return_credits(struct vnic_intr *intr, static inline unsigned int vnic_intr_credits(struct vnic_intr *intr) { - return ENIC_BUS_READ_4(intr->ctrl, INTR_CREDITS); + return (ENIC_BUS_READ_4(intr->ctrl, INTR_CREDITS)); } static inline void vnic_intr_return_all_credits(struct vnic_intr *intr) diff --git a/sys/dev/enic/vnic_resource.h b/sys/dev/enic/vnic_resource.h index 184bfa7401df87..d365b8d914bac8 100644 --- a/sys/dev/enic/vnic_resource.h +++ b/sys/dev/enic/vnic_resource.h @@ -39,6 +39,7 @@ enum vnic_res_type { RES_TYPE_MQ_RQ, /* MQ Receive queues */ RES_TYPE_MQ_CQ, /* MQ Completion queues */ RES_TYPE_DEPRECATED1, /* Old version of devcmd 2 */ + RES_TYPE_DEPRECATED2, /* Old version of devcmd 2 */ RES_TYPE_DEVCMD2, /* Device control region */ RES_TYPE_MAX, /* Count of resource types */ }; diff --git a/sys/dev/enic/vnic_rq.c b/sys/dev/enic/vnic_rq.c index 3720da5f9aa69b..ef30563fa2f347 100644 --- a/sys/dev/enic/vnic_rq.c +++ b/sys/dev/enic/vnic_rq.c @@ -40,6 +40,7 @@ void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, fetch_index = 0; } + fetch_index = 0; vnic_rq_init_start(rq, cq_index, fetch_index, fetch_index, error_interrupt_enable, @@ -50,7 +51,7 @@ void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, unsigned int vnic_rq_error_status(struct vnic_rq *rq) { - return ENIC_BUS_READ_4(rq->ctrl, RX_ERROR_STATUS); + return (ENIC_BUS_READ_4(rq->ctrl, RX_ERROR_STATUS)); } void vnic_rq_enable(struct vnic_rq *rq) @@ -73,7 +74,7 @@ int vnic_rq_disable(struct vnic_rq *rq) pr_err("Failed to disable RQ[%d]\n", rq->index); - return -ETIMEDOUT; + return (ETIMEDOUT); } void vnic_rq_clean(struct vnic_rq *rq) diff --git a/sys/dev/enic/vnic_rq.h b/sys/dev/enic/vnic_rq.h index ae8c1fdc39bdaf..9e3d239809c424 100644 --- a/sys/dev/enic/vnic_rq.h +++ b/sys/dev/enic/vnic_rq.h @@ -133,7 +133,6 @@ void vnic_rq_init_start(struct vnic_rq *rq, unsigned int cq_index, void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, unsigned int error_interrupt_enable, unsigned int error_interrupt_offset); -void vnic_rq_error_out(struct vnic_rq *rq, unsigned int error); unsigned int vnic_rq_error_status(struct vnic_rq *rq); void vnic_rq_enable(struct vnic_rq *rq); int vnic_rq_disable(struct vnic_rq *rq); diff --git a/sys/dev/enic/vnic_rss.h b/sys/dev/enic/vnic_rss.h index abd7b9f131aa9b..039041ece5b2c4 100644 --- a/sys/dev/enic/vnic_rss.h +++ b/sys/dev/enic/vnic_rss.h @@ -24,9 +24,4 @@ union vnic_rss_cpu { u64 raw[32]; }; -void vnic_set_rss_key(union vnic_rss_key *rss_key, u8 *key); -void vnic_set_rss_cpu(union vnic_rss_cpu *rss_cpu, u8 *cpu); -void vnic_get_rss_key(union vnic_rss_key *rss_key, u8 *key); -void vnic_get_rss_cpu(union vnic_rss_cpu *rss_cpu, u8 *cpu); - #endif /* _VNIC_RSS_H_ */ diff --git a/sys/dev/enic/vnic_wq.c b/sys/dev/enic/vnic_wq.c index b032df3392b2ed..995af3270a21df 100644 --- a/sys/dev/enic/vnic_wq.c +++ b/sys/dev/enic/vnic_wq.c @@ -7,7 +7,103 @@ #include "vnic_dev.h" #include "vnic_wq.h" -void vnic_wq_init_start(struct vnic_wq *wq, unsigned int cq_index, +int vnic_dev_alloc_desc_ring(struct vnic_dev *vdev, + struct vnic_dev_ring *ring, unsigned int desc_count, unsigned int desc_size) +{ + iflib_dma_info_t ifdip; + int err; + + if ((ifdip = malloc(sizeof(struct iflib_dma_info), + M_DEVBUF, M_NOWAIT | M_ZERO)) == NULL) { + device_printf(dev_from_vnic_dev(vdev), + "Unable to allocate DMA info memory\n"); + return (ENOMEM); + } + + err = iflib_dma_alloc(vdev->softc->ctx, desc_count * desc_size, + ifdip, 0); + if (err) { + device_printf(dev_from_vnic_dev(vdev), + "Unable to allocate DEVCMD2 descriptors\n"); + err = ENOMEM; + goto err_out_alloc; + } + + ring->base_addr = ifdip->idi_paddr; + ring->descs = ifdip->idi_vaddr; + ring->ifdip = ifdip; + ring->desc_size = desc_size; + ring->desc_count = desc_count; + ring->last_count = 0; + ring->desc_avail = ring->desc_count - 1; + + ring->size = ring->desc_count * ring->desc_size; + ring->base_align = 512; + ring->size_unaligned = ring->size + ring->base_align; + + return (err); + + iflib_dma_free(ifdip); + +err_out_alloc: + free(ifdip, M_DEVBUF); + return (err); +} + +void vnic_dev_free_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring) +{ + if (ring && ring->descs) { + iflib_dma_free(ring->ifdip); + free(ring->ifdip, M_DEVBUF); + ring->descs = NULL; + } +} + +void vnic_wq_free(struct vnic_wq *wq) { + vnic_dev_free_desc_ring(wq->vdev, &wq->ring); + wq->ctrl = NULL; +} + +int enic_wq_devcmd2_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + wq->index = 0; + wq->vdev = vdev; + + + wq->ctrl = vnic_dev_get_res(vdev, RES_TYPE_DEVCMD2, 0); + if (!wq->ctrl) + return (EINVAL); + vnic_wq_disable(wq); + err = vnic_dev_alloc_desc_ring(vdev, &wq->ring, desc_count, desc_size); + + return (err); +} + +void vnic_dev_deinit_devcmd2(struct vnic_dev *vdev) +{ + if (vdev->devcmd2) { + vnic_wq_disable(&vdev->devcmd2->wq); + if (vdev->devcmd2->wq_ctrl) + vnic_wq_free(&vdev->devcmd2->wq); + if (vdev->devcmd2->result) + vnic_dev_free_desc_ring(vdev, &vdev->devcmd2->results_ring); + free(vdev->devcmd2, M_DEVBUF); + vdev->devcmd2 = NULL; + } +} + +int vnic_dev_deinit(struct vnic_dev *vdev) { + u64 a0 = 0, a1 = 0; + int wait = 1000; + + return (vnic_dev_cmd(vdev, CMD_DEINIT, &a0, &a1, wait)); + return (0); +} + +void enic_wq_init_start(struct vnic_wq *wq, unsigned int cq_index, unsigned int fetch_index, unsigned int posted_index, unsigned int error_interrupt_enable, unsigned int error_interrupt_offset) @@ -33,7 +129,7 @@ void vnic_wq_init(struct vnic_wq *wq, unsigned int cq_index, unsigned int error_interrupt_enable, unsigned int error_interrupt_offset) { - vnic_wq_init_start(wq, cq_index, 0, 0, + enic_wq_init_start(wq, cq_index, 0, 0, error_interrupt_enable, error_interrupt_offset); wq->cq_pend = 0; @@ -42,7 +138,7 @@ void vnic_wq_init(struct vnic_wq *wq, unsigned int cq_index, unsigned int vnic_wq_error_status(struct vnic_wq *wq) { - return ENIC_BUS_READ_4(wq->ctrl, TX_ERROR_STATUS); + return (ENIC_BUS_READ_4(wq->ctrl, TX_ERROR_STATUS)); } void vnic_wq_enable(struct vnic_wq *wq) @@ -65,7 +161,7 @@ int vnic_wq_disable(struct vnic_wq *wq) pr_err("Failed to disable WQ[%d]\n", wq->index); - return -ETIMEDOUT; + return (ETIMEDOUT); } void vnic_wq_clean(struct vnic_wq *wq) diff --git a/sys/dev/enic/vnic_wq.h b/sys/dev/enic/vnic_wq.h index c4f551de844117..9ef492adba24e8 100644 --- a/sys/dev/enic/vnic_wq.h +++ b/sys/dev/enic/vnic_wq.h @@ -61,6 +61,20 @@ struct vnic_wq { uint64_t offloads; }; +struct devcmd2_controller { + struct vnic_res *wq_ctrl; + struct vnic_devcmd2 *cmd_ring; + struct devcmd2_result *result; + u16 next_result; + u16 result_size; + int color; + struct vnic_dev_ring results_ring; + struct vnic_res *results_ctrl; + struct vnic_wq wq; + u32 posted; +}; + + static inline unsigned int vnic_wq_desc_avail(struct vnic_wq *wq) { /* how many does SW own? */ @@ -92,7 +106,7 @@ buf_idx_incr(uint32_t n_descriptors, uint32_t idx) } void vnic_wq_free(struct vnic_wq *wq); -void vnic_wq_init_start(struct vnic_wq *wq, unsigned int cq_index, +void enic_wq_init_start(struct vnic_wq *wq, unsigned int cq_index, unsigned int fetch_index, unsigned int posted_index, unsigned int error_interrupt_enable, unsigned int error_interrupt_offset); @@ -104,5 +118,7 @@ unsigned int vnic_wq_error_status(struct vnic_wq *wq); void vnic_wq_enable(struct vnic_wq *wq); int vnic_wq_disable(struct vnic_wq *wq); void vnic_wq_clean(struct vnic_wq *wq); +int enic_wq_devcmd2_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, + unsigned int desc_count, unsigned int desc_size); #endif /* _VNIC_WQ_H_ */ From 2cadbe468a8e8aef193429565b729d34ec48b266 Mon Sep 17 00:00:00 2001 From: Minsoo Choo Date: Fri, 5 Apr 2024 20:30:50 -0400 Subject: [PATCH 091/143] tcp_wrappers: Use default C standard version Reviewed by: emaste, arichardson, jhb Differential Revision: https://reviews.freebsd.org/D43236 --- contrib/tcp_wrappers/tcpd.c | 1 + libexec/tcpd/Makefile | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/tcp_wrappers/tcpd.c b/contrib/tcp_wrappers/tcpd.c index a0ba42f71f8650..164132570a3668 100644 --- a/contrib/tcp_wrappers/tcpd.c +++ b/contrib/tcp_wrappers/tcpd.c @@ -44,6 +44,7 @@ static char sccsid[] = "@(#) tcpd.c 1.10 96/02/11 17:01:32"; int allow_severity = SEVERITY; /* run-time adjustable */ int deny_severity = LOG_WARNING; /* ditto */ +int main(int argc, char **argv) { struct request_info request; diff --git a/libexec/tcpd/Makefile b/libexec/tcpd/Makefile index bb8f09ca13acce..4845013f748918 100644 --- a/libexec/tcpd/Makefile +++ b/libexec/tcpd/Makefile @@ -6,7 +6,6 @@ PACKAGE= tcpd PROG= tcpd MAN= tcpd.8 -CSTD?= c89 CFLAGS+=-DREAL_DAEMON_DIR=\"${LIBEXECDIR}\" \ -DSEVERITY=LOG_INFO -DRFC931_TIMEOUT=10 \ -DHOSTS_DENY=\"/etc/hosts.deny\" -DHOSTS_ALLOW=\"/etc/hosts.allow\" \ From d0d7fcbae4207402b35f37cabe2bb5f30bec7c5d Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 9 Jan 2025 12:52:50 -0500 Subject: [PATCH 092/143] dumpon: Move the _Noreturn keyword before the return type This fixes a warning from GCC 14 when compiling with the native C11 _Noreturn rather than the older GNU C function attribute: sbin/dumpon/dumpon.c:73:1: error: '_Noreturn' is not at beginning of declaration [-Werror=old-style-declaration] 73 | static void _Noreturn | ^~~~~~ --- sbin/dumpon/dumpon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sbin/dumpon/dumpon.c b/sbin/dumpon/dumpon.c index 46652d8471ebde..e6c1634ff6fe38 100644 --- a/sbin/dumpon/dumpon.c +++ b/sbin/dumpon/dumpon.c @@ -70,7 +70,7 @@ static int verbose; -static void _Noreturn +static _Noreturn void usage(void) { fprintf(stderr, From c6eb7f3fbffd9065ab75a2ed266f1b069fd97e6e Mon Sep 17 00:00:00 2001 From: Minsoo Choo Date: Thu, 9 Jan 2025 13:28:12 -0500 Subject: [PATCH 093/143] zstd: Add a stub for the kernel The stub header includes . zstd's xx_hash.h #includes for the definition of static_assert() when building with C11 or newer. Reviewed by: jhb Differential Revision: https://reviews.freebsd.org/D43239 --- sys/contrib/zstd/lib/freebsd/assert.h | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 sys/contrib/zstd/lib/freebsd/assert.h diff --git a/sys/contrib/zstd/lib/freebsd/assert.h b/sys/contrib/zstd/lib/freebsd/assert.h new file mode 100644 index 00000000000000..eb2efe9be6c0b4 --- /dev/null +++ b/sys/contrib/zstd/lib/freebsd/assert.h @@ -0,0 +1,2 @@ +/* This file is in the public domain */ +#include From 6af088c736c2fd9e64f2ad9449b7df1a109e6241 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 9 Jan 2025 15:09:21 -0500 Subject: [PATCH 094/143] BUS_CHILD_DETACHED.9: This is also called if DEVICE_ATTACH fails Reviewed by: imp Differential Revision: https://reviews.freebsd.org/D48363 --- share/man/man9/BUS_CHILD_DETACHED.9 | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/share/man/man9/BUS_CHILD_DETACHED.9 b/share/man/man9/BUS_CHILD_DETACHED.9 index 4cc00a49465b7a..8b59d1362d3d14 100644 --- a/share/man/man9/BUS_CHILD_DETACHED.9 +++ b/share/man/man9/BUS_CHILD_DETACHED.9 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd August 21, 2012 +.Dd January 9, 2025 .Dt BUS_CHILD_DETACHED 9 .Os .Sh NAME @@ -39,7 +39,10 @@ .Sh DESCRIPTION The .Fn BUS_CHILD_DETACHED -method is invoked by the new-bus framework after a device is detached. +method is invoked by the new-bus framework after a device is detached +or if a driver's attach routine +.Pq see Xr DEVICE_ATTACH 9 +fails. A bus driver can provide an implementation of this method to reclaim any resources allocated on behalf of the child or to cleanup state not properly released by a From ccabc7c2e556ac0b14da9b682b706ccaf251c0fe Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 9 Jan 2025 15:20:16 -0500 Subject: [PATCH 095/143] DEVICE_IDENTIFY.9: Modernize description and use cases Mention adding devices based on firmware tables and software-only pseudo-devices as use cases for identify methods as those are more common than reading random I/O ports to identify a legacy ISA device. Describe how device_find_chid can be used to avoid duplicates. While here, explicitly note that devices added in identify methods typically use a fixed device name. Trim the cross-references a bit. Reviewed by: ziaee, imp Differential Revision: https://reviews.freebsd.org/D48367 --- share/man/man9/DEVICE_IDENTIFY.9 | 52 +++++++++++++++----------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/share/man/man9/DEVICE_IDENTIFY.9 b/share/man/man9/DEVICE_IDENTIFY.9 index d75c1a91ce4aed..b10d9414305034 100644 --- a/share/man/man9/DEVICE_IDENTIFY.9 +++ b/share/man/man9/DEVICE_IDENTIFY.9 @@ -26,44 +26,46 @@ .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" -.Dd January 15, 2017 +.Dd January 9, 2025 .Dt DEVICE_IDENTIFY 9 .Os .Sh NAME .Nm DEVICE_IDENTIFY -.Nd identify a device, register it +.Nd identify new child devices and register them .Sh SYNOPSIS .In sys/param.h .In sys/bus.h .Ft void .Fn DEVICE_IDENTIFY "driver_t *driver" "device_t parent" .Sh DESCRIPTION -The identify function for a device is only needed for devices on buses -that cannot identify their children independently, e.g.\& the ISA bus. -It is used to recognize the device (usually done by accessing non-ambiguous -registers in the hardware) and to tell the kernel about it and thus -creating a new device instance. +The identify method of a device driver is used to add devices that cannot be +enumerated by the standard method on a bus device. +Devices can be enumerated in various ways including accessing non-ambiguous +device registers and parsing firmware tables. +Software-only pseudo devices are also often enumerated via identify methods. .Pp +For each newly identified device, +a new device instance should be created by invoking the .Xr BUS_ADD_CHILD 9 -is used to register the device as a child of the bus. -The device's resources (such as IRQ and I/O ports) are registered -with the kernel by calling -.Fn bus_set_resource -for each resource (refer to +method. +If the identify method is able to discover other properties about the new +device, those should also be set. +For example, device resources should be added to the device by calling .Xr bus_set_resource 9 -for more information). +for each resource. .Pp -Since the device tree and the device driver tree are disjoint, the -.Fn DEVICE_IDENTIFY -routine needs to take this into account. -If you load and unload your device driver that has the identify -routine, the child node has the potential for adding the same node -multiple times unless specific measure are taken to preclude that -possibility. +An identify method might be invoked multiple times. +If a device driver is unloaded and loaded, +the identify method will be called a second time after being reloaded. +As a result, the identify method should avoid duplicate devices. +Devices added by identify methods typically use a fixed device name +in which case +.Xr device_find_child 9 +can be used to detect existing devices. .Sh EXAMPLES The following pseudo-code shows an example of a function that probes for a piece of hardware and registers it and its resource -(an I/O port) with the kernel. +(an I/O port) with the parent bus device. .Bd -literal void foo_identify(driver_t *driver, device_t parent) @@ -72,7 +74,7 @@ foo_identify(driver_t *driver, device_t parent) retrieve_device_information; if (devices matches one of your supported devices && - not already in device tree) { + device_get_child(parent, "foo", DEVICE_UNIT_ANY) == NULL) { child = BUS_ADD_CHILD(parent, 0, "foo", DEVICE_UNIT_ANY); bus_set_resource(child, SYS_RES_IOPORT, 0, FOO_IOADDR, 1); } @@ -82,11 +84,7 @@ foo_identify(driver_t *driver, device_t parent) .Xr BUS_ADD_CHILD 9 , .Xr bus_set_resource 9 , .Xr device 9 , -.Xr device_add_child 9 , -.Xr DEVICE_ATTACH 9 , -.Xr DEVICE_DETACH 9 , -.Xr DEVICE_PROBE 9 , -.Xr DEVICE_SHUTDOWN 9 +.Xr device_find_child 9 .Sh AUTHORS This manual page was written by .An Alexander Langer Aq Mt alex@FreeBSD.org . From ed49d3b31d425a0add04aff6eb721a474937b7da Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 9 Jan 2025 21:09:52 -0500 Subject: [PATCH 096/143] twe.4: Remove manpage for previously-removed driver Reviewed by: ziaee, imp Fixes: 062a7b918fac twe: Remove driver Differential Revision: https://reviews.freebsd.org/D48403 --- ObsoleteFiles.inc | 3 + share/man/man4/Makefile | 1 - share/man/man4/twe.4 | 278 ---------------------------------------- 3 files changed, 3 insertions(+), 279 deletions(-) delete mode 100644 share/man/man4/twe.4 diff --git a/ObsoleteFiles.inc b/ObsoleteFiles.inc index 473daec4c737c2..c05a0d37c72796 100644 --- a/ObsoleteFiles.inc +++ b/ObsoleteFiles.inc @@ -2236,6 +2236,9 @@ OLD_FILES+=usr/share/certs/trusted/TrustCor_ECA-1.pem OLD_FILES+=usr/share/certs/trusted/TrustCor_RootCert_CA-1.pem OLD_FILES+=usr/share/certs/trusted/TrustCor_RootCert_CA-2.pem +# 20230510: twe(4) driver removed +OLD_FILES+=usr/share/man/man4/twe.4.gz + # 20230505: md5 tests are now self-contained OLD_FILES+=usr/tests/sbin/md5/1.inp OLD_FILES+=usr/tests/sbin/md5/1.sha512-p.chk diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index a7dbf6c615d62c..c03ba63c349ff9 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -584,7 +584,6 @@ MAN= aac.4 \ tslog.4 \ tty.4 \ tun.4 \ - twe.4 \ tws.4 \ udp.4 \ udplite.4 \ diff --git a/share/man/man4/twe.4 b/share/man/man4/twe.4 deleted file mode 100644 index 03a51b7b6a5096..00000000000000 --- a/share/man/man4/twe.4 +++ /dev/null @@ -1,278 +0,0 @@ -.\" -.\" Copyright (c) 2000 Michael Smith -.\" Copyright (c) 2000 BSDi -.\" All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. The name of the author may not be used to endorse or promote products -.\" derived from this software without specific prior written permission -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, -.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -.\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -.\" -.Dd May 7, 2023 -.Dt TWE 4 -.Os -.Sh NAME -.Nm twe -.Nd 3ware 5000/6000/7000/8000 series PATA/SATA RAID adapter driver -.Sh SYNOPSIS -To compile this driver into the kernel, -place the following lines in your -kernel configuration file: -.Bd -ragged -offset indent -.Cd "device pci" -.Cd "device twe" -.Ed -.Pp -Alternatively, to load the driver as a -module at boot time, place the following line in -.Xr loader.conf 5 : -.Bd -literal -offset indent -twe_load="YES" -.Ed -.Sh DEPRECATION NOTICE -The -.Nm -driver is not present in -.Fx 14.0 . -.Sh DESCRIPTION -The -.Nm -driver provides support for AMCC's 3ware 5000/6000/7000/8000 series -PATA/SATA RAID adapters. -These adapters were formerly known as -.Dq 3ware Escalade . -.Pp -These devices support 2, 4, 8, or 12 ATA disk drives -and provide RAID0 (striping) and RAID1 (mirroring) functionality. -.Sh HARDWARE -The -.Nm -driver supports the following PATA/SATA RAID -controllers: -.Pp -.Bl -bullet -compact -.It -AMCC's 3ware 5000 series -.It -AMCC's 3ware 6000 series -.It -AMCC's 3ware 7000-2 -.It -AMCC's 3ware 7006-2 -.It -AMCC's 3ware 7500-4LP -.It -AMCC's 3ware 7500-8 -.It -AMCC's 3ware 7500-12 -.It -AMCC's 3ware 7506-4LP -.It -AMCC's 3ware 7506-8 -.It -AMCC's 3ware 7506-12 -.It -AMCC's 3ware 8006-2LP -.It -AMCC's 3ware 8500-4LP -.It -AMCC's 3ware 8500-8 -.It -AMCC's 3ware 8500-12 -.It -AMCC's 3ware 8506-4LP -.It -AMCC's 3ware 8506-8 -.It -AMCC's 3ware 8506-8MI -.It -AMCC's 3ware 8506-12 -.It -AMCC's 3ware 8506-12MI -.El -.Sh DIAGNOSTICS -.Ss Controller initialisation phase -.Bl -diag -.It twe%d: microcontroller not ready -.Pp -The controller's onboard CPU is not reporting that it is ready; -this may be due to either a board or system failure. -Initialisation has failed. -.It twe%d: no attention interrupt -.It twe%d: can't drain AEN queue -.It twe%d: reset not reported -.It twe%d: controller errors detected -.It twe%d: can't drain response queue -.It twe%d: reset %d failed, trying again -.Pp -The controller is not responding correctly to -the driver's attempts to reset and initialise it. -This process is retried several times. -.It twe%d: can't initialise controller, giving up -.Pp -Several attempts to reset and initialise the controller have failed; -initialisation has failed -and the driver will not attach to this controller. -.El -.Ss Driver initialisation/shutdown phase -.Bl -diag -.It twe%d: register window not available -.It twe%d: can't allocate register window -.It twe%d: can't allocate parent DMA tag -.It twe%d: can't allocate interrupt -.It twe%d: can't set up interrupt -.It twe%d: can't establish configuration hook -.Pp -A resource allocation error occurred while initialising the driver; -initialisation has failed -and the driver will not attach to this controller. -.It twe%d: can't detect attached units -.Pp -Fetching the list of attached units failed; initialisation has failed. -.It twe%d: error fetching capacity for unit %d -.It twe%d: error fetching state for unit %d -.It twe%d: error fetching descriptor size for unit %d -.It twe%d: error fetching descriptor for unit %d -.It twe%d: device_add_child failed -.It twe%d: bus_generic_attach returned %d -.Pp -Creation of the disk devices failed, either due to communication -problems with the adapter or due to resource shortage; -attachment of one or more units may have been aborted. -.El -.Ss Operational phase -.Bl -diag -.It twe%d: command completed - %s -.El -.Pp -A command was reported completed with a warning by the controller. -The warning may be one of: -.Bl -diag -.It redundant/inconsequential request ignored -.It failed to write zeroes to LBA 0 -.It failed to profile TwinStor zones -.El -.Bl -diag -.It twe%d: command failed - %s -.El -.Pp -A command was reported as failed by the controller. -The failure message may be one of: -.Bl -diag -.It aborted due to system command or reconfiguration -.It aborted -.It access error -.It access violation -.It device failure -.It controller error -.It timed out -.It invalid unit number -.It unit not available -.It undefined opcode -.It request incompatible with unit -.It invalid request -.It firmware error, reset requested -.Pp -The command will be returned to the operating system after a -fatal error. -.El -.Bl -diag -.It twe%d: command failed submission - controller wedged -.Pp -A command could not be delivered to the controller because -the controller is unresponsive. -.It twe%d: AEN: <%s> -.El -.Pp -The controller has reported a change in status using an AEN -(Asynchronous Event Notification). -The following AENs may be reported: -.Bl -diag -.It queue empty -.It soft reset -.It degraded mirror -.It controller error -.It rebuild fail -.It rebuild done -.It incomplete unit -.It initialisation done -.It unclean shutdown detected -.It drive timeout -.It drive error -.It rebuild started -.It aen queue full -.Pp -AENs are also queued internally for use by management tools. -.El -.Bl -diag -.It twe%d: error polling for signalled AENs -.Pp -The controller has reported -that one or more status messages are ready for the driver, -but attempting to fetch one of these has returned an error. -.It twe%d: AEN queue overflow, lost AEN <%s> -.Pp -A status message was retrieved from the controller, -but there is no more room to queue it in the driver. -The message is lost (but will be printed to the console). -.It twe%d: missing expected status bits %s -.It twe%d: unexpected status bits %s -.Pp -A check of the controller's status bits -indicates an unexpected condition. -.It twe%d: host interrupt -.Pp -The controller has signalled a host interrupt. -This serves an unknown purpose and is ignored. -.It twe%d: command interrupt -.Pp -The controller has signalled a command interrupt. -This is not used, and will be disabled. -.It twe%d: controller reset in progress... -.Pp -The controller is being reset by the driver. -Typically this is done when the driver has determined that the -controller is in an unrecoverable state. -.It twe%d: can't reset controller, giving up -.Pp -The driver has given up on resetting the controller. -No further I/O will be handled. -.It controller reset done, %d commands restarted -.Pp -The controller was successfully reset, -and outstanding commands were restarted. -.El -.Sh AUTHORS -.An -nosplit -The -.Nm -driver and manual page were written by -.An Michael Smith Aq Mt msmith@FreeBSD.org . -.Pp -Extensive work done on the driver by -.An Vinod Kashyap Aq Mt vkashyap@FreeBSD.org -and -.An Paul Saab Aq Mt ps@FreeBSD.org . -.Sh BUGS -The controller cannot handle I/O transfers -that are not aligned to a 512-byte boundary. -In order to support raw device access from user-space, -the driver will perform alignment fixup on non-aligned data. -This process is inefficient, -and thus in order to obtain best performance -user-space applications accessing the device -should do so with aligned buffers. From f9f0a1d61c7b97c705246c747baec385e0592966 Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Thu, 9 Jan 2025 19:54:41 -0800 Subject: [PATCH 097/143] nfscl: Fix a crash when a readdir entry has nul in it Commit 026cdaa3b3a9 added a check for a nul or "/" in a file name in a readdir reply. Unfortunately, the minimal testing done on it did not detect a bug that can cause the client to crash. This patch fixes the code so that it does not crash. Note that a NFS server will not normally return a file name in a readdir reply that has a nul or "/" in it, so the crash is unlikely. PR: 283965 Reported by: asomers Tested by: asomers MFC after: 2 weeks --- sys/fs/nfsclient/nfs_clrpcops.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index e1c02a71939b3a..c35d0c6295b934 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -3397,6 +3397,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, nfsattrbit_t attrbits, dattrbits; u_int32_t rderr, *tl2 = NULL; size_t tresid; + bool validentry; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, @@ -3622,6 +3623,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { + validentry = true; if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; @@ -3701,6 +3703,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, uiop->uio_offset = savoff; uiop->uio_resid = savresid; blksiz = savblksiz; + validentry = false; } else { cp = uiop->uio_iov->iov_base; tlen -= len; @@ -3738,7 +3741,7 @@ nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, ncookie.lval[0] = 0; ncookie.lval[1] = *tl++; } - if (bigenough) { + if (bigenough && validentry) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; @@ -3875,7 +3878,7 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, size_t tresid; u_int32_t *tl2 = NULL, rderr; struct timespec dctime, ts; - bool attr_ok; + bool attr_ok, validentry; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, @@ -4086,6 +4089,7 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { + validentry = true; NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (nd->nd_flag & ND_NFSV4) { ncookie.lval[0] = *tl++; @@ -4161,6 +4165,7 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, uiop->uio_offset = savoff; uiop->uio_resid = savresid; blksiz = savblksiz; + validentry = false; } else { cp = uiop->uio_iov->iov_base; tlen -= len; @@ -4217,7 +4222,7 @@ nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, goto nfsmout; } - if (bigenough) { + if (bigenough && validentry) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; From 16f0d01f9ca1e28bede9a493329c5d66e317d88f Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Thu, 9 Jan 2025 22:27:50 -0600 Subject: [PATCH 098/143] arm64: apple: fix aic for !SMP configurations Allocate sc_cpuids anyways, even if it's just a single entry, to minimize functional diff between SMP and !SMP. Reviewed by: jhb Differential Revision: https://reviews.freebsd.org/D48289 --- sys/arm64/apple/apple_aic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sys/arm64/apple/apple_aic.c b/sys/arm64/apple/apple_aic.c index b500099a5430fd..c9ce3b4d21659a 100644 --- a/sys/arm64/apple/apple_aic.c +++ b/sys/arm64/apple/apple_aic.c @@ -137,9 +137,9 @@ struct apple_aic_softc { u_int sc_ndie; #ifdef SMP struct apple_aic_irqsrc sc_ipi_srcs[AIC_NIPIS]; - u_int *sc_cpuids; /* cpu index to AIC CPU ID */ uint32_t *sc_ipimasks; #endif + u_int *sc_cpuids; /* cpu index to AIC CPU ID */ }; static u_int aic_next_cpu; @@ -215,6 +215,7 @@ apple_aic_attach(device_t dev) #ifdef SMP sc->sc_ipimasks = malloc(sizeof(*sc->sc_ipimasks) * mp_maxid + 1, M_DEVBUF, M_WAITOK | M_ZERO); +#endif sc->sc_cpuids = malloc(sizeof(*sc->sc_cpuids) * mp_maxid + 1, M_DEVBUF, M_WAITOK | M_ZERO); @@ -223,8 +224,6 @@ apple_aic_attach(device_t dev) if (bootverbose) device_printf(dev, "BSP CPU %d: whoami %x\n", cpu, sc->sc_cpuids[cpu]); -#endif - name = device_get_nameunit(dev); for (i = 0; i < sc->sc_ndie; i++) { From 9d1de25930735261c16ed874a933b4c1f1d9041e Mon Sep 17 00:00:00 2001 From: Andrew Turner Date: Fri, 10 Jan 2025 10:34:52 +0000 Subject: [PATCH 099/143] Update the Arm Optimized Routines Import the v25.01 release of the Arm Optimized Routines [1]. [1] https://github.com/ARM-software/optimized-routines/tree/v25.01 Sponsored by: Arm Ltd --- MAINTAINERS | 9 +- Makefile | 16 +- README | 33 +- config.mk.dist | 99 +- math/Dir.mk | 253 ++- math/README.contributors | 5 +- .../aarch64/advsimd/acos.c | 30 +- .../aarch64/advsimd/acosf.c | 32 +- .../aarch64/advsimd/acosh.c | 27 +- .../aarch64/advsimd/acoshf.c | 62 +- .../aarch64/advsimd/asin.c | 75 +- .../aarch64/advsimd/asinf.c | 30 +- math/aarch64/advsimd/asinh.c | 242 +++ math/aarch64/advsimd/asinhf.c | 89 + .../aarch64/advsimd/atan.c | 85 +- math/aarch64/advsimd/atan2.c | 171 ++ .../aarch64/advsimd/atan2f.c | 84 +- .../aarch64/advsimd/atanf.c | 26 +- .../aarch64/advsimd/atanh.c | 45 +- .../aarch64/advsimd/atanhf.c | 49 +- .../aarch64/advsimd/cbrt.c | 43 +- .../aarch64/advsimd/cbrtf.c | 19 +- .../aarch64/advsimd/cexpi.c | 14 +- .../aarch64/advsimd/cexpif.c | 14 +- math/aarch64/{v_cos.c => advsimd/cos.c} | 21 +- math/aarch64/{v_cosf.c => advsimd/cosf.c} | 23 +- .../aarch64/advsimd/cosh.c | 25 +- .../aarch64/advsimd/coshf.c | 52 +- .../aarch64/advsimd/cospi.c | 25 +- .../aarch64/advsimd/cospif.c | 29 +- .../v_erf_2u5.c => math/aarch64/advsimd/erf.c | 48 +- .../aarch64/advsimd/erfc.c | 65 +- .../aarch64/advsimd/erfcf.c | 54 +- .../aarch64/advsimd/erff.c | 34 +- math/aarch64/{v_exp.c => advsimd/exp.c} | 11 +- .../aarch64/advsimd/exp10.c | 23 +- .../aarch64/advsimd/exp10f.c | 87 +- .../aarch64/advsimd/exp2.c | 28 +- math/aarch64/{v_exp2f.c => advsimd/exp2f.c} | 69 +- math/aarch64/advsimd/exp2f_1u.c | 73 + math/aarch64/{v_expf.c => advsimd/expf.c} | 72 +- math/aarch64/advsimd/expf_1u.c | 79 + math/aarch64/advsimd/expm1.c | 77 + math/aarch64/advsimd/expm1f.c | 82 + .../aarch64/advsimd}/finite_pow.h | 22 +- .../aarch64/advsimd/hypot.c | 30 +- .../aarch64/advsimd/hypotf.c | 36 +- math/aarch64/advsimd/log.c | 118 ++ math/aarch64/advsimd/log10.c | 132 ++ math/aarch64/advsimd/log10f.c | 106 ++ math/aarch64/advsimd/log1p.c | 61 + math/aarch64/advsimd/log1pf.c | 92 + math/aarch64/advsimd/log2.c | 123 ++ math/aarch64/advsimd/log2f.c | 102 ++ math/aarch64/advsimd/logf.c | 88 + math/aarch64/advsimd/modf.c | 33 + math/aarch64/advsimd/modff.c | 34 + .../v_pow_1u5.c => math/aarch64/advsimd/pow.c | 195 ++- math/aarch64/advsimd/powf.c | 209 +++ math/aarch64/{v_sin.c => advsimd/sin.c} | 26 +- .../aarch64/advsimd/sincos.c | 30 +- .../aarch64/advsimd/sincosf.c | 30 +- math/aarch64/advsimd/sincospi.c | 44 + math/aarch64/advsimd/sincospif.c | 43 + math/aarch64/{v_sinf.c => advsimd/sinf.c} | 36 +- math/aarch64/advsimd/sinh.c | 80 + .../aarch64/advsimd/sinhf.c | 46 +- .../aarch64/advsimd/sinpi.c | 25 +- .../aarch64/advsimd/sinpif.c | 29 +- .../v_tan_3u5.c => math/aarch64/advsimd/tan.c | 28 +- .../aarch64/advsimd/tanf.c | 35 +- math/aarch64/advsimd/tanh.c | 67 + .../aarch64/advsimd/tanhf.c | 44 +- math/aarch64/advsimd/tanpi.c | 88 + math/aarch64/advsimd/tanpif.c | 70 + math/aarch64/advsimd/v_expf_inline.h | 58 + math/aarch64/advsimd/v_expm1_inline.h | 86 + math/aarch64/advsimd/v_expm1f_inline.h | 62 + math/aarch64/advsimd/v_log1p_inline.h | 119 ++ math/aarch64/advsimd/v_log1pf_inline.h | 94 + .../aarch64/advsimd}/v_log_inline.h | 6 +- {pl/math => math/aarch64/advsimd}/v_math.h | 91 +- .../aarch64/advsimd/v_poly_f32.h | 6 +- .../aarch64/advsimd/v_poly_f64.h | 6 +- .../aarch64/advsimd}/v_sincos_common.h | 4 +- .../aarch64/advsimd}/v_sincosf_common.h | 2 +- math/aarch64/advsimd/v_sincospi_common.h | 64 + math/aarch64/advsimd/v_sincospif_common.h | 57 + .../cospi_3u1.c => math/aarch64/cospi_3u5.c | 31 +- {pl/math => math/aarch64}/cospif_2u6.c | 31 +- .../aarch64/experimental}/README.contributors | 7 - .../aarch64/experimental}/acos_2u.c | 44 +- .../aarch64/experimental}/acosf_1u4.c | 40 +- .../aarch64/experimental}/acosh_3u.c | 35 +- .../aarch64/experimental}/acoshf_2u8.c | 32 +- .../aarch64/experimental/advsimd/erfinv_25u.c | 35 +- .../aarch64/experimental/advsimd/erfinvf_5u.c | 49 +- .../experimental/advsimd}/v_logf_inline.h | 2 +- .../aarch64/experimental}/asin_3u.c | 40 +- .../aarch64/experimental}/asin_data.c | 2 +- .../aarch64/experimental}/asinf_2u5.c | 36 +- .../aarch64/experimental}/asinf_data.c | 2 +- .../aarch64/experimental}/asinh_2u5.c | 33 +- .../aarch64/experimental}/asinh_data.c | 17 +- .../aarch64/experimental}/asinhf_3u5.c | 25 +- math/aarch64/experimental/asinhf_data.c | 15 + .../aarch64/experimental}/atan2_2u5.c | 24 +- .../aarch64/experimental}/atan2f_3u.c | 24 +- .../aarch64/experimental}/atan_2u5.c | 22 +- .../aarch64/experimental}/atan_common.h | 2 +- math/aarch64/experimental/atan_data.c | 23 + .../aarch64/experimental}/atanf_2u9.c | 18 +- .../aarch64/experimental}/atanf_common.h | 2 +- math/aarch64/experimental/atanf_data.c | 17 + .../aarch64/experimental}/atanh_3u.c | 18 +- .../aarch64/experimental}/atanhf_3u1.c | 16 +- .../aarch64/experimental}/cbrt_2u.c | 16 +- .../aarch64/experimental}/cbrt_data.c | 2 +- .../aarch64/experimental}/cbrtf_1u5.c | 16 +- .../aarch64/experimental}/cbrtf_data.c | 2 +- .../aarch64/experimental}/cosh_2u.c | 34 +- .../aarch64/experimental}/coshf_1u9.c | 29 +- .../aarch64/experimental}/erf_2u5.c | 21 +- .../aarch64/experimental}/erfc_1u8.c | 26 +- .../aarch64/experimental}/erfcf_1u7.c | 24 +- .../aarch64/experimental}/erff_2u.c | 21 +- .../aarch64/experimental}/erfinv_24u5.c | 20 +- .../aarch64/experimental}/erfinvf_4u7.c | 16 +- .../aarch64/experimental}/erfinvl.c | 2 +- .../aarch64/experimental/exp_inline.h | 22 +- .../aarch64/experimental}/expf_data.c | 4 +- .../aarch64/experimental}/expm1_2u5.c | 20 +- math/aarch64/experimental/expm1_data.c | 21 + .../aarch64/experimental}/expm1f_1u6.c | 24 +- .../aarch64/experimental}/expm1f_data.c | 6 +- .../aarch64/experimental}/log10_2u.c | 33 +- .../aarch64/experimental}/log10_data.c | 4 +- .../aarch64/experimental}/log1p_2u.c | 20 +- math/aarch64/experimental/log1p_data.c | 20 + .../aarch64/experimental}/log1pf_2u1.c | 18 +- .../aarch64/experimental}/log1pf_data.c | 8 +- .../aarch64/experimental}/sinh_3u.c | 27 +- .../aarch64/experimental}/sinhf_2u3.c | 32 +- math/aarch64/experimental/sve/erfinv_25u.c | 156 ++ math/aarch64/experimental/sve/erfinvf_5u.c | 156 ++ .../aarch64/experimental/sve/powi.c | 3 +- .../aarch64/experimental/sve/powif.c | 3 +- .../aarch64/experimental/sve/sv_logf_inline.h | 51 + .../aarch64/experimental}/tanf_3u3.c | 42 +- .../aarch64/experimental}/tanf_data.c | 2 +- .../aarch64/experimental}/tanh_3u.c | 22 +- .../aarch64/experimental}/tanhf_2u6.c | 25 +- math/aarch64/sincospi_4u.c | 158 ++ math/aarch64/sincospif_3u2.c | 145 ++ .../sinpi_3u.c => math/aarch64/sinpi_3u5.c | 39 +- {pl/math => math/aarch64}/sinpif_2u5.c | 35 +- .../sv_acos_2u.c => math/aarch64/sve/acos.c | 24 +- .../aarch64/sve/acosf.c | 24 +- math/aarch64/sve/acosh.c | 51 + math/aarch64/sve/acoshf.c | 51 + .../sv_asin_3u.c => math/aarch64/sve/asin.c | 28 +- .../aarch64/sve/asinf.c | 24 +- math/aarch64/sve/asinh.c | 197 +++ .../aarch64/sve/asinhf.c | 38 +- .../sv_atan_2u5.c => math/aarch64/sve/atan.c | 22 +- .../aarch64/sve/atan2.c | 54 +- .../aarch64/sve/atan2f.c | 55 +- .../aarch64/sve/atanf.c | 22 +- .../aarch64/sve/atanh.c | 24 +- .../aarch64/sve/atanhf.c | 33 +- .../sv_cbrt_2u.c => math/aarch64/sve/cbrt.c | 35 +- .../aarch64/sve/cbrtf.c | 16 +- .../aarch64/sve/cexpi.c | 17 +- .../aarch64/sve/cexpif.c | 17 +- .../sv_cos_2u5.c => math/aarch64/sve/cos.c | 16 +- .../sv_cosf_2u1.c => math/aarch64/sve/cosf.c | 16 +- .../sv_cosh_2u.c => math/aarch64/sve/cosh.c | 34 +- math/aarch64/sve/coshf.c | 62 + .../aarch64/sve/cospi.c | 25 +- .../aarch64/sve/cospif.c | 25 +- .../sv_erf_2u5.c => math/aarch64/sve/erf.c | 28 +- .../sv_erfc_1u8.c => math/aarch64/sve/erfc.c | 24 +- .../aarch64/sve/erfcf.c | 36 +- .../sv_erff_2u.c => math/aarch64/sve/erff.c | 33 +- .../sv_exp_1u5.c => math/aarch64/sve/exp.c | 56 +- .../aarch64/sve/exp10.c | 43 +- math/aarch64/sve/exp10f.c | 101 ++ .../sv_exp2_2u.c => math/aarch64/sve/exp2.c | 44 +- math/aarch64/sve/exp2f.c | 83 + math/aarch64/sve/expf.c | 50 + .../aarch64/sve/expm1.c | 20 +- .../aarch64/sve/expm1f.c | 46 +- .../aarch64/sve/hypot.c | 20 +- .../aarch64/sve/hypotf.c | 20 +- math/aarch64/sve/log.c | 97 + math/aarch64/sve/log10.c | 101 ++ .../aarch64/sve/log10f.c | 65 +- .../aarch64/sve/log1p.c | 24 +- math/aarch64/sve/log1pf.c | 43 + math/aarch64/sve/log2.c | 96 + .../aarch64/sve/log2f.c | 62 +- .../sv_logf_3u4.c => math/aarch64/sve/logf.c | 64 +- math/aarch64/sve/modf.c | 36 + math/aarch64/sve/modff.c | 36 + .../sv_pow_1u5.c => math/aarch64/sve/pow.c | 295 ++-- .../sv_powf_2u6.c => math/aarch64/sve/powf.c | 157 +- .../sv_sin_3u5.c => math/aarch64/sve/sin.c | 16 +- .../aarch64/sve/sincos.c | 36 +- .../aarch64/sve/sincosf.c | 36 +- math/aarch64/sve/sincospi.c | 47 + math/aarch64/sve/sincospif.c | 46 + .../sv_sinf_1u9.c => math/aarch64/sve/sinf.c | 16 +- .../sv_sinh_3u.c => math/aarch64/sve/sinh.c | 20 +- .../aarch64/sve/sinhf.c | 21 +- .../aarch64/sve/sinpi.c | 33 +- .../aarch64/sve/sinpif.c | 33 +- math/aarch64/sve/sv_expf_inline.h | 66 + .../aarch64/sve}/sv_expm1f_inline.h | 36 +- .../aarch64/sve}/sv_log1p_inline.h | 14 +- math/aarch64/sve/sv_log1pf_inline.h | 83 + math/aarch64/sve/sv_log_inline.h | 83 + {pl/math => math/aarch64/sve}/sv_math.h | 32 +- .../aarch64/sve/sv_poly_f32.h | 8 +- .../aarch64/sve/sv_poly_f64.h | 8 +- .../aarch64/sve/sv_poly_generic.h | 32 +- .../aarch64/sve}/sv_sincos_common.h | 4 +- .../aarch64/sve}/sv_sincosf_common.h | 2 +- math/aarch64/sve/sv_sincospi_common.h | 76 + math/aarch64/sve/sv_sincospif_common.h | 82 + math/aarch64/sve/tan.c | 131 ++ .../sv_tanf_3u5.c => math/aarch64/sve/tanf.c | 46 +- .../sv_tanh_3u.c => math/aarch64/sve/tanh.c | 20 +- math/aarch64/sve/tanhf.c | 68 + math/aarch64/sve/tanpi.c | 89 + math/aarch64/sve/tanpif.c | 68 + math/aarch64/tanpi_2u5.c | 158 ++ math/aarch64/tanpif_3u1.c | 145 ++ .../erf_data.c => math/aarch64/v_erf_data.c | 10 +- .../erfc_data.c => math/aarch64/v_erfc_data.c | 10 +- .../aarch64/v_erfcf_data.c | 10 +- .../erff_data.c => math/aarch64/v_erff_data.c | 10 +- math/aarch64/v_exp2f_1u.c | 72 - math/aarch64/v_exp_data.c | 99 +- {pl/math => math/aarch64}/v_exp_tail_data.c | 4 +- math/aarch64/v_expf_1u.c | 77 - math/aarch64/v_log.c | 100 -- {pl/math => math/aarch64}/v_log10_data.c | 2 +- {pl/math => math/aarch64}/v_log2_data.c | 2 +- math/aarch64/v_log_data.c | 25 +- math/aarch64/v_logf.c | 74 - math/aarch64/v_math.h | 135 -- math/aarch64/v_pow.c | 22 - {pl/math => math/aarch64}/v_pow_exp_data.c | 2 +- {pl/math => math/aarch64}/v_pow_log_data.c | 2 +- math/aarch64/v_powf.c | 148 -- {pl/math => math/aarch64}/v_powf_data.c | 2 +- math/cosf.c | 10 +- math/erf.c | 12 +- math/erff.c | 12 +- math/exp.c | 25 +- math/exp10.c | 22 +- math/exp2.c | 11 +- math/exp2f.c | 10 +- math/expf.c | 10 +- math/include/mathlib.h | 294 +++- math/include/test_defs.h | 21 + math/include/test_sig.h | 47 + math/log.c | 11 +- {pl/math => math}/log10f.c | 24 +- math/log2.c | 11 +- math/log2f.c | 11 +- math/logf.c | 11 +- math/logf_data.c | 3 +- math/math_config.h | 261 ++- {pl/math => math}/poly_generic.h | 2 +- {pl/math => math}/poly_scalar_f32.h | 6 +- {pl/math => math}/poly_scalar_f64.h | 6 +- math/pow.c | 22 +- math/powf.c | 12 +- math/sincosf.c | 12 +- math/sincosf.h | 5 +- math/sinf.c | 10 +- math/test/mathbench.c | 229 +-- math/test/mathbench_funcs.h | 141 +- math/test/mathbench_wrappers.h | 302 +++- math/test/mathtest.c | 12 +- math/test/rtest/dotest.c | 45 +- math/test/runulp.sh | 311 +--- math/test/test_defs.h | 31 + .../test/testcases/directed/acos.tst | 2 +- .../test/testcases/directed/acosf.tst | 2 +- .../test/testcases/directed/acosh.tst | 2 +- .../test/testcases/directed/acoshf.tst | 2 +- .../test/testcases/directed/asin.tst | 2 +- .../test/testcases/directed/asinf.tst | 2 +- .../test/testcases/directed/asinh.tst | 2 +- .../test/testcases/directed/asinhf.tst | 2 +- .../test/testcases/directed/atan.tst | 2 +- .../test/testcases/directed/atan2.tst | 2 +- .../test/testcases/directed/atan2f.tst | 2 +- .../test/testcases/directed/atanf.tst | 2 +- .../test/testcases/directed/atanh.tst | 2 +- .../test/testcases/directed/atanhf.tst | 2 +- .../test/testcases/directed/cbrtf.tst | 2 +- .../test/testcases/directed/cosh.tst | 2 +- .../test/testcases/directed/coshf.tst | 2 +- .../test/testcases/directed/erfc.tst | 2 +- .../test/testcases/directed/erfcf.tst | 2 +- .../test/testcases/directed/expm1.tst | 2 +- .../test/testcases/directed/expm1f.tst | 2 +- .../test/testcases/directed/log10.tst | 2 +- .../test/testcases/directed/log10f.tst | 2 +- .../test/testcases/directed/log1p.tst | 2 +- .../test/testcases/directed/log1pf.tst | 2 +- .../test/testcases/directed/sinh.tst | 2 +- .../test/testcases/directed/sinhf.tst | 2 +- .../test/testcases/directed/tanf.tst | 2 +- .../test/testcases/directed/tanh.tst | 2 +- .../test/testcases/directed/tanhf.tst | 2 +- math/test/trigpi_references.h | 106 ++ math/test/ulp.c | 328 ++-- math/test/ulp.h | 41 +- math/test/ulp_funcs.h | 119 +- math/test/ulp_wrappers.h | 418 ++++- math/tgamma128.c | 2 + {pl/math => math}/tools/asin.sollya | 2 +- {pl/math => math}/tools/asinf.sollya | 2 +- {pl/math => math}/tools/asinh.sollya | 2 +- {pl/math => math}/tools/asinhf.sollya | 2 +- {pl/math => math}/tools/atan.sollya | 2 +- {pl/math => math}/tools/atanf.sollya | 2 +- {pl/math => math}/tools/cbrt.sollya | 2 +- {pl/math => math}/tools/cbrtf.sollya | 2 +- {pl/math => math}/tools/erf.sollya | 2 +- {pl/math => math}/tools/erfc.sollya | 2 +- {pl/math => math}/tools/erfcf.sollya | 2 +- {pl/math => math}/tools/erff.sollya | 2 +- {pl/math => math}/tools/exp10.sollya | 2 +- {pl/math => math}/tools/expm1.sollya | 2 +- {pl/math => math}/tools/expm1f.sollya | 2 +- {pl/math => math}/tools/log10.sollya | 2 +- {pl/math => math}/tools/log10f.sollya | 2 +- {pl/math => math}/tools/log1p.sollya | 2 +- {pl/math => math}/tools/log1pf.sollya | 2 +- {pl/math => math}/tools/sincos.sollya | 4 +- {pl/math => math}/tools/sincosf.sollya | 2 +- {pl/math => math}/tools/sinpi.sollya | 2 +- {pl/math => math}/tools/tan.sollya | 2 +- {pl/math => math}/tools/tanf.sollya | 2 +- math/tools/tanpi.sollya | 48 + {pl/math => math}/tools/v_erf.sollya | 2 +- {pl/math => math}/tools/v_erfc.sollya | 2 +- {pl/math => math}/tools/v_log10.sollya | 2 +- {pl/math => math}/tools/v_log10f.sollya | 2 +- {pl/math => math}/tools/v_log2f.sollya | 2 +- networking/Dir.mk | 6 +- pl/Dir.mk | 21 - pl/math/Dir.mk | 216 --- pl/math/asinhf_data.c | 15 - pl/math/atan_data.c | 20 - pl/math/atanf_data.c | 15 - pl/math/exp_data.c | 1120 ------------ pl/math/expf.c | 76 - pl/math/expm1_data.c | 21 - pl/math/include/mathlib.h | 206 --- pl/math/include/pl_test.h | 24 - pl/math/log.c | 161 -- pl/math/log1p_data.c | 19 - pl/math/log_data.c | 511 ------ pl/math/logf.c | 75 - pl/math/logf_data.c | 36 - pl/math/math_config.h | 624 ------- pl/math/math_err.c | 78 - pl/math/math_errf.c | 78 - pl/math/pl_sig.h | 59 - pl/math/sv_acosh_3u5.c | 50 - pl/math/sv_acoshf_2u8.c | 47 - pl/math/sv_asinh_3u0.c | 129 -- pl/math/sv_coshf_2u.c | 56 - pl/math/sv_erf_data.c | 1558 ----------------- pl/math/sv_erff_data.c | 1046 ----------- pl/math/sv_exp10f_1u5.c | 87 - pl/math/sv_exp2f_1u6.c | 80 - pl/math/sv_expf_2u.c | 86 - pl/math/sv_expf_inline.h | 66 - pl/math/sv_log10_2u5.c | 75 - pl/math/sv_log1pf_1u3.c | 97 - pl/math/sv_log1pf_inline.h | 65 - pl/math/sv_log2_3u.c | 73 - pl/math/sv_log_2u5.c | 76 - pl/math/sv_tan_3u5.c | 99 -- pl/math/sv_tanhf_2u6.c | 59 - pl/math/test/mathbench_funcs.h | 87 - pl/math/test/mathbench_wrappers.h | 206 --- pl/math/test/pl_test.h | 39 - pl/math/test/runulp.sh | 78 - pl/math/test/testcases/directed/erff.tst | 17 - pl/math/test/testcases/directed/log2.tst | 21 - pl/math/test/testcases/directed/log2f.tst | 27 - pl/math/test/testcases/random/double.tst | 6 - pl/math/test/testcases/random/float.tst | 8 - pl/math/test/ulp_funcs.h | 70 - pl/math/test/ulp_wrappers.h | 140 -- pl/math/trigpi_references.c | 57 - pl/math/v_asinh_3u5.c | 175 -- pl/math/v_asinhf_2u7.c | 80 - pl/math/v_atan2_3u.c | 121 -- pl/math/v_exp_data.c | 55 - pl/math/v_exp_tail.h | 21 - pl/math/v_exp_tail_inline.h | 102 -- pl/math/v_expf_inline.h | 60 - pl/math/v_expm1_2u5.c | 118 -- pl/math/v_expm1f_1u6.c | 117 -- pl/math/v_expm1f_inline.h | 63 - pl/math/v_log10_2u5.c | 120 -- pl/math/v_log10f_3u5.c | 82 - pl/math/v_log1p_2u5.c | 128 -- pl/math/v_log1p_inline.h | 91 - pl/math/v_log1pf_2u1.c | 126 -- pl/math/v_log1pf_inline.h | 67 - pl/math/v_log2_3u.c | 109 -- pl/math/v_log2f_2u5.c | 77 - pl/math/v_log_data.c | 161 -- pl/math/v_sinh_3u.c | 118 -- pl/math/v_tanh_3u.c | 106 -- string/Dir.mk | 9 +- string/aarch64/__mtag_tag_region.S | 3 - string/aarch64/__mtag_tag_zero_region.S | 3 - string/aarch64/asmdefs.h | 37 - .../aarch64/{ => experimental}/memchr-sve.S | 8 +- .../aarch64/{ => experimental}/memcmp-sve.S | 9 +- .../aarch64/{ => experimental}/stpcpy-sve.S | 0 .../aarch64/{ => experimental}/strchr-sve.S | 7 +- .../{ => experimental}/strchrnul-sve.S | 0 .../aarch64/{ => experimental}/strcmp-sve.S | 8 +- .../aarch64/{ => experimental}/strcpy-sve.S | 8 +- .../aarch64/{ => experimental}/strlen-sve.S | 7 +- .../aarch64/{ => experimental}/strncmp-sve.S | 9 +- .../aarch64/{ => experimental}/strnlen-sve.S | 8 +- .../aarch64/{ => experimental}/strrchr-sve.S | 7 +- string/aarch64/memchr-mte.S | 2 - string/aarch64/memchr.S | 2 - string/aarch64/memcmp.S | 4 - string/aarch64/memcpy-advsimd.S | 3 - string/aarch64/memcpy-mops.S | 4 - string/aarch64/memcpy-sve.S | 8 - string/aarch64/memcpy.S | 3 - string/aarch64/memmove-mops.S | 4 - string/aarch64/memrchr.S | 1 - string/aarch64/memset-mops.S | 3 - string/aarch64/memset-sve.S | 114 ++ string/aarch64/memset.S | 104 +- string/aarch64/strchr-mte.S | 1 - string/aarch64/strchr.S | 1 - string/aarch64/strchrnul-mte.S | 1 - string/aarch64/strchrnul.S | 1 - string/aarch64/strcmp.S | 2 - string/aarch64/strcpy.S | 2 - string/aarch64/strlen-mte.S | 38 +- string/aarch64/strlen.S | 1 - string/aarch64/strncmp.S | 3 - string/aarch64/strnlen.S | 2 - string/aarch64/strrchr-mte.S | 1 - string/aarch64/strrchr.S | 1 - string/bench/memcpy.c | 239 +-- string/bench/memset.c | 141 +- string/bench/strlen.c | 206 +-- string/include/benchlib.h | 31 + string/include/stringlib.h | 3 +- string/test/memcpy.c | 2 - string/test/memmove.c | 2 - string/test/memset.c | 3 + 472 files changed, 11852 insertions(+), 14525 deletions(-) rename pl/math/v_acos_2u.c => math/aarch64/advsimd/acos.c (85%) rename pl/math/v_acosf_1u4.c => math/aarch64/advsimd/acosf.c (82%) rename pl/math/v_acosh_3u5.c => math/aarch64/advsimd/acosh.c (72%) rename pl/math/v_acoshf_3u1.c => math/aarch64/advsimd/acoshf.c (50%) rename pl/math/v_asin_3u.c => math/aarch64/advsimd/asin.c (56%) rename pl/math/v_asinf_2u5.c => math/aarch64/advsimd/asinf.c (82%) create mode 100644 math/aarch64/advsimd/asinh.c create mode 100644 math/aarch64/advsimd/asinhf.c rename pl/math/v_atan_2u5.c => math/aarch64/advsimd/atan.c (51%) create mode 100644 math/aarch64/advsimd/atan2.c rename pl/math/v_atan2f_3u.c => math/aarch64/advsimd/atan2f.c (54%) rename pl/math/v_atanf_3u.c => math/aarch64/advsimd/atanf.c (85%) rename pl/math/v_atanh_3u5.c => math/aarch64/advsimd/atanh.c (55%) rename pl/math/v_atanhf_3u1.c => math/aarch64/advsimd/atanhf.c (54%) rename pl/math/v_cbrt_2u.c => math/aarch64/advsimd/cbrt.c (76%) rename pl/math/v_cbrtf_1u7.c => math/aarch64/advsimd/cbrtf.c (91%) rename pl/math/v_cexpi_3u5.c => math/aarch64/advsimd/cexpi.c (79%) rename pl/math/v_cexpif_1u8.c => math/aarch64/advsimd/cexpif.c (80%) rename math/aarch64/{v_cos.c => advsimd/cos.c} (80%) rename math/aarch64/{v_cosf.c => advsimd/cosf.c} (76%) rename pl/math/v_cosh_2u.c => math/aarch64/advsimd/cosh.c (84%) rename pl/math/v_coshf_2u4.c => math/aarch64/advsimd/coshf.c (64%) rename pl/math/v_cospi_3u1.c => math/aarch64/advsimd/cospi.c (81%) rename pl/math/v_cospif_3u2.c => math/aarch64/advsimd/cospif.c (76%) rename pl/math/v_erf_2u5.c => math/aarch64/advsimd/erf.c (77%) rename pl/math/v_erfc_1u8.c => math/aarch64/advsimd/erfc.c (77%) rename pl/math/v_erfcf_1u7.c => math/aarch64/advsimd/erfcf.c (76%) rename pl/math/v_erff_2u.c => math/aarch64/advsimd/erff.c (76%) rename math/aarch64/{v_exp.c => advsimd/exp.c} (90%) rename pl/math/v_exp10_2u.c => math/aarch64/advsimd/exp10.c (89%) rename pl/math/v_exp10f_2u4.c => math/aarch64/advsimd/exp10f.c (58%) rename pl/math/v_exp2_2u.c => math/aarch64/advsimd/exp2.c (82%) rename math/aarch64/{v_exp2f.c => advsimd/exp2f.c} (58%) create mode 100644 math/aarch64/advsimd/exp2f_1u.c rename math/aarch64/{v_expf.c => advsimd/expf.c} (61%) create mode 100644 math/aarch64/advsimd/expf_1u.c create mode 100644 math/aarch64/advsimd/expm1.c create mode 100644 math/aarch64/advsimd/expm1f.c rename {pl/math => math/aarch64/advsimd}/finite_pow.h (94%) rename pl/math/v_hypot_1u5.c => math/aarch64/advsimd/hypot.c (74%) rename pl/math/v_hypotf_1u5.c => math/aarch64/advsimd/hypotf.c (68%) create mode 100644 math/aarch64/advsimd/log.c create mode 100644 math/aarch64/advsimd/log10.c create mode 100644 math/aarch64/advsimd/log10f.c create mode 100644 math/aarch64/advsimd/log1p.c create mode 100644 math/aarch64/advsimd/log1pf.c create mode 100644 math/aarch64/advsimd/log2.c create mode 100644 math/aarch64/advsimd/log2f.c create mode 100644 math/aarch64/advsimd/logf.c create mode 100644 math/aarch64/advsimd/modf.c create mode 100644 math/aarch64/advsimd/modff.c rename pl/math/v_pow_1u5.c => math/aarch64/advsimd/pow.c (60%) create mode 100644 math/aarch64/advsimd/powf.c rename math/aarch64/{v_sin.c => advsimd/sin.c} (77%) rename pl/math/v_sincos_3u5.c => math/aarch64/advsimd/sincos.c (70%) rename pl/math/v_sincosf_1u8.c => math/aarch64/advsimd/sincosf.c (70%) create mode 100644 math/aarch64/advsimd/sincospi.c create mode 100644 math/aarch64/advsimd/sincospif.c rename math/aarch64/{v_sinf.c => advsimd/sinf.c} (65%) create mode 100644 math/aarch64/advsimd/sinh.c rename pl/math/v_sinhf_2u3.c => math/aarch64/advsimd/sinhf.c (59%) rename pl/math/v_sinpi_3u1.c => math/aarch64/advsimd/sinpi.c (81%) rename pl/math/v_sinpif_3u.c => math/aarch64/advsimd/sinpif.c (76%) rename pl/math/v_tan_3u5.c => math/aarch64/advsimd/tan.c (86%) rename pl/math/v_tanf_3u5.c => math/aarch64/advsimd/tanf.c (83%) create mode 100644 math/aarch64/advsimd/tanh.c rename pl/math/v_tanhf_2u6.c => math/aarch64/advsimd/tanhf.c (62%) create mode 100644 math/aarch64/advsimd/tanpi.c create mode 100644 math/aarch64/advsimd/tanpif.c create mode 100644 math/aarch64/advsimd/v_expf_inline.h create mode 100644 math/aarch64/advsimd/v_expm1_inline.h create mode 100644 math/aarch64/advsimd/v_expm1f_inline.h create mode 100644 math/aarch64/advsimd/v_log1p_inline.h create mode 100644 math/aarch64/advsimd/v_log1pf_inline.h rename {pl/math => math/aarch64/advsimd}/v_log_inline.h (94%) rename {pl/math => math/aarch64/advsimd}/v_math.h (58%) rename pl/math/poly_advsimd_f32.h => math/aarch64/advsimd/v_poly_f32.h (81%) rename pl/math/poly_advsimd_f64.h => math/aarch64/advsimd/v_poly_f64.h (81%) rename {pl/math => math/aarch64/advsimd}/v_sincos_common.h (97%) rename {pl/math => math/aarch64/advsimd}/v_sincosf_common.h (98%) create mode 100644 math/aarch64/advsimd/v_sincospi_common.h create mode 100644 math/aarch64/advsimd/v_sincospif_common.h rename pl/math/cospi_3u1.c => math/aarch64/cospi_3u5.c (82%) rename {pl/math => math/aarch64}/cospif_2u6.c (79%) rename {pl => math/aarch64/experimental}/README.contributors (71%) rename {pl/math => math/aarch64/experimental}/acos_2u.c (76%) rename {pl/math => math/aarch64/experimental}/acosf_1u4.c (79%) rename {pl/math => math/aarch64/experimental}/acosh_3u.c (69%) rename {pl/math => math/aarch64/experimental}/acoshf_2u8.c (68%) rename pl/math/v_erfinv_25u.c => math/aarch64/experimental/advsimd/erfinv_25u.c (88%) rename pl/math/v_erfinvf_5u.c => math/aarch64/experimental/advsimd/erfinvf_5u.c (83%) rename {pl/math => math/aarch64/experimental/advsimd}/v_logf_inline.h (97%) rename {pl/math => math/aarch64/experimental}/asin_3u.c (78%) rename {pl/math => math/aarch64/experimental}/asin_data.c (94%) rename {pl/math => math/aarch64/experimental}/asinf_2u5.c (80%) rename {pl/math => math/aarch64/experimental}/asinf_data.c (92%) rename {pl/math => math/aarch64/experimental}/asinh_2u5.c (75%) rename {pl/math => math/aarch64/experimental}/asinh_data.c (51%) rename {pl/math => math/aarch64/experimental}/asinhf_3u5.c (77%) create mode 100644 math/aarch64/experimental/asinhf_data.c rename {pl/math => math/aarch64/experimental}/atan2_2u5.c (91%) rename {pl/math => math/aarch64/experimental}/atan2f_3u.c (90%) rename {pl/math => math/aarch64/experimental}/atan_2u5.c (79%) rename {pl/math => math/aarch64/experimental}/atan_common.h (95%) create mode 100644 math/aarch64/experimental/atan_data.c rename {pl/math => math/aarch64/experimental}/atanf_2u9.c (82%) rename {pl/math => math/aarch64/experimental}/atanf_common.h (96%) create mode 100644 math/aarch64/experimental/atanf_data.c rename {pl/math => math/aarch64/experimental}/atanh_3u.c (88%) rename {pl/math => math/aarch64/experimental}/atanhf_3u1.c (87%) rename {pl/math => math/aarch64/experimental}/cbrt_2u.c (89%) rename {pl/math => math/aarch64/experimental}/cbrt_data.c (93%) rename {pl/math => math/aarch64/experimental}/cbrtf_1u5.c (88%) rename {pl/math => math/aarch64/experimental}/cbrtf_data.c (93%) rename {pl/math => math/aarch64/experimental}/cosh_2u.c (70%) rename {pl/math => math/aarch64/experimental}/coshf_1u9.c (71%) rename {pl/math => math/aarch64/experimental}/erf_2u5.c (87%) rename {pl/math => math/aarch64/experimental}/erfc_1u8.c (90%) rename {pl/math => math/aarch64/experimental}/erfcf_1u7.c (86%) rename {pl/math => math/aarch64/experimental}/erff_2u.c (83%) rename {pl/math => math/aarch64/experimental}/erfinv_24u5.c (88%) rename {pl/math => math/aarch64/experimental}/erfinvf_4u7.c (88%) rename {pl/math => math/aarch64/experimental}/erfinvl.c (98%) rename pl/math/exp.c => math/aarch64/experimental/exp_inline.h (93%) rename {pl/math => math/aarch64/experimental}/expf_data.c (93%) rename {pl/math => math/aarch64/experimental}/expm1_2u5.c (83%) create mode 100644 math/aarch64/experimental/expm1_data.c rename {pl/math => math/aarch64/experimental}/expm1f_1u6.c (82%) rename {pl/math => math/aarch64/experimental}/expm1f_data.c (59%) rename {pl/math => math/aarch64/experimental}/log10_2u.c (84%) rename {pl/math => math/aarch64/experimental}/log10_data.c (99%) rename {pl/math => math/aarch64/experimental}/log1p_2u.c (91%) create mode 100644 math/aarch64/experimental/log1p_data.c rename {pl/math => math/aarch64/experimental}/log1pf_2u1.c (93%) rename {pl/math => math/aarch64/experimental}/log1pf_data.c (59%) rename {pl/math => math/aarch64/experimental}/sinh_3u.c (72%) rename {pl/math => math/aarch64/experimental}/sinhf_2u3.c (69%) create mode 100644 math/aarch64/experimental/sve/erfinv_25u.c create mode 100644 math/aarch64/experimental/sve/erfinvf_5u.c rename pl/math/sv_powi.c => math/aarch64/experimental/sve/powi.c (96%) rename pl/math/sv_powif.c => math/aarch64/experimental/sve/powif.c (96%) create mode 100644 math/aarch64/experimental/sve/sv_logf_inline.h rename {pl/math => math/aarch64/experimental}/tanf_3u3.c (80%) rename {pl/math => math/aarch64/experimental}/tanf_data.c (96%) rename {pl/math => math/aarch64/experimental}/tanh_3u.c (80%) rename {pl/math => math/aarch64/experimental}/tanhf_2u6.c (79%) create mode 100644 math/aarch64/sincospi_4u.c create mode 100644 math/aarch64/sincospif_3u2.c rename pl/math/sinpi_3u.c => math/aarch64/sinpi_3u5.c (76%) rename {pl/math => math/aarch64}/sinpif_2u5.c (75%) rename pl/math/sv_acos_2u.c => math/aarch64/sve/acos.c (85%) rename pl/math/sv_acosf_1u4.c => math/aarch64/sve/acosf.c (83%) create mode 100644 math/aarch64/sve/acosh.c create mode 100644 math/aarch64/sve/acoshf.c rename pl/math/sv_asin_3u.c => math/aarch64/sve/asin.c (80%) rename pl/math/sv_asinf_2u5.c => math/aarch64/sve/asinf.c (81%) create mode 100644 math/aarch64/sve/asinh.c rename pl/math/sv_asinhf_2u5.c => math/aarch64/sve/asinhf.c (53%) rename pl/math/sv_atan_2u5.c => math/aarch64/sve/atan.c (86%) rename pl/math/sv_atan2_2u5.c => math/aarch64/sve/atan2.c (74%) rename pl/math/sv_atan2f_3u.c => math/aarch64/sve/atan2f.c (68%) rename pl/math/sv_atanf_2u9.c => math/aarch64/sve/atanf.c (83%) rename pl/math/sv_atanh_3u3.c => math/aarch64/sve/atanh.c (72%) rename pl/math/sv_atanhf_2u8.c => math/aarch64/sve/atanhf.c (61%) rename pl/math/sv_cbrt_2u.c => math/aarch64/sve/cbrt.c (77%) rename pl/math/sv_cbrtf_1u7.c => math/aarch64/sve/cbrtf.c (92%) rename pl/math/sv_cexpi_3u5.c => math/aarch64/sve/cexpi.c (79%) rename pl/math/sv_cexpif_1u8.c => math/aarch64/sve/cexpif.c (80%) rename pl/math/sv_cos_2u5.c => math/aarch64/sve/cos.c (88%) rename pl/math/sv_cosf_2u1.c => math/aarch64/sve/cosf.c (87%) rename pl/math/sv_cosh_2u.c => math/aarch64/sve/cosh.c (77%) create mode 100644 math/aarch64/sve/coshf.c rename pl/math/sv_cospi_3u2.c => math/aarch64/sve/cospi.c (78%) rename pl/math/sv_cospif_2u6.c => math/aarch64/sve/cospif.c (75%) rename pl/math/sv_erf_2u5.c => math/aarch64/sve/erf.c (83%) rename pl/math/sv_erfc_1u8.c => math/aarch64/sve/erfc.c (91%) rename pl/math/sv_erfcf_1u7.c => math/aarch64/sve/erfcf.c (77%) rename pl/math/sv_erff_2u.c => math/aarch64/sve/erff.c (77%) rename pl/math/sv_exp_1u5.c => math/aarch64/sve/exp.c (79%) rename pl/math/sv_exp10_1u5.c => math/aarch64/sve/exp10.c (79%) create mode 100644 math/aarch64/sve/exp10f.c rename pl/math/sv_exp2_2u.c => math/aarch64/sve/exp2.c (72%) create mode 100644 math/aarch64/sve/exp2f.c create mode 100644 math/aarch64/sve/expf.c rename pl/math/sv_expm1_2u5.c => math/aarch64/sve/expm1.c (86%) rename pl/math/sv_expm1f_1u6.c => math/aarch64/sve/expm1f.c (67%) rename pl/math/sv_hypot_1u5.c => math/aarch64/sve/hypot.c (72%) rename pl/math/sv_hypotf_1u5.c => math/aarch64/sve/hypotf.c (69%) create mode 100644 math/aarch64/sve/log.c create mode 100644 math/aarch64/sve/log10.c rename pl/math/sv_log10f_3u5.c => math/aarch64/sve/log10f.c (56%) rename pl/math/sv_log1p_2u5.c => math/aarch64/sve/log1p.c (88%) create mode 100644 math/aarch64/sve/log1pf.c create mode 100644 math/aarch64/sve/log2.c rename pl/math/sv_log2f_2u5.c => math/aarch64/sve/log2f.c (53%) rename pl/math/sv_logf_3u4.c => math/aarch64/sve/logf.c (52%) create mode 100644 math/aarch64/sve/modf.c create mode 100644 math/aarch64/sve/modff.c rename pl/math/sv_pow_1u5.c => math/aarch64/sve/pow.c (64%) rename pl/math/sv_powf_2u6.c => math/aarch64/sve/powf.c (69%) rename pl/math/sv_sin_3u5.c => math/aarch64/sve/sin.c (89%) rename pl/math/sv_sincos_3u5.c => math/aarch64/sve/sincos.c (72%) rename pl/math/sv_sincosf_1u8.c => math/aarch64/sve/sincosf.c (72%) create mode 100644 math/aarch64/sve/sincospi.c create mode 100644 math/aarch64/sve/sincospif.c rename pl/math/sv_sinf_1u9.c => math/aarch64/sve/sinf.c (89%) rename pl/math/sv_sinh_3u.c => math/aarch64/sve/sinh.c (88%) rename pl/math/sv_sinhf_2u3.c => math/aarch64/sve/sinhf.c (78%) rename pl/math/sv_sinpi_3u1.c => math/aarch64/sve/sinpi.c (66%) rename pl/math/sv_sinpif_2u5.c => math/aarch64/sve/sinpif.c (61%) create mode 100644 math/aarch64/sve/sv_expf_inline.h rename {pl/math => math/aarch64/sve}/sv_expm1f_inline.h (65%) rename {pl/math => math/aarch64/sve}/sv_log1p_inline.h (90%) create mode 100644 math/aarch64/sve/sv_log1pf_inline.h create mode 100644 math/aarch64/sve/sv_log_inline.h rename {pl/math => math/aarch64/sve}/sv_math.h (72%) rename pl/math/poly_sve_f32.h => math/aarch64/sve/sv_poly_f32.h (78%) rename pl/math/poly_sve_f64.h => math/aarch64/sve/sv_poly_f64.h (78%) rename pl/math/poly_sve_generic.h => math/aarch64/sve/sv_poly_generic.h (91%) rename {pl/math => math/aarch64/sve}/sv_sincos_common.h (97%) rename {pl/math => math/aarch64/sve}/sv_sincosf_common.h (98%) create mode 100644 math/aarch64/sve/sv_sincospi_common.h create mode 100644 math/aarch64/sve/sv_sincospif_common.h create mode 100644 math/aarch64/sve/tan.c rename pl/math/sv_tanf_3u5.c => math/aarch64/sve/tanf.c (79%) rename pl/math/sv_tanh_3u.c => math/aarch64/sve/tanh.c (86%) create mode 100644 math/aarch64/sve/tanhf.c create mode 100644 math/aarch64/sve/tanpi.c create mode 100644 math/aarch64/sve/tanpif.c create mode 100644 math/aarch64/tanpi_2u5.c create mode 100644 math/aarch64/tanpif_3u1.c rename pl/math/erf_data.c => math/aarch64/v_erf_data.c (99%) rename pl/math/erfc_data.c => math/aarch64/v_erfc_data.c (99%) rename pl/math/erfcf_data.c => math/aarch64/v_erfcf_data.c (98%) rename pl/math/erff_data.c => math/aarch64/v_erff_data.c (98%) delete mode 100644 math/aarch64/v_exp2f_1u.c rename {pl/math => math/aarch64}/v_exp_tail_data.c (98%) delete mode 100644 math/aarch64/v_expf_1u.c delete mode 100644 math/aarch64/v_log.c rename {pl/math => math/aarch64}/v_log10_data.c (99%) rename {pl/math => math/aarch64}/v_log2_data.c (99%) delete mode 100644 math/aarch64/v_logf.c delete mode 100644 math/aarch64/v_math.h delete mode 100644 math/aarch64/v_pow.c rename {pl/math => math/aarch64}/v_pow_exp_data.c (99%) rename {pl/math => math/aarch64}/v_pow_log_data.c (99%) delete mode 100644 math/aarch64/v_powf.c rename {pl/math => math/aarch64}/v_powf_data.c (98%) create mode 100644 math/include/test_defs.h create mode 100644 math/include/test_sig.h rename {pl/math => math}/log10f.c (84%) rename {pl/math => math}/poly_generic.h (99%) rename {pl/math => math}/poly_scalar_f32.h (80%) rename {pl/math => math}/poly_scalar_f64.h (80%) create mode 100644 math/test/test_defs.h rename {pl/math => math}/test/testcases/directed/acos.tst (95%) rename {pl/math => math}/test/testcases/directed/acosf.tst (95%) rename {pl/math => math}/test/testcases/directed/acosh.tst (96%) rename {pl/math => math}/test/testcases/directed/acoshf.tst (95%) rename {pl/math => math}/test/testcases/directed/asin.tst (97%) rename {pl/math => math}/test/testcases/directed/asinf.tst (96%) rename {pl/math => math}/test/testcases/directed/asinh.tst (95%) rename {pl/math => math}/test/testcases/directed/asinhf.tst (95%) rename {pl/math => math}/test/testcases/directed/atan.tst (96%) rename {pl/math => math}/test/testcases/directed/atan2.tst (99%) rename {pl/math => math}/test/testcases/directed/atan2f.tst (99%) rename {pl/math => math}/test/testcases/directed/atanf.tst (95%) rename {pl/math => math}/test/testcases/directed/atanh.tst (97%) rename {pl/math => math}/test/testcases/directed/atanhf.tst (96%) rename {pl/math => math}/test/testcases/directed/cbrtf.tst (97%) rename {pl/math => math}/test/testcases/directed/cosh.tst (95%) rename {pl/math => math}/test/testcases/directed/coshf.tst (93%) rename {pl/math => math}/test/testcases/directed/erfc.tst (96%) rename {pl/math => math}/test/testcases/directed/erfcf.tst (93%) rename {pl/math => math}/test/testcases/directed/expm1.tst (96%) rename {pl/math => math}/test/testcases/directed/expm1f.tst (98%) rename {pl/math => math}/test/testcases/directed/log10.tst (95%) rename {pl/math => math}/test/testcases/directed/log10f.tst (98%) rename {pl/math => math}/test/testcases/directed/log1p.tst (96%) rename {pl/math => math}/test/testcases/directed/log1pf.tst (99%) rename {pl/math => math}/test/testcases/directed/sinh.tst (96%) rename {pl/math => math}/test/testcases/directed/sinhf.tst (95%) rename {pl/math => math}/test/testcases/directed/tanf.tst (96%) rename {pl/math => math}/test/testcases/directed/tanh.tst (95%) rename {pl/math => math}/test/testcases/directed/tanhf.tst (95%) create mode 100644 math/test/trigpi_references.h rename {pl/math => math}/tools/asin.sollya (93%) rename {pl/math => math}/tools/asinf.sollya (94%) rename {pl/math => math}/tools/asinh.sollya (94%) rename {pl/math => math}/tools/asinhf.sollya (93%) rename {pl/math => math}/tools/atan.sollya (93%) rename {pl/math => math}/tools/atanf.sollya (92%) rename {pl/math => math}/tools/cbrt.sollya (90%) rename {pl/math => math}/tools/cbrtf.sollya (90%) rename {pl/math => math}/tools/erf.sollya (92%) rename {pl/math => math}/tools/erfc.sollya (95%) rename {pl/math => math}/tools/erfcf.sollya (91%) rename {pl/math => math}/tools/erff.sollya (91%) rename {pl/math => math}/tools/exp10.sollya (97%) rename {pl/math => math}/tools/expm1.sollya (91%) rename {pl/math => math}/tools/expm1f.sollya (91%) rename {pl/math => math}/tools/log10.sollya (96%) rename {pl/math => math}/tools/log10f.sollya (96%) rename {pl/math => math}/tools/log1p.sollya (93%) rename {pl/math => math}/tools/log1pf.sollya (91%) rename {pl/math => math}/tools/sincos.sollya (92%) rename {pl/math => math}/tools/sincosf.sollya (95%) rename {pl/math => math}/tools/sinpi.sollya (95%) rename {pl/math => math}/tools/tan.sollya (91%) rename {pl/math => math}/tools/tanf.sollya (98%) create mode 100644 math/tools/tanpi.sollya rename {pl/math => math}/tools/v_erf.sollya (91%) rename {pl/math => math}/tools/v_erfc.sollya (96%) rename {pl/math => math}/tools/v_log10.sollya (96%) rename {pl/math => math}/tools/v_log10f.sollya (96%) rename {pl/math => math}/tools/v_log2f.sollya (96%) delete mode 100644 pl/Dir.mk delete mode 100644 pl/math/Dir.mk delete mode 100644 pl/math/asinhf_data.c delete mode 100644 pl/math/atan_data.c delete mode 100644 pl/math/atanf_data.c delete mode 100644 pl/math/exp_data.c delete mode 100644 pl/math/expf.c delete mode 100644 pl/math/expm1_data.c delete mode 100644 pl/math/include/mathlib.h delete mode 100644 pl/math/include/pl_test.h delete mode 100644 pl/math/log.c delete mode 100644 pl/math/log1p_data.c delete mode 100644 pl/math/log_data.c delete mode 100644 pl/math/logf.c delete mode 100644 pl/math/logf_data.c delete mode 100644 pl/math/math_config.h delete mode 100644 pl/math/math_err.c delete mode 100644 pl/math/math_errf.c delete mode 100644 pl/math/pl_sig.h delete mode 100644 pl/math/sv_acosh_3u5.c delete mode 100644 pl/math/sv_acoshf_2u8.c delete mode 100644 pl/math/sv_asinh_3u0.c delete mode 100644 pl/math/sv_coshf_2u.c delete mode 100644 pl/math/sv_erf_data.c delete mode 100644 pl/math/sv_erff_data.c delete mode 100644 pl/math/sv_exp10f_1u5.c delete mode 100644 pl/math/sv_exp2f_1u6.c delete mode 100644 pl/math/sv_expf_2u.c delete mode 100644 pl/math/sv_expf_inline.h delete mode 100644 pl/math/sv_log10_2u5.c delete mode 100644 pl/math/sv_log1pf_1u3.c delete mode 100644 pl/math/sv_log1pf_inline.h delete mode 100644 pl/math/sv_log2_3u.c delete mode 100644 pl/math/sv_log_2u5.c delete mode 100644 pl/math/sv_tan_3u5.c delete mode 100644 pl/math/sv_tanhf_2u6.c delete mode 100644 pl/math/test/mathbench_funcs.h delete mode 100644 pl/math/test/mathbench_wrappers.h delete mode 100644 pl/math/test/pl_test.h delete mode 100755 pl/math/test/runulp.sh delete mode 100644 pl/math/test/testcases/directed/erff.tst delete mode 100644 pl/math/test/testcases/directed/log2.tst delete mode 100644 pl/math/test/testcases/directed/log2f.tst delete mode 100644 pl/math/test/testcases/random/double.tst delete mode 100644 pl/math/test/testcases/random/float.tst delete mode 100644 pl/math/test/ulp_funcs.h delete mode 100644 pl/math/test/ulp_wrappers.h delete mode 100644 pl/math/trigpi_references.c delete mode 100644 pl/math/v_asinh_3u5.c delete mode 100644 pl/math/v_asinhf_2u7.c delete mode 100644 pl/math/v_atan2_3u.c delete mode 100644 pl/math/v_exp_data.c delete mode 100644 pl/math/v_exp_tail.h delete mode 100644 pl/math/v_exp_tail_inline.h delete mode 100644 pl/math/v_expf_inline.h delete mode 100644 pl/math/v_expm1_2u5.c delete mode 100644 pl/math/v_expm1f_1u6.c delete mode 100644 pl/math/v_expm1f_inline.h delete mode 100644 pl/math/v_log10_2u5.c delete mode 100644 pl/math/v_log10f_3u5.c delete mode 100644 pl/math/v_log1p_2u5.c delete mode 100644 pl/math/v_log1p_inline.h delete mode 100644 pl/math/v_log1pf_2u1.c delete mode 100644 pl/math/v_log1pf_inline.h delete mode 100644 pl/math/v_log2_3u.c delete mode 100644 pl/math/v_log2f_2u5.c delete mode 100644 pl/math/v_log_data.c delete mode 100644 pl/math/v_sinh_3u.c delete mode 100644 pl/math/v_tanh_3u.c rename string/aarch64/{ => experimental}/memchr-sve.S (96%) rename string/aarch64/{ => experimental}/memcmp-sve.S (93%) rename string/aarch64/{ => experimental}/stpcpy-sve.S (100%) rename string/aarch64/{ => experimental}/strchr-sve.S (97%) rename string/aarch64/{ => experimental}/strchrnul-sve.S (100%) rename string/aarch64/{ => experimental}/strcmp-sve.S (96%) rename string/aarch64/{ => experimental}/strcpy-sve.S (96%) rename string/aarch64/{ => experimental}/strlen-sve.S (96%) rename string/aarch64/{ => experimental}/strncmp-sve.S (95%) rename string/aarch64/{ => experimental}/strnlen-sve.S (96%) rename string/aarch64/{ => experimental}/strrchr-sve.S (98%) create mode 100644 string/aarch64/memset-sve.S diff --git a/MAINTAINERS b/MAINTAINERS index 6c5823a8dbce5a..06cceb8f2501ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1,12 +1,9 @@ / - Szabolcs Nagy + Tamar Christina math/ - Szabolcs Nagy -networking/ - Szabolcs Nagy -pl/ Pierre Blanchard Joe Ramsay +networking/ + Ola Liljedahl string/ - Szabolcs Nagy Wilco Dijkstra diff --git a/Makefile b/Makefile index c487896728c2cd..e7503dbd2f6075 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile - requires GNU make # -# Copyright (c) 2018-2022, Arm Limited. +# Copyright (c) 2018-2024, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception srcdir = . @@ -11,7 +11,6 @@ includedir = $(prefix)/include # Configure these in config.mk, do not make changes in this file. SUBS = math string networking -PLSUBS = math HOST_CC = cc HOST_CFLAGS = -std=c99 -O2 HOST_LDFLAGS = @@ -21,12 +20,22 @@ CPPFLAGS = CFLAGS = -std=c99 -O2 CFLAGS_SHARED = -fPIC CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS) -CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL LDFLAGS = LDLIBS = AR = $(CROSS_COMPILE)ar RANLIB = $(CROSS_COMPILE)ranlib INSTALL = install +# Detect OS. +# Assume Unix environment: Linux, Darwin, or Msys. +OS := $(shell uname -s) +OS := $(patsubst MSYS%,Msys,$(OS)) +# Following math dependencies can be adjusted in config file +# if necessary, e.g. for Msys. +libm-libs = -lm +libc-libs = -lc +mpfr-libs = -lmpfr +gmp-libs = -lgmp +mpc-libs = -lmpc all: @@ -53,7 +62,6 @@ $(DIRS): mkdir -p $@ $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED) -$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED) build/%.o: $(srcdir)/%.S $(CC) $(CFLAGS_ALL) -c -o $@ $< diff --git a/README b/README index 651ebdc84bc865..4bbed76d75c824 100644 --- a/README +++ b/README @@ -12,12 +12,25 @@ contribution requirements are documented in README.contributors of the appropriate subdirectory. Regular quarterly releases are tagged as vYY.MM, the latest -release is v24.01. +release is v25.01. Source code layout: build/ - build directory (created by make). -math/ - math subproject sources. +math/ - math subproject sources for generic scalar + subroutines and sources shared with + subdirectories of math/. + All math routines should meet the quality + requirements stated in math/README.contributors, + routines that fail to do so are located in an + experimental/ directory. +math/aarch64/ - math subproject AArch64-specific sources + and sources shared with subdirectories. +math/aarch64/advsimd - AdvSIMD-specific math sources. +math/aarch64/experimental - Experimental math sources do not + meet quality requirements stated in + math/README.contributors. +math/aarch64/sve - SVE-specific math sources. math/include/ - math library public headers. math/test/ - math test and benchmark related sources. math/tools/ - tools used for designing the algorithms. @@ -25,9 +38,16 @@ networking/ - networking subproject sources. networking/include/ - networking library public headers. networking/test/ - networking test and benchmark related sources. string/ - string routines subproject sources. + All string routines should meet the quality + requirements stated in string/README.contributors, + routines that fail to do so are located in an + experimental/ directory. +string/ - -specific string routines sources for + =aarch64, and arm. +string/aarch64/experimental - Experimental string routines which + may not be fully optimized yet. string/include/ - string library public headers. string/test/ - string test and benchmark related sources. -pl/... - separately maintained performance library code. The steps to build the target libraries and run the tests: @@ -50,6 +70,13 @@ Or building and testing the math subproject only: make all-math make check-math +Note on compiler compability/requirement: + +SVE routines are always built by default - this means that on AArch64 +GCC >= 10 or LLVM >= 5 are always required for SVE ACLE compatibility. +There is no explicit check for compatible compiler, therefore the SVE +routines will fail to build if CC is too old. + The test system requires libmpfr and libmpc. For example on debian linux they can be installed as: diff --git a/config.mk.dist b/config.mk.dist index 03fb54db52fabe..ae4574e7cdba80 100644 --- a/config.mk.dist +++ b/config.mk.dist @@ -1,14 +1,11 @@ # Example config.mk # -# Copyright (c) 2018-2023, Arm Limited. +# Copyright (c) 2018-2024, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception # Subprojects to build SUBS = math string networking -# Subsubprojects to build if subproject pl is built -PLSUBS = math - # Target architecture: aarch64, arm or x86_64 ARCH = aarch64 @@ -30,6 +27,27 @@ HOST_CFLAGS += -Wall -Wno-unused-function HOST_CFLAGS += -g CFLAGS += -g +ifeq ($(OS),Msys) + # llvm is the only available/valid native compiler + CC = clang + AR = llvm-ar + RANLIB = llvm-ranlib + HOST_CC = clang + SYSROOT = /c/wenv/msys2/msys64/clangarm64 + # Common windows flags + COMMON_WIN_CFLAGS = -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE + COMMON_WIN_CFLAGS += -Wno-deprecated-declarations -Wno-unused-variable + # For mathtest + HOST_CFLAGS += -I$(SYSROOT)/include + HOST_CFLAGS += $(COMMON_WIN_CFLAGS) -Wno-ignored-attributes + # Clear the default flag -fPIC, as not supported on Windows + CFLAGS_SHARED = + # For ulp.h with MPFR + CFLAGS += -I$(SYSROOT)/include + # For clang on Windows + CFLAGS += $(COMMON_WIN_CFLAGS) +endif + # Optimize the shared libraries on aarch64 assuming they fit in 1M. #CFLAGS_SHARED = -fPIC -mcmodel=tiny @@ -45,12 +63,33 @@ math-cflags = math-ldlibs = math-ulpflags = math-testflags = -string-cflags = +string-cflags = -falign-functions=64 networking-cflags = -# Use if mpfr is available on the target for ulp error checking. -#math-ldlibs += -lmpfr -lgmp -#math-cflags += -DUSE_MPFR +ifeq ($(OS),Msys) + # Libraries can be installed with pacman + libm-libs = -lmsvcrt -lvcruntime -lucrt + libc-libs = + # Linker will look for .lib but some systems only have .dll.a, + # therefore we have to give absolute path to libraries. + # This is system dependent and might need adjusting. + mpfr-libs = $(SYSROOT)/lib/libmpfr.dll.a + gmp-libs = $(SYSROOT)/lib/libgmp.dll.a + mpc-libs = $(SYSROOT)/lib/libmpc.dll.a +endif + +# Use if mpfr is available on the target for ulp error checking. If +# enabling this, it is advised to disable fenv checks by uncommenting +# the two lines at the bottom of this block. +USE_MPFR=0 +math-cflags += -DUSE_MPFR=$(USE_MPFR) +ifeq ($(USE_MPFR), 1) + math-ldlibs += $(mpfr-libs) $(gmp-libs) + math-ulpflags += -m +endif +# Disable fenv checks +#math-ulpflags = -q -f +#math-testflags = -nostatus # Use with gcc. math-cflags += -frounding-math -fexcess-precision=standard -fno-stack-protector @@ -59,30 +98,36 @@ math-cflags += -ffp-contract=fast -fno-math-errno # Use with clang. #math-cflags += -ffp-contract=fast -# Disable/enable SVE vector math code and tests. -# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE -# routines only so that SVE code does not leak into scalar -# routines. It is also necessary to add it for tools (e.g. ulp, -# mathbench) -WANT_SVE_MATH = 0 -ifeq ($(WANT_SVE_MATH), 1) - math-sve-cflags = -march=armv8-a+sve -endif -math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH) - # If defined to 1, set errno in math functions according to ISO C. Many math # libraries do not set errno, so this is 0 by default. It may need to be # set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. WANT_ERRNO = 0 math-cflags += -DWANT_ERRNO=$(WANT_ERRNO) +# Disable/enable SVE vector math tests/tools. +ifeq ($(ARCH),aarch64) + WANT_SVE_TESTS = 1 +else + WANT_SVE_TESTS = 0 +endif +math-cflags += -DWANT_SVE_TESTS=$(WANT_SVE_TESTS) + # If set to 1, set fenv in vector math routines. WANT_SIMD_EXCEPT = 0 math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT) -# Disable fenv checks -#math-ulpflags = -q -f -#math-testflags = -nostatus +# If set to 1, enable tests for exp10. +WANT_EXP10_TESTS = 1 +math-cflags += -DWANT_EXP10_TESTS=$(WANT_EXP10_TESTS) + +# If set to 1, enable tests for sinpi and cospi. These functions are +# only supported on aarch64 +ifeq ($(ARCH),aarch64) + WANT_TRIGPI_TESTS = 1 +else + WANT_TRIGPI_TESTS = 0 +endif +math-cflags += -DWANT_TRIGPI_TESTS=$(WANT_TRIGPI_TESTS) # Remove GNU Property Notes from asm files. #string-cflags += -DWANT_GNU_PROPERTY=0 @@ -92,3 +137,13 @@ math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT) # Avoid auto-vectorization of scalar code and unroll loops networking-cflags += -O2 -fno-tree-vectorize -funroll-loops + +# Provide *_finite symbols and some of the glibc hidden symbols +# so libmathlib can be used with binaries compiled against glibc +# to interpose math functions with both static and dynamic linking +USE_GLIBC_ABI = 1 +math-cflags += -DUSE_GLIBC_ABI=$(USE_GLIBC_ABI) + +# Enable experimental math routines - non-C23 vector math and low-accuracy scalar +WANT_EXPERIMENTAL_MATH = 0 +math-cflags += -DWANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH) diff --git a/math/Dir.mk b/math/Dir.mk index 5e9494a7bd3cbc..6277241ac4de9a 100644 --- a/math/Dir.mk +++ b/math/Dir.mk @@ -1,23 +1,61 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2023, Arm Limited. +# Copyright (c) 2019-2024, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception -S := $(srcdir)/math -B := build/math - -math-lib-srcs := $(wildcard $(S)/*.[cS]) -math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS]) +.SECONDEXPANSION: + +ifneq ($(OS),Linux) + ifeq ($(WANT_SIMD_EXCEPT),1) + $(error WANT_SIMD_EXCEPT is not supported outside Linux) + endif + ifneq ($(USE_MPFR),1) + $(warning WARNING: Double-precision ULP tests will not be usable without MPFR) + endif + ifeq ($(USE_GLIBC_ABI),1) + $(error Can only generate special GLIBC symbols on Linux - please disable USE_GLIBC_ABI) + endif +endif + +ifneq ($(ARCH),aarch64) + ifeq ($(WANT_TRIGPI_TESTS),1) + $(error trigpi functions only supported on aarch64) + endif + ifeq ($(WANT_EXPERIMENTAL_MATH),1) + $(error Experimental math only supported on aarch64) + endif +endif + +math-src-dir := $(srcdir)/math +math-build-dir := build/math + +math-lib-srcs := $(wildcard $(math-src-dir)/*.[cS]) +math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*.[cS]) +ifeq ($(OS),Linux) +# Vector symbols only supported on Linux +math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/*/*.[cS]) +endif + +ifeq ($(WANT_EXPERIMENTAL_MATH), 1) +ifeq ($(OS),Linux) +# Vector symbols only supported on Linux +math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*/*.[cS]) +else +math-lib-srcs += $(wildcard $(math-src-dir)/$(ARCH)/experimental/*.[cS]) +endif +else +# Scalar experimental symbols will have been added by wildcard, so remove them +math-lib-srcs := $(filter-out $(math-src-dir)/aarch64/experimental/%, $(math-lib-srcs)) +endif math-test-srcs := \ - $(S)/test/mathtest.c \ - $(S)/test/mathbench.c \ - $(S)/test/ulp.c \ + $(math-src-dir)/test/mathtest.c \ + $(math-src-dir)/test/mathbench.c \ + $(math-src-dir)/test/ulp.c \ -math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS]) +math-test-host-srcs := $(wildcard $(math-src-dir)/test/rtest/*.[cS]) -math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) -math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h)) +math-includes := $(patsubst $(math-src-dir)/%,build/%,$(wildcard $(math-src-dir)/include/*.h)) math-libs := \ build/lib/libmathlib.so \ @@ -33,9 +71,9 @@ math-tools := \ math-host-tools := \ build/bin/rtest \ -math-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-lib-srcs))) -math-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-srcs))) -math-host-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(math-test-host-srcs))) +math-lib-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-lib-srcs))) +math-test-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-srcs))) +math-host-objs := $(patsubst $(math-src-dir)/%,$(math-build-dir)/%.o,$(basename $(math-test-host-srcs))) math-target-objs := $(math-lib-objs) $(math-test-objs) math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs) @@ -44,18 +82,69 @@ math-files := \ $(math-libs) \ $(math-tools) \ $(math-host-tools) \ - $(math-includes) \ - $(math-test-includes) \ + $(math-includes) -all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) +all-math: $(math-libs) $(math-tools) $(math-includes) -$(math-objs): $(math-includes) $(math-test-includes) +$(math-objs): $(math-includes) $(math-objs): CFLAGS_ALL += $(math-cflags) -$(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno +$(math-build-dir)/test/mathtest.o: CFLAGS_ALL += -fmath-errno $(math-host-objs): CC = $(HOST_CC) $(math-host-objs): CFLAGS_ALL = $(HOST_CFLAGS) -$(B)/test/ulp.o: $(S)/test/ulp.h +# Add include path for experimental routines so they can share helpers with non-experimental +$(math-build-dir)/aarch64/experimental/advsimd/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/advsimd +$(math-build-dir)/aarch64/experimental/sve/%: CFLAGS_ALL += -I$(math-src-dir)/aarch64/sve + +$(math-objs): CFLAGS_ALL += -I$(math-src-dir) + +ulp-funcs-dir = build/test/ulp-funcs/ +ulp-wrappers-dir = build/test/ulp-wrappers/ +mathbench-funcs-dir = build/test/mathbench-funcs/ +test-sig-dirs = $(ulp-funcs-dir) $(ulp-wrappers-dir) $(mathbench-funcs-dir) +build/include/test $(test-sig-dirs) $(addsuffix /$(ARCH),$(test-sig-dirs)) $(addsuffix /aarch64/experimental,$(test-sig-dirs)) \ +$(addsuffix /aarch64/experimental/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/experimental/sve,$(test-sig-dirs)) \ +$(addsuffix /aarch64/advsimd,$(test-sig-dirs)) $(addsuffix /aarch64/sve,$(test-sig-dirs)): + mkdir -p $@ + +ulp-funcs = $(patsubst $(math-src-dir)/%,$(ulp-funcs-dir)/%,$(basename $(math-lib-srcs))) +ulp-wrappers = $(patsubst $(math-src-dir)/%,$(ulp-wrappers-dir)/%,$(basename $(math-lib-srcs))) +mathbench-funcs = $(patsubst $(math-src-dir)/%,$(mathbench-funcs-dir)/%,$(basename $(math-lib-srcs))) + +ifeq ($(WANT_SVE_TESTS), 0) + # Filter out anything with sve in the path + ulp-funcs := $(foreach a,$(ulp-funcs),$(if $(findstring sve,$a),,$a)) + ulp-wrappers := $(foreach a,$(ulp-wrappers),$(if $(findstring sve,$a),,$a)) + mathbench-funcs := $(foreach a,$(mathbench-funcs),$(if $(findstring sve,$a),,$a)) +endif + +define emit_sig +$1/aarch64/experimental/sve/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/sve +$1/aarch64/experimental/advsimd/%.i: EXTRA_INC = -I$(math-src-dir)/aarch64/advsimd +$1/%.i: $(math-src-dir)/%.c | $$$$(@D) + $(CC) $$< $(math-cflags) -I$(math-src-dir)/include -I$(math-src-dir) $$(EXTRA_INC) -D$2 -E -o $$@ +$1/%: $1/%.i + { grep TEST_SIG $$< || true; } | cut -f 2- -d ' ' > $$@ +endef + +$(eval $(call emit_sig,$(ulp-funcs-dir),EMIT_ULP_FUNCS)) +$(eval $(call emit_sig,$(ulp-wrappers-dir),EMIT_ULP_WRAPPERS)) +$(eval $(call emit_sig,$(mathbench-funcs-dir),EMIT_MATHBENCH_FUNCS)) + +ulp-funcs-gen = build/include/test/ulp_funcs_gen.h +ulp-wrappers-gen = build/include/test/ulp_wrappers_gen.h +mathbench-funcs-gen = build/include/test/mathbench_funcs_gen.h +math-tools-autogen-headers = $(ulp-funcs-gen) $(ulp-wrappers-gen) $(mathbench-funcs-gen) + +$(ulp-funcs-gen): $(ulp-funcs) | $$(@D) +$(ulp-wrappers-gen): $(ulp-wrappers) | $$(@D) +$(mathbench-funcs-gen): $(mathbench-funcs) | $$(@D) + +$(math-tools-autogen-headers): | $$(@D) + cat $^ | sort -u > $@ + +$(math-build-dir)/test/mathbench.o: $(mathbench-funcs-gen) +$(math-build-dir)/test/ulp.o: $(math-src-dir)/test/ulp.h $(ulp-funcs-gen) $(ulp-wrappers-gen) build/lib/libmathlib.so: $(math-lib-objs:%.o=%.os) $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^ @@ -65,38 +154,40 @@ build/lib/libmathlib.a: $(math-lib-objs) $(AR) rc $@ $^ $(RANLIB) $@ -$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc -$(math-tools): LDLIBS += $(math-ldlibs) -lm -# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled -$(math-tools): CFLAGS_ALL += $(math-sve-cflags) +$(math-host-tools): HOST_LDLIBS += $(libm-libs) $(mpfr-libs) $(mpc-libs) +$(math-tools): LDLIBS += $(math-ldlibs) $(libm-libs) + +ifneq ($(OS),Darwin) + $(math-tools): LDFLAGS += -static +endif build/bin/rtest: $(math-host-objs) $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) -build/bin/mathtest: $(B)/test/mathtest.o build/lib/libmathlib.a - $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) +build/bin/mathtest: $(math-build-dir)/test/mathtest.o build/lib/libmathlib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs) -build/bin/mathbench: $(B)/test/mathbench.o build/lib/libmathlib.a - $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) +build/bin/mathbench: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(libm-libs) # This is not ideal, but allows custom symbols in mathbench to get resolved. -build/bin/mathbench_libc: $(B)/test/mathbench.o build/lib/libmathlib.a - $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/lib/libmathlib.a -lm - -build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a - $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) +build/bin/mathbench_libc: $(math-build-dir)/test/mathbench.o build/lib/libmathlib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $< $(libm-libs) $(libc-libs) build/lib/libmathlib.a $(libm-libs) -build/include/%.h: $(S)/include/%.h - cp $< $@ +build/bin/ulp: $(math-build-dir)/test/ulp.o build/lib/libmathlib.a + $(CC) $(CFLAGS_ALL) $(LDFLAGS) -o $@ $^ $(LDLIBS) -build/include/test/%.h: $(S)/test/%.h +build/include/%.h: $(math-src-dir)/include/%.h cp $< $@ -build/bin/%.sh: $(S)/test/%.sh +build/bin/%.sh: $(math-src-dir)/test/%.sh cp $< $@ -math-tests := $(wildcard $(S)/test/testcases/directed/*.tst) -math-rtests := $(wildcard $(S)/test/testcases/random/*.tst) +math-tests := $(wildcard $(math-src-dir)/test/testcases/directed/*.tst) +ifneq ($(WANT_EXP10_TESTS),1) +math-tests := $(filter-out %exp10.tst, $(math-tests)) +endif +math-rtests := $(wildcard $(math-src-dir)/test/testcases/random/*.tst) check-math-test: $(math-tools) cat $(math-tests) | $(EMULATOR) build/bin/mathtest $(math-testflags) @@ -104,8 +195,88 @@ check-math-test: $(math-tools) check-math-rtest: $(math-host-tools) $(math-tools) cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags) +ulp-input-dir = $(math-build-dir)/test/inputs +$(ulp-input-dir) $(ulp-input-dir)/$(ARCH) $(ulp-input-dir)/aarch64/sve $(ulp-input-dir)/aarch64/advsimd \ +$(ulp-input-dir)/aarch64/experimental $(ulp-input-dir)/aarch64/experimental/advsimd $(ulp-input-dir)/aarch64/experimental/sve: + mkdir -p $@ + +math-lib-lims = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp,$(math-lib-srcs)) +math-lib-lims-nn = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.ulp_nn,$(math-lib-srcs)) +math-lib-fenvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.fenv,$(math-lib-srcs)) +math-lib-itvs = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.itv,$(math-lib-srcs)) +math-lib-cvals = $(patsubst $(math-src-dir)/%.c,$(ulp-input-dir)/%.cval,$(math-lib-srcs)) + +ulp-inputs = $(math-lib-lims) $(math-lib-lims-nn) $(math-lib-fenvs) $(math-lib-itvs) $(math-lib-cvals) +$(ulp-inputs): CFLAGS = -I$(math-src-dir)/test -I$(math-src-dir)/include -I$(math-src-dir) $(math-cflags)\ + -I$(math-src-dir)/aarch64/advsimd -I$(math-src-dir)/aarch64/sve + +$(ulp-input-dir)/%.ulp.i: $(math-src-dir)/%.c | $$(@D) + $(CC) $(CFLAGS) $< -E -o $@ + +$(ulp-input-dir)/%.ulp: $(ulp-input-dir)/%.ulp.i + { grep "TEST_ULP " $< || true; } > $@ + +$(ulp-input-dir)/%.ulp_nn.i: $(math-src-dir)/%.c | $$(@D) + $(CC) $(CFLAGS) $< -E -o $@ + +$(ulp-input-dir)/%.ulp_nn: $(ulp-input-dir)/%.ulp_nn.i + { grep "TEST_ULP_NONNEAREST " $< || true; } > $@ + +$(ulp-input-dir)/%.fenv.i: $(math-src-dir)/%.c | $$(@D) + $(CC) $(CFLAGS) $< -E -o $@ + +$(ulp-input-dir)/%.fenv: $(ulp-input-dir)/%.fenv.i + { grep "TEST_DISABLE_FENV " $< || true; } > $@ + +$(ulp-input-dir)/%.itv.i: $(math-src-dir)/%.c | $$(@D) + $(CC) $(CFLAGS) $< -E -o $@ + +$(ulp-input-dir)/%.itv: $(ulp-input-dir)/%.itv.i + { grep "TEST_INTERVAL " $< || true; } | sed "s/ TEST_INTERVAL/\nTEST_INTERVAL/g" > $@ + +$(ulp-input-dir)/%.cval.i: $(math-src-dir)/%.c | $$(@D) + $(CC) $(CFLAGS) $< -E -o $@ + +$(ulp-input-dir)/%.cval: $(ulp-input-dir)/%.cval.i + { grep "TEST_CONTROL_VALUE " $< || true; } > $@ + +ulp-lims = $(ulp-input-dir)/limits +$(ulp-lims): $(math-lib-lims) + +ulp-lims-nn = $(ulp-input-dir)/limits_nn +$(ulp-lims-nn): $(math-lib-lims-nn) + +fenv-exps := $(ulp-input-dir)/fenv +$(fenv-exps): $(math-lib-fenvs) + +generic-itvs = $(ulp-input-dir)/itvs +$(generic-itvs): $(filter-out $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs)) + +arch-itvs = $(ulp-input-dir)/$(ARCH)/itvs +$(arch-itvs): $(filter $(ulp-input-dir)/$(ARCH)/%,$(math-lib-itvs)) + +ulp-cvals := $(ulp-input-dir)/cvals +$(ulp-cvals): $(math-lib-cvals) + +# Remove first word, which will be TEST directive +$(ulp-lims) $(ulp-lims-nn) $(fenv-exps) $(arch-itvs) $(generic-itvs) $(ulp-cvals): | $$(@D) + sed "s/TEST_[^ ]* //g" $^ | sort -u > $@ + +check-math-ulp: $(ulp-lims) $(ulp-lims-nn) +check-math-ulp: $(fenv-exps) $(ulp-cvals) +check-math-ulp: $(generic-itvs) $(arch-itvs) check-math-ulp: $(math-tools) - ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR) + ULPFLAGS="$(math-ulpflags)" \ + LIMITS=../../$(ulp-lims) \ + ARCH_ITVS=../../$(arch-itvs) \ + GEN_ITVS=../../$(generic-itvs) \ + DISABLE_FENV=../../$(fenv-exps) \ + CVALS=../../$(ulp-cvals) \ + FUNC=$(func) \ + WANT_EXPERIMENTAL_MATH=$(WANT_EXPERIMENTAL_MATH) \ + WANT_SVE_TESTS=$(WANT_SVE_TESTS) \ + USE_MPFR=$(USE_MPFR) \ + build/bin/runulp.sh $(EMULATOR) check-math: check-math-test check-math-rtest check-math-ulp diff --git a/math/README.contributors b/math/README.contributors index 33e7ba376e4193..58a04fa4759d15 100644 --- a/math/README.contributors +++ b/math/README.contributors @@ -1,8 +1,9 @@ STYLE REQUIREMENTS ================== -1. Most code in this sub-directory is expected to be upstreamed into glibc so - the GNU Coding Standard and glibc specific conventions should be followed +1. With the exception of math/aarch64/experimental/, most code in this + sub-directory is expected to be upstreamed into glibc so the GNU + Coding Standard and glibc specific conventions should be followed to ease upstreaming. 2. ABI and symbols: the code should be written so it is suitable for inclusion diff --git a/pl/math/v_acos_2u.c b/math/aarch64/advsimd/acos.c similarity index 85% rename from pl/math/v_acos_2u.c rename to math/aarch64/advsimd/acos.c index 581f8506c0d6f5..7873a07e6f56eb 100644 --- a/pl/math/v_acos_2u.c +++ b/math/aarch64/advsimd/acos.c @@ -1,14 +1,14 @@ /* * Double-precision vector acos(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -30,8 +30,8 @@ static const struct data }; #define AllMask v_u64 (0xffffffffffffffff) -#define Oneu (0x3ff0000000000000) -#define Small (0x3e50000000000000) /* 2^-53. */ +#define Oneu 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-53. */ #if WANT_SIMD_EXCEPT static float64x2_t VPCS_ATTR NOINLINE @@ -111,12 +111,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) return vfmaq_f64 (add, mul, y); } -PL_SIG (V, D, 1, acos, -1.0, 1.0) -PL_TEST_ULP (V_NAME_D1 (acos), 1.02) -PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000) -PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000) +TEST_SIG (V, D, 1, acos, -1.0, 1.0) +TEST_ULP (V_NAME_D1 (acos), 1.02) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000) +TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000) diff --git a/pl/math/v_acosf_1u4.c b/math/aarch64/advsimd/acosf.c similarity index 82% rename from pl/math/v_acosf_1u4.c rename to math/aarch64/advsimd/acosf.c index bb17b1df18f355..e200f792c76436 100644 --- a/pl/math/v_acosf_1u4.c +++ b/math/aarch64/advsimd/acosf.c @@ -1,14 +1,14 @@ /* * Single-precision vector acos(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -57,8 +57,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) The largest observed error in this region is 1.32 ulps, _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 - want 0x1.feb32ep-1. */ -float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x) + want 0x1.feb32ep-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -102,12 +102,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x) return vfmaq_f32 (add, mul, y); } -PL_SIG (V, F, 1, acos, -1.0, 1.0) -PL_TEST_ULP (V_NAME_F1 (acos), 0.82) -PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000) -PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000) +HALF_WIDTH_ALIAS_F1 (acos) + +TEST_SIG (V, F, 1, acos, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (acos), 0.82) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000) +TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000) diff --git a/pl/math/v_acosh_3u5.c b/math/aarch64/advsimd/acosh.c similarity index 72% rename from pl/math/v_acosh_3u5.c rename to math/aarch64/advsimd/acosh.c index 42fa2616d562bb..55d8ed5a421ecd 100644 --- a/pl/math/v_acosh_3u5.c +++ b/math/aarch64/advsimd/acosh.c @@ -1,12 +1,12 @@ /* - * Single-precision vector acosh(x) function. - * Copyright (c) 2023, Arm Limited. + * Double-precision vector acosh(x) function. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define WANT_V_LOG1P_K0_SHORTCUT 1 #include "v_log1p_inline.h" @@ -45,9 +45,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); #endif - float64x2_t xm1 = vsubq_f64 (x, v_f64 (1)); - float64x2_t y; - y = vaddq_f64 (x, v_f64 (1)); + float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0)); + float64x2_t y = vaddq_f64 (x, v_f64 (1.0)); y = vmulq_f64 (y, xm1); y = vsqrtq_f64 (y); y = vaddq_f64 (xm1, y); @@ -57,10 +56,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) return log1p_inline (y, &d->log1p_consts); } -PL_SIG (V, D, 1, acosh, 1.0, 10.0) -PL_TEST_ULP (V_NAME_D1 (acosh), 2.53) -PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000) -PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000) -PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000) -PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000) +TEST_SIG (V, D, 1, acosh, 1.0, 10.0) +TEST_ULP (V_NAME_D1 (acosh), 2.53) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000) +TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000) +TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000) +TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/pl/math/v_acoshf_3u1.c b/math/aarch64/advsimd/acoshf.c similarity index 50% rename from pl/math/v_acoshf_3u1.c rename to math/aarch64/advsimd/acoshf.c index a2ff0f02635b37..029d457cfa8aed 100644 --- a/pl/math/v_acoshf_3u1.c +++ b/math/aarch64/advsimd/acoshf.c @@ -1,49 +1,46 @@ /* * Single-precision vector acosh(x) function. - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "v_log1pf_inline.h" +#define SquareLim 0x1p64 + const static struct data { struct v_log1pf_data log1pf_consts; uint32x4_t one; - uint16x4_t thresh; -} data = { - .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, - .one = V4 (0x3f800000), - .thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1). */ -}; +} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) }; -#define SignMask 0x80000000 +#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */ static float32x4_t NOINLINE VPCS_ATTR special_case (float32x4_t x, float32x4_t y, uint16x4_t special, - const struct v_log1pf_data d) + const struct v_log1pf_data *d) { return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); } /* Vector approximation for single-precision acosh, based on log1p. Maximum error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it - is 2.78 ULP: - __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 - want 0x1.ef9ea2p-3. + is 3.00 ULP: + _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4 + want 0x1.ef0a7cp-4. With exceptions disabled, we can compute u with a shorter dependency chain, - which gives maximum error of 3.07 ULP: - __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4 - want 0x1.fbc7f4p-4. */ + which gives maximum error of 3.22 ULP: + _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5 + want 0x1.fdcdd2p-5. */ -VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); uint32x4_t ix = vreinterpretq_u32_f32 (x); - uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh); + uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh); #if WANT_SIMD_EXCEPT /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use @@ -54,25 +51,28 @@ VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x) float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); #else - float32x4_t xm1 = vsubq_f32 (x, v_f32 (1)); - float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f))); + float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one)); + float32x4_t u + = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one))); #endif float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); if (unlikely (v_any_u16h (special))) - return special_case (x, y, special, d->log1pf_consts); - return log1pf_inline (y, d->log1pf_consts); + return special_case (x, y, special, &d->log1pf_consts); + return log1pf_inline (y, &d->log1pf_consts); } -PL_SIG (V, F, 1, acosh, 1.0, 10.0) +HALF_WIDTH_ALIAS_F1 (acosh) + +TEST_SIG (V, F, 1, acosh, 1.0, 10.0) #if WANT_SIMD_EXCEPT -PL_TEST_ULP (V_NAME_F1 (acosh), 2.29) +TEST_ULP (V_NAME_F1 (acosh), 2.50) #else -PL_TEST_ULP (V_NAME_F1 (acosh), 2.58) +TEST_ULP (V_NAME_F1 (acosh), 2.78) #endif -PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500) -PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000) -PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000) -PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500) +TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000) +TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000) +TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/pl/math/v_asin_3u.c b/math/aarch64/advsimd/asin.c similarity index 56% rename from pl/math/v_asin_3u.c rename to math/aarch64/advsimd/asin.c index 756443c6b320ba..c751d9264a1285 100644 --- a/pl/math/v_asin_3u.c +++ b/math/aarch64/advsimd/asin.c @@ -1,36 +1,35 @@ /* * Double-precision vector asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { - float64x2_t poly[12]; + float64x2_t c0, c2, c4, c6, c8, c10; float64x2_t pi_over_2; uint64x2_t abs_mask; + double c1, c3, c5, c7, c9, c11; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), - V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), - V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), - V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), - V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), - V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, - .pi_over_2 = V2 (0x1.921fb54442d18p+0), - .abs_mask = V2 (0x7fffffffffffffff), + .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, + .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, + .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, + .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, + .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), }; #define AllMask v_u64 (0xffffffffffffffff) -#define One (0x3ff0000000000000) -#define Small (0x3e50000000000000) /* 2^-12. */ +#define One 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-12. */ #if WANT_SIMD_EXCEPT static float64x2_t VPCS_ATTR NOINLINE @@ -58,12 +57,11 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). The largest observed error in this region is 2.69 ulps, - _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 - want 0x1.110d7e85fdd53p-1. */ + _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - float64x2_t ax = vabsq_f64 (x); #if WANT_SIMD_EXCEPT @@ -76,7 +74,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) return special_case (x, x, AllMask); #endif - uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5)); + uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5)); /* Evaluate polynomial Q(x) = y + y * z * P(z) with z = x ^ 2 and y = |x| , if |x| < 0.5 @@ -89,7 +87,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) float64x2_t z4 = vmulq_f64 (z2, z2); float64x2_t z8 = vmulq_f64 (z4, z4); float64x2_t z16 = vmulq_f64 (z8, z8); - float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* order-11 estrin. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); + float64x2_t p = vfmaq_f64 (p07, z16, p811); /* Finalize polynomial: z + z * z2 * P(z2). */ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); @@ -102,12 +119,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) return vbslq_f64 (d->abs_mask, y, x); } -PL_SIG (V, D, 1, asin, -1.0, 1.0) -PL_TEST_ULP (V_NAME_D1 (asin), 2.19) -PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000) -PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000) +TEST_SIG (V, D, 1, asin, -1.0, 1.0) +TEST_ULP (V_NAME_D1 (asin), 2.20) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000) +TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000) diff --git a/pl/math/v_asinf_2u5.c b/math/aarch64/advsimd/asinf.c similarity index 82% rename from pl/math/v_asinf_2u5.c rename to math/aarch64/advsimd/asinf.c index eb978cd956ab82..970feb37e1d592 100644 --- a/pl/math/v_asinf_2u5.c +++ b/math/aarch64/advsimd/asinf.c @@ -1,14 +1,14 @@ /* * Single-precision vector asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -53,7 +53,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) The largest observed error in this region is 2.41 ulps, _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ -float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -93,12 +93,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x) return vbslq_f32 (v_u32 (AbsMask), y, x); } -PL_SIG (V, F, 1, asin, -1.0, 1.0) -PL_TEST_ULP (V_NAME_F1 (asin), 1.91) -PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000) -PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000) +HALF_WIDTH_ALIAS_F1 (asin) + +TEST_SIG (V, F, 1, asin, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (asin), 1.91) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000) +TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000) diff --git a/math/aarch64/advsimd/asinh.c b/math/aarch64/advsimd/asinh.c new file mode 100644 index 00000000000000..550302826bd92e --- /dev/null +++ b/math/aarch64/advsimd/asinh.c @@ -0,0 +1,242 @@ +/* + * Double-precision vector asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "test_defs.h" +#include "test_sig.h" +#include "v_math.h" + +const static struct data +{ + uint64x2_t huge_bound, abs_mask, off, mask; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound; +#endif + float64x2_t lc0, lc2; + double lc1, lc3, ln2, lc4; + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17; + double c1, c3, c5, c7, c9, c11, c13, c15; + +} data = { + +#if WANT_SIMD_EXCEPT + .tiny_bound = V2 (0x1p-26), +#endif + /* Even terms of polynomial s.t. asinh(x) is approximated by + asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). + Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ + + .c0 = V2 (-0x1.55555555554a7p-3), + .c1 = 0x1.3333333326c7p-4, + .c2 = V2 (-0x1.6db6db68332e6p-5), + .c3 = 0x1.f1c71b26fb40dp-6, + .c4 = V2 (-0x1.6e8b8b654a621p-6), + .c5 = 0x1.1c4daa9e67871p-6, + .c6 = V2 (-0x1.c9871d10885afp-7), + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c8 = V2 (-0x1.3ddca533e9f54p-7), + .c9 = 0x1.0becef748dafcp-7, + .c10 = V2 (-0x1.b90c7099dd397p-8), + .c11 = 0x1.541f2bb1ffe51p-8, + .c12 = V2 (-0x1.d217026a669ecp-9), + .c13 = 0x1.0b5c7977aaf7p-9, + .c14 = V2 (-0x1.e0f37daef9127p-11), + .c15 = 0x1.388b5fe542a6p-12, + .c16 = V2 (-0x1.021a48685e287p-14), + .c17 = V2 (0x1.93d4ba83d34dap-18), + + .lc0 = V2 (-0x1.ffffffffffff7p-2), + .lc1 = 0x1.55555555170d4p-2, + .lc2 = V2 (-0x1.0000000399c27p-2), + .lc3 = 0x1.999b2e90e94cap-3, + .lc4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + + .off = V2 (0x3fe6900900000000), + .huge_bound = V2 (0x5fe0000000000000), + .abs_mask = V2 (0x7fffffffffffffff), + .mask = V2 (0xfffULL << 52), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask, + uint64x2_t special) +{ + /* Copy sign. */ + y = vbslq_f64 (abs_mask, y, x); + return v_call_f64 (asinh, x, y, special); +} + +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static inline float64x2_t +log_inline (float64x2_t xm, const struct data *d) +{ + + uint64x2_t u = vreinterpretq_u64_f64 (xm); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->lc1); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1); + y = vfmaq_f64 (p, r2, y); + return vfmaq_f64 (hi, y, r2); +} + +/* Double-precision implementation of vector asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 2.79 ULP, in + |x| >= 1: + _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1 + want 0x1.ffffd003219ddp-1. */ +VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t ax = vabsq_f64 (x); + + uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); + +#if WANT_SIMD_EXCEPT + uint64x2_t iax = vreinterpretq_u64_f64 (ax); + uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound)); + uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); + special = vorrq_u64 (special, tiny); +#else + uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound)); +#endif + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). + If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will + overflow, by setting special lanes to 1. These will be fixed later. */ + float64x2_t option_1 = v_f64 (0); + if (likely (v_any_u64 (gt1))) + { +#if WANT_SIMD_EXCEPT + float64x2_t xm = v_zerofy_f64 (ax, special); +#else + float64x2_t xm = ax; +#endif + option_1 = log_inline ( + vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will + overflow, and tiny lanes, which will underflow, by setting them to 0. They + will be fixed later, either by selecting x or falling back to the scalar + special-case. The largest observed error in this region is 1.47 ULPs: + _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 + want 0x1.c1d6bf874019cp-1. */ + float64x2_t option_2 = v_f64 (0); + + if (likely (v_any_u64 (vceqzq_u64 (gt1)))) + { + +#if WANT_SIMD_EXCEPT + ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); +#endif + float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2); + /* Order-17 Pairwise Horner scheme. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1); + float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17); + + float64x2_t p = vfmaq_f64 (p1415, z2, p1617); + p = vfmaq_f64 (p1213, z2, p); + p = vfmaq_f64 (p1011, z2, p); + p = vfmaq_f64 (p89, z2, p); + + p = vfmaq_f64 (p67, z2, p); + p = vfmaq_f64 (p45, z2, p); + + p = vfmaq_f64 (p23, z2, p); + + p = vfmaq_f64 (p01, z2, p); + option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2)); +#if WANT_SIMD_EXCEPT + option_2 = vbslq_f64 (tiny, x, option_2); +#endif + } + + /* Choose the right option for each lane. */ + float64x2_t y = vbslq_f64 (gt1, option_1, option_2); + if (unlikely (v_any_u64 (special))) + { + return special_case (x, y, d->abs_mask, special); + } + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +TEST_SIG (V, D, 1, asinh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (asinh), 2.29) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the v_sel is choosing the right option in all cases. */ +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5) +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2) +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600) diff --git a/math/aarch64/advsimd/asinhf.c b/math/aarch64/advsimd/asinhf.c new file mode 100644 index 00000000000000..6a96f6ee9f4b9b --- /dev/null +++ b/math/aarch64/advsimd/asinhf.c @@ -0,0 +1,89 @@ +/* + * Single-precision vector asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + float32x4_t one; + uint32x4_t big_bound; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .one = V4 (1), + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */ +#endif +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t sign, float32x4_t y, + uint32x4_t special, const struct data *d) +{ + return v_call_f32 ( + asinhf, x, + vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))), + special); +} + +/* Single-precision implementation of vector asinh(x), using vector log1p. + Worst-case error is 2.59 ULP: + _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3 + want 0x1.d449c4p-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); + uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax); + float32x4_t special_arg = x; + +#if WANT_SIMD_EXCEPT + /* Sidestep tiny and large values to avoid inadvertently triggering + under/overflow. */ + special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound)); + if (unlikely (v_any_u32 (special))) + { + ax = v_zerofy_f32 (ax, special); + x = v_zerofy_f32 (x, special); + } +#endif + + /* asinh(x) = log(x + sqrt(x * x + 1)). + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ + float32x4_t d + = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax))); + float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)); + + if (unlikely (v_any_u32 (special))) + return special_case (special_arg, sign, y, special, dat); + return vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts)))); +} + +HALF_WIDTH_ALIAS_F1 (asinh) + +TEST_SIG (V, F, 1, asinh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (asinh), 2.10) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000) diff --git a/pl/math/v_atan_2u5.c b/math/aarch64/advsimd/atan.c similarity index 51% rename from pl/math/v_atan_2u5.c rename to math/aarch64/advsimd/atan.c index ba68cc3cc720bf..26d2643210685a 100644 --- a/pl/math/v_atan_2u5.c +++ b/math/aarch64/advsimd/atan.c @@ -1,32 +1,32 @@ /* * Double-precision vector atan(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; float64x2_t pi_over_2; - float64x2_t poly[20]; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, .pi_over_2 = V2 (0x1.921fb54442d18p+0), }; @@ -42,6 +42,11 @@ static const struct data float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) { const struct data *d = ptr_barrier (&data); + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); /* Small cases, infs and nans are supported by our approximation technique, but do not set fenv flags correctly. Only trigger special case if we need @@ -80,9 +85,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) float64x2_t x2 = vmulq_f64 (z2, z2); float64x2_t x4 = vmulq_f64 (x2, x2); float64x2_t x8 = vmulq_f64 (x4, x4); - float64x2_t y - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), - v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t y = vfmaq_f64 (p07, p819, x8); /* Finalize. y = shift + z + z^3 * P(z^2). */ y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); @@ -93,12 +124,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) return y; } -PL_SIG (V, D, 1, atan, -10.0, 10.0) -PL_TEST_ULP (V_NAME_D1 (atan), 1.78) -PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000) -PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000) -PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000) -PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000) -PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000) -PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000) +TEST_SIG (V, D, 1, atan, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (atan), 1.78) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000) +TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000) +TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000) +TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000) +TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000) +TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000) diff --git a/math/aarch64/advsimd/atan2.c b/math/aarch64/advsimd/atan2.c new file mode 100644 index 00000000000000..18c4b70b92f6a6 --- /dev/null +++ b/math/aarch64/advsimd/atan2.c @@ -0,0 +1,171 @@ +/* + * Double-precision vector atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + uint64x2_t zeroinfnan, minustwo; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .c0 = V2 (-0x1.5555555555555p-2), + .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), + .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), + .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), + .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), + .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), + .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), + .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), + .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), + .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), + .c19 = 0x1.358851160a528p-16, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), + .minustwo = V2 (0xc000000000000000), +}; + +#define SignMask v_u64 (0x8000000000000000) + +/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, + uint64x2_t sign_xy, uint64x2_t cmp) +{ + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + return v_call2_f64 (atan2, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline uint64x2_t +zeroinfnan (uint64x2_t i, const struct data *d) +{ + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan); +} + +/* Fast implementation of vector atan2. + Maximum observed error is 2.8 ulps: + _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) + got 0x1.92d628ab678ccp-1 + want 0x1.92d628ab678cfp-1. */ +float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t iy = vreinterpretq_u64_f64 (y); + + uint64x2_t special_cases + = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d)); + + uint64x2_t sign_x = vandq_u64 (ix, SignMask); + uint64x2_t sign_y = vandq_u64 (iy, SignMask); + uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t pred_xlt0 = vcltzq_f64 (x); + uint64x2_t pred_aygtax = vcagtq_f64 (y, x); + + /* Set up z for call to atan. */ + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); + float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (n, q); + + /* Work out the correct shift. */ + float64x2_t shift + = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); + shift = vmulq_f64 (shift, d->pi_over_2); + + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t ret = vfmaq_f64 (p07, p819, x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); + ret = vaddq_f64 (ret, shift); + + if (unlikely (v_any_u64 (special_cases))) + return special_case (y, x, ret, sign_xy, special_cases); + + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + + return ret; +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (V, D, 2, atan2) +// TODO tighten this once __v_atan2 is fixed +TEST_ULP (V_NAME_D2 (atan2), 2.9) +TEST_DISABLE_FENV (V_NAME_D2 (atan2)) +TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000) +TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000) diff --git a/pl/math/v_atan2f_3u.c b/math/aarch64/advsimd/atan2f.c similarity index 54% rename from pl/math/v_atan2f_3u.c rename to math/aarch64/advsimd/atan2f.c index bbfc3cb552f69d..632014249ab031 100644 --- a/pl/math/v_atan2f_3u.c +++ b/math/aarch64/advsimd/atan2f.c @@ -1,59 +1,64 @@ /* * Single-precision vector atan2(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { - float32x4_t poly[8]; - float32x4_t pi_over_2; + float32x4_t c0, pi_over_2, c4, c6, c2; + float c1, c3, c5, c7; + uint32x4_t comp_const; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. Generated using fpminimax between FLT_MIN and 1. */ - .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), - V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), - V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, - .pi_over_2 = V4 (0x1.921fb6p+0f), + .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, + .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, + .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, + .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, + .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), }; #define SignMask v_u32 (0x80000000) /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, + uint32x4_t sign_xy, uint32x4_t cmp) { + /* Account for the sign of y. */ + ret = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); return v_call2_f32 (atan2f, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ static inline uint32x4_t -zeroinfnan (uint32x4_t i) +zeroinfnan (uint32x4_t i, const struct data *d) { /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ - return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), - v_u32 (2 * 0x7f800000lu - 1)); + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); } /* Fast implementation of vector atan2f. Maximum observed error is 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 want 0x1.967f00p-1. */ -float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t iy = vreinterpretq_u32_f32 (y); - uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); + uint32x4_t special_cases + = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d)); uint32x4_t sign_x = vandq_u32 (ix, SignMask); uint32x4_t sign_y = vandq_u32 (iy, SignMask); @@ -67,14 +72,14 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) /* Set up z for call to atanf. */ float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); - float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); - float32x4_t z = vdivq_f32 (n, d); + float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (n, q); /* Work out the correct shift. */ float32x4_t shift = vreinterpretq_f32_u32 ( vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); - shift = vmulq_f32 (shift, data_ptr->pi_over_2); + shift = vmulq_f32 (shift, d->pi_over_2); /* Calculate the polynomial approximation. Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, @@ -86,30 +91,37 @@ float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) float32x4_t z2 = vmulq_f32 (z, z); float32x4_t z4 = vmulq_f32 (z2, z2); - float32x4_t ret = vfmaq_f32 ( - v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, - vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3); + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); + + float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); /* y = shift + z * P(z^2). */ ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); - /* Account for the sign of y. */ - ret = vreinterpretq_f32_u32 ( - veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); - if (unlikely (v_any_u32 (special_cases))) { - return special_case (y, x, ret, special_cases); + return special_case (y, x, ret, sign_xy, special_cases); } - return ret; + /* Account for the sign of y. */ + return vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); } +HALF_WIDTH_ALIAS_F2 (atan2) + /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ -PL_SIG (V, F, 2, atan2) -PL_TEST_ULP (V_NAME_F2 (atan2), 2.46) -PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000) -PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000) +TEST_SIG (V, F, 2, atan2) +TEST_DISABLE_FENV (V_NAME_F2 (atan2)) +TEST_ULP (V_NAME_F2 (atan2), 2.46) +TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000) +TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000) diff --git a/pl/math/v_atanf_3u.c b/math/aarch64/advsimd/atanf.c similarity index 85% rename from pl/math/v_atanf_3u.c rename to math/aarch64/advsimd/atanf.c index f522d957c1cc30..61927c9b261a45 100644 --- a/pl/math/v_atanf_3u.c +++ b/math/aarch64/advsimd/atanf.c @@ -1,14 +1,14 @@ /* * Single-precision vector atan(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f32.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" static const struct data { @@ -43,7 +43,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ -float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -98,10 +98,12 @@ float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x) return y; } -PL_SIG (V, F, 1, atan, -10.0, 10.0) -PL_TEST_ULP (V_NAME_F1 (atan), 2.5) -PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000) +HALF_WIDTH_ALIAS_F1 (atan) + +TEST_SIG (V, F, 1, atan, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (atan), 2.5) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000) diff --git a/pl/math/v_atanh_3u5.c b/math/aarch64/advsimd/atanh.c similarity index 55% rename from pl/math/v_atanh_3u5.c rename to math/aarch64/advsimd/atanh.c index f282826a3f3214..c2f9585dd29b18 100644 --- a/pl/math/v_atanh_3u5.c +++ b/math/aarch64/advsimd/atanh.c @@ -1,13 +1,13 @@ /* * Double-precision vector atanh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define WANT_V_LOG1P_K0_SHORTCUT 0 #include "v_log1p_inline.h" @@ -15,15 +15,19 @@ const static struct data { struct v_log1p_data log1p_consts; - uint64x2_t one, half; + uint64x2_t one; + uint64x2_t sign_mask; } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, .one = V2 (0x3ff0000000000000), - .half = V2 (0x3fe0000000000000) }; + .sign_mask = V2 (0x8000000000000000) }; static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y, + uint64x2_t special, const struct data *d) { - return v_call_f64 (atanh, x, y, special); + y = log1p_inline (y, &d->log1p_consts); + return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x), + vmulq_f64 (halfsign, y), special); } /* Approximation for vector double-precision atanh(x) using modified log1p. @@ -35,11 +39,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); + float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5)); float64x2_t ax = vabsq_f64 (x); uint64x2_t ia = vreinterpretq_u64_f64 (ax); - uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia); uint64x2_t special = vcgeq_u64 (ia, d->one); - float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half)); #if WANT_SIMD_EXCEPT ax = v_zerofy_f64 (ax, special); @@ -47,20 +50,26 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x) float64x2_t y; y = vaddq_f64 (ax, ax); - y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax)); - y = log1p_inline (y, &d->log1p_consts); + y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax)); if (unlikely (v_any_u64 (special))) - return special_case (x, vmulq_f64 (y, halfsign), special); +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special, d); +#else + return special_case (ax, halfsign, y, special, d); +#endif + + y = log1p_inline (y, &d->log1p_consts); return vmulq_f64 (y, halfsign); } -PL_SIG (V, D, 1, atanh, -1.0, 1.0) -PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT) -PL_TEST_ULP (V_NAME_D1 (atanh), 3.32) +TEST_SIG (V, D, 1, atanh, -1.0, 1.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT) +TEST_ULP (V_NAME_D1 (atanh), 3.32) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100) /* atanh is asymptotic at 1, which is the default control value - have to set -c 0 specially to ensure fp exceptions are triggered correctly (choice of control lane is irrelevant if fp exceptions are disabled). */ -PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0) -PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0) -PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0) +TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0) diff --git a/pl/math/v_atanhf_3u1.c b/math/aarch64/advsimd/atanhf.c similarity index 54% rename from pl/math/v_atanhf_3u1.c rename to math/aarch64/advsimd/atanhf.c index f6a5f25eca9a8c..313d15ca63910d 100644 --- a/pl/math/v_atanhf_3u1.c +++ b/math/aarch64/advsimd/atanhf.c @@ -1,13 +1,13 @@ /* * Single-precision vector atanh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "v_log1pf_inline.h" const static struct data @@ -30,16 +30,18 @@ const static struct data #define Half v_u32 (0x3f000000) static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y, + uint32x4_t special) { - return v_call_f32 (atanhf, x, y, special); + return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign), + vmulq_f32 (halfsign, y), special); } /* Approximation for vector single-precision atanh(x) using modified log1p. - The maximum error is 3.08 ULP: - __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 - want 0x1.ffcb82p-5. */ -VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x) + The maximum error is 2.93 ULP: + _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5 + want 0x1.f4dcf8p-5. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -58,20 +60,31 @@ VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x) uint32x4_t special = vcgeq_u32 (iax, d->one); #endif - float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax)); - y = log1pf_inline (y, d->log1pf_consts); + float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), + vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax)); + y = log1pf_inline (y, &d->log1pf_consts); + /* If exceptions not required, pass ax to special-case for shorter dependency + chain. If exceptions are required ax will have been zerofied, so have to + pass x. */ if (unlikely (v_any_u32 (special))) - return special_case (x, vmulq_f32 (halfsign, y), special); +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special); +#else + return special_case (ax, halfsign, y, special); +#endif return vmulq_f32 (halfsign, y); } -PL_SIG (V, F, 1, atanh, -1.0, 1.0) -PL_TEST_ULP (V_NAME_F1 (atanh), 2.59) -PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT) +HALF_WIDTH_ALIAS_F1 (atanh) + +TEST_SIG (V, F, 1, atanh, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (atanh), 2.44) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000) /* atanh is asymptotic at 1, which is the default control value - have to set -c 0 specially to ensure fp exceptions are triggered correctly (choice of control lane is irrelevant if fp exceptions are disabled). */ -PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0) -PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0) -PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0) +TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0) diff --git a/pl/math/v_cbrt_2u.c b/math/aarch64/advsimd/cbrt.c similarity index 76% rename from pl/math/v_cbrt_2u.c rename to math/aarch64/advsimd/cbrt.c index cc7cff15dc0fa5..8e72e5b566fc88 100644 --- a/pl/math/v_cbrt_2u.c +++ b/math/aarch64/advsimd/cbrt.c @@ -1,14 +1,14 @@ /* * Double-precision vector cbrt(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f64.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f64.h" const static struct data { @@ -40,13 +40,20 @@ special_case (float64x2_t x, float64x2_t y, uint32x2_t special) return v_call_f64 (cbrt, x, y, vmovl_u32 (special)); } -/* Approximation for double-precision vector cbrt(x), using low-order polynomial - and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat +/* Approximation for double-precision vector cbrt(x), using low-order + polynomial and two Newton iterations. + + The vector version of frexp does not handle subnormals + correctly. As a result these need to be handled by the scalar + fallback, where accuracy may be worse than that of the vector code + path. + + Greatest observed error in the normal range is 1.79 ULP. Errors repeat according to the exponent, for instance an error observed for double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an integer. - __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 - want 0x1.965fe72821e99p+0. */ + _ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -64,8 +71,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) uint64x2_t ia12 = vshrq_n_u64 (iax, 52); int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias); - /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for - Newton iterations. */ + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly); float64x2_t one_third = d->one_third; /* Two iterations of Newton's method for iteratively approximating cbrt. */ @@ -84,8 +91,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. - Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is - an integer in [-2, 2], and can be looked up in the table T. Hence the + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the result is assembled as: cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ @@ -110,7 +117,11 @@ VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) return vbslq_f64 (d->abs_mask, y, x); } -PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30) -PL_SIG (V, D, 1, cbrt, -10.0, 10.0) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt)) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000) +/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which + has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error + in the vector path is 1.79 ULP. + [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical + Functions in Single, Double, Double Extended, and Quadruple Precision. */ +TEST_ULP (V_NAME_D1 (cbrt), 3.17) +TEST_SIG (V, D, 1, cbrt, -10.0, 10.0) +TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000) diff --git a/pl/math/v_cbrtf_1u7.c b/math/aarch64/advsimd/cbrtf.c similarity index 91% rename from pl/math/v_cbrtf_1u7.c rename to math/aarch64/advsimd/cbrtf.c index 74918765209f9a..4e76feb2dd8b62 100644 --- a/pl/math/v_cbrtf_1u7.c +++ b/math/aarch64/advsimd/cbrtf.c @@ -1,14 +1,14 @@ /* * Single-precision vector cbrt(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f32.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" const static struct data { @@ -49,7 +49,7 @@ shifted_lookup (const float *table, int32x4_t i) 0x1.85a2aa and the exponent is a multiple of 3, for example: _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1 want 0x1.267932p+1. */ -VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x) { const struct data *d = ptr_barrier (&data); uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); @@ -110,7 +110,8 @@ VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x) return vbslq_f32 (SignMask, x, y); } -PL_SIG (V, F, 1, cbrt, -10.0, 10.0) -PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt)) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000) +HALF_WIDTH_ALIAS_F1 (cbrt) + +TEST_SIG (V, F, 1, cbrt, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (cbrt), 1.15) +TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000) diff --git a/pl/math/v_cexpi_3u5.c b/math/aarch64/advsimd/cexpi.c similarity index 79% rename from pl/math/v_cexpi_3u5.c rename to math/aarch64/advsimd/cexpi.c index 5163b15926b899..40ba5ff31f20ea 100644 --- a/pl/math/v_cexpi_3u5.c +++ b/math/aarch64/advsimd/cexpi.c @@ -1,13 +1,13 @@ /* * Double-precision vector sincos function - return-by-value interface. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_sincos_common.h" #include "v_math.h" -#include "pl_test.h" +#include "test_defs.h" static float64x2x2_t VPCS_ATTR NOINLINE special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y) @@ -34,11 +34,13 @@ _ZGVnN2v_cexpi (float64x2_t x) return sc; } -PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73) -PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73) +TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos) +TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin) +TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73) +TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73) #define V_CEXPI_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n) + TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n) V_CEXPI_INTERVAL (0, 0x1p23, 500000) V_CEXPI_INTERVAL (-0, -0x1p23, 500000) V_CEXPI_INTERVAL (0x1p23, inf, 10000) diff --git a/pl/math/v_cexpif_1u8.c b/math/aarch64/advsimd/cexpif.c similarity index 80% rename from pl/math/v_cexpif_1u8.c rename to math/aarch64/advsimd/cexpif.c index 4897018d30908b..e55d99653a668c 100644 --- a/pl/math/v_cexpif_1u8.c +++ b/math/aarch64/advsimd/cexpif.c @@ -1,13 +1,13 @@ /* * Single-precision vector cexpi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_sincosf_common.h" #include "v_math.h" -#include "pl_test.h" +#include "test_defs.h" static float32x4x2_t VPCS_ATTR NOINLINE special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y) @@ -36,11 +36,13 @@ _ZGVnN4v_cexpif (float32x4_t x) return sc; } -PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17) -PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31) +TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin) +TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos) +TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17) +TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31) #define V_CEXPIF_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n) + TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n) V_CEXPIF_INTERVAL (0, 0x1p20, 500000) V_CEXPIF_INTERVAL (-0, -0x1p20, 500000) V_CEXPIF_INTERVAL (0x1p20, inf, 10000) diff --git a/math/aarch64/v_cos.c b/math/aarch64/advsimd/cos.c similarity index 80% rename from math/aarch64/v_cos.c rename to math/aarch64/advsimd/cos.c index 9a73575bce896a..9f3de4dd5c3690 100644 --- a/math/aarch64/v_cos.c +++ b/math/aarch64/advsimd/cos.c @@ -1,17 +1,19 @@ /* * Double-precision vector cos function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { float64x2_t poly[7]; - float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), @@ -19,11 +21,9 @@ static const struct data V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), V2 (-0x1.9e9540300a1p-41) }, .inv_pi = V2 (0x1.45f306dc9c883p-2), - .half_pi = V2 (0x1.921fb54442d18p+0), .pi_1 = V2 (0x1.921fb54442d18p+1), .pi_2 = V2 (0x1.1a62633145c06p-53), .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), .range_val = V2 (0x1p23) }; @@ -57,10 +57,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) #endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); - n = vsubq_f64 (n, v_f64 (0.5)); + n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); + n = vsubq_f64 (n, v_f64 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f64 (r, d->pi_1, n); @@ -85,3 +84,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } + +TEST_SIG (V, D, 1, cos, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (cos), 3.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000) diff --git a/math/aarch64/v_cosf.c b/math/aarch64/advsimd/cosf.c similarity index 76% rename from math/aarch64/v_cosf.c rename to math/aarch64/advsimd/cosf.c index b9890b2998ad3c..d2844e44e19662 100644 --- a/math/aarch64/v_cosf.c +++ b/math/aarch64/advsimd/cosf.c @@ -1,17 +1,19 @@ /* * Single-precision vector cos function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { float32x4_t poly[4]; - float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* 1.886 ulp error. */ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), @@ -22,8 +24,6 @@ static const struct data .pi_3 = V4 (-0x1.ee59dap-49f), .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), - .half_pi = V4 (0x1.921fb6p0f), .range_val = V4 (0x1p20f) }; @@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) return v_call_f32 (cosf, x, y, cmp); } -float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) { const struct data *d = ptr_barrier (&data); float32x4_t n, r, r2, r3, y; @@ -58,9 +58,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) #endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); + n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); n = vsubq_f32 (n, v_f32 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ @@ -80,3 +79,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } + +HALF_WIDTH_ALIAS_F1 (cos) + +TEST_SIG (V, F, 1, cos, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (cos), 1.4) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000) diff --git a/pl/math/v_cosh_2u.c b/math/aarch64/advsimd/cosh.c similarity index 84% rename from pl/math/v_cosh_2u.c rename to math/aarch64/advsimd/cosh.c index 649c390f4622d6..54407b23aa9dce 100644 --- a/pl/math/v_cosh_2u.c +++ b/math/aarch64/advsimd/cosh.c @@ -1,18 +1,20 @@ /* * Double-precision vector cosh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float64x2_t poly[3]; - float64x2_t inv_ln2, ln2, shift, thres; + float64x2_t inv_ln2; + double ln2[2]; + float64x2_t shift, thres; uint64x2_t index_mask, special_bound; } data = { .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), @@ -48,8 +50,9 @@ exp_inline (float64x2_t x) float64x2_t n = vsubq_f64 (z, d->shift); /* r = x - n*ln2/N. */ - float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0); - r = vfmaq_laneq_f64 (r, n, d->ln2, 1); + float64x2_t ln2 = vld1q_f64 (d->ln2); + float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0); + r = vfmaq_laneq_f64 (r, n, ln2, 1); uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); uint64x2_t i = vandq_u64 (u, d->index_mask); @@ -97,8 +100,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x) return vaddq_f64 (half_t, half_over_t); } -PL_SIG (V, D, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_D1 (cosh), 1.43) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh)) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000) +TEST_SIG (V, D, 1, cosh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (cosh), 1.43) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000) diff --git a/pl/math/v_coshf_2u4.c b/math/aarch64/advsimd/coshf.c similarity index 64% rename from pl/math/v_coshf_2u4.c rename to math/aarch64/advsimd/coshf.c index c622b0b183f1dc..f1ed3e5161fdc8 100644 --- a/pl/math/v_coshf_2u4.c +++ b/math/aarch64/advsimd/coshf.c @@ -1,32 +1,39 @@ /* * Single-precision vector cosh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_expf_inline.h" #include "v_math.h" -#include "mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { struct v_expf_data expf_consts; - uint32x4_t tiny_bound, special_bound; + uint32x4_t tiny_bound; + float32x4_t bound; +#if WANT_SIMD_EXCEPT + uint32x4_t special_bound; +#endif } data = { .expf_consts = V_EXPF_DATA, .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .bound = V4 (0x1.5a92d8p+6), +#if WANT_SIMD_EXCEPT .special_bound = V4 (0x42ad496c), +#endif }; #if !WANT_SIMD_EXCEPT static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t, + uint32x4_t special) { - return v_call_f32 (coshf, x, y, special); + return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special); } #endif @@ -34,18 +41,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) Maximum error is 2.38 ULP: _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4. */ -float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t ax = vabsq_f32 (x); - uint32x4_t iax = vreinterpretq_u32_f32 (ax); - uint32x4_t special = vcgeq_u32 (iax, d->special_bound); - #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered correctly, fall back to the scalar variant for all inputs if any input is a special value or above the bound at which expf overflows. */ + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); if (unlikely (v_any_u32 (special))) return v_call_f32 (coshf, x, x, v_u32 (-1)); @@ -54,10 +60,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) input to 0, which will generate no exceptions. */ if (unlikely (v_any_u32 (tiny))) ax = v_zerofy_f32 (ax, tiny); + float32x4_t t = v_expf_inline (ax, &d->expf_consts); +#else + uint32x4_t special = vcageq_f32 (x, d->bound); + float32x4_t t = v_expf_inline (x, &d->expf_consts); #endif /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ - float32x4_t t = v_expf_inline (ax, &d->expf_consts); float32x4_t half_t = vmulq_n_f32 (t, 0.5); float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); @@ -66,15 +75,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); #else if (unlikely (v_any_u32 (special))) - return special_case (x, vaddq_f32 (half_t, half_over_t), special); + return special_case (x, half_t, half_over_t, special); #endif return vaddq_f32 (half_t, half_over_t); } -PL_SIG (V, F, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_F1 (cosh), 1.89) -PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) +HALF_WIDTH_ALIAS_F1 (cosh) + +TEST_SIG (V, F, 1, cosh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (cosh), 1.89) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/pl/math/v_cospi_3u1.c b/math/aarch64/advsimd/cospi.c similarity index 81% rename from pl/math/v_cospi_3u1.c rename to math/aarch64/advsimd/cospi.c index 3c2ee0b74c8ead..e63201a5578611 100644 --- a/pl/math/v_cospi_3u1.c +++ b/math/aarch64/advsimd/cospi.c @@ -1,15 +1,15 @@ /* * Double-precision vector cospi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -31,7 +31,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) { /* Fall back to scalar code. */ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); - return v_call_f64 (cospi, x, y, cmp); + return v_call_f64 (arm_math_cospi, x, y, cmp); } /* Approximation for vector double-precision cospi(x). @@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x) return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } -PL_SIG (V, D, 1, cospi, -0.9, 0.9) -PL_TEST_ULP (V_NAME_D1 (cospi), 2.56) -PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000) +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_D1 (cospi), 2.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000) +#endif diff --git a/pl/math/v_cospif_3u2.c b/math/aarch64/advsimd/cospif.c similarity index 76% rename from pl/math/v_cospif_3u2.c rename to math/aarch64/advsimd/cospif.c index d88aa828439d15..62f4b8122b2cff 100644 --- a/pl/math/v_cospif_3u2.c +++ b/math/aarch64/advsimd/cospif.c @@ -1,15 +1,15 @@ /* * Single-precision vector cospi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -26,14 +26,14 @@ static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) { y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); - return v_call_f32 (cospif, x, y, cmp); + return v_call_f32 (arm_math_cospif, x, y, cmp); } /* Approximation for vector single-precision cospi(x) Maximum Error: 3.17 ULP: _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. */ -float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -74,10 +74,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x) return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } -PL_SIG (V, F, 1, cospi, -0.9, 0.9) -PL_TEST_ULP (V_NAME_F1 (cospi), 2.67) -PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000) +HALF_WIDTH_ALIAS_F1 (cospi) + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_F1 (cospi), 2.67) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000) +#endif diff --git a/pl/math/v_erf_2u5.c b/math/aarch64/advsimd/erf.c similarity index 77% rename from pl/math/v_erf_2u5.c rename to math/aarch64/advsimd/erf.c index e581ec5bb8a73c..40717a660ce2fb 100644 --- a/pl/math/v_erf_2u5.c +++ b/math/aarch64/advsimd/erf.c @@ -1,30 +1,32 @@ /* * Double-precision vector erf(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float64x2_t third; - float64x2_t tenth, two_over_five, two_over_fifteen; - float64x2_t two_over_nine, two_over_fortyfive; + float64x2_t tenth, two_over_five, two_over_nine; + double two_over_fifteen, two_over_fortyfive; float64x2_t max, shift; + uint64x2_t max_idx; #if WANT_SIMD_EXCEPT float64x2_t tiny_bound, huge_bound, scale_minus_one; #endif } data = { + .max_idx = V2 (768), .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ - .two_over_fifteen = V2 (0x1.1111111111111p-3), + .two_over_fifteen = 0x1.1111111111111p-3, .tenth = V2 (-0x1.999999999999ap-4), .two_over_five = V2 (-0x1.999999999999ap-2), .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), - .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5), + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, .max = V2 (5.9921875), /* 6 - 1/128. */ .shift = V2 (0x1p45), #if WANT_SIMD_EXCEPT @@ -46,8 +48,8 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])), - e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1])); + float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf), + e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf); e.erf = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; @@ -77,8 +79,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) float64x2_t a = vabsq_f64 (x); /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs to return expected results. */ - uint64x2_t a_le_max = vcleq_f64 (a, dat->max); - uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max); + uint64x2_t a_le_max = vcaleq_f64 (x, dat->max); + uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max); #if WANT_SIMD_EXCEPT /* |x| huge or tiny. */ @@ -105,7 +107,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) segfault. */ uint64x2_t i = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift)); - i = vbslq_u64 (a_le_max, i, v_u64 (768)); + i = vbslq_u64 (a_le_max, i, dat->max_idx); struct entry e = lookup (i); float64x2_t r = vsubq_f64 (z, shift); @@ -115,14 +117,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) float64x2_t d2 = vmulq_f64 (d, d); float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t two_over_fifteen_and_fortyfive + = vld1q_f64 (&dat->two_over_fifteen); + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ float64x2_t p1 = r; float64x2_t p2 = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third)); float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third)); - float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen); + float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2, + two_over_fifteen_and_fortyfive, 0); p4 = vfmsq_f64 (dat->tenth, r2, p4); - float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive); + float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2, + two_over_fifteen_and_fortyfive, 1); p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5)); float64x2_t p34 = vfmaq_f64 (p3, d, p4); @@ -150,9 +157,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) return y; } -PL_SIG (V, D, 1, erf, -6.0, 6.0) -PL_TEST_ULP (V_NAME_D1 (erf), 1.79) -PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000) +TEST_SIG (V, D, 1, erf, -6.0, 6.0) +TEST_ULP (V_NAME_D1 (erf), 1.79) +/* WANT_SIMD_EXCEPT blocks miss some cases. */ +TEST_DISABLE_FENV (V_NAME_D1 (erf)) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000) diff --git a/pl/math/v_erfc_1u8.c b/math/aarch64/advsimd/erfc.c similarity index 77% rename from pl/math/v_erfc_1u8.c rename to math/aarch64/advsimd/erfc.c index 10ef7e6a3c34e3..97ef09ecc113c7 100644 --- a/pl/math/v_erfc_1u8.c +++ b/math/aarch64/advsimd/erfc.c @@ -1,21 +1,21 @@ /* * Double-precision vector erfc(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { uint64x2_t offset, table_scale; float64x2_t max, shift; - float64x2_t p20, p40, p41, p42; - float64x2_t p51, p52; - float64x2_t qr5, qr6, qr7, qr8, qr9; + float64x2_t p20, p40, p41, p51; + double p42, p52; + double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; #if WANT_SIMD_EXCEPT float64x2_t uflow_bound; #endif @@ -30,9 +30,9 @@ static const struct data .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */ .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */ .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */ - .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */ + .p42 = 0x1.1111111111111p-3, /* 2/15. */ .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */ - .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */ + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 }, .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 }, @@ -57,8 +57,10 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])), - e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1])); + float64x2_t e1 + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); + float64x2_t e2 + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); e.erfc = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; @@ -144,22 +146,26 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) float64x2_t p1 = r; float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20)); float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20)); - float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42); + float64x2_t p42_p52 = vld1q_f64 (&dat->p42); + float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0); p4 = vfmsq_f64 (dat->p40, r2, p4); - float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52); + float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1); p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); /* Compute p_i using recurrence relation: p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ - float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0)); - p6 = vmulq_laneq_f64 (p6, dat->qr5, 1); - float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0)); - p7 = vmulq_laneq_f64 (p7, dat->qr6, 1); - float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0)); - p8 = vmulq_laneq_f64 (p8, dat->qr7, 1); - float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0)); - p9 = vmulq_laneq_f64 (p9, dat->qr8, 1); - float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0)); - p10 = vmulq_laneq_f64 (p10, dat->qr9, 1); + float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6), + qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8), + qr9 = vld1q_f64 (dat->qr9); + float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0)); + p6 = vmulq_laneq_f64 (p6, qr5, 1); + float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0)); + p7 = vmulq_laneq_f64 (p7, qr6, 1); + float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0)); + p8 = vmulq_laneq_f64 (p8, qr7, 1); + float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0)); + p9 = vmulq_laneq_f64 (p9, qr8, 1); + float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0)); + p10 = vmulq_laneq_f64 (p10, qr9, 1); /* Compute polynomial in d using pairwise Horner scheme. */ float64x2_t p90 = vfmaq_f64 (p9, d, p10); float64x2_t p78 = vfmaq_f64 (p7, d, p8); @@ -189,10 +195,11 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) return vfmaq_f64 (off, fac, y); } -PL_SIG (V, D, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (V_NAME_D1 (erfc), 1.21) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) -PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) -PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000) -PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000) +TEST_SIG (V, D, 1, erfc, -6.0, 28.0) +TEST_ULP (V_NAME_D1 (erfc), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000) diff --git a/pl/math/v_erfcf_1u7.c b/math/aarch64/advsimd/erfcf.c similarity index 76% rename from pl/math/v_erfcf_1u7.c rename to math/aarch64/advsimd/erfcf.c index c361d070443827..f420439ef8a3b6 100644 --- a/pl/math/v_erfcf_1u7.c +++ b/math/aarch64/advsimd/erfcf.c @@ -1,19 +1,20 @@ /* * Single-precision vector erfc(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { uint32x4_t offset, table_scale; float32x4_t max, shift; - float32x4_t coeffs, third, two_over_five, tenth; + float coeffs[4]; + float32x4_t third, two_over_five, tenth; #if WANT_SIMD_EXCEPT float32x4_t uflow_bound; #endif @@ -27,7 +28,7 @@ static const struct data .shift = V4 (0x1p17f), /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and fmas. */ - .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, + .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, .third = V4 (0x1.555556p-2f), .two_over_five = V4 (-0x1.99999ap-2f), .tenth = V4 (-0x1.99999ap-4f), @@ -50,12 +51,16 @@ static inline struct entry lookup (uint32x4_t i) { struct entry e; - float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0])); - float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1])); - float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2])); - float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3])); - float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); - float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + float32x2_t t0 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); + float32x2_t t1 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); + float32x2_t t2 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); + float32x2_t t3 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); e.erfc = vuzp1q_f32 (e1, e2); e.scale = vuzp2q_f32 (e1, e2); return e; @@ -86,8 +91,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 want 0x1.f51216p-120. */ -VPCS_ATTR -float32x4_t V_NAME_F1 (erfc) (float32x4_t x) +NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x) { const struct data *dat = ptr_barrier (&data); @@ -130,10 +134,11 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x) float32x4_t r2 = vmulq_f32 (r, r); float32x4_t p1 = r; - float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1); + float32x4_t coeffs = vld1q_f32 (dat->coeffs); + float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1); float32x4_t p3 - = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0)); - float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2); + = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0)); + float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2); p4 = vfmsq_f32 (dat->tenth, r2, p4); float32x4_t y = vfmaq_f32 (p3, d, p4); @@ -157,10 +162,13 @@ float32x4_t V_NAME_F1 (erfc) (float32x4_t x) return vfmaq_f32 (off, fac, y); } -PL_SIG (V, F, 1, erfc, -4.0, 10.0) -PL_TEST_ULP (V_NAME_F1 (erfc), 1.14) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) +HALF_WIDTH_ALIAS_F1 (erfc) + +TEST_SIG (V, F, 1, erfc, -4.0, 10.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT) +TEST_ULP (V_NAME_F1 (erfc), 1.14) +TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) diff --git a/pl/math/v_erff_2u.c b/math/aarch64/advsimd/erff.c similarity index 76% rename from pl/math/v_erff_2u.c rename to math/aarch64/advsimd/erff.c index 502526407df229..508bc4c2f5e226 100644 --- a/pl/math/v_erff_2u.c +++ b/math/aarch64/advsimd/erff.c @@ -1,13 +1,13 @@ /* * Single-precision vector erf(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -37,12 +37,12 @@ static inline struct entry lookup (uint32x4_t i) { struct entry e; - float64_t t0 = *((float64_t *) (__erff_data.tab + i[0])); - float64_t t1 = *((float64_t *) (__erff_data.tab + i[1])); - float64_t t2 = *((float64_t *) (__erff_data.tab + i[2])); - float64_t t3 = *((float64_t *) (__erff_data.tab + i[3])); - float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); - float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf); + float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf); + float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf); + float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); e.erf = vuzp1q_f32 (e1, e2); e.scale = vuzp2q_f32 (e1, e2); return e; @@ -61,7 +61,7 @@ lookup (uint32x4_t i) Maximum error: 1.93 ULP _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 want 0x1.fd6868p-9. */ -float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x) { const struct data *dat = ptr_barrier (&data); @@ -110,9 +110,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x) return y; } -PL_SIG (V, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (V_NAME_F1 (erf), 1.43) -PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000) +HALF_WIDTH_ALIAS_F1 (erf) + +TEST_SIG (V, F, 1, erf, -4.0, 4.0) +TEST_ULP (V_NAME_F1 (erf), 1.43) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000) diff --git a/math/aarch64/v_exp.c b/math/aarch64/advsimd/exp.c similarity index 90% rename from math/aarch64/v_exp.c rename to math/aarch64/advsimd/exp.c index bc5609faf4fc35..a928c35c9418b7 100644 --- a/math/aarch64/v_exp.c +++ b/math/aarch64/advsimd/exp.c @@ -1,12 +1,14 @@ /* * Double-precision vector e^x function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" #define N (1 << V_EXP_TABLE_BITS) #define IndexMask (N - 1) @@ -123,3 +125,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) return vfmaq_f64 (s, y, s); } + +TEST_SIG (V, D, 1, exp, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp), 1.9) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000) diff --git a/pl/math/v_exp10_2u.c b/math/aarch64/advsimd/exp10.c similarity index 89% rename from pl/math/v_exp10_2u.c rename to math/aarch64/advsimd/exp10.c index 29072a60fb3aac..24fdd1c7d257a1 100644 --- a/pl/math/v_exp10_2u.c +++ b/math/aarch64/advsimd/exp10.c @@ -1,14 +1,15 @@ /* * Double-precision vector 10^x function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#define _GNU_SOURCE #include "mathlib.h" #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* Value of |x| above which scale overflows without special treatment. */ #define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */ @@ -135,10 +136,12 @@ float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x) return vfmaq_f64 (s, y, s); } -PL_SIG (S, D, 1, exp10, -9.9, 9.9) -PL_SIG (V, D, 1, exp10, -9.9, 9.9) -PL_TEST_ULP (V_NAME_D1 (exp10), 1.15) -PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000) +#if WANT_EXP10_TESTS +TEST_SIG (S, D, 1, exp10, -9.9, 9.9) +TEST_SIG (V, D, 1, exp10, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp10), 1.15) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000) +#endif diff --git a/pl/math/v_exp10f_2u4.c b/math/aarch64/advsimd/exp10f.c similarity index 58% rename from pl/math/v_exp10f_2u4.c rename to math/aarch64/advsimd/exp10f.c index 0e91becfa61291..eb0d5dd0d57cb4 100644 --- a/pl/math/v_exp10f_2u4.c +++ b/math/aarch64/advsimd/exp10f.c @@ -1,23 +1,24 @@ /* * Single-precision vector 10^x function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "mathlib.h" +#define _GNU_SOURCE #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f32.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" #define ScaleBound 192.0f static const struct data { - float32x4_t poly[5]; - float32x4_t log10_2_and_inv, shift; - + float32x4_t c0, c1, c3; + float log10_2_high, log10_2_low, c2, c4; + float32x4_t inv_log10_2, special_bound; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT float32x4_t scale_thresh; #endif @@ -27,19 +28,24 @@ static const struct data rel error: 0x1.89dafa3p-24 abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] maxerr: 1.85943 +0.5 ulp. */ - .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), - V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, - .shift = V4 (0x1.8p23f), - - /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ - .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, + .c0 = V4 (0x1.26bb16p+1f), + .c1 = V4 (0x1.5350d2p+1f), + .c2 = 0x1.04744ap+1f, + .c3 = V4 (0x1.2d8176p+0f), + .c4 = 0x1.12b41ap-1f, + .inv_log10_2 = V4 (0x1.a934fp+1), + .log10_2_high = 0x1.344136p-2, + .log10_2_low = 0x1.ec10cp-27, + /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + .special_bound = V4 (126.0f), + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .scale_thresh = V4 (ScaleBound) #endif }; -#define ExponentBias v_u32 (0x3f800000) - #if WANT_SIMD_EXCEPT # define SpecialBound 38.0f /* rint(log10(2^127)). */ @@ -57,17 +63,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) +# define SpecialBound 126.0f static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); @@ -84,7 +88,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, Algorithm is accurate to 2.36 ULP. _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 want 0x1.7e79cp+11. */ -float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) { const struct data *d = ptr_barrier (&data); #if WANT_SIMD_EXCEPT @@ -102,22 +106,23 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x) /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ - float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); - float32x4_t n = vsubq_f32 (z, d->shift); - float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); - r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0); + r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23); - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t poly - = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), - v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2); + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3); + float32x4_t p14 = vfmaq_f32 (p12, r2, p34); + float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2); if (unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT @@ -129,10 +134,14 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x) return vfmaq_f32 (scale, poly, scale); } -PL_SIG (S, F, 1, exp10, -9.9, 9.9) -PL_SIG (V, F, 1, exp10, -9.9, 9.9) -PL_TEST_ULP (V_NAME_F1 (exp10), 1.86) -PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000) +HALF_WIDTH_ALIAS_F1 (exp10) + +#if WANT_EXP10_TESTS +TEST_SIG (S, F, 1, exp10, -9.9, 9.9) +TEST_SIG (V, F, 1, exp10, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp10), 1.86) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000) +#endif diff --git a/pl/math/v_exp2_2u.c b/math/aarch64/advsimd/exp2.c similarity index 82% rename from pl/math/v_exp2_2u.c rename to math/aarch64/advsimd/exp2.c index de59779689f59f..63448d806b8221 100644 --- a/pl/math/v_exp2_2u.c +++ b/math/aarch64/advsimd/exp2.c @@ -1,19 +1,20 @@ /* * Double-precision vector 2^x function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" #define N (1 << V_EXP_TABLE_BITS) #define IndexMask (N - 1) #define BigBound 1022.0 #define UOFlowBound 1280.0 +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ static const struct data { @@ -38,7 +39,6 @@ lookup_sbits (uint64x2_t i) #if WANT_SIMD_EXCEPT -# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ # define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ /* Call scalar exp2 as a fallback. */ @@ -62,8 +62,8 @@ special_case (float64x2_t s, float64x2_t y, float64x2_t n, /* 2^(n/N) may overflow, break it up into s1*s2. */ uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset)); float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b)); - float64x2_t s2 = vreinterpretq_f64_u64 ( - vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b)); + float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b)); uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound); float64x2_t r1 = vmulq_f64 (s1, s1); float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1); @@ -119,10 +119,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) return vfmaq_f64 (s, s, y); } -PL_SIG (V, D, 1, exp2, -9.9, 9.9) -PL_TEST_ULP (V_NAME_D1 (exp2), 1.15) -PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000) +TEST_SIG (V, D, 1, exp2, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp2), 1.15) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000) diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/advsimd/exp2f.c similarity index 58% rename from math/aarch64/v_exp2f.c rename to math/aarch64/advsimd/exp2f.c index e402205e98e6be..40f6170d3702a7 100644 --- a/math/aarch64/v_exp2f.c +++ b/math/aarch64/advsimd/exp2f.c @@ -1,33 +1,38 @@ /* * Single-precision vector 2^x function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { - float32x4_t poly[5]; - uint32x4_t exponent_bias; + float32x4_t c1, c3; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT - float32x4_t special_bound, scale_thresh; + float32x4_t scale_thresh, special_bound; #endif + float c0, c2, c4, zero; } data = { /* maxerr: 1.962 ulp. */ - .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), - V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .c0 = 0x1.59977ap-10f, + .c1 = V4 (0x1.3ce9e4p-7f), + .c2 = 0x1.c6bd32p-5f, + .c3 = V4 (0x1.ebf9bcp-3f), + .c4 = 0x1.62e422p-1f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), #endif }; -#define C(i) d->poly[i] - #if WANT_SIMD_EXCEPT # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ @@ -44,16 +49,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); @@ -66,16 +68,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, #endif -float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly; - uint32x4_t cmp, e; #if WANT_SIMD_EXCEPT /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); float32x4_t xm = x; /* If any lanes are special, mask them with 1 and retain a copy of x to allow special_case to fix special lanes later. This is only necessary if fenv @@ -84,23 +84,24 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) x = vbslq_f32 (cmp, v_f32 (1), x); #endif - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ - n = vrndaq_f32 (x); - r = vsubq_f32 (x, n); - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = vsubq_f32 (x, n); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t c024 = vld1q_f32 (&d->c0); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_laneq_f32 (r, c024, 2); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT @@ -111,3 +112,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) return vfmaq_f32 (scale, poly, scale); } + +HALF_WIDTH_ALIAS_F1 (exp2) + +TEST_SIG (V, F, 1, exp2, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp2), 1.49) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/advsimd/exp2f_1u.c b/math/aarch64/advsimd/exp2f_1u.c new file mode 100644 index 00000000000000..1f8e89ab658fa1 --- /dev/null +++ b/math/aarch64/advsimd/exp2f_1u.c @@ -0,0 +1,73 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c1, c2, c3, c4, c5, shift; + uint32x4_t exponent_bias; + float32x4_t special_bound, scale_thresh; + uint32x4_t special_offset, special_bias; +} data = { + .shift = V4 (0x1.8p23f), + .exponent_bias = V4 (0x3f800000), + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), + /* maxerr: 0.878 ulp. */ + .c0 = V4 (0x1.416b5ep-13f), + .c1 = V4 (0x1.5f082ep-10f), + .c2 = V4 (0x1.3b2dep-7f), + .c3 = V4 (0x1.c6af7cp-5f), + .c4 = V4 (0x1.ebfbdcp-3f), + .c5 = V4 (0x1.62e43p-1f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r1 = vmulq_f32 (s1, s1); + float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) +{ + /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + const struct data *d = ptr_barrier (&data); + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = x - n; + uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; + float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + + float32x4_t p = vfmaq_f32 (d->c1, d->c0, r); + p = vfmaq_f32 (d->c2, p, r); + p = vfmaq_f32 (d->c3, p, r); + p = vfmaq_f32 (d->c4, p, r); + p = vfmaq_f32 (d->c5, p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (p, n, e, d); + return scale * p; +} + +TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4) +TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u) +TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/v_expf.c b/math/aarch64/advsimd/expf.c similarity index 61% rename from math/aarch64/v_expf.c rename to math/aarch64/advsimd/expf.c index 34e8b6081bcd94..e5b1f020d1a02a 100644 --- a/math/aarch64/v_expf.c +++ b/math/aarch64/advsimd/expf.c @@ -1,30 +1,34 @@ /* * Single-precision vector e^x function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ - -#include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { - float32x4_t poly[5]; - float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; - uint32x4_t exponent_bias; + float32x4_t c1, c3, c4, inv_ln2; + float ln2_hi, ln2_lo, c0, c2; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT float32x4_t special_bound, scale_thresh; #endif } data = { /* maxerr: 1.45358 +0.5 ulp. */ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, - .shift = V4 (0x1.8p23f), + .c0 = 0x1.0e4020p-7f, + .c1 = V4 (0x1.573e2ep-5f), + .c2 = 0x1.555e66p-3f, + .c3 = V4 (0x1.fffdb6p-2f), + .c4 = V4 (0x1.ffffecp-1f), .inv_ln2 = V4 (0x1.715476p+0f), - .ln2_hi = V4 (0x1.62e4p-1f), - .ln2_lo = V4 (0x1.7f7d1cp-20f), + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), @@ -49,19 +53,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); + // (s2 + p*s2)*s1 = s2(p+1)s1 float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); /* Similar to r1 but avoids double rounding in the subnormal range. */ float32x4_t r0 = vfmaq_f32 (scale, poly, scale); @@ -71,15 +73,14 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, #endif -float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly, z; - uint32x4_t cmp, e; + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); #if WANT_SIMD_EXCEPT /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - cmp = vcgeq_u32 ( + uint32x4_t cmp = vcgeq_u32 ( vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), TinyBound), SpecialBound); @@ -93,23 +94,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - z = vfmaq_f32 (d->shift, x, d->inv_ln2); - n = vsubq_f32 (z, d->shift); - r = vfmsq_f32 (x, n, d->ln2_hi); - r = vfmsq_f32 (r, n, d->ln2_lo); - e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT @@ -120,3 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) return vfmaq_f32 (scale, poly, scale); } + +HALF_WIDTH_ALIAS_F1 (exp) + +TEST_SIG (V, F, 1, exp, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp), 1.49) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/advsimd/expf_1u.c b/math/aarch64/advsimd/expf_1u.c new file mode 100644 index 00000000000000..4e114d810e08b3 --- /dev/null +++ b/math/aarch64/advsimd/expf_1u.c @@ -0,0 +1,79 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t shift, inv_ln2; + uint32x4_t exponent_bias; + float32x4_t c1, c2, c3, c4; + float32x4_t special_bound, scale_thresh; + uint32x4_t special_offset, special_bias; + float ln2_hi, ln2_lo, c0, nothing; +} data = { + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, + .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .exponent_bias = V4 (0x3f800000), + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), + .special_offset = V4 (0x83000000), + .special_bias = V4 (0x7f000000), + /* maxerr: 0.36565 +0.5 ulp. */ + .c0 = 0x1.6a6000p-10f, + .c1 = V4 (0x1.12718ep-7f), + .c2 = V4 (0x1.555af0p-5f), + .c3 = V4 (0x1.555430p-3f), + .c4 = V4 (0x1.fffff4p-2f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r1 = vmulq_f32 (s1, s1); + float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_expf_1u (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi); + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t z = vmulq_f32 (x, d->inv_ln2); + float32x4_t n = vrndaq_f32 (z); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c0, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2); + p = vfmaq_f32 (d->c2, p, r); + p = vfmaq_f32 (d->c3, p, r); + p = vfmaq_f32 (d->c4, p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (p, n, e, d); + return scale * p; +} + +TEST_ULP (_ZGVnN4v_expf_1u, 0.4) +TEST_DISABLE_FENV (_ZGVnN4v_expf_1u) +TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000) diff --git a/math/aarch64/advsimd/expm1.c b/math/aarch64/advsimd/expm1.c new file mode 100644 index 00000000000000..7535a18304277e --- /dev/null +++ b/math/aarch64/advsimd/expm1.c @@ -0,0 +1,77 @@ +/* + * Double-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; +#if WANT_SIMD_EXCEPT + uint64x2_t thresh, tiny_bound; +#else + float64x2_t oflow_bound; +#endif +} data = { + .d = V_EXPM1_DATA, +#if WANT_SIMD_EXCEPT + /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs + compare. */ + .thresh = V2 (0x78c56fa6d34b552), + /* asuint64(0x1p-51) << 1. */ + .tiny_bound = V2 (0x3cc0000000000000 << 1), +#else + /* Value above which expm1(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V2 (0x1.62b7d369a5aa9p+9), +#endif +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, const struct data *d) +{ + return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d), + special); +} + +/* Double-precision vector exp(x) - 1 function. + The maximum error observed error is 2.05 ULP: + _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2 + want 0x1.a8897eef87b32p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ix = vreinterpretq_u64_f64 (x); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); +#else + /* Large input, NaNs and Infs. */ + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); +#endif + + if (unlikely (v_any_u64 (special))) + return special_case (x, special, d); + + /* expm1(x) ~= p * t + (t - 1). */ + return expm1_inline (x, &d->d); +} + +TEST_SIG (V, D, 1, expm1, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (expm1), 1.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100) diff --git a/math/aarch64/advsimd/expm1f.c b/math/aarch64/advsimd/expm1f.c new file mode 100644 index 00000000000000..6d4431dcd8a5e3 --- /dev/null +++ b/math/aarch64/advsimd/expm1f.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1f_inline.h" + +static const struct data +{ + struct v_expm1f_data d; +#if WANT_SIMD_EXCEPT + uint32x4_t thresh; +#else + float32x4_t oflow_bound; +#endif +} data = { + .d = V_EXPM1F_DATA, +#if !WANT_SIMD_EXCEPT + /* Value above which expm1f(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V4 (0x1.5ebc4p+6), +#else + /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute + compare. */ + .thresh = V4 (0x1d5ebc40), +#endif +}; + +/* asuint(0x1p-23), shifted by 1 for abs compare. */ +#define TinyBound v_u32 (0x34000000 << 1) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, const struct data *d) +{ + return v_call_f32 ( + expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special); +} + +/* Single-precision vector exp(x) - 1 function. + The maximum error is 1.62 ULP: + _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2 + want 0x1.da9f44p-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ix = vreinterpretq_u32_f32 (x); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint32x4_t special + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); +#else + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); +#endif + + if (unlikely (v_any_u32 (special))) + return special_case (x, special, d); + + /* expm1(x) ~= p * t + (t - 1). */ + return expm1f_inline (x, &d->d); +} + +HALF_WIDTH_ALIAS_F1 (expm1) + +TEST_SIG (V, F, 1, expm1, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (expm1), 1.13) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000) +TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000) diff --git a/pl/math/finite_pow.h b/math/aarch64/advsimd/finite_pow.h similarity index 94% rename from pl/math/finite_pow.h rename to math/aarch64/advsimd/finite_pow.h index 8944d4fae62589..0c8350a1a77bb3 100644 --- a/pl/math/finite_pow.h +++ b/math/aarch64/advsimd/finite_pow.h @@ -1,7 +1,7 @@ /* * Double-precision x^y function. * - * Copyright (c) 2018-2023, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -108,7 +108,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki) sbits -= 1009ull << 52; scale = asdouble (sbits); y = 0x1p1009 * (scale + scale * tmp); - return check_oflow (eval_as_double (y)); + return y; } /* k < 0, need special care in the subnormal range. */ sbits += 1022ull << 52; @@ -128,7 +128,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki) lo = scale - y + scale * tmp; hi = one + y; lo = one - hi + y + lo; - y = eval_as_double (hi + lo) - one; + y = (hi + lo) - one; /* Fix the sign of 0. */ if (y == 0.0) y = asdouble (sbits & 0x8000000000000000); @@ -137,7 +137,7 @@ special_case (double tmp, uint64_t sbits, uint64_t ki) } #endif y = 0x1p-1022 * y; - return check_uflow (eval_as_double (y)); + return y; } /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. @@ -192,7 +192,7 @@ exp_inline (double x, double xtail, uint32_t sign_bias) double scale = asdouble (sbits); /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there is no spurious underflow here even without fma. */ - return eval_as_double (scale + scale * tmp); + return scale + scale * tmp; } /* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. @@ -239,7 +239,7 @@ exp_nosignbias (double x, double xtail) double scale = asdouble (sbits); /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there is no spurious underflow here even without fma. */ - return eval_as_double (scale + scale * tmp); + return scale + scale * tmp; } /* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is @@ -267,7 +267,7 @@ zeroinfnan (uint64_t i) } static double NOINLINE -__pl_finite_pow (double x, double y) +pow_scalar_special_case (double x, double y) { uint32_t sign_bias = 0; uint64_t ix, iy; @@ -311,9 +311,7 @@ __pl_finite_pow (double x, double y) if (2 * ix == 0 && iy >> 63) return __math_divzero (sign_bias); #endif - /* Without the barrier some versions of clang hoist the 1/x2 and - thus division by zero exception can be signaled spuriously. */ - return iy >> 63 ? opt_barrier_double (1 / x2) : x2; + return iy >> 63 ? 1 / x2 : x2; } /* Here x and y are non-zero finite. */ if (ix >> 63) @@ -349,9 +347,7 @@ __pl_finite_pow (double x, double y) if (topx == 0) { /* Normalize subnormal x so exponent becomes negative. */ - /* Without the barrier some versions of clang evalutate the mul - unconditionally causing spurious overflow exceptions. */ - ix = asuint64 (opt_barrier_double (x) * 0x1p52); + ix = asuint64 (x * 0x1p52); ix &= 0x7fffffffffffffff; ix -= 52ULL << 52; } diff --git a/pl/math/v_hypot_1u5.c b/math/aarch64/advsimd/hypot.c similarity index 74% rename from pl/math/v_hypot_1u5.c rename to math/aarch64/advsimd/hypot.c index d4ff7be89a8fa7..dc01ed5bac931e 100644 --- a/pl/math/v_hypot_1u5.c +++ b/math/aarch64/advsimd/hypot.c @@ -1,13 +1,13 @@ /* * Double-precision vector hypot(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #if WANT_SIMD_EXCEPT static const struct data @@ -15,7 +15,7 @@ static const struct data uint64x2_t tiny_bound, thres; } data = { .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */ - .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ + .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ }; #else static const struct data @@ -24,7 +24,7 @@ static const struct data uint32x4_t thres; } data = { .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */ - .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ + .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ }; #endif @@ -75,9 +75,9 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y); - uint32x2_t special = vcge_u32 ( - vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound), - vget_low_u32 (d->thres)); + uint32x2_t special + = vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound), + vget_low_u32 (d->thres)); if (unlikely (v_any_u32h (special))) return special_case (x, y, sqsum, special); @@ -86,10 +86,10 @@ float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) } #endif -PL_SIG (V, D, 2, hypot, -10.0, 10.0) -PL_TEST_ULP (V_NAME_D2 (hypot), 1.21) -PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) -PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) +TEST_SIG (V, D, 2, hypot, -10.0, 10.0) +TEST_ULP (V_NAME_D2 (hypot), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT) +TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/pl/math/v_hypotf_1u5.c b/math/aarch64/advsimd/hypotf.c similarity index 68% rename from pl/math/v_hypotf_1u5.c rename to math/aarch64/advsimd/hypotf.c index 3227b0a3fd8bac..69634875be5a35 100644 --- a/pl/math/v_hypotf_1u5.c +++ b/math/aarch64/advsimd/hypotf.c @@ -1,13 +1,13 @@ /* * Single-precision vector hypot(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #if WANT_SIMD_EXCEPT static const struct data @@ -15,7 +15,7 @@ static const struct data uint32x4_t tiny_bound, thres; } data = { .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ - .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ + .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ }; #else static const struct data @@ -24,7 +24,7 @@ static const struct data uint16x8_t thres; } data = { .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ - .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ + .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ }; #endif @@ -41,7 +41,7 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, want 0x1.6a41dp-13. */ #if WANT_SIMD_EXCEPT -float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) { const struct data *d = ptr_barrier (&data); @@ -68,15 +68,15 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) } #else -float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) { const struct data *d = ptr_barrier (&data); float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); - uint16x4_t special = vcge_u16 ( - vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), - vget_low_u16 (d->thres)); + uint16x4_t special + = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), + vget_low_u16 (d->thres)); if (unlikely (v_any_u16h (special))) return special_case (x, y, sqsum, special); @@ -85,10 +85,12 @@ float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) } #endif -PL_SIG (V, F, 2, hypot, -10.0, 10.0) -PL_TEST_ULP (V_NAME_F2 (hypot), 1.21) -PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) -PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) +HALF_WIDTH_ALIAS_F2 (hypot) + +TEST_SIG (V, F, 2, hypot, -10.0, 10.0) +TEST_ULP (V_NAME_F2 (hypot), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT) +TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/math/aarch64/advsimd/log.c b/math/aarch64/advsimd/log.c new file mode 100644 index 00000000000000..94e3f448207987 --- /dev/null +++ b/math/aarch64/advsimd/log.c @@ -0,0 +1,118 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + float64x2_t c0, c2; + double c1, c3, ln2, c4; +} data = { + /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .c0 = V2 (-0x1.ffffffffffff7p-2), + .c1 = 0x1.55555555170d4p-2, + .c2 = V2 (-0x1.0000000399c27p-2), + .c3 = 0x1.999b2e90e94cap-3, + .c4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .sign_exp_mask = V2 (0xfff0000000000000), + .off = V2 (0x3fe6900900000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */ +}; + +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Double-precision vector log routine. + The maximum observed error is 2.17 ULP: + _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 + want 0x1.ffffff1cca045p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log), 1.67) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000) +TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000) +TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000) diff --git a/math/aarch64/advsimd/log10.c b/math/aarch64/advsimd/log10.c new file mode 100644 index 00000000000000..c2b8f1c54f0e91 --- /dev/null +++ b/math/aarch64/advsimd/log10.c @@ -0,0 +1,132 @@ +/* + * Double-precision vector log10(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + double invln10, log10_2; + double c1, c3; + float64x2_t c0, c2, c4; +} data = { + /* Computed from log coefficients divided by log(10) then rounded to double + precision. */ + .c0 = V2 (-0x1.bcb7b1526e506p-3), + .c1 = 0x1.287a7636be1d1p-3, + .c2 = V2 (-0x1.bcb7b158af938p-4), + .c3 = 0x1.63c78734e6d07p-4, + .c4 = V2 (-0x1.287461742fee4p-4), + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */ +}; + +#define N (1 << V_LOG10_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t log10c; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log10c = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Fast implementation of double-precision vector log10 + is a slight modification of double-precision vector log. + Max ULP error: < 2.5 ulp (nearest rounding.) + Maximum measured at 2.46 ulp for x in [0.96, 0.97] + _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 + want 0x1.fff6be3cae4b9p-6. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r / log(10) + log10(c) + k*log10(2). + Constants in v_log10_data.c are computed (in extended precision) as + e.log10c := e.logc * invln10. */ + float64x2_t cte = vld1q_f64 (&d->invln10); + float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0); + + /* y = log10(1+r) + n * log10(2). */ + hi = vfmaq_laneq_f64 (hi, kd, cte, 1); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_f64 (y, d->c4, r2); + y = vfmaq_f64 (p, y, r2); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log10, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log10), 1.97) +TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000) +TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000) diff --git a/math/aarch64/advsimd/log10f.c b/math/aarch64/advsimd/log10f.c new file mode 100644 index 00000000000000..907c1051e0864c --- /dev/null +++ b/math/aarch64/advsimd/log10f.c @@ -0,0 +1,106 @@ +/* + * Single-precision vector log10 function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c2, c4, c6, inv_ln10, ln2; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; +} data = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ + .c0 = V4 (-0x1.bcb79cp-3f), + .c1 = 0x1.2879c8p-3f, + .c2 = V4 (-0x1.bcd472p-4f), + .c3 = 0x1.6408f8p-4f, + .c4 = V4 (-0x1.246f8p-4f), + .c5 = 0x1.f0e514p-5f, + .c6 = V4 (-0x1.0fc92cp-4f), + .c7 = 0x1.f5f76ap-5f, + .ln2 = V4 (0x1.62e43p-1f), + .inv_ln10 = V4 (0x1.bcb7b2p-2f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); +} + +/* Fast implementation of AdvSIMD log10f, + uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and + an order 9 polynomial. + Maximum error: 3.305ulps (nearest rounding.) + _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t c1357 = vld1q_f32 (&d->c1); + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log10(1+r) + n * log10(2). */ + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + + float32x4_t p47 = vfmaq_f32 (c45, r2, c67); + float32x4_t p27 = vfmaq_f32 (c23, r2, p47); + float32x4_t poly = vfmaq_f32 (c01, r2, p27); + + /* y = Log10(2) * n + poly * InvLn(10). */ + float32x4_t y = vfmaq_f32 (r, d->ln2, n); + y = vmulq_f32 (y, d->inv_ln10); + + if (unlikely (v_any_u16h (special))) + return special_case (y, u_off, poly, r2, special, d); + return vfmaq_f32 (y, poly, r2); +} + +HALF_WIDTH_ALIAS_F1 (log10) + +TEST_SIG (V, F, 1, log10, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log10), 2.81) +TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100) +TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100) +TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000) diff --git a/math/aarch64/advsimd/log1p.c b/math/aarch64/advsimd/log1p.c new file mode 100644 index 00000000000000..42a0c579392052 --- /dev/null +++ b/math/aarch64/advsimd/log1p.c @@ -0,0 +1,61 @@ +/* + * Double-precision vector log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 0 +#include "v_log1p_inline.h" + +const static struct data +{ + struct v_log1p_data d; + uint64x2_t inf, minus_one; +} data = { .d = V_LOG1P_CONSTANTS_TABLE, + .inf = V2 (0x7ff0000000000000), + .minus_one = V2 (0xbff0000000000000) }; + +#define BottomMask v_u64 (0xffffffff) + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, uint64x2_t cmp, const struct data *d) +{ + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float64x2_t x_nospecial = v_zerofy_f64 (x, cmp); + return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp); +} + +/* Vector log1p approximation using polynomial on reduced interval. Routine is + a modification of the algorithm used in scalar log1p, with no shortcut for + k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP: + _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2 + want 0x1.fd61d0727429fp+2 . */ +VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + uint64x2_t special_cases + = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one)); + + if (unlikely (v_any_u64 (special_cases))) + return special_case (x, special_cases, d); + + return log1p_inline (x, &d->d); +} + +TEST_SIG (V, D, 1, log1p, -0.9, 10.0) +TEST_ULP (V_NAME_D1 (log1p), 1.95) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000) +TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500) diff --git a/math/aarch64/advsimd/log1pf.c b/math/aarch64/advsimd/log1pf.c new file mode 100644 index 00000000000000..94b90249128fa4 --- /dev/null +++ b/math/aarch64/advsimd/log1pf.c @@ -0,0 +1,92 @@ +/* + * Single-precision vector log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +#if WANT_SIMD_EXCEPT + +const static struct data +{ + uint32x4_t minus_one, thresh; + struct v_log1pf_data d; +} data = { + .d = V_LOG1PF_CONSTANTS_TABLE, + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */ + .minus_one = V4 (0xbf800000), +}; + +/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ +# define TinyBound v_u32 (0x34000000) + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t cmp, const struct data *d) +{ + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float32x4_t x_nospecial = v_zerofy_f32 (x, cmp); + return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.69 ULP: + _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3 + want 0x1.cfcbdcp-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + uint32x4_t special_cases + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh), + vcgeq_u32 (ix, d->minus_one)); + + if (unlikely (v_any_u32 (special_cases))) + return special_case (x, special_cases, d); + + return log1pf_inline (x, &d->d); +} + +#else + +const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t cmp) +{ + return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.63 ULP: + _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3 + want 0x1.fdcb16p-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x) +{ + uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)), + vcaleq_f32 (x, v_f32 (0x1p127f))); + + if (unlikely (v_any_u32 (special_cases))) + return special_case (x, special_cases); + + return log1pf_inline (x, ptr_barrier (&data)); +} + +#endif + +HALF_WIDTH_ALIAS_F1 (log1p) + +TEST_SIG (V, F, 1, log1p, -0.9, 10.0) +TEST_ULP (V_NAME_F1 (log1p), 1.20) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000) +TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000) +TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000) +TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000) diff --git a/math/aarch64/advsimd/log2.c b/math/aarch64/advsimd/log2.c new file mode 100644 index 00000000000000..7d2e44dad2c9ef --- /dev/null +++ b/math/aarch64/advsimd/log2.c @@ -0,0 +1,123 @@ +/* + * Double-precision vector log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + float64x2_t c0, c2; + double c1, c3, invln2, c4; +} data = { + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .c0 = V2 (-0x1.71547652b8300p-1), + .c1 = 0x1.ec709dc340953p-2, + .c2 = V2 (-0x1.71547651c8f35p-2), + .c3 = 0x1.2777ebe12dda5p-2, + .c4 = -0x1.ec738d616fe26p-3, + .invln2 = 0x1.71547652b82fep0, + .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */ +}; + +#define N (1 << V_LOG2_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t log2c; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log2c = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Double-precision vector log2 routine. Implements the same algorithm as + vector log10, with coefficients and table entries scaled in extended + precision. The maximum observed error is 2.58 ULP: + _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2); + float64x2_t hi + = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0); + + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log2, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log2), 2.09) +TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000) diff --git a/math/aarch64/advsimd/log2f.c b/math/aarch64/advsimd/log2f.c new file mode 100644 index 00000000000000..3053c64bc552c4 --- /dev/null +++ b/math/aarch64/advsimd/log2f.c @@ -0,0 +1,102 @@ +/* + * Single-precision vector log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c2, c4, c6, c8; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; +} data = { + /* Coefficients generated using Remez algorithm approximate + log2(1+r)/r for r in [ -1/3, 1/3 ]. + rel error: 0x1.c4c4b0cp-26. */ + .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ + .c1 = -0x1.715458p-1f, + .c2 = V4 (0x1.ec701cp-2f), + .c3 = -0x1.7171a4p-2f, + .c4 = V4 (0x1.27a0b8p-2f), + .c5 = -0x1.e5143ep-3f, + .c6 = V4 (0x1.9d8ecap-3f), + .c7 = -0x1.c675bp-3f, + .c8 = V4 (0x1.9e495p-3f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); +} + +/* Fast implementation for single precision AdvSIMD log2, + relies on same argument reduction as AdvSIMD logf. + Maximum error: 2.48 ULPs + _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log2(1+r) + n. */ + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8); + float32x4_t p48 = vfmaq_f32 (c45, r2, p68); + float32x4_t p28 = vfmaq_f32 (c23, r2, p48); + float32x4_t p = vfmaq_f32 (c01, r2, p28); + + if (unlikely (v_any_u16h (special))) + return special_case (n, u_off, p, r, special, d); + return vfmaq_f32 (n, p, r); +} + +HALF_WIDTH_ALIAS_F1 (log2) + +TEST_SIG (V, F, 1, log2, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log2), 1.99) +TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000) diff --git a/math/aarch64/advsimd/logf.c b/math/aarch64/advsimd/logf.c new file mode 100644 index 00000000000000..84705fad05eee7 --- /dev/null +++ b/math/aarch64/advsimd/logf.c @@ -0,0 +1,88 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float32x4_t c2, c4, c6, ln2; + uint32x4_t off, offset_lower_bound, mantissa_mask; + uint16x8_t special_bound; + float c1, c3, c5, c0; +} data = { + /* 3.34 ulp error. */ + .c0 = -0x1.3e737cp-3f, + .c1 = 0x1.5a9aa2p-3f, + .c2 = V4 (-0x1.4f9934p-3f), + .c3 = 0x1.961348p-3f, + .c4 = V4 (-0x1.00187cp-2f), + .c5 = 0x1.555d7cp-2f, + .c6 = V4 (-0x1.ffffc8p-2f), + .ln2 = V4 (0x1.62e43p-1f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t c1350 = vld1q_f32 (&d->c1); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + float32x4_t r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); + float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2); + p = vfmaq_laneq_f32 (p, r2, c1350, 3); + + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + if (unlikely (v_any_u16h (cmp))) + return special_case (p, u_off, y, r2, cmp, d); + return vfmaq_f32 (p, y, r2); +} + +HALF_WIDTH_ALIAS_F1 (log) + +TEST_SIG (V, F, 1, log, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log), 2.9) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000) +TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000) +TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000) diff --git a/math/aarch64/advsimd/modf.c b/math/aarch64/advsimd/modf.c new file mode 100644 index 00000000000000..da2fcbff851497 --- /dev/null +++ b/math/aarch64/advsimd/modf.c @@ -0,0 +1,33 @@ +/* + * Double-precision vector modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modf algorithm. Produces exact values in all rounding modes. */ +float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int) +{ + /* Get integer component of x. */ + float64x2_t rounded = vrndq_f64 (x); + vst1q_f64 (out_int, rounded); + + /* Subtract integer component from input. */ + uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded)); + + /* Return +0 for integer x. */ + uint64x2_t is_integer = vceqq_f64 (x, rounded); + return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer)); +} + +TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000) + +TEST_ULP (_ZGVnN2vl8_modf_int, 0.0) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000) diff --git a/math/aarch64/advsimd/modff.c b/math/aarch64/advsimd/modff.c new file mode 100644 index 00000000000000..0a646b24cb1ae1 --- /dev/null +++ b/math/aarch64/advsimd/modff.c @@ -0,0 +1,34 @@ +/* + * Single-precision vector modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modff algorithm. Produces exact values in all rounding modes. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x, + float *out_int) +{ + /* Get integer component of x. */ + float32x4_t rounded = vrndq_f32 (x); + vst1q_f32 (out_int, rounded); + + /* Subtract integer component from input. */ + uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded)); + + /* Return +0 for integer x. */ + uint32x4_t is_integer = vceqq_f32 (x, rounded); + return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer)); +} + +TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000) + +TEST_ULP (_ZGVnN4vl4_modff_int, 0.0) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000) diff --git a/pl/math/v_pow_1u5.c b/math/aarch64/advsimd/pow.c similarity index 60% rename from pl/math/v_pow_1u5.c rename to math/aarch64/advsimd/pow.c index 9053347d4e3524..db9d6e9ba14bb9 100644 --- a/pl/math/v_pow_1u5.c +++ b/math/aarch64/advsimd/pow.c @@ -1,20 +1,17 @@ /* * Double-precision vector pow function. * - * Copyright (c) 2020-2023, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* Defines parameters of the approximation and scalar fallback. */ #include "finite_pow.h" -#define VecSmallExp v_u64 (SmallExp) -#define VecThresExp v_u64 (ThresExp) - #define VecSmallPowX v_u64 (SmallPowX) #define VecThresPowX v_u64 (ThresPowX) #define VecSmallPowY v_u64 (SmallPowY) @@ -22,34 +19,49 @@ static const struct data { - float64x2_t log_poly[7]; - float64x2_t exp_poly[3]; - float64x2_t ln2_hi, ln2_lo; - float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n; + uint64x2_t inf; + float64x2_t small_powx; + uint64x2_t offset, mask; + uint64x2_t mask_sub_0, mask_sub_1; + float64x2_t log_c0, log_c2, log_c4, log_c5; + double log_c1, log_c3; + double ln2_lo, ln2_hi; + uint64x2_t small_exp, thres_exp; + double ln2_lo_n, ln2_hi_n; + double inv_ln2_n, exp_c2; + float64x2_t exp_c0, exp_c1; } data = { + /* Power threshold. */ + .inf = V2 (0x7ff0000000000000), + .small_powx = V2 (0x1p-126), + .offset = V2 (Off), + .mask = V2 (0xfffULL << 52), + .mask_sub_0 = V2 (1ULL << 52), + .mask_sub_1 = V2 (52ULL << 52), /* Coefficients copied from v_pow_log_data.c relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] Coefficients are scaled to match the scaling during evaluation. */ - .log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2), - V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4), - V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8), - V2 (-0x1.0002b8b263fc3p-3 * -8) }, - .ln2_hi = V2 (0x1.62e42fefa3800p-1), - .ln2_lo = V2 (0x1.ef35793c76730p-45), + .log_c0 = V2 (0x1.555555555556p-2 * -2), + .log_c1 = -0x1.0000000000006p-2 * -2, + .log_c2 = V2 (0x1.999999959554ep-3 * 4), + .log_c3 = -0x1.555555529a47ap-3 * 4, + .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8), + .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8), + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 (0.550 without fma) if |x| < ln2/512. */ - .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3), - V2 (0x1.5555576a5adcep-5) }, - .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */ - .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */ - .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */ - .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45), + .exp_c0 = V2 (0x1.fffffffffffd4p-2), + .exp_c1 = V2 (0x1.5555571d6ef9p-3), + .exp_c2 = 0x1.5555576a5adcep-5, + .small_exp = V2 (0x3c90000000000000), + .thres_exp = V2 (0x03f0000000000000), + .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */ + .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */ + .ln2_lo_n = -0x1.c610ca86c3899p-45, }; -#define A(i) data.log_poly[i] -#define C(i) data.exp_poly[i] - -/* This version implements an algorithm close to AOR scalar pow but +/* This version implements an algorithm close to scalar pow but - does not implement the trick in the exp's specialcase subroutine to avoid double-rounding, - does not use a tail in the exponential core computation, @@ -78,10 +90,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off)); - int64x2_t k - = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52))); + uint64x2_t tmp = vsubq_u64 (ix, d->offset); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask)); float64x2_t z = vreinterpretq_f64_u64 (iz); float64x2_t kd = vcvtq_f64_s64 (k); /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ @@ -92,12 +103,13 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); /* k*Ln2 + log(c) + r. */ - float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi); + float64x2_t ln2 = vld1q_f64 (&d->ln2_lo); + float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1); float64x2_t t2 = vaddq_f64 (t1, r); - float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo); + float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0); float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); /* Evaluation is optimized assuming superscalar pipelined execution. */ - float64x2_t ar = vmulq_f64 (A (0), r); + float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r); float64x2_t ar2 = vmulq_f64 (r, ar); float64x2_t ar3 = vmulq_f64 (r, ar2); /* k*Ln2 + log(c) + r + A[0]*r*r. */ @@ -105,9 +117,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); /* p = log1p(r) - r - A[0]*r*r. */ - float64x2_t a56 = vfmaq_f64 (A (5), r, A (6)); - float64x2_t a34 = vfmaq_f64 (A (3), r, A (4)); - float64x2_t a12 = vfmaq_f64 (A (1), r, A (2)); + float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1); + float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5); + float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1); + float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0); float64x2_t p = vfmaq_f64 (a34, ar2, a56); p = vfmaq_f64 (a12, ar2, p); p = vmulq_f64 (ar3, p); @@ -118,29 +131,37 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) return y; } +static float64x2_t VPCS_ATTR NOINLINE +exp_special_case (float64x2_t x, float64x2_t xtail) +{ + return (float64x2_t){ exp_nosignbias (x[0], xtail[0]), + exp_nosignbias (x[1], xtail[1]) }; +} + /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ static inline float64x2_t -v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) +v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d) { /* Fallback to scalar exp_inline for all lanes if any lane contains value of x s.t. |x| <= 2^-54 or >= 512. */ - uint64x2_t abstop - = vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff)); - uint64x2_t uoflowx - = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp); + uint64x2_t uoflowx = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp), + d->thres_exp); if (unlikely (v_any_u64 (uoflowx))) - return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1)); + return exp_special_case (x, vnegq_f64 (neg_xtail)); + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ - float64x2_t z = vmulq_f64 (d->inv_ln2_n, x); /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - float64x2_t kd = vaddq_f64 (z, d->shift); - uint64x2_t ki = vreinterpretq_u64_f64 (kd); - kd = vsubq_f64 (kd, d->shift); - float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n); - r = vfmsq_f64 (r, kd, d->ln2_lo_n); + float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n); + float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0); + float64x2_t kd = vrndnq_f64 (z); + uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z)); + float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n); + float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1); + r = vfmsq_laneq_f64 (r, kd, ln2_n, 0); /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ - r = vaddq_f64 (r, xtail); + r = vsubq_f64 (r, neg_xtail); /* 2^(k/N) ~= scale. */ uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); @@ -149,8 +170,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) sbits = vaddq_u64 (sbits, top); /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t tmp = vfmaq_f64 (C (1), r, C (2)); - tmp = vfmaq_f64 (C (0), r, tmp); + float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1); + tmp = vfmaq_f64 (d->exp_c0, r, tmp); tmp = vfmaq_f64 (r, r2, tmp); float64x2_t scale = vreinterpretq_f64_u64 (sbits); /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there @@ -158,54 +179,59 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) return vfmaq_f64 (scale, scale, tmp); } +static float64x2_t NOINLINE VPCS_ATTR +scalar_fallback (float64x2_t x, float64x2_t y) +{ + return (float64x2_t){ pow_scalar_special_case (x[0], y[0]), + pow_scalar_special_case (x[1], y[1]) }; +} + float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) { const struct data *d = ptr_barrier (&data); /* Case of x <= 0 is too complicated to be vectorised efficiently here, fallback to scalar pow for all lanes if any x < 0 detected. */ if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x)))) - return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1)); + return scalar_fallback (x, y); uint64x2_t vix = vreinterpretq_u64_f64 (x); uint64x2_t viy = vreinterpretq_u64_f64 (y); - uint64x2_t vtopx = vshrq_n_u64 (vix, 52); - uint64x2_t vtopy = vshrq_n_u64 (viy, 52); - uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff)); - uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff)); + uint64x2_t iay = vandq_u64 (viy, d->inf); /* Special cases of x or y. */ #if WANT_SIMD_EXCEPT /* Small or large. */ + uint64x2_t vtopx = vshrq_n_u64 (vix, 52); + uint64x2_t vabstopy = vshrq_n_u64 (iay, 52); uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX); uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY); #else - /* Inf or nan. */ - uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff)); - uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff)); /* The case y==0 does not trigger a special case, since in this case it is necessary to fix the result only if x is a signalling nan, which already triggers a special case. We test y==0 directly in the scalar fallback. */ + uint64x2_t iax = vandq_u64 (vix, d->inf); + uint64x2_t specialx = vcgeq_u64 (iax, d->inf); + uint64x2_t specialy = vcgeq_u64 (iay, d->inf); #endif uint64x2_t special = vorrq_u64 (specialx, specialy); /* Fallback to scalar on all lanes if any lane is inf or nan. */ if (unlikely (v_any_u64 (special))) - return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1)); + return scalar_fallback (x, y); /* Small cases of x: |x| < 0x1p-126. */ - uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX); + uint64x2_t smallx = vcaltq_f64 (x, d->small_powx); if (unlikely (v_any_u64 (smallx))) { /* Update ix if top 12 bits of x are 0. */ - uint64x2_t sub_x = vceqzq_u64 (vtopx); + uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52)); if (unlikely (v_any_u64 (sub_x))) { /* Normalize subnormal x so exponent becomes negative. */ - uint64x2_t vix_norm - = vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52))); - vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff)); - vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52)); + uint64x2_t vix_norm = vreinterpretq_u64_f64 ( + vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0)))); + vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1); vix = vbslq_u64 (sub_x, vix_norm, vix); } } @@ -216,21 +242,20 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) /* Vector Exp(y_loghi, y_loglo). */ float64x2_t vehi = vmulq_f64 (y, vhi); - float64x2_t velo = vmulq_f64 (y, vlo); float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); - velo = vsubq_f64 (velo, vemi); - return v_exp_inline (vehi, velo, d); + float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo); + return v_exp_inline (vehi, neg_velo, d); } -PL_SIG (V, D, 2, pow) -PL_TEST_ULP (V_NAME_D2 (pow), 0.55) -PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT) +TEST_SIG (V, D, 2, pow) +TEST_ULP (V_NAME_D2 (pow), 0.55) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT) /* Wide intervals spanning the whole domain but shared between x and y. */ -#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ - PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) #define EXPAND(str) str##000000000 #define SHL52(str) EXPAND (str) V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) @@ -248,12 +273,12 @@ V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) /* x is negative, y is odd or even integer, or y is real not integer. */ -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) /* 1.0^y. */ -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) -PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/math/aarch64/advsimd/powf.c b/math/aarch64/advsimd/powf.c new file mode 100644 index 00000000000000..47f74cf38ab09d --- /dev/null +++ b/math/aarch64/advsimd/powf.c @@ -0,0 +1,209 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A d->log2_poly +#define C d->exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct data +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + float64x2_t log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + float64x2_t exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + V2 (-0x1.6ff5daa3b3d7cp-2 * Scale), + V2 (0x1.ec81d03c01aebp-2 * Scale), + V2 (-0x1.71547bb43f101p-1 * Scale), + V2 (0x1.7154764a815cbp0 * Scale)}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale), + V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale), + V2 (0x1.62e42ff0c52d6p-1 / Scale)}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +static inline float64x2_t +ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k, + float64x2_t invc, float64x2_t logc, float64x2_t y) +{ + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc); + float64x2_t y0 = vaddq_f64 (logc, k); + + /* Polynomial to approximate log1p(r)/ln2. */ + float64x2_t logx = vfmaq_f64 (A[1], r, A[0]); + logx = vfmaq_f64 (A[2], logx, r); + logx = vfmaq_f64 (A[3], logx, r); + logx = vfmaq_f64 (y0, logx, r); + + return vmulq_f64 (logx, y); +} + +static inline float64x2_t +log2_lookup (const struct data *d, uint32_t i) +{ + return vld1q_f64 ( + &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc); +} + +static inline uint64x1_t +exp2f_lookup (const struct data *d, uint64_t i) +{ + return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]); +} + +static inline float32x2_t +powf_core (const struct data *d, float64x2_t ylogx) +{ + /* N*x = k + r with r in [-1/2, 1/2]. */ + float64x2_t kd = vrndnq_f64 (ylogx); + int64x2_t ki = vcvtaq_s64_f64 (ylogx); + float64x2_t r = vsubq_f64 (ylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)), + exp2f_lookup (d, vgetq_lane_s64 (ki, 1))); + t = vaddq_u64 ( + t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS))); + float64x2_t s = vreinterpretq_f64_u64 (t); + float64x2_t p = vfmaq_f64 (C[1], r, C[0]); + p = vfmaq_f64 (C[2], r, p); + p = vfmaq_f64 (s, p, vmulq_f64 (s, r)); + return vcvt_f32_f64 (p); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top)); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + /* Use double precision for each lane: split input vectors into lo and hi + halves and promote. */ + float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)), + tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)), + tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)), + tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3)); + + float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)), + iz_hi = vcvt_high_f64_f32 (iz); + + float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))), + k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k)); + + float64x2_t invc_lo = vzip1q_f64 (tab0, tab1), + invc_hi = vzip1q_f64 (tab2, tab3), + logc_lo = vzip2q_f64 (tab0, tab1), + logc_hi = vzip2q_f64 (tab2, tab3); + + float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)), + y_hi = vcvt_high_f64_f32 (y); + + float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo); + float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi); + + uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo), + vreinterpretq_u32_f64 (ylogx_hi)); + + cmp = vorrq_u32 ( + cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)), + vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) + >> 47))); + + float32x2_t p_lo = powf_core (d, ylogx_lo); + float32x2_t p_hi = powf_core (d, ylogx_hi); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp); + return vcombine_f32 (p_lo, p_hi); +} + +HALF_WIDTH_ALIAS_F2 (pow) + +TEST_SIG (V, F, 2, pow) +TEST_ULP (V_NAME_F2 (pow), 2.1) +TEST_DISABLE_FENV (V_NAME_F2 (pow)) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000) diff --git a/math/aarch64/v_sin.c b/math/aarch64/advsimd/sin.c similarity index 77% rename from math/aarch64/v_sin.c rename to math/aarch64/advsimd/sin.c index 04129c31133d62..0461bbb994059a 100644 --- a/math/aarch64/v_sin.c +++ b/math/aarch64/advsimd/sin.c @@ -1,17 +1,19 @@ /* * Double-precision vector sin function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "test_defs.h" +#include "test_sig.h" #include "mathlib.h" #include "v_math.h" static const struct data { float64x2_t poly[7]; - float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), @@ -23,12 +25,13 @@ static const struct data .pi_1 = V2 (0x1.921fb54442d18p+1), .pi_2 = V2 (0x1.1a62633145c06p-53), .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), }; #if WANT_SIMD_EXCEPT -# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ -# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ +/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */ +# define TinyBound v_u64 (0x3020000000000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u64 (0x1160000000000000) #endif #define C(i) d->poly[i] @@ -61,16 +64,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) fenv). These lanes will be fixed by special-case handler later. */ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); - r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); + r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp)); #else r = x; cmp = vcageq_f64 (x, d->range_val); #endif /* n = rint(|x|/pi). */ - n = vfmaq_f64 (d->shift, d->inv_pi, r); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); + n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f64 (r, d->pi_1, n); @@ -95,3 +97,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } + +TEST_SIG (V, D, 1, sin, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (sin), 3.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000) diff --git a/pl/math/v_sincos_3u5.c b/math/aarch64/advsimd/sincos.c similarity index 70% rename from pl/math/v_sincos_3u5.c rename to math/aarch64/advsimd/sincos.c index 6fc014c120b866..83bfa45efa982c 100644 --- a/pl/math/v_sincos_3u5.c +++ b/math/aarch64/advsimd/sincos.c @@ -1,7 +1,7 @@ /* * Double-precision vector sincos function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -10,12 +10,21 @@ be linked against the scalar sincosf from math/. */ #define _GNU_SOURCE #include -#undef _GNU_SOURCE #include "v_math.h" -#include "pl_test.h" +#include "test_defs.h" #include "v_sincos_common.h" +/* sincos not available for all scalar libm implementations. */ +#if defined(_MSC_VER) || !defined(__GLIBC__) +static void +sincos (double x, double *out_sin, double *out_cos) +{ + *out_sin = sin (x); + *out_cos = cos (x); +} +#endif + static void VPCS_ATTR NOINLINE special_case (float64x2_t x, uint64x2_t special, double *out_sin, double *out_cos) @@ -46,12 +55,13 @@ _ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos) special_case (x, special, out_sin, out_cos); } -PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73) -PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73) +TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos) +TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin) +TEST_ULP (_ZGVnN2v_sincos_sin, 2.73) +TEST_ULP (_ZGVnN2v_sincos_cos, 2.73) #define V_SINCOS_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n) -V_SINCOS_INTERVAL (0, 0x1p23, 500000) -V_SINCOS_INTERVAL (-0, -0x1p23, 500000) + TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n) +V_SINCOS_INTERVAL (0, 0x1p-31, 50000) +V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000) V_SINCOS_INTERVAL (0x1p23, inf, 10000) -V_SINCOS_INTERVAL (-0x1p23, -inf, 10000) diff --git a/pl/math/v_sincosf_1u8.c b/math/aarch64/advsimd/sincosf.c similarity index 70% rename from pl/math/v_sincosf_1u8.c rename to math/aarch64/advsimd/sincosf.c index bf77afaa14db02..cd482f38d5f645 100644 --- a/pl/math/v_sincosf_1u8.c +++ b/math/aarch64/advsimd/sincosf.c @@ -1,7 +1,7 @@ /* * Single-precision vector sincos function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -10,11 +10,20 @@ be linked against the scalar sincosf from math/. */ #define _GNU_SOURCE #include -#undef _GNU_SOURCE #include "v_sincosf_common.h" #include "v_math.h" -#include "pl_test.h" +#include "test_defs.h" + +/* sincos not available for all scalar libm implementations. */ +#if defined(_MSC_VER) || !defined(__GLIBC__) +static void +sincosf (float x, float *out_sin, float *out_cos) +{ + *out_sin = sinf (x); + *out_cos = cosf (x); +} +#endif static void VPCS_ATTR NOINLINE special_case (float32x4_t x, uint32x4_t special, float *out_sin, @@ -47,12 +56,13 @@ _ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos) special_case (x, special, out_sin, out_cos); } -PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17) -PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31) +TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin) +TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos) +TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17) +TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31) #define V_SINCOSF_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n) -V_SINCOSF_INTERVAL (0, 0x1p20, 500000) -V_SINCOSF_INTERVAL (-0, -0x1p20, 500000) + TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n) +V_SINCOSF_INTERVAL (0, 0x1p-31, 50000) +V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000) V_SINCOSF_INTERVAL (0x1p20, inf, 10000) -V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/math/aarch64/advsimd/sincospi.c b/math/aarch64/advsimd/sincospi.c new file mode 100644 index 00000000000000..fd425202ce6706 --- /dev/null +++ b/math/aarch64/advsimd/sincospi.c @@ -0,0 +1,44 @@ +/* + * Double-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_sincospi_common.h" +#include "v_math.h" +#include "test_defs.h" + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using separate argument reduction and shared low-order + polynomials. + Approximation for vector double-precision sincospi(x). + Maximum Error 3.09 ULP: + _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1 + Maximum Error 3.16 ULP: + _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. */ +VPCS_ATTR void +_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos) +{ + const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data); + + float64x2x2_t sc = v_sincospi_inline (x, d); + + vst1q_f64 (out_sin, sc.val[0]); + vst1q_f64 (out_cos, sc.val[1]); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos) +TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin) +TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59) +TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66) +# define V_SINCOSPI_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n) +V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000) +V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000) +V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000) +V_SINCOSPI_INTERVAL (0x1p63, inf, 10000) +#endif diff --git a/math/aarch64/advsimd/sincospif.c b/math/aarch64/advsimd/sincospif.c new file mode 100644 index 00000000000000..760ea3d4f5e181 --- /dev/null +++ b/math/aarch64/advsimd/sincospif.c @@ -0,0 +1,43 @@ +/* + * Single-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincospif_common.h" +#include "v_math.h" +#include "test_defs.h" +#include "mathlib.h" + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + */ +VPCS_ATTR void +_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos) +{ + const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data); + + float32x4x2_t sc = v_sincospif_inline (x, d); + + vst1q_f32 (out_sin, sc.val[0]); + vst1q_f32 (out_cos, sc.val[1]); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin) +TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos) +TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54) +TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68) +# define V_SINCOSPIF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n) +V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000) +V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000) +V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000) +V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000) +#endif diff --git a/math/aarch64/v_sinf.c b/math/aarch64/advsimd/sinf.c similarity index 65% rename from math/aarch64/v_sinf.c rename to math/aarch64/advsimd/sinf.c index 336879844459f7..0764434039a073 100644 --- a/math/aarch64/v_sinf.c +++ b/math/aarch64/advsimd/sinf.c @@ -1,17 +1,19 @@ /* * Single-precision vector sin function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" static const struct data { float32x4_t poly[4]; - float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* 1.886 ulp error. */ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), @@ -22,13 +24,14 @@ static const struct data .pi_3 = V4 (-0x1.ee59dap-49f), .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), .range_val = V4 (0x1p20f) }; #if WANT_SIMD_EXCEPT -# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ -# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ +/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */ +# define TinyBound v_u32 (0x22000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u32 (0x27800000) #endif #define C(i) d->poly[i] @@ -41,7 +44,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) return v_call_f32 (sinf, x, y, cmp); } -float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) { const struct data *d = ptr_barrier (&data); float32x4_t n, r, r2, y; @@ -53,23 +56,22 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) /* If fenv exceptions are to be triggered correctly, set any special lanes to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by special-case handler later. */ - r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); + r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp)); #else r = x; cmp = vcageq_f32 (x, d->range_val); #endif - /* n = rint(|x|/pi) */ - n = vfmaq_f32 (d->shift, d->inv_pi, r); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); + /* n = rint(|x|/pi). */ + n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f32 (r, d->pi_1, n); r = vfmsq_f32 (r, d->pi_2, n); r = vfmsq_f32 (r, d->pi_3, n); - /* y = sin(r) */ + /* y = sin(r). */ r2 = vmulq_f32 (r, r); y = vfmaq_f32 (C (2), C (3), r2); y = vfmaq_f32 (C (1), y, r2); @@ -80,3 +82,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) return special_case (x, y, odd, cmp); return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } + +HALF_WIDTH_ALIAS_F1 (sin) + +TEST_SIG (V, F, 1, sin, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (sin), 1.4) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000) diff --git a/math/aarch64/advsimd/sinh.c b/math/aarch64/advsimd/sinh.c new file mode 100644 index 00000000000000..f65ccd0c627005 --- /dev/null +++ b/math/aarch64/advsimd/sinh.c @@ -0,0 +1,80 @@ +/* + * Double-precision vector sinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; + uint64x2_t halff; +#if WANT_SIMD_EXCEPT + uint64x2_t tiny_bound, thresh; +#else + float64x2_t large_bound; +#endif +} data = { + .d = V_EXPM1_DATA, + .halff = V2 (0x3fe0000000000000), +#if WANT_SIMD_EXCEPT + /* 2^-26, below which sinh(x) rounds to x. */ + .tiny_bound = V2 (0x3e50000000000000), + /* asuint(large_bound) - asuint(tiny_bound). */ + .thresh = V2 (0x0230000000000000), +#else + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = V2 (0x1p+9), +#endif +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x) +{ + return v_call_f64 (sinh, x, x, v_u64 (-1)); +} + +/* Approximation for vector double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.52 ULP: + _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2 + want -0x1.ac2f05bb66fc9p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + float64x2_t halfsign = vreinterpretq_f64_u64 ( + vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff)); + +#if WANT_SIMD_EXCEPT + uint64x2_t special = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); +#else + uint64x2_t special = vcageq_f64 (x, d->large_bound); +#endif + + /* Fall back to scalar variant for all lanes if any of them are special. */ + if (unlikely (v_any_u64 (special))) + return special_case (x); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ + float64x2_t t = expm1_inline (ax, &d->d); + t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); + return vmulq_f64 (t, halfsign); +} + +TEST_SIG (V, D, 1, sinh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (sinh), 2.02) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/pl/math/v_sinhf_2u3.c b/math/aarch64/advsimd/sinhf.c similarity index 59% rename from pl/math/v_sinhf_2u3.c rename to math/aarch64/advsimd/sinhf.c index cd8c0f08f78444..12dbe26b425b73 100644 --- a/pl/math/v_sinhf_2u3.c +++ b/math/aarch64/advsimd/sinhf.c @@ -1,28 +1,25 @@ /* * Single-precision vector sinh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" - +#include "test_sig.h" +#include "test_defs.h" #include "v_expm1f_inline.h" static const struct data { struct v_expm1f_data expm1f_consts; - uint32x4_t halff; #if WANT_SIMD_EXCEPT uint32x4_t tiny_bound, thresh; #else - uint32x4_t oflow_bound; + float32x4_t oflow_bound; #endif } data = { .expm1f_consts = V_EXPM1F_DATA, - .halff = V4 (0x3f000000), #if WANT_SIMD_EXCEPT /* 0x1.6a09e8p-32, below which expm1f underflows. */ .tiny_bound = V4 (0x2fb504f4), @@ -30,14 +27,15 @@ static const struct data .thresh = V4 (0x12fbbbb3), #else /* 0x1.61814ep+6, above which expm1f helper overflows. */ - .oflow_bound = V4 (0x42b0c0a7), + .oflow_bound = V4 (0x1.61814ep+6), #endif }; static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign, + uint32x4_t special) { - return v_call_f32 (sinhf, x, y, special); + return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special); } /* Approximation for vector single-precision sinh(x) using expm1. @@ -45,21 +43,21 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) The maximum error is 2.26 ULP: _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */ -float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); uint32x4_t ix = vreinterpretq_u32_f32 (x); float32x4_t ax = vabsq_f32 (x); - uint32x4_t iax = vreinterpretq_u32_f32 (ax); - uint32x4_t sign = veorq_u32 (ix, iax); - float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff)); + float32x4_t halfsign = vreinterpretq_f32_u32 ( + vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5)))); #if WANT_SIMD_EXCEPT - uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh); + uint32x4_t special = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh); ax = v_zerofy_f32 (ax, special); #else - uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound); + uint32x4_t special = vcageq_f32 (x, d->oflow_bound); #endif /* Up to the point that expm1f overflows, we can use it to calculate sinhf @@ -71,14 +69,16 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x) /* Fall back to the scalar variant for any lanes that should trigger an exception. */ if (unlikely (v_any_u32 (special))) - return special_case (x, vmulq_f32 (t, halfsign), special); + return special_case (x, t, halfsign, special); return vmulq_f32 (t, halfsign); } -PL_SIG (V, F, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_F1 (sinh), 1.76) -PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) +HALF_WIDTH_ALIAS_F1 (sinh) + +TEST_SIG (V, F, 1, sinh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (sinh), 1.76) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) diff --git a/pl/math/v_sinpi_3u1.c b/math/aarch64/advsimd/sinpi.c similarity index 81% rename from pl/math/v_sinpi_3u1.c rename to math/aarch64/advsimd/sinpi.c index 8d2917ff8ecd70..f86d167a2ac3d3 100644 --- a/pl/math/v_sinpi_3u1.c +++ b/math/aarch64/advsimd/sinpi.c @@ -1,15 +1,15 @@ /* * Double-precision vector sinpi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -34,7 +34,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) { /* Fall back to scalar code. */ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); - return v_call_f64 (sinpi, x, y, cmp); + return v_call_f64 (arm_math_sinpi, x, y, cmp); } #endif @@ -77,10 +77,11 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x) return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } -PL_SIG (V, D, 1, sinpi, -0.9, 0.9) -PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06) -PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000) +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_D1 (sinpi), 2.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000) +#endif diff --git a/pl/math/v_sinpif_3u.c b/math/aarch64/advsimd/sinpif.c similarity index 76% rename from pl/math/v_sinpif_3u.c rename to math/aarch64/advsimd/sinpif.c index 3d6eeff333f7e8..98ba9d84d2fb07 100644 --- a/pl/math/v_sinpif_3u.c +++ b/math/aarch64/advsimd/sinpif.c @@ -1,15 +1,15 @@ /* * Single-precision vector sinpi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -29,7 +29,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) { /* Fall back to scalar code. */ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); - return v_call_f32 (sinpif, x, y, cmp); + return v_call_f32 (arm_math_sinpif, x, y, cmp); } #endif @@ -37,7 +37,7 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) Maximum Error 3.03 ULP: _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. */ -float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -72,10 +72,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x) return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } -PL_SIG (V, F, 1, sinpi, -0.9, 0.9) -PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54) -PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000) +HALF_WIDTH_ALIAS_F1 (sinpi) + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_F1 (sinpi), 2.54) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000) +#endif diff --git a/pl/math/v_tan_3u5.c b/math/aarch64/advsimd/tan.c similarity index 86% rename from pl/math/v_tan_3u5.c rename to math/aarch64/advsimd/tan.c index c431c8c4889ef8..957f9aba3a1e63 100644 --- a/pl/math/v_tan_3u5.c +++ b/math/aarch64/advsimd/tan.c @@ -1,19 +1,20 @@ /* * Double-precision vector tan(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float64x2_t poly[9]; - float64x2_t half_pi, two_over_pi, shift; + double half_pi[2]; + float64x2_t two_over_pi, shift; #if !WANT_SIMD_EXCEPT float64x2_t range_val; #endif @@ -71,8 +72,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ float64x2_t r = x; - r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); - r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); + float64x2_t half_pi = vld1q_f64 (dat->half_pi); + r = vfmsq_laneq_f64 (r, q, half_pi, 0); + r = vfmsq_laneq_f64 (r, q, half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ r = vmulq_n_f64 (r, 0.5); @@ -112,9 +114,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) vbslq_f64 (no_recip, d, n)); } -PL_SIG (V, D, 1, tan, -3.1, 3.1) -PL_TEST_ULP (V_NAME_D1 (tan), 2.99) -PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000) +TEST_SIG (V, D, 1, tan, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (tan), 2.99) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000) diff --git a/pl/math/v_tanf_3u5.c b/math/aarch64/advsimd/tanf.c similarity index 83% rename from pl/math/v_tanf_3u5.c rename to math/aarch64/advsimd/tanf.c index 98948b0a9ecfb8..ed5448649f6cd7 100644 --- a/pl/math/v_tanf_3u5.c +++ b/math/aarch64/advsimd/tanf.c @@ -1,19 +1,19 @@ /* * Single-precision vector tan(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float32x4_t poly[6]; - float32x4_t pi_consts; + float pi_consts[4]; float32x4_t shift; #if !WANT_SIMD_EXCEPT float32x4_t range_val; @@ -64,7 +64,7 @@ eval_poly (float32x4_t z, const struct data *d) Maximum error is 3.45 ULP: __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 want 0x1.ff9850p-1. */ -float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) { const struct data *d = ptr_barrier (&data); float32x4_t special_arg = x; @@ -85,16 +85,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x) #endif /* n = rint(x/(pi/2)). */ - float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); + float32x4_t pi_consts = vld1q_f32 (d->pi_consts); + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3); float32x4_t n = vsubq_f32 (q, d->shift); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ float32x4_t r; - r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); - r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); - r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); + r = vfmaq_laneq_f32 (x, n, pi_consts, 0); + r = vfmaq_laneq_f32 (r, n, pi_consts, 1); + r = vfmaq_laneq_f32 (r, n, pi_consts, 2); /* If x lives in an interval, where |tan(x)| - is finite, then use a polynomial approximation of the form @@ -119,9 +120,11 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x) return vbslq_f32 (pred_alt, inv_y, y); } -PL_SIG (V, F, 1, tan, -3.1, 3.1) -PL_TEST_ULP (V_NAME_F1 (tan), 2.96) -PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000) +HALF_WIDTH_ALIAS_F1 (tan) + +TEST_SIG (V, F, 1, tan, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (tan), 2.96) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000) diff --git a/math/aarch64/advsimd/tanh.c b/math/aarch64/advsimd/tanh.c new file mode 100644 index 00000000000000..3dc6e5527ffce2 --- /dev/null +++ b/math/aarch64/advsimd/tanh.c @@ -0,0 +1,67 @@ +/* + * Double-precision vector tanh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; + uint64x2_t thresh, tiny_bound; +} data = { + .d = V_EXPM1_DATA, + .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = V2 (0x01f241bf835f9d5f), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t q, float64x2_t qp2, + uint64x2_t special) +{ + return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special); +} + +/* Vector approximation for double-precision tanh(x), using a simplified + version of expm1. The greatest observed error is 2.70 ULP: + _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3 + want -0x1.be5452a6459fbp-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + float64x2_t u = x; + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh); +#if WANT_SIMD_EXCEPT + /* To trigger fp exceptions correctly, set special lanes to a neutral value. + They will be fixed up later by the special-case handler. */ + if (unlikely (v_any_u64 (special))) + u = v_zerofy_f64 (u, special); +#endif + + u = vaddq_f64 (u, u); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float64x2_t q = expm1_inline (u, &d->d); + float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0)); + + if (unlikely (v_any_u64 (special))) + return special_case (x, q, qp2, special); + return vdivq_f64 (q, qp2); +} + +TEST_SIG (V, D, 1, tanh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (tanh), 2.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/pl/math/v_tanhf_2u6.c b/math/aarch64/advsimd/tanhf.c similarity index 62% rename from pl/math/v_tanhf_2u6.c rename to math/aarch64/advsimd/tanhf.c index d1cb9fb6eeb3af..18fe93c7e7ba74 100644 --- a/pl/math/v_tanhf_2u6.c +++ b/math/aarch64/advsimd/tanhf.c @@ -1,14 +1,13 @@ /* * Single-precision vector tanh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" - +#include "test_sig.h" +#include "test_defs.h" #include "v_expm1f_inline.h" static const struct data @@ -20,20 +19,23 @@ static const struct data /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ .boring_bound = V4 (0x41102cb3), .large_bound = V4 (0x7f800000), - .onef = V4 (0x3f800000), }; static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring, + float32x4_t q, uint32x4_t special) { - return v_call_f32 (tanhf, x, y, special); + return v_call_f32 ( + tanhf, x, + vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))), + special); } /* Approximation for single-precision vector tanh(x), using a simplified version of expm1f. The maximum error is 2.58 ULP: _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5 want 0x1.f9ba08p-5. */ -float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -42,7 +44,9 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x) uint32x4_t iax = vreinterpretq_u32_f32 (ax); uint32x4_t sign = veorq_u32 (ix, iax); uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound); - float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef)); + /* expm1 exponent bias is 1.0f reinterpreted to int. */ + float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 ( + sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias))); #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered properly, set all special and boring @@ -58,16 +62,20 @@ float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x) /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); - float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); + if (unlikely (v_any_u32 (special))) - return special_case (vreinterpretq_f32_u32 (ix), - vbslq_f32 (is_boring, boring, y), special); + return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q, + special); + + float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); return vbslq_f32 (is_boring, boring, y); } -PL_SIG (V, F, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_F1 (tanh), 2.09) -PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) +HALF_WIDTH_ALIAS_F1 (tanh) + +TEST_SIG (V, F, 1, tanh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (tanh), 2.09) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/math/aarch64/advsimd/tanpi.c b/math/aarch64/advsimd/tanpi.c new file mode 100644 index 00000000000000..16de00ad555666 --- /dev/null +++ b/math/aarch64/advsimd/tanpi.c @@ -0,0 +1,88 @@ +/* + * Double-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpi_data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12; + double c1, c3, c5, c7, c9, c11, c13, c14; +} tanpi_data = { + /* Coefficents for tan(pi * x) computed with fpminimax + on [ 0x1p-1022 0x1p-2 ] + approx rel error: 0x1.7eap-55 + approx abs error: 0x1.7eap-55. */ + .c0 = V2 (0x1.921fb54442d18p1), /* pi. */ + .c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5), + .c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9), + .c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13), + .c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17), + .c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21), + .c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27), + .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32, +}; + +/* Approximation for double-precision vector tanpi(x) + The maximum error is 3.06 ULP: + _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3 + want -0x1.fa30112702c95p+3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x) +{ + const struct v_tanpi_data *d = ptr_barrier (&tanpi_data); + + float64x2_t n = vrndnq_f64 (x); + + /* inf produces nan that propagates. */ + float64x2_t xr = vsubq_f64 (x, n); + float64x2_t ar = vabdq_f64 (x, n); + uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25)); + float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar); + + /* Order-14 pairwise Horner. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + + float64x2_t c_1_3 = vld1q_f64 (&d->c1); + float64x2_t c_5_7 = vld1q_f64 (&d->c5); + float64x2_t c_9_11 = vld1q_f64 (&d->c9); + float64x2_t c_13_14 = vld1q_f64 (&d->c13); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0); + + float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1); + p = vfmaq_f64 (p1011, r4, p); + p = vfmaq_f64 (p89, r4, p); + p = vfmaq_f64 (p67, r4, p); + p = vfmaq_f64 (p45, r4, p); + p = vfmaq_f64 (p23, r4, p); + p = vfmaq_f64 (p01, r4, p); + p = vmulq_f64 (r, p); + + float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p); + float64x2_t y = vbslq_f64 (flip, p_recip, p); + + uint64x2_t sign + = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar)); + return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (V_NAME_D1 (tanpi)) +TEST_ULP (V_NAME_D1 (tanpi), 2.57) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000) +#endif diff --git a/math/aarch64/advsimd/tanpif.c b/math/aarch64/advsimd/tanpif.c new file mode 100644 index 00000000000000..7bd6d206819f82 --- /dev/null +++ b/math/aarch64/advsimd/tanpif.c @@ -0,0 +1,70 @@ +/* + * Single-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpif_data +{ + float32x4_t c0, c2, c4, c6; + float c1, c3, c5, c7; +} tanpif_data = { + /* Coefficents for tan(pi * x). */ + .c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f), + .c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f, + .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f, +}; + +/* Approximation for single-precision vector tanpi(x) + The maximum error is 3.34 ULP: + _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2 + want 0x1.f70aa6p+2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x) +{ + const struct v_tanpif_data *d = ptr_barrier (&tanpif_data); + + float32x4_t n = vrndnq_f32 (x); + + /* inf produces nan that propagates. */ + float32x4_t xr = vsubq_f32 (x, n); + float32x4_t ar = vabdq_f32 (x, n); + uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f)); + float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar); + + /* Order-7 pairwise Horner polynomial evaluation scheme. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + + float32x4_t odd_coeffs = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3); + float32x4_t p = vfmaq_f32 (p45, r4, p67); + p = vfmaq_f32 (p23, r4, p); + p = vfmaq_f32 (p01, r4, p); + + p = vmulq_f32 (r, p); + float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p); + float32x4_t y = vbslq_f32 (flip, p_recip, p); + + uint32x4_t sign + = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar)); + return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign)); +} + +HALF_WIDTH_ALIAS_F1 (tanpi) + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (V_NAME_F1 (tanpi)) +TEST_ULP (V_NAME_F1 (tanpi), 2.84) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000) +#endif diff --git a/math/aarch64/advsimd/v_expf_inline.h b/math/aarch64/advsimd/v_expf_inline.h new file mode 100644 index 00000000000000..797d217820c3bb --- /dev/null +++ b/math/aarch64/advsimd/v_expf_inline.h @@ -0,0 +1,58 @@ +/* + * Helper for single-precision routines which calculate exp(ax) and do not + * need special-case handling + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPF_INLINE_H +#define MATH_V_EXPF_INLINE_H + +#include "v_math.h" + +struct v_expf_data +{ + float ln2_hi, ln2_lo, c0, c2; + float32x4_t inv_ln2, c1, c3, c4; + /* asuint(1.0f). */ + uint32x4_t exponent_bias; +}; + +/* maxerr: 1.45358 +0.5 ulp. */ +#define V_EXPF_DATA \ + { \ + .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \ + .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \ + } + +static inline float32x4_t +v_expf_inline (float32x4_t x, const struct v_expf_data *d) +{ + /* Helper routine for calculating exp(ax). + Copied from v_expf.c, with all special-case handling removed - the + calling routine should handle special values if required. */ + + /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t ax = vabsq_f32 (x); + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); + float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + + /* Custom order-4 Estrin avoids building high order monomial. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); + return vfmaq_f32 (scale, poly, scale); +} + +#endif // MATH_V_EXPF_INLINE_H diff --git a/math/aarch64/advsimd/v_expm1_inline.h b/math/aarch64/advsimd/v_expm1_inline.h new file mode 100644 index 00000000000000..82d2e9415d93b8 --- /dev/null +++ b/math/aarch64/advsimd/v_expm1_inline.h @@ -0,0 +1,86 @@ +/* + * Helper for double-precision routines which calculate exp(x) - 1 and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPM1_INLINE_H +#define MATH_V_EXPM1_INLINE_H + +#include "v_math.h" + +struct v_expm1_data +{ + float64x2_t c2, c4, c6, c8; + float64x2_t invln2; + int64x2_t exponent_bias; + double c1, c3, c5, c7, c9, c10; + double ln2[2]; +}; + +/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ +#define V_EXPM1_DATA \ + { \ + .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \ + .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \ + .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \ + .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \ + .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \ + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \ + .invln2 = V2 (0x1.71547652b82fep0), \ + .exponent_bias = V2 (0x3ff0000000000000), \ + } + +static inline float64x2_t +expm1_inline (float64x2_t x, const struct v_expm1_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2)); + int64x2_t i = vcvtq_s64_f64 (n); + float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); + f = vfmsq_laneq_f64 (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t lane_consts_13 = vld1q_f64 (&d->c1); + float64x2_t lane_consts_57 = vld1q_f64 (&d->c5); + float64x2_t lane_consts_910 = vld1q_f64 (&d->c9); + float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1); + float64x2_t p03 = vfmaq_f64 (p01, f2, p23); + float64x2_t p47 = vfmaq_f64 (p45, f2, p67); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0); + float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1); + p = vfmaq_f64 (p47, f4, p); + p = vfmaq_f64 (p03, f4, p); + + p = vfmaq_f64 (f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); + float64x2_t t = vreinterpretq_f64_s64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); +} + +#endif // MATH_V_EXPM1_INLINE_H diff --git a/math/aarch64/advsimd/v_expm1f_inline.h b/math/aarch64/advsimd/v_expm1f_inline.h new file mode 100644 index 00000000000000..463b07aa7705d6 --- /dev/null +++ b/math/aarch64/advsimd/v_expm1f_inline.h @@ -0,0 +1,62 @@ +/* + * Helper for single-precision routines which calculate exp(x) - 1 and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPM1F_INLINE_H +#define MATH_V_EXPM1F_INLINE_H + +#include "v_math.h" + +struct v_expm1f_data +{ + float32x4_t c0, c2; + int32x4_t exponent_bias; + float c1, c3, inv_ln2, c4; + float ln2_hi, ln2_lo; +}; + +/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, + log(2)/2]. Exponent bias is asuint(1.0f). */ +#define V_EXPM1F_DATA \ + { \ + .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \ + .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + } + +static inline float32x4_t +expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float32x2_t ln2 = vld1_f32 (&d->ln2_hi); + float32x4_t lane_consts = vld1q_f32 (&d->c1); + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2)); + int32x4_t i = vcvtq_s32_f32 (j); + float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0); + f = vfmsq_lane_f32 (f, j, ln2, 1); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */ + float32x4_t f2 = vmulq_f32 (f, f); + float32x4_t f4 = vmulq_f32 (f2, f2); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1); + float32x4_t p = vfmaq_f32 (p01, f2, p23); + p = vfmaq_laneq_f32 (p, f4, lane_consts, 3); + p = vfmaq_f32 (f, f2, p); + + /* t = 2^i. */ + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); + float32x4_t t = vreinterpretq_f32_s32 (u); + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); +} + +#endif // MATH_V_EXPM1F_INLINE_H diff --git a/math/aarch64/advsimd/v_log1p_inline.h b/math/aarch64/advsimd/v_log1p_inline.h new file mode 100644 index 00000000000000..ef906ae4b6033c --- /dev/null +++ b/math/aarch64/advsimd/v_log1p_inline.h @@ -0,0 +1,119 @@ +/* + * Helper for vector double-precision routines which calculate log(1 + x) and + * do not need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef MATH_V_LOG1P_INLINE_H +#define MATH_V_LOG1P_INLINE_H + +#include "v_math.h" + +struct v_log1p_data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16; + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; + int64x2_t one_top; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; + double ln2[2]; +}; + +/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ +#define V_LOG1P_CONSTANTS_TABLE \ + { \ + .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \ + .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \ + .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \ + .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \ + .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \ + .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \ + .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \ + .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \ + .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \ + .c18 = -0x1.cfa7385bdb37ep-6, \ + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \ + .hf_rt2_top = V2 (0x3fe6a09e00000000), \ + .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ + .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ + } + +#define BottomMask v_u64 (0xffffffff) + +static inline float64x2_t +eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1718 = vld1q_f64 (&d->c17); + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0); + float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1); + p = vfmaq_f64 (p1415, m2, p); + p = vfmaq_f64 (p1213, m2, p); + p = vfmaq_f64 (p1011, m2, p); + p = vfmaq_f64 (p89, m2, p); + p = vfmaq_f64 (p67, m2, p); + p = vfmaq_f64 (p45, m2, p); + p = vfmaq_f64 (p23, m2, p); + return vfmaq_f64 (p01, m2, p); +} + +static inline float64x2_t +log1p_inline (float64x2_t x, const struct v_log1p_data *d) +{ + /* Helper for calculating log(x + 1): + - No special-case handling - this should be dealt with by the caller. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using v_sel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. */ + float64x2_t m = vaddq_f64 (x, v_f64 (1.0)); + uint64x2_t mi = vreinterpretq_u64_f64 (m); + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); + + int64x2_t ki + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); + float64x2_t k = vcvtq_f64_s64 (ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0)); + + /* Correction term c/m. */ + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m); + +#ifndef WANT_V_LOG1P_K0_SHORTCUT +# error \ + "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +#elif WANT_V_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is + that the approximation is solely the polynomial. */ + uint64x2_t k0 = vceqzq_f64 (k); + cm = v_zerofy_f64 (cm, k0); + f = vbslq_f64 (k0, x, f); +#endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t p = eval_poly (f, f2, d); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1); + float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0); + return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); +} + +#endif // MATH_V_LOG1P_INLINE_H diff --git a/math/aarch64/advsimd/v_log1pf_inline.h b/math/aarch64/advsimd/v_log1pf_inline.h new file mode 100644 index 00000000000000..e81fa24486aeda --- /dev/null +++ b/math/aarch64/advsimd/v_log1pf_inline.h @@ -0,0 +1,94 @@ +/* + * Helper for single-precision routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_LOG1PF_INLINE_H +#define MATH_V_LOG1PF_INLINE_H + +#include "v_math.h" +#include "v_poly_f32.h" + +struct v_log1pf_data +{ + uint32x4_t four; + int32x4_t three_quarters; + float c0, c3, c5, c7; + float32x4_t c4, c6, c1, c2, ln2; +}; + +/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients + (1, -0.5) are not stored as they can be generated more efficiently. */ +#define V_LOG1PF_CONSTANTS_TABLE \ + { \ + .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \ + .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \ + .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \ + .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \ + .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ + .three_quarters = V4 (0x3f400000) \ + } + +static inline float32x4_t +eval_poly (float32x4_t m, const struct v_log1pf_data *d) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float32x4_t c0357 = vld1q_f32 (&d->c0); + float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0); + float32x4_t m2 = vmulq_f32 (m, m); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1); + float32x4_t p = vfmaq_f32 (p45, m2, p67); + p = vfmaq_f32 (p23, m2, p); + p = vfmaq_f32 (d->c1, m, p); + p = vmulq_f32 (m2, p); + p = vfmaq_f32 (m, m2, p); + return vfmaq_f32 (p, m2, q); +} + +static inline float32x4_t +log1pf_inline (float32x4_t x, const struct v_log1pf_data *d) +{ + /* Helper for calculating log(x + 1). */ + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + int32x4_t k + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), + v_s32 (0xff800000)); + uint32x4_t ku = vreinterpretq_u32_s32 (k); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); + + /* Scale x by exponent manipulation. */ + float32x4_t m_scale + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); + + /* Evaluate polynomial on the reduced interval. */ + float32x4_t p = eval_poly (m_scale, d); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); + + /* Apply the scaling back. */ + return vfmaq_f32 (p, scale_back, d->ln2); +} + +#endif // MATH_V_LOG1PF_INLINE_H diff --git a/pl/math/v_log_inline.h b/math/aarch64/advsimd/v_log_inline.h similarity index 94% rename from pl/math/v_log_inline.h rename to math/aarch64/advsimd/v_log_inline.h index 2df00cf4ddf4c2..770f9e81c19532 100644 --- a/pl/math/v_log_inline.h +++ b/math/aarch64/advsimd/v_log_inline.h @@ -1,7 +1,7 @@ /* * Double-precision vector log(x) function - inline version * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -57,8 +57,8 @@ log_lookup (uint64x2_t i) { /* Since N is a power of 2, n % N = n & (N - 1). */ struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); e.invc = vuzp1q_f64 (e0, e1); diff --git a/pl/math/v_math.h b/math/aarch64/advsimd/v_math.h similarity index 58% rename from pl/math/v_math.h rename to math/aarch64/advsimd/v_math.h index 1b10929facccfb..75cd71cc87a77c 100644 --- a/pl/math/v_math.h +++ b/math/aarch64/advsimd/v_math.h @@ -1,36 +1,63 @@ /* * Vector math abstractions. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _V_MATH_H #define _V_MATH_H -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 +#if !__aarch64__ +# error "Cannot build without AArch64" #endif -#if WANT_VMATH - -# if __aarch64__ -# define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) -# else -# error "Cannot build without AArch64" -# endif - -# include -# include "math_config.h" -# if __aarch64__ +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun +#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f +#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun + +#if USE_GLIBC_ABI + +# define HALF_WIDTH_ALIAS_F1(fun) \ + float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \ + { \ + return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \ + } + +# define HALF_WIDTH_ALIAS_F2(fun) \ + float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \ + { \ + return vget_low_f32 ( \ + _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \ + } + +#else +# define HALF_WIDTH_ALIAS_F1(fun) +# define HALF_WIDTH_ALIAS_F2(fun) +#endif -# include +#include +#include "math_config.h" +#include /* Shorthand helpers for declaring constants. */ -# define V2(X) { X, X } -# define V4(X) { X, X, X, X } -# define V8(X) { X, X, X, X, X, X, X, X } +#define V2(X) \ + { \ + X, X \ + } +#define V4(X) \ + { \ + X, X, X, X \ + } +#define V8(X) \ + { \ + X, X, X, X, X, X, X, X \ + } static inline int v_any_u16h (uint16x4_t x) @@ -38,6 +65,12 @@ v_any_u16h (uint16x4_t x) return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; } +static inline int +v_lanes32 (void) +{ + return 4; +} + static inline float32x4_t v_f32 (float x) { @@ -54,7 +87,7 @@ v_s32 (int32_t x) return (int32x4_t) V4 (x); } -/* true if any elements of a vector compare result is non-zero. */ +/* true if any elements of a v_cond result is non-zero. */ static inline int v_any_u32 (uint32x4_t x) { @@ -97,6 +130,11 @@ v_zerofy_f32 (float32x4_t x, uint32x4_t mask) return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask)); } +static inline int +v_lanes64 (void) +{ + return 2; +} static inline float64x2_t v_f64 (double x) { @@ -113,20 +151,13 @@ v_s64 (int64_t x) return (int64x2_t) V2 (x); } -/* true if any elements of a vector compare result is non-zero. */ +/* true if any elements of a v_cond result is non-zero. */ static inline int v_any_u64 (uint64x2_t x) { /* assume elements in x are either 0 or -1u. */ return vpaddd_u64 (x) != 0; } -/* true if all elements of a vector compare result is 1. */ -static inline int -v_all_u64 (uint64x2_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; -} static inline float64x2_t v_lookup_f64 (const double *tab, uint64x2_t idx) { @@ -137,7 +168,6 @@ v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) { return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; } - static inline float64x2_t v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) { @@ -169,7 +199,4 @@ v_zerofy_f64 (float64x2_t x, uint64x2_t mask) return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask)); } -# endif -#endif - #endif diff --git a/pl/math/poly_advsimd_f32.h b/math/aarch64/advsimd/v_poly_f32.h similarity index 81% rename from pl/math/poly_advsimd_f32.h rename to math/aarch64/advsimd/v_poly_f32.h index 438e153dff90c6..9a9c5c1ac15b34 100644 --- a/pl/math/poly_advsimd_f32.h +++ b/math/aarch64/advsimd/v_poly_f32.h @@ -2,12 +2,12 @@ * Helpers for evaluating polynomials on single-precision AdvSIMD input, using * various schemes. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_POLY_ADVSIMD_F32_H -#define PL_MATH_POLY_ADVSIMD_F32_H +#ifndef MATH_POLY_ADVSIMD_F32_H +#define MATH_POLY_ADVSIMD_F32_H #include diff --git a/pl/math/poly_advsimd_f64.h b/math/aarch64/advsimd/v_poly_f64.h similarity index 81% rename from pl/math/poly_advsimd_f64.h rename to math/aarch64/advsimd/v_poly_f64.h index 7ea249a9122554..4331bfbd03b0c1 100644 --- a/pl/math/poly_advsimd_f64.h +++ b/math/aarch64/advsimd/v_poly_f64.h @@ -2,12 +2,12 @@ * Helpers for evaluating polynomials on double-precision AdvSIMD input, using * various schemes. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_POLY_ADVSIMD_F64_H -#define PL_MATH_POLY_ADVSIMD_F64_H +#ifndef MATH_POLY_ADVSIMD_F64_H +#define MATH_POLY_ADVSIMD_F64_H #include diff --git a/pl/math/v_sincos_common.h b/math/aarch64/advsimd/v_sincos_common.h similarity index 97% rename from pl/math/v_sincos_common.h rename to math/aarch64/advsimd/v_sincos_common.h index ee7937e0785ae6..14227d9339a881 100644 --- a/pl/math/v_sincos_common.h +++ b/math/aarch64/advsimd/v_sincos_common.h @@ -1,12 +1,12 @@ /* * Core approximation for double-precision vector sincos * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "poly_advsimd_f64.h" +#include "v_poly_f64.h" static const struct v_sincos_data { diff --git a/pl/math/v_sincosf_common.h b/math/aarch64/advsimd/v_sincosf_common.h similarity index 98% rename from pl/math/v_sincosf_common.h rename to math/aarch64/advsimd/v_sincosf_common.h index 8239bd9f01763e..7c29eded14d68e 100644 --- a/pl/math/v_sincosf_common.h +++ b/math/aarch64/advsimd/v_sincosf_common.h @@ -1,7 +1,7 @@ /* * Core approximation for single-precision vector sincos * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/math/aarch64/advsimd/v_sincospi_common.h b/math/aarch64/advsimd/v_sincospi_common.h new file mode 100644 index 00000000000000..438b141b9174de --- /dev/null +++ b/math/aarch64/advsimd/v_sincospi_common.h @@ -0,0 +1,64 @@ +/* + * Helper for Double-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "v_poly_f64.h" + +static const struct v_sincospi_data +{ + float64x2_t poly[10], range_val; +} v_sincospi_data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using separate argument reduction and shared low-order + polynomials. + Approximation for vector double-precision sincospi(x). + Maximum Error 3.09 ULP: + _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1 + Maximum Error 3.16 ULP: + _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. */ +static inline float64x2x2_t +v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d) +{ + /* If r is odd, the sign of the result should be inverted for sinpi + and reintroduced for cospi. */ + uint64x2_t cmp = vcgeq_f64 (x, d->range_val); + uint64x2_t odd = vshlq_n_u64 ( + vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63); + + /* r = x - rint(x). */ + float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x)); + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float64x2_t sr2 = vmulq_f64 (sr, sr); + float64x2_t sr4 = vmulq_f64 (sr2, sr2); + float64x2_t cr2 = vmulq_f64 (cr, cr); + float64x2_t cr4 = vmulq_f64 (cr2, cr2); + + float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr); + float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr); + + float64x2_t sinpix + = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd)); + + float64x2_t cospix + = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd)); + + return (float64x2x2_t){ sinpix, cospix }; +} diff --git a/math/aarch64/advsimd/v_sincospif_common.h b/math/aarch64/advsimd/v_sincospif_common.h new file mode 100644 index 00000000000000..8d4177dd871eb3 --- /dev/null +++ b/math/aarch64/advsimd/v_sincospif_common.h @@ -0,0 +1,57 @@ +/* + * Helper for Single-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f32.h" + +const static struct v_sincospif_data +{ + float32x4_t poly[6], range_val; +} v_sincospif_data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + */ +static inline float32x4x2_t +v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d) +{ + /* If r is odd, the sign of the result should be inverted for sinpi and + reintroduced for cospi. */ + uint32x4_t cmp = vcgeq_f32 (x, d->range_val); + uint32x4_t odd = vshlq_n_u32 ( + vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31); + + /* r = x - rint(x). */ + float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x)); + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t sr2 = vmulq_f32 (sr, sr); + float32x4_t sr4 = vmulq_f32 (sr2, sr2); + float32x4_t cr2 = vmulq_f32 (cr, cr); + float32x4_t cr4 = vmulq_f32 (cr2, cr2); + + float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr); + float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr); + + float32x4_t sinpix + = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd)); + float32x4_t cospix + = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd)); + + return (float32x4x2_t){ sinpix, cospix }; +} diff --git a/pl/math/cospi_3u1.c b/math/aarch64/cospi_3u5.c similarity index 82% rename from pl/math/cospi_3u1.c rename to math/aarch64/cospi_3u5.c index 4a688a07682970..4131f6c816a199 100644 --- a/pl/math/cospi_3u1.c +++ b/math/aarch64/cospi_3u5.c @@ -1,14 +1,14 @@ /* * Double-precision scalar cospi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "poly_scalar_f64.h" /* Taylor series coefficents for sin(pi * x). @@ -29,9 +29,9 @@ static const double poly[] cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1 want 0x1.fffffffffd16ep-1. */ double -cospi (double x) +arm_math_cospi (double x) { - if (isinf (x)) + if (isinf (x) || isnan (x)) return __math_invalid (x); double ax = asdouble (asuint64 (x) & ~0x8000000000000000); @@ -81,9 +81,18 @@ cospi (double x) return asdouble (asuint64 (y) ^ sign); } -PL_SIG (S, D, 1, cospi, -0.9, 0.9) -PL_TEST_ULP (cospi, 2.63) -PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000) -PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000) -PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000) -PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000) +#if WANT_EXPERIMENTAL_MATH +double +cospi (double x) +{ + return arm_math_cospi (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_cospi, 2.63) +TEST_SYM_INTERVAL (arm_math_cospi, 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (arm_math_cospi, 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_cospi, 0.5, 0x1p51f, 10000) +TEST_SYM_INTERVAL (arm_math_cospi, 0x1p51f, inf, 10000) +#endif diff --git a/pl/math/cospif_2u6.c b/math/aarch64/cospif_2u6.c similarity index 79% rename from pl/math/cospif_2u6.c rename to math/aarch64/cospif_2u6.c index d78a98ed8b2d21..eb5b75402a63c8 100644 --- a/pl/math/cospif_2u6.c +++ b/math/aarch64/cospif_2u6.c @@ -1,14 +1,14 @@ /* * Single-precision scalar cospi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* Taylor series coefficents for sin(pi * x). */ #define C0 0x1.921fb6p1f @@ -25,9 +25,9 @@ cospif(0x1.37e844p-4) got 0x1.f16b3p-1 want 0x1.f16b2ap-1. */ float -cospif (float x) +arm_math_cospif (float x) { - if (isinf (x)) + if (isinf (x) || isnan (x)) return __math_invalidf (x); float ax = asfloat (asuint (x) & ~0x80000000); @@ -76,9 +76,18 @@ cospif (float x) return asfloat (asuint (y * r) ^ sign); } -PL_SIG (S, F, 1, cospi, -0.9, 0.9) -PL_TEST_ULP (cospif, 2.15) -PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000) -PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000) -PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000) -PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000) +#if WANT_EXPERIMENTAL_MATH +float +cospif (float x) +{ + return arm_math_cospif (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_cospif, 2.15) +TEST_SYM_INTERVAL (arm_math_cospif, 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (arm_math_cospif, 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_cospif, 0.5, 0x1p22f, 10000) +TEST_SYM_INTERVAL (arm_math_cospif, 0x1p22f, inf, 10000) +#endif diff --git a/pl/README.contributors b/math/aarch64/experimental/README.contributors similarity index 71% rename from pl/README.contributors rename to math/aarch64/experimental/README.contributors index 3af9b1fc7741d9..abb749485ba3fe 100644 --- a/pl/README.contributors +++ b/math/aarch64/experimental/README.contributors @@ -5,7 +5,6 @@ glibc-specific conventions need not be followed. The requirements for portable code apply to non-portable code with the following differences: - 1. Worst-case ULP error should be encoded in filenames (e.g. sin_u35.c). There are no specific restrictions on acceptable ULP error, but if functions provide significantly less accuracy than portable equivalents then a clear @@ -15,9 +14,3 @@ following differences: 2. Functions are assumed to support round-to-nearest mode by default, unless stated; other rounding modes are not required to be provided. - -3. Handling of special cases may be relaxed for vector functions. Checking - whether each vector lane contains special values such as NaN, Inf or - denormal numbers can prove too costly for vector functions. This is often - not required since vector functions are typically used along with aggressive - compiler optimization flags. diff --git a/pl/math/acos_2u.c b/math/aarch64/experimental/acos_2u.c similarity index 76% rename from pl/math/acos_2u.c rename to math/aarch64/experimental/acos_2u.c index 9ec6894f1d8100..062215c92248ff 100644 --- a/pl/math/acos_2u.c +++ b/math/aarch64/experimental/acos_2u.c @@ -1,23 +1,23 @@ /* * Double-precision acos(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "poly_scalar_f64.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define AbsMask (0x7fffffffffffffff) -#define Half (0x3fe0000000000000) -#define One (0x3ff0000000000000) -#define PiOver2 (0x1.921fb54442d18p+0) -#define Pi (0x1.921fb54442d18p+1) -#define Small (0x3c90000000000000) /* 2^-53. */ -#define Small16 (0x3c90) -#define QNaN (0x7ff8) +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define One 0x3ff0000000000000 +#define PiOver2 0x1.921fb54442d18p+0 +#define Pi 0x1.921fb54442d18p+1 +#define Small 0x3c90000000000000 /* 2^-53. */ +#define Small16 0x3c90 +#define QNaN 0x7ff8 /* Fast implementation of double-precision acos(x) based on polynomial approximation of double-precision asin(x). @@ -29,8 +29,8 @@ acos(x) = pi/2 - asin(x) - and use an order 11 polynomial P such that the final approximation of asin is - an odd polynomial: asin(x) ~ x + x^3 * P(x^2). + and use an order 11 polynomial P such that the final approximation of asin + is an odd polynomial: asin(x) ~ x + x^3 * P(x^2). The largest observed error in this region is 1.18 ulps, acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 @@ -90,11 +90,11 @@ acos (double x) return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p; } -PL_SIG (S, D, 1, acos, -1.0, 1.0) -PL_TEST_ULP (acos, 1.02) -PL_TEST_INTERVAL (acos, 0, Small, 5000) -PL_TEST_INTERVAL (acos, Small, 0.5, 50000) -PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000) -PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000) -PL_TEST_INTERVAL (acos, -0, -inf, 20000) +TEST_SIG (S, D, 1, acos, -1.0, 1.0) +TEST_ULP (acos, 1.02) +TEST_INTERVAL (acos, 0, Small, 5000) +TEST_INTERVAL (acos, Small, 0.5, 50000) +TEST_INTERVAL (acos, 0.5, 1.0, 50000) +TEST_INTERVAL (acos, 1.0, 0x1p11, 50000) +TEST_INTERVAL (acos, 0x1p11, inf, 20000) +TEST_INTERVAL (acos, -0, -inf, 20000) diff --git a/pl/math/acosf_1u4.c b/math/aarch64/experimental/acosf_1u4.c similarity index 79% rename from pl/math/acosf_1u4.c rename to math/aarch64/experimental/acosf_1u4.c index 6dde422ef85a89..d207f5e89f26d4 100644 --- a/pl/math/acosf_1u4.c +++ b/math/aarch64/experimental/acosf_1u4.c @@ -1,23 +1,23 @@ /* * Single-precision acos(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f32.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define AbsMask (0x7fffffff) -#define Half (0x3f000000) -#define One (0x3f800000) -#define PiOver2f (0x1.921fb6p+0f) -#define Pif (0x1.921fb6p+1f) -#define Small (0x32800000) /* 2^-26. */ -#define Small12 (0x328) -#define QNaN (0x7fc) +#include "test_sig.h" +#include "test_defs.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define PiOver2f 0x1.921fb6p+0f +#define Pif 0x1.921fb6p+1f +#define Small 0x32800000 /* 2^-26. */ +#define Small12 0x328 +#define QNaN 0x7fc /* Fast implementation of single-precision acos(x) based on polynomial approximation of single-precision asin(x). @@ -89,11 +89,11 @@ acosf (float x) return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p; } -PL_SIG (S, F, 1, acos, -1.0, 1.0) -PL_TEST_ULP (acosf, 0.82) -PL_TEST_INTERVAL (acosf, 0, Small, 5000) -PL_TEST_INTERVAL (acosf, Small, 0.5, 50000) -PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000) -PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000) -PL_TEST_INTERVAL (acosf, -0, -inf, 20000) +TEST_SIG (S, F, 1, acos, -1.0, 1.0) +TEST_ULP (acosf, 0.82) +TEST_INTERVAL (acosf, 0, Small, 5000) +TEST_INTERVAL (acosf, Small, 0.5, 50000) +TEST_INTERVAL (acosf, 0.5, 1.0, 50000) +TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000) +TEST_INTERVAL (acosf, 0x1p11, inf, 20000) +TEST_INTERVAL (acosf, -0, -inf, 20000) diff --git a/pl/math/acosh_3u.c b/math/aarch64/experimental/acosh_3u.c similarity index 69% rename from pl/math/acosh_3u.c rename to math/aarch64/experimental/acosh_3u.c index 4e2cb6737ba863..19da82f4f3e5d9 100644 --- a/pl/math/acosh_3u.c +++ b/math/aarch64/experimental/acosh_3u.c @@ -1,31 +1,26 @@ /* * Double-precision acosh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Ln2 (0x1.62e42fefa39efp-1) #define MinusZero (0x8000000000000000) #define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511). */ #define Two (0x4000000000000000) /* asuint64(2.0). */ -double -optr_aor_log_f64 (double); - -double -log1p (double); - /* acosh approximation using a variety of approaches on different intervals: acosh(x) = ln(x + sqrt(x * x - 1)). - x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is - close enough to x that we can calculate the result by ln(2x) == ln(x) + + x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) + is close enough to x that we can calculate the result by ln(2x) == ln(x) + ln(2). The greatest observed error in this region is 0.98 ULP: acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9 want 0x1.28066a11a7c8p+9. @@ -48,19 +43,19 @@ acosh (double x) return __math_invalid (x); if (unlikely (ix >= SquareLim)) - return optr_aor_log_f64 (x) + Ln2; + return log (x) + Ln2; if (ix >= Two) - return optr_aor_log_f64 (x + sqrt (x * x - 1)); + return log (x + sqrt (x * x - 1)); double xm1 = x - 1; return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1)); } -PL_SIG (S, D, 1, acosh, 1.0, 10.0) -PL_TEST_ULP (acosh, 2.19) -PL_TEST_INTERVAL (acosh, 0, 1, 10000) -PL_TEST_INTERVAL (acosh, 1, 2, 100000) -PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000) -PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000) -PL_TEST_INTERVAL (acosh, -0, -inf, 10000) +TEST_SIG (S, D, 1, acosh, 1.0, 10.0) +TEST_ULP (acosh, 2.19) +TEST_INTERVAL (acosh, 0, 1, 10000) +TEST_INTERVAL (acosh, 1, 2, 100000) +TEST_INTERVAL (acosh, 2, 0x1p511, 100000) +TEST_INTERVAL (acosh, 0x1p511, inf, 100000) +TEST_INTERVAL (acosh, -0, -inf, 10000) diff --git a/pl/math/acoshf_2u8.c b/math/aarch64/experimental/acoshf_2u8.c similarity index 68% rename from pl/math/acoshf_2u8.c rename to math/aarch64/experimental/acoshf_2u8.c index c9cded7fd2ff7f..a46b310ee312ed 100644 --- a/pl/math/acoshf_2u8.c +++ b/math/aarch64/experimental/acoshf_2u8.c @@ -1,27 +1,19 @@ /* * Single-precision acosh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Ln2 (0x1.62e4p-1f) #define MinusZero 0x80000000 #define SquareLim 0x5f800000 /* asuint(0x1p64). */ #define Two 0x40000000 -/* Single-precision log from math/. */ -float -optr_aor_log_f32 (float); - -/* Single-precision log(1+x) from pl/math. */ -float -log1pf (float); - /* acoshf approximation using a variety of approaches on different intervals: x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is @@ -45,19 +37,19 @@ acoshf (float x) return __math_invalidf (x); if (unlikely (ix >= SquareLim)) - return optr_aor_log_f32 (x) + Ln2; + return logf (x) + Ln2; if (ix > Two) - return optr_aor_log_f32 (x + sqrtf (x * x - 1)); + return logf (x + sqrtf (x * x - 1)); float xm1 = x - 1; return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1)); } -PL_SIG (S, F, 1, acosh, 1.0, 10.0) -PL_TEST_ULP (acoshf, 2.30) -PL_TEST_INTERVAL (acoshf, 0, 1, 100) -PL_TEST_INTERVAL (acoshf, 1, 2, 10000) -PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000) -PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000) -PL_TEST_INTERVAL (acoshf, -0, -inf, 10000) +TEST_SIG (S, F, 1, acosh, 1.0, 10.0) +TEST_ULP (acoshf, 2.30) +TEST_INTERVAL (acoshf, 0, 1, 100) +TEST_INTERVAL (acoshf, 1, 2, 10000) +TEST_INTERVAL (acoshf, 2, 0x1p64, 100000) +TEST_INTERVAL (acoshf, 0x1p64, inf, 100000) +TEST_INTERVAL (acoshf, -0, -inf, 10000) diff --git a/pl/math/v_erfinv_25u.c b/math/aarch64/experimental/advsimd/erfinv_25u.c similarity index 88% rename from pl/math/v_erfinv_25u.c rename to math/aarch64/experimental/advsimd/erfinv_25u.c index 654a7336e85bc8..2fa2f0beb8b79b 100644 --- a/pl/math/v_erfinv_25u.c +++ b/math/aarch64/experimental/advsimd/erfinv_25u.c @@ -1,15 +1,15 @@ /* * Double-precision inverse error function (AdvSIMD variant). * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_test.h" +#include "test_defs.h" #include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "poly_advsimd_f64.h" +#include "test_sig.h" +#include "v_poly_f64.h" #define V_LOG_INLINE_POLY_ORDER 4 #include "v_log_inline.h" @@ -22,7 +22,7 @@ const static struct data can be taken. */ double P[8][2], Q[7][2]; float64x2_t tailshift; - uint8x16_t idx; + uint8_t idx[16]; struct v_log_inline_data log_tbl; float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6]; } data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 }, @@ -58,7 +58,7 @@ const static struct data V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7), V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) }, .tailshift = V2 (-0.87890625), - .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, .log_tbl = V_LOG_CONSTANTS }; static inline float64x2_t @@ -128,7 +128,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x) uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375)); uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8)); - uint8x16_t idx = vaddq_u8 (d->idx, off); + uint8x16_t idx = vaddq_u8 (vld1q_u8 (d->idx), off); float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625)); t = vfmaq_f64 (t, x, x); @@ -150,12 +150,17 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x) return vdivq_f64 (p, q); } -PL_SIG (V, D, 1, erfinv, -0.99, 0.99) -PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8) +#if USE_MPFR +# warning Not generating tests for _ZGVnN2v_erfinv, as MPFR has no suitable reference +#else +TEST_SIG (V, D, 1, erfinv, -0.99, 0.99) +TEST_ULP (V_NAME_D1 (erfinv), 24.8) +TEST_DISABLE_FENV (V_NAME_D1 (erfinv)) +TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000) /* Test with control lane in each interval. */ -PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, - 0.5) -PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, - 0.8) -PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000, - 0.95) +TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.5) +TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.8) +TEST_CONTROL_VALUE (V_NAME_D1 (erfinv), 0.95) +#endif diff --git a/pl/math/v_erfinvf_5u.c b/math/aarch64/experimental/advsimd/erfinvf_5u.c similarity index 83% rename from pl/math/v_erfinvf_5u.c rename to math/aarch64/experimental/advsimd/erfinvf_5u.c index 5a6800b86ae9fd..254d50feb2895f 100644 --- a/pl/math/v_erfinvf_5u.c +++ b/math/aarch64/experimental/advsimd/erfinvf_5u.c @@ -1,13 +1,13 @@ /* * Single-precision inverse error function (AdvSIMD variant). * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f32.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" #include "v_logf_inline.h" const static struct data @@ -24,14 +24,15 @@ const static struct data P_10 and Q_10 are also stored in homogenous vectors to allow better memory access when no lanes are in a tail region. */ - float32x4_t Plo, PQ, Qhi, P29_3, tailshift; + float Plo[4], PQ[4], Qhi[4]; + float32x4_t P29_3, tailshift; float32x4_t P_50[6], Q_50[2]; float32x4_t P_10[3], Q_10[3]; - uint8x16_t idxhi, idxlo; + uint8_t idxhi[16], idxlo[16]; struct v_logf_data logf_tbl; } data = { - .idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - .idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 }, + .idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 }, + .idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }, .P29_3 = V4 (0x1.b13626p-2), .tailshift = V4 (-0.87890625), .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 }, @@ -86,7 +87,7 @@ lookup (float32x4_t tbl, uint8x16_t idx) tail region: _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0 want 0x1.b4793ap+0 . */ -float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x) +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -124,18 +125,18 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x) Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores two pairs of coeffs, so we need two idx vectors - one for each pair. */ uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4)); - uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off); - uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off); + uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off); + uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off); /* Load the tables. */ - float32x4_t p_lo = d->Plo; - float32x4_t pq = d->PQ; - float32x4_t qhi = d->Qhi; + float32x4_t plo = vld1q_f32 (d->Plo); + float32x4_t pq = vld1q_f32 (d->PQ); + float32x4_t qhi = vld1q_f32 (d->Qhi); /* Do the lookup (and calculate p3 by masking non-tail lanes). */ float32x4_t p3 = vreinterpretq_f32_u32 ( vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3))); - float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi), + float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi), p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi), q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi); @@ -155,9 +156,17 @@ float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x) return vdivq_f32 (p, q); } -PL_SIG (V, F, 1, erfinv, -0.99, 0.99) -PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49) +HALF_WIDTH_ALIAS_F1 (erfinv) + +#if USE_MPFR +# warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference +#else +TEST_SIG (V, F, 1, erfinv, -0.99, 0.99) +TEST_DISABLE_FENV (V_NAME_F1 (erfinv)) +TEST_ULP (V_NAME_F1 (erfinv), 4.49) +TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000) /* Test with control lane in each interval. */ -PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5) -PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8) -PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95) +TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5) +TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8) +TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95) +#endif diff --git a/pl/math/v_logf_inline.h b/math/aarch64/experimental/advsimd/v_logf_inline.h similarity index 97% rename from pl/math/v_logf_inline.h rename to math/aarch64/experimental/advsimd/v_logf_inline.h index c00fe0909afc86..3f45341732892a 100644 --- a/pl/math/v_logf_inline.h +++ b/math/aarch64/experimental/advsimd/v_logf_inline.h @@ -1,7 +1,7 @@ /* * Single-precision vector log function - inline version * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/asin_3u.c b/math/aarch64/experimental/asin_3u.c similarity index 78% rename from pl/math/asin_3u.c rename to math/aarch64/experimental/asin_3u.c index 0b50995449cef3..56e63e451ba18b 100644 --- a/pl/math/asin_3u.c +++ b/math/aarch64/experimental/asin_3u.c @@ -1,22 +1,22 @@ /* * Double-precision asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f64.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" -#define AbsMask (0x7fffffffffffffff) -#define Half (0x3fe0000000000000) -#define One (0x3ff0000000000000) -#define PiOver2 (0x1.921fb54442d18p+0) -#define Small (0x3e50000000000000) /* 2^-26. */ -#define Small16 (0x3e50) -#define QNaN (0x7ff8) +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define One 0x3ff0000000000000 +#define PiOver2 0x1.921fb54442d18p+0 +#define Small 0x3e50000000000000 /* 2^-26. */ +#define Small16 0x3e50 +#define QNaN 0x7ff8 /* Fast implementation of double-precision asin(x) based on polynomial approximation. @@ -54,8 +54,8 @@ asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)). The largest observed error in this region is 2.69 ulps, - asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 - want 0x1.110d7e85fdd53p-1. */ + asin(0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ double asin (double x) { @@ -96,11 +96,11 @@ asin (double x) return asdouble (asuint64 (y) | sign); } -PL_SIG (S, D, 1, asin, -1.0, 1.0) -PL_TEST_ULP (asin, 2.19) -PL_TEST_INTERVAL (asin, 0, Small, 5000) -PL_TEST_INTERVAL (asin, Small, 0.5, 50000) -PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000) -PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000) -PL_TEST_INTERVAL (asin, -0, -inf, 20000) +TEST_SIG (S, D, 1, asin, -1.0, 1.0) +TEST_ULP (asin, 2.20) +TEST_INTERVAL (asin, 0, Small, 5000) +TEST_INTERVAL (asin, Small, 0.5, 50000) +TEST_INTERVAL (asin, 0.5, 1.0, 50000) +TEST_INTERVAL (asin, 1.0, 0x1p11, 50000) +TEST_INTERVAL (asin, 0x1p11, inf, 20000) +TEST_INTERVAL (asin, -0, -inf, 20000) diff --git a/pl/math/asin_data.c b/math/aarch64/experimental/asin_data.c similarity index 94% rename from pl/math/asin_data.c rename to math/aarch64/experimental/asin_data.c index b5517731c7f4b3..60ab476e7ec950 100644 --- a/pl/math/asin_data.c +++ b/math/aarch64/experimental/asin_data.c @@ -1,7 +1,7 @@ /* * Coefficients for single-precision asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/asinf_2u5.c b/math/aarch64/experimental/asinf_2u5.c similarity index 80% rename from pl/math/asinf_2u5.c rename to math/aarch64/experimental/asinf_2u5.c index ec608146ff666d..1136da01550ecb 100644 --- a/pl/math/asinf_2u5.c +++ b/math/aarch64/experimental/asinf_2u5.c @@ -1,22 +1,22 @@ /* * Single-precision asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f32.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" -#define AbsMask (0x7fffffff) -#define Half (0x3f000000) -#define One (0x3f800000) -#define PiOver2f (0x1.921fb6p+0f) -#define Small (0x39800000) /* 2^-12. */ -#define Small12 (0x398) -#define QNaN (0x7fc) +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define PiOver2f 0x1.921fb6p+0f +#define Small 0x39800000 /* 2^-12. */ +#define Small12 0x398 +#define QNaN 0x7fc /* Fast implementation of single-precision asin(x) based on polynomial approximation. @@ -90,11 +90,11 @@ asinf (float x) return asfloat (asuint (y) | sign); } -PL_SIG (S, F, 1, asin, -1.0, 1.0) -PL_TEST_ULP (asinf, 1.91) -PL_TEST_INTERVAL (asinf, 0, Small, 5000) -PL_TEST_INTERVAL (asinf, Small, 0.5, 50000) -PL_TEST_INTERVAL (asinf, 0.5, 1.0, 50000) -PL_TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (asinf, 0x1p11, inf, 20000) -PL_TEST_INTERVAL (asinf, -0, -inf, 20000) +TEST_SIG (S, F, 1, asin, -1.0, 1.0) +TEST_ULP (asinf, 1.91) +TEST_INTERVAL (asinf, 0, Small, 5000) +TEST_INTERVAL (asinf, Small, 0.5, 50000) +TEST_INTERVAL (asinf, 0.5, 1.0, 50000) +TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000) +TEST_INTERVAL (asinf, 0x1p11, inf, 20000) +TEST_INTERVAL (asinf, -0, -inf, 20000) diff --git a/pl/math/asinf_data.c b/math/aarch64/experimental/asinf_data.c similarity index 92% rename from pl/math/asinf_data.c rename to math/aarch64/experimental/asinf_data.c index 1652025e2920cd..15f331dde5a73c 100644 --- a/pl/math/asinf_data.c +++ b/math/aarch64/experimental/asinf_data.c @@ -1,7 +1,7 @@ /* * Coefficients for single-precision asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/asinh_2u5.c b/math/aarch64/experimental/asinh_2u5.c similarity index 75% rename from pl/math/asinh_2u5.c rename to math/aarch64/experimental/asinh_2u5.c index b7fc81a2b94f24..9d2d160a1453af 100644 --- a/pl/math/asinh_2u5.c +++ b/math/aarch64/experimental/asinh_2u5.c @@ -1,13 +1,14 @@ /* * Double-precision asinh(x) function * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "mathlib.h" #include "poly_scalar_f64.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask 0x7fffffffffffffff #define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26). */ @@ -15,9 +16,6 @@ #define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511). */ #define Ln2 0x1.62e42fefa39efp-1 -double -optr_aor_log_f64 (double); - /* Scalar double-precision asinh implementation. This routine uses different approaches on different intervals: @@ -67,19 +65,18 @@ asinh (double x) if (unlikely (ia >= Exp511)) { - return asdouble (asuint64 (optr_aor_log_f64 (ax) + Ln2) | sign); + return asdouble (asuint64 (log (ax) + Ln2) | sign); } - return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1))) - | sign); + return asdouble (asuint64 (log (ax + sqrt (ax * ax + 1))) | sign); } -PL_SIG (S, D, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (asinh, 1.54) -PL_TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000) -PL_TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000) -PL_TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000) -PL_TEST_INTERVAL (asinh, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (asinh, -1.0, -100.0, 10000) -PL_TEST_INTERVAL (asinh, 100.0, inf, 50000) -PL_TEST_INTERVAL (asinh, -100.0, -inf, 10000) +TEST_SIG (S, D, 1, asinh, -10.0, 10.0) +TEST_ULP (asinh, 1.54) +TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000) +TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000) +TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000) +TEST_INTERVAL (asinh, 1.0, 100.0, 40000) +TEST_INTERVAL (asinh, -1.0, -100.0, 10000) +TEST_INTERVAL (asinh, 100.0, inf, 50000) +TEST_INTERVAL (asinh, -100.0, -inf, 10000) diff --git a/pl/math/asinh_data.c b/math/aarch64/experimental/asinh_data.c similarity index 51% rename from pl/math/asinh_data.c rename to math/aarch64/experimental/asinh_data.c index 073b19799bda7f..7afaf69601309f 100644 --- a/pl/math/asinh_data.c +++ b/math/aarch64/experimental/asinh_data.c @@ -1,7 +1,7 @@ /* * Double-precision polynomial coefficients for scalar asinh(x) * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -13,10 +13,11 @@ Note P is evaluated on even powers of x only. See tools/asinh.sollya for the algorithm used to generate these coefficients. */ const struct asinh_data __asinh_data - = {.poly - = {-0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5, - 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6, - -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7, - 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8, - -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11, - 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18}}; + = { .poly + = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5, + 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6, + -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7, + 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8, + -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11, + 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, + 0x1.93d4ba83d34dap-18 } }; diff --git a/pl/math/asinhf_3u5.c b/math/aarch64/experimental/asinhf_3u5.c similarity index 77% rename from pl/math/asinhf_3u5.c rename to math/aarch64/experimental/asinhf_3u5.c index ec26b80ec2ec4c..92c6dfd9b43d64 100644 --- a/pl/math/asinhf_3u5.c +++ b/math/aarch64/experimental/asinhf_3u5.c @@ -1,14 +1,14 @@ /* * Single-precision asinh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f32.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask (0x7fffffff) #define SqrtFltMax (0x1.749e96p+10f) @@ -16,9 +16,6 @@ #define One (0x3f8) #define ExpM12 (0x398) -float -optr_aor_log_f32 (float); - /* asinhf approximation using a variety of approaches on different intervals: |x| < 2^-12: Return x. Function is exactly rounded in this region. @@ -62,15 +59,15 @@ asinhf (float x) if (unlikely (ax > SqrtFltMax)) { - return asfloat (asuint (optr_aor_log_f32 (ax) + Ln2) | sign); + return asfloat (asuint (logf (ax) + Ln2) | sign); } - return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign); + return asfloat (asuint (logf (ax + sqrtf (ax * ax + 1))) | sign); } -PL_SIG (S, F, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (asinhf, 2.9) -PL_TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000) -PL_TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000) -PL_TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000) +TEST_SIG (S, F, 1, asinh, -10.0, 10.0) +TEST_ULP (asinhf, 2.9) +TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000) +TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000) +TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000) +TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000) diff --git a/math/aarch64/experimental/asinhf_data.c b/math/aarch64/experimental/asinhf_data.c new file mode 100644 index 00000000000000..5ed261ba835b5e --- /dev/null +++ b/math/aarch64/experimental/asinhf_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients for single-precision asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya + for these coeffs were generated. */ +const struct asinhf_data __asinhf_data + = { .coeffs = { -0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, + 0x1.3a81dcp-4f, 0x1.65bbaap-10f, -0x1.057f1p-4f, + 0x1.6c1d46p-5f, -0x1.4cafe8p-7f } }; diff --git a/pl/math/atan2_2u5.c b/math/aarch64/experimental/atan2_2u5.c similarity index 91% rename from pl/math/atan2_2u5.c rename to math/aarch64/experimental/atan2_2u5.c index c909ac99fa22e9..518e34589e5bb4 100644 --- a/pl/math/atan2_2u5.c +++ b/math/aarch64/experimental/atan2_2u5.c @@ -1,7 +1,7 @@ /* * Double-precision scalar atan2(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,8 +9,8 @@ #include "atan_common.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Pi (0x1.921fb54442d18p+1) #define PiOver2 (0x1.921fb54442d18p+0) @@ -79,8 +79,8 @@ atan2 (double y, double x) if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND)) return sign_y ? -PiOver2 : PiOver2; - /* Special case for either x is INF or (x, y) is very close to x axis and x is - negative. */ + /* Special case for either x is INF or (x, y) is very close to x axis and x + is negative. */ if (unlikely (iax == 0x7ff0000000000000 || (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2))) { @@ -150,10 +150,10 @@ atan2 (double y, double x) } /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ -PL_SIG (S, D, 2, atan2) -PL_TEST_ULP (atan2, 1.78) -PL_TEST_INTERVAL (atan2, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (atan2, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (atan2, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (atan2, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (atan2, 1e6, 1e32, 40000) +TEST_SIG (S, D, 2, atan2) +TEST_ULP (atan2, 1.78) +TEST_INTERVAL (atan2, -10.0, 10.0, 50000) +TEST_INTERVAL (atan2, -1.0, 1.0, 40000) +TEST_INTERVAL (atan2, 0.0, 1.0, 40000) +TEST_INTERVAL (atan2, 1.0, 100.0, 40000) +TEST_INTERVAL (atan2, 1e6, 1e32, 40000) diff --git a/pl/math/atan2f_3u.c b/math/aarch64/experimental/atan2f_3u.c similarity index 90% rename from pl/math/atan2f_3u.c rename to math/aarch64/experimental/atan2f_3u.c index 38e1df59c102e2..245ba551566c54 100644 --- a/pl/math/atan2f_3u.c +++ b/math/aarch64/experimental/atan2f_3u.c @@ -1,7 +1,7 @@ /* * Single-precision scalar atan2(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,8 +9,8 @@ #include "atanf_common.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Pi (0x1.921fb6p+1f) #define PiOver2 (0x1.921fb6p+0f) @@ -19,8 +19,8 @@ /* We calculate atan2f by P(n/d), where n and d are similar to the input arguments, and P is a polynomial. The polynomial may underflow. - POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and d - for which P underflows, and is used to special-case such inputs. */ + POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and + d for which P underflows, and is used to special-case such inputs. */ #define POLY_UFLOW_BOUND 24 static inline int32_t @@ -158,10 +158,10 @@ atan2f (float y, float x) } /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ -PL_SIG (S, F, 2, atan2) -PL_TEST_ULP (atan2f, 2.4) -PL_TEST_INTERVAL (atan2f, -10.0, 10.0, 50000) -PL_TEST_INTERVAL (atan2f, -1.0, 1.0, 40000) -PL_TEST_INTERVAL (atan2f, 0.0, 1.0, 40000) -PL_TEST_INTERVAL (atan2f, 1.0, 100.0, 40000) -PL_TEST_INTERVAL (atan2f, 1e6, 1e32, 40000) +TEST_SIG (S, F, 2, atan2) +TEST_ULP (atan2f, 2.4) +TEST_INTERVAL (atan2f, -10.0, 10.0, 50000) +TEST_INTERVAL (atan2f, -1.0, 1.0, 40000) +TEST_INTERVAL (atan2f, 0.0, 1.0, 40000) +TEST_INTERVAL (atan2f, 1.0, 100.0, 40000) +TEST_INTERVAL (atan2f, 1e6, 1e32, 40000) diff --git a/pl/math/atan_2u5.c b/math/aarch64/experimental/atan_2u5.c similarity index 79% rename from pl/math/atan_2u5.c rename to math/aarch64/experimental/atan_2u5.c index ee477010175899..9c9c77d98cd3cd 100644 --- a/pl/math/atan_2u5.c +++ b/math/aarch64/experimental/atan_2u5.c @@ -1,12 +1,12 @@ /* * Double-precision atan(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "atan_common.h" #define AbsMask 0x7fffffffffffffff @@ -63,11 +63,11 @@ atan (double x) return asdouble (asuint64 (y) ^ sign); } -PL_SIG (S, D, 1, atan, -10.0, 10.0) -PL_TEST_ULP (atan, 1.78) -PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000) -PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000) -PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000) -PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000) -PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000) -PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000) +TEST_SIG (S, D, 1, atan, -10.0, 10.0) +TEST_ULP (atan, 1.78) +TEST_INTERVAL (atan, 0, 0x1p-30, 10000) +TEST_INTERVAL (atan, -0, -0x1p-30, 1000) +TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000) +TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000) +TEST_INTERVAL (atan, 0x1p53, inf, 10000) +TEST_INTERVAL (atan, -0x1p53, -inf, 1000) diff --git a/pl/math/atan_common.h b/math/aarch64/experimental/atan_common.h similarity index 95% rename from pl/math/atan_common.h rename to math/aarch64/experimental/atan_common.h index 798cc22cc40aa6..1fd83860219b9e 100644 --- a/pl/math/atan_common.h +++ b/math/aarch64/experimental/atan_common.h @@ -2,7 +2,7 @@ * Double-precision polynomial evaluation function for scalar * atan(x) and atan2(y,x). * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/math/aarch64/experimental/atan_data.c b/math/aarch64/experimental/atan_data.c new file mode 100644 index 00000000000000..5d24fa912d02c3 --- /dev/null +++ b/math/aarch64/experimental/atan_data.c @@ -0,0 +1,23 @@ +/* + * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x). + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct atan_poly_data __atan_poly_data + = { .poly = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) + on [2**-1022, 1.0]. See atan.sollya for details of how + these were generated. */ + -0x1.5555555555555p-2, 0x1.99999999996c1p-3, + -0x1.2492492478f88p-3, 0x1.c71c71bc3951cp-4, + -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, + -0x1.aebfe7b418581p-5, 0x1.842dbe9b0d916p-5, + -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, + -0x1.0051381722a59p-6, 0x1.14e9dc19a4a4ep-7, + -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16 } }; diff --git a/pl/math/atanf_2u9.c b/math/aarch64/experimental/atanf_2u9.c similarity index 82% rename from pl/math/atanf_2u9.c rename to math/aarch64/experimental/atanf_2u9.c index ba6f68089de13f..518415ded6341f 100644 --- a/pl/math/atanf_2u9.c +++ b/math/aarch64/experimental/atanf_2u9.c @@ -1,13 +1,13 @@ /* * Single-precision atan(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "atanf_common.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define PiOver2 0x1.921fb6p+0f #define AbsMask 0x7fffffff @@ -64,9 +64,9 @@ atanf (float x) return asfloat (asuint (y) ^ sign); } -PL_SIG (S, F, 1, atan, -10.0, 10.0) -PL_TEST_ULP (atanf, 2.38) -PL_TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000) -PL_TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000) -PL_TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000) -PL_TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000) +TEST_SIG (S, F, 1, atan, -10.0, 10.0) +TEST_ULP (atanf, 2.38) +TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000) +TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000) +TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000) +TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000) diff --git a/pl/math/atanf_common.h b/math/aarch64/experimental/atanf_common.h similarity index 96% rename from pl/math/atanf_common.h rename to math/aarch64/experimental/atanf_common.h index 8952e7e0078be8..3e654204730963 100644 --- a/pl/math/atanf_common.h +++ b/math/aarch64/experimental/atanf_common.h @@ -2,7 +2,7 @@ * Single-precision polynomial evaluation function for scalar * atan(x) and atan2(y,x). * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/math/aarch64/experimental/atanf_data.c b/math/aarch64/experimental/atanf_data.c new file mode 100644 index 00000000000000..f4d607c2a12d04 --- /dev/null +++ b/math/aarch64/experimental/atanf_data.c @@ -0,0 +1,17 @@ +/* + * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x). + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. + */ +const struct atanf_poly_data __atanf_poly_data + = { .poly + = { /* See atanf.sollya for details of how these were generated. */ + -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, + 0x1.01fd88p-8f } }; diff --git a/pl/math/atanh_3u.c b/math/aarch64/experimental/atanh_3u.c similarity index 88% rename from pl/math/atanh_3u.c rename to math/aarch64/experimental/atanh_3u.c index dcfbe8192a22a9..d01b8bacd46a6c 100644 --- a/pl/math/atanh_3u.c +++ b/math/aarch64/experimental/atanh_3u.c @@ -1,21 +1,21 @@ /* * Double-precision atanh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "poly_scalar_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask 0x7fffffffffffffff #define Half 0x3fe0000000000000 #define One 0x3ff0000000000000 #define Ln2Hi 0x1.62e42fefa3800p-1 #define Ln2Lo 0x1.ef35793c76730p-45 -#define OneMHfRt2Top \ +#define OneMHfRt2Top \ 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ #define OneTop12 0x3ff #define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ @@ -76,8 +76,8 @@ atanh (double x) return halfsign * log1p_inline ((2 * ax) / (1 - ax)); } -PL_SIG (S, D, 1, atanh, -1.0, 1.0) -PL_TEST_ULP (atanh, 3.00) -PL_TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000) -PL_TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000) -PL_TEST_SYM_INTERVAL (atanh, 1, inf, 100) +TEST_SIG (S, D, 1, atanh, -1.0, 1.0) +TEST_ULP (atanh, 3.00) +TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000) +TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000) +TEST_SYM_INTERVAL (atanh, 1, inf, 100) diff --git a/pl/math/atanhf_3u1.c b/math/aarch64/experimental/atanhf_3u1.c similarity index 87% rename from pl/math/atanhf_3u1.c rename to math/aarch64/experimental/atanhf_3u1.c index e99d5a9900a9d6..c452bab91f979e 100644 --- a/pl/math/atanhf_3u1.c +++ b/math/aarch64/experimental/atanhf_3u1.c @@ -1,14 +1,14 @@ /* * Single-precision atanh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask 0x7fffffff #define Half 0x3f000000 @@ -79,8 +79,8 @@ atanhf (float x) return halfsign * log1pf_inline ((2 * ax) / (1 - ax)); } -PL_SIG (S, F, 1, atanh, -1.0, 1.0) -PL_TEST_ULP (atanhf, 2.59) -PL_TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500) -PL_TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000) -PL_TEST_SYM_INTERVAL (atanhf, 1, inf, 1000) +TEST_SIG (S, F, 1, atanh, -1.0, 1.0) +TEST_ULP (atanhf, 2.59) +TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500) +TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000) +TEST_SYM_INTERVAL (atanhf, 1, inf, 1000) diff --git a/pl/math/cbrt_2u.c b/math/aarch64/experimental/cbrt_2u.c similarity index 89% rename from pl/math/cbrt_2u.c rename to math/aarch64/experimental/cbrt_2u.c index 80be83c4470c32..cf31627e43dcc0 100644 --- a/pl/math/cbrt_2u.c +++ b/math/aarch64/experimental/cbrt_2u.c @@ -1,15 +1,15 @@ /* * Double-precision cbrt(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" -PL_SIG (S, D, 1, cbrt, -10.0, 10.0) +TEST_SIG (S, D, 1, cbrt, -10.0, 10.0) #define AbsMask 0x7fffffffffffffff #define TwoThirds 0x1.5555555555555p-1 @@ -39,8 +39,8 @@ cbrt (double x) int e; double m = frexp (asdouble (iax), &e); - /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for - Newton iterations. */ + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ double p_01 = fma (C (1), m, C (0)); double p_23 = fma (C (3), m, C (2)); double p = fma (p_23, m * m, p_01); @@ -65,5 +65,5 @@ cbrt (double x) return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign); } -PL_TEST_ULP (cbrt, 1.30) -PL_TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000) +TEST_ULP (cbrt, 1.30) +TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000) diff --git a/pl/math/cbrt_data.c b/math/aarch64/experimental/cbrt_data.c similarity index 93% rename from pl/math/cbrt_data.c rename to math/aarch64/experimental/cbrt_data.c index 3d484c2779e24d..dabcb6aff2d4cc 100644 --- a/pl/math/cbrt_data.c +++ b/math/aarch64/experimental/cbrt_data.c @@ -1,7 +1,7 @@ /* * Coefficients and table entries for double-precision cbrt(x). * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/cbrtf_1u5.c b/math/aarch64/experimental/cbrtf_1u5.c similarity index 88% rename from pl/math/cbrtf_1u5.c rename to math/aarch64/experimental/cbrtf_1u5.c index 88fcb7162ef6b1..5f0288e6d27ad8 100644 --- a/pl/math/cbrtf_1u5.c +++ b/math/aarch64/experimental/cbrtf_1u5.c @@ -1,14 +1,14 @@ /* * Single-precision cbrt(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f32.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask 0x7fffffff #define SignMask 0x80000000 @@ -18,8 +18,8 @@ /* Approximation for single-precision cbrt(x), using low-order polynomial and one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This - is observed for every value where the mantissa is 0x1.81410e and the exponent - is a multiple of 3, for example: + is observed for every value where the mantissa is 0x1.81410e and the + exponent is a multiple of 3, for example: cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 want 0x1.255d92p+10. */ float @@ -61,6 +61,6 @@ cbrtf (float x) return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign); } -PL_SIG (S, F, 1, cbrt, -10.0, 10.0) -PL_TEST_ULP (cbrtf, 1.03) -PL_TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000) +TEST_SIG (S, F, 1, cbrt, -10.0, 10.0) +TEST_ULP (cbrtf, 1.03) +TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000) diff --git a/pl/math/cbrtf_data.c b/math/aarch64/experimental/cbrtf_data.c similarity index 93% rename from pl/math/cbrtf_data.c rename to math/aarch64/experimental/cbrtf_data.c index c6cdb4de0d65bf..7b5c53f4a6066d 100644 --- a/pl/math/cbrtf_data.c +++ b/math/aarch64/experimental/cbrtf_data.c @@ -1,7 +1,7 @@ /* * Coefficients and table entries for single-precision cbrt(x). * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/cosh_2u.c b/math/aarch64/experimental/cosh_2u.c similarity index 70% rename from pl/math/cosh_2u.c rename to math/aarch64/experimental/cosh_2u.c index 2240a9c56f1589..f5bc73b85df855 100644 --- a/pl/math/cosh_2u.c +++ b/math/aarch64/experimental/cosh_2u.c @@ -1,21 +1,19 @@ /* * Double-precision cosh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" +#include "exp_inline.h" #define AbsMask 0x7fffffffffffffff -#define SpecialBound \ +#define SpecialBound \ 0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows. */ -double -__exp_dd (double, double); - static double specialcase (double x, uint64_t iax) { @@ -23,9 +21,9 @@ specialcase (double x, uint64_t iax) return INFINITY; if (iax > 0x7ff0000000000000) return __math_invalid (x); - /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated by - exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2. */ - double t = __exp_dd (asdouble (iax) / 2, 0); + /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated + by exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2. */ + double t = exp_inline (asdouble (iax) / 2, 0); return (0.5 * t) * t; } @@ -44,20 +42,20 @@ cosh (double x) uint64_t ix = asuint64 (x); uint64_t iax = ix & AbsMask; - /* exp overflows a little bit before cosh, so use special-case handler for the - gap, as well as special values. */ + /* exp overflows a little bit before cosh, so use special-case handler for + the gap, as well as special values. */ if (unlikely (iax >= SpecialBound)) return specialcase (x, iax); double ax = asdouble (iax); /* Use double-precision exp helper to calculate exp(x), then: cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2). */ - double t = __exp_dd (ax, 0); + double t = exp_inline (ax, 0); return 0.5 * t + 0.5 / t; } -PL_SIG (S, D, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (cosh, 1.43) -PL_TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) -PL_TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) -PL_TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100) +TEST_SIG (S, D, 1, cosh, -10.0, 10.0) +TEST_ULP (cosh, 1.43) +TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) +TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) +TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100) diff --git a/pl/math/coshf_1u9.c b/math/aarch64/experimental/coshf_1u9.c similarity index 71% rename from pl/math/coshf_1u9.c rename to math/aarch64/experimental/coshf_1u9.c index cf737840e0d698..b7e7720a472ec8 100644 --- a/pl/math/coshf_1u9.c +++ b/math/aarch64/experimental/coshf_1u9.c @@ -1,22 +1,19 @@ /* * Single-precision cosh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask 0x7fffffff #define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */ -#define SpecialBound \ - 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \ - special case. */ - -float -optr_aor_exp_f32 (float); +/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ +#define SpecialBound 0x42ad496c static NOINLINE float specialcase (float x, uint32_t iax) @@ -32,7 +29,7 @@ specialcase (float x, uint32_t iax) without overflow, so use exp(|x|/2) instead. For large x cosh(x) is dominated by exp(x), so return: cosh(x) ~= (exp(|x|/2))^2 / 2. */ - float t = optr_aor_exp_f32 (asfloat (iax) / 2); + float t = expf (asfloat (iax) / 2); return (0.5 * t) * t; } @@ -57,12 +54,12 @@ coshf (float x) /* Compute cosh using the definition: coshf(x) = exp(x) / 2 + exp(-x) / 2. */ - float t = optr_aor_exp_f32 (ax); + float t = expf (ax); return 0.5f * t + 0.5f / t; } -PL_SIG (S, F, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (coshf, 1.89) -PL_TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100) -PL_TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) -PL_TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) +TEST_SIG (S, F, 1, cosh, -10.0, 10.0) +TEST_ULP (coshf, 1.89) +TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100) +TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) +TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) diff --git a/pl/math/erf_2u5.c b/math/aarch64/experimental/erf_2u5.c similarity index 87% rename from pl/math/erf_2u5.c rename to math/aarch64/experimental/erf_2u5.c index 3ca2a1332c1f35..0bbe3e9548f88e 100644 --- a/pl/math/erf_2u5.c +++ b/math/aarch64/experimental/erf_2u5.c @@ -1,13 +1,13 @@ /* * Double-precision erf(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 #define Shift 0x1p45 @@ -42,7 +42,7 @@ erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 want -0x1.20dd59132ebafp-8. */ double -erf (double x) +arm_math_erf (double x) { /* Get absolute value and sign. */ uint64_t ix = asuint64 (x); @@ -62,8 +62,8 @@ erf (double x) double r = z - Shift; /* Lookup erf(r) and scale(r) in table. Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */ - double erfr = __erf_data.tab[i].erf; - double scale = __erf_data.tab[i].scale; + double erfr = __v_erf_data.tab[i].erf; + double scale = __v_erf_data.tab[i].scale; /* erf(x) ~ erf(r) + scale * d * poly (d, r). */ double d = a - r; @@ -95,8 +95,7 @@ erf (double x) return asdouble (sign | asuint64 (1.0)); } -PL_SIG (S, D, 1, erf, -6.0, 6.0) -PL_TEST_ULP (erf, 1.79) -PL_TEST_SYM_INTERVAL (erf, 0, 5.9921875, 40000) -PL_TEST_SYM_INTERVAL (erf, 5.9921875, inf, 40000) -PL_TEST_SYM_INTERVAL (erf, 0, inf, 40000) +TEST_ULP (arm_math_erf, 1.79) +TEST_SYM_INTERVAL (arm_math_erf, 0, 5.9921875, 40000) +TEST_SYM_INTERVAL (arm_math_erf, 5.9921875, inf, 40000) +TEST_SYM_INTERVAL (arm_math_erf, 0, inf, 40000) diff --git a/pl/math/erfc_1u8.c b/math/aarch64/experimental/erfc_1u8.c similarity index 90% rename from pl/math/erfc_1u8.c rename to math/aarch64/experimental/erfc_1u8.c index 7f2004e9335d7e..5357e932943379 100644 --- a/pl/math/erfc_1u8.c +++ b/math/aarch64/experimental/erfc_1u8.c @@ -1,13 +1,13 @@ /* * Double-precision erfc(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Shift 0x1p45 #define P20 0x1.5555555555555p-2 /* 1/3. */ @@ -86,11 +86,11 @@ erfc (double x) /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale to 2/sqrt(pi), when x reduced to r = 0. */ double z = a + Shift; - uint64_t i = asuint64 (z); + uint64_t i = asuint64 (z) - asuint64 (Shift); double r = z - Shift; /* These values are scaled by 2^128. */ - double erfcr = __erfc_data.tab[i].erfc; - double scale = __erfc_data.tab[i].scale; + double erfcr = __v_erfc_data.tab[i].erfc; + double scale = __v_erfc_data.tab[i].scale; /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ double d = a - r; @@ -144,10 +144,10 @@ erfc (double x) return __math_uflow (0); } -PL_SIG (S, D, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (erfc, 1.21) -PL_TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000) -PL_TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000) -PL_TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000) -PL_TEST_INTERVAL (erfc, 28.0, inf, 40000) -PL_TEST_INTERVAL (erfc, -6.0, -inf, 40000) +TEST_SIG (S, D, 1, erfc, -6.0, 28.0) +TEST_ULP (erfc, 1.21) +TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000) +TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000) +TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000) +TEST_INTERVAL (erfc, 28.0, inf, 40000) +TEST_INTERVAL (erfc, -6.0, -inf, 40000) diff --git a/pl/math/erfcf_1u7.c b/math/aarch64/experimental/erfcf_1u7.c similarity index 86% rename from pl/math/erfcf_1u7.c rename to math/aarch64/experimental/erfcf_1u7.c index c8ce95cca058c1..e56193c8a1030d 100644 --- a/pl/math/erfcf_1u7.c +++ b/math/aarch64/experimental/erfcf_1u7.c @@ -1,13 +1,13 @@ /* * Single-precision erfc(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Shift 0x1p17f #define OneThird 0x1.555556p-2f @@ -59,8 +59,8 @@ erfcf (float x) float r = z - Shift; /* These values are scaled by 2^-47. */ - float erfcr = __erfcf_data.tab[i].erfc; - float scale = __erfcf_data.tab[i].scale; + float erfcr = __v_erfcf_data.tab[i].erfc; + float scale = __v_erfcf_data.tab[i].scale; /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */ float d = a - r; @@ -94,10 +94,10 @@ erfcf (float x) return sign ? 2.0f : __math_uflowf (0); } -PL_SIG (S, F, 1, erfc, -4.0, 10.0) -PL_TEST_ULP (erfcf, 1.14) -PL_TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000) -PL_TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000) -PL_TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000) -PL_TEST_INTERVAL (erfcf, 10.0625, inf, 40000) -PL_TEST_INTERVAL (erfcf, -4.0, -inf, 40000) +TEST_SIG (S, F, 1, erfc, -4.0, 10.0) +TEST_ULP (erfcf, 1.14) +TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000) +TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000) +TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000) +TEST_INTERVAL (erfcf, 10.0625, inf, 40000) +TEST_INTERVAL (erfcf, -4.0, -inf, 40000) diff --git a/pl/math/erff_2u.c b/math/aarch64/experimental/erff_2u.c similarity index 83% rename from pl/math/erff_2u.c rename to math/aarch64/experimental/erff_2u.c index f43e647072f866..9487f60dd1e3c9 100644 --- a/pl/math/erff_2u.c +++ b/math/aarch64/experimental/erff_2u.c @@ -1,13 +1,13 @@ /* * Single-precision erf(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f #define Shift 0x1p16f @@ -37,7 +37,7 @@ erff(0x1.c373e6p-9) got 0x1.fd686cp-9 want 0x1.fd6868p-9. */ float -erff (float x) +arm_math_erff (float x) { /* Get absolute value and sign. */ uint32_t ix = asuint (x); @@ -56,8 +56,8 @@ erff (float x) float z = a + Shift; uint32_t i = asuint (z) - asuint (Shift); float r = z - Shift; - float erfr = __erff_data.tab[i].erf; - float scale = __erff_data.tab[i].scale; + float erfr = __v_erff_data.tab[i].erf; + float scale = __v_erff_data.tab[i].scale; /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ float d = a - r; @@ -75,8 +75,7 @@ erff (float x) return asfloat (sign | asuint (1.0f)); } -PL_SIG (S, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (erff, 1.43) -PL_TEST_SYM_INTERVAL (erff, 0, 3.9375, 40000) -PL_TEST_SYM_INTERVAL (erff, 3.9375, inf, 40000) -PL_TEST_SYM_INTERVAL (erff, 0, inf, 40000) +TEST_ULP (arm_math_erff, 1.43) +TEST_SYM_INTERVAL (arm_math_erff, 0, 3.9375, 40000) +TEST_SYM_INTERVAL (arm_math_erff, 3.9375, inf, 40000) +TEST_SYM_INTERVAL (arm_math_erff, 0, inf, 40000) diff --git a/pl/math/erfinv_24u5.c b/math/aarch64/experimental/erfinv_24u5.c similarity index 88% rename from pl/math/erfinv_24u5.c rename to math/aarch64/experimental/erfinv_24u5.c index 20e1e361befc2c..753f38a79f664d 100644 --- a/pl/math/erfinv_24u5.c +++ b/math/aarch64/experimental/erfinv_24u5.c @@ -1,14 +1,13 @@ /* * Double-precision inverse error function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "poly_scalar_f64.h" -#include "pl_sig.h" -#define IGNORE_SCALAR_FENV -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" const static struct { @@ -75,7 +74,12 @@ erfinv (double x) / (copysign (t, x) * horner_9_f64 (t, data.Q_57)); } -PL_SIG (S, D, 1, erfinv, -0.99, 0.99) -PL_TEST_ULP (erfinv, 24.0) -PL_TEST_INTERVAL (erfinv, 0, 1, 40000) -PL_TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000) +#if USE_MPFR +# warning Not generating tests for erfinv, as MPFR has no suitable reference +#else +TEST_DISABLE_FENV (erfinv) +TEST_SIG (S, D, 1, erfinv, -0.99, 0.99) +TEST_ULP (erfinv, 24.0) +TEST_INTERVAL (erfinv, 0, 1, 40000) +TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000) +#endif diff --git a/pl/math/erfinvf_4u7.c b/math/aarch64/experimental/erfinvf_4u7.c similarity index 88% rename from pl/math/erfinvf_4u7.c rename to math/aarch64/experimental/erfinvf_4u7.c index 40736da08be846..152994f6336a7a 100644 --- a/pl/math/erfinvf_4u7.c +++ b/math/aarch64/experimental/erfinvf_4u7.c @@ -1,13 +1,13 @@ /* * Single-precision inverse error function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f32.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" const static struct { @@ -69,6 +69,10 @@ erfinvf (float x) / (copysignf (t, x) * horner_2_f32 (t, data.Q_50)); } -PL_SIG (S, F, 1, erfinv, -0.99, 0.99) -PL_TEST_ULP (erfinvf, 4.09) -PL_TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000) +#if USE_MPFR +# warning Not generating tests for erfinvf, as MPFR has no suitable reference +#else +TEST_SIG (S, F, 1, erfinv, -0.99, 0.99) +TEST_ULP (erfinvf, 4.09) +TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000) +#endif diff --git a/pl/math/erfinvl.c b/math/aarch64/experimental/erfinvl.c similarity index 98% rename from pl/math/erfinvl.c rename to math/aarch64/experimental/erfinvl.c index ea4aadfccd00bc..4d91410f1a5c27 100644 --- a/pl/math/erfinvl.c +++ b/math/aarch64/experimental/erfinvl.c @@ -1,7 +1,7 @@ /* * Extended precision inverse error function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE diff --git a/pl/math/exp.c b/math/aarch64/experimental/exp_inline.h similarity index 93% rename from pl/math/exp.c rename to math/aarch64/experimental/exp_inline.h index 90253b68875dc3..1a327c1e67d362 100644 --- a/pl/math/exp.c +++ b/math/aarch64/experimental/exp_inline.h @@ -1,10 +1,13 @@ /* * Double-precision e^x function. * - * Copyright (c) 2018-2023, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#ifndef PL_MATH_EXP_INLINE_H +#define PL_MATH_EXP_INLINE_H + #include #include #include @@ -30,7 +33,7 @@ adjustment of scale, positive k here means the result may overflow and negative k means the result may underflow. */ static inline double -specialcase (double_t tmp, uint64_t sbits, uint64_t ki) +exp_inline_special_case (double_t tmp, uint64_t sbits, uint64_t ki) { double_t scale, y; @@ -77,7 +80,7 @@ top12 (double x) /* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. If hastail is 0 then xtail is assumed to be 0 too. */ static inline double -exp_inline (double x, double xtail, int hastail) +exp_inline (double x, double xtail) { uint32_t abstop; uint64_t ki, idx, top, sbits; @@ -125,7 +128,7 @@ exp_inline (double x, double xtail, int hastail) #endif r = x + kd * NegLn2hiN + kd * NegLn2loN; /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ - if (hastail) + if (!__builtin_constant_p (xtail) || xtail != 0.0) r += xtail; /* 2^(k/N) ~= scale * (1 + tail). */ idx = 2 * (ki % N); @@ -146,18 +149,11 @@ exp_inline (double x, double xtail, int hastail) tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6); #endif if (unlikely (abstop == 0)) - return specialcase (tmp, sbits, ki); + return exp_inline_special_case (tmp, sbits, ki); scale = asdouble (sbits); /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there is no spurious underflow here even without fma. */ return eval_as_double (scale + scale * tmp); } -/* May be useful for implementing pow where more than double - precision input is needed. */ -double -__exp_dd (double x, double xtail) -{ - return exp_inline (x, xtail, 1); -} - +#endif diff --git a/pl/math/expf_data.c b/math/aarch64/experimental/expf_data.c similarity index 93% rename from pl/math/expf_data.c rename to math/aarch64/experimental/expf_data.c index 474ad57a29a06a..958f705cc67694 100644 --- a/pl/math/expf_data.c +++ b/math/aarch64/experimental/expf_data.c @@ -2,7 +2,7 @@ * Coeffs and table entries for single-precision exp. Copied from * math/exp2f_data.c, with EXP2F_TABLE_BITS == 32. * - * Copyright (c) 2017-2023, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -13,7 +13,7 @@ const struct expf_data __expf_data = { /* tab[i] = uint(2^(i/N)) - (i << 52-BITS) used for computing 2^(k/N) for an int |k| < 150 N as - double(tab[k%N] + (k << 52-BITS)) */ + double(tab[k%N] + (k << 52-BITS)). */ .tab = { 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, diff --git a/pl/math/expm1_2u5.c b/math/aarch64/experimental/expm1_2u5.c similarity index 83% rename from pl/math/expm1_2u5.c rename to math/aarch64/experimental/expm1_2u5.c index f7d43119861482..a4805e832af3ee 100644 --- a/pl/math/expm1_2u5.c +++ b/math/aarch64/experimental/expm1_2u5.c @@ -1,14 +1,14 @@ /* * Double-precision e^x - 1 function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f64.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define InvLn2 0x1.71547652b82fep0 #define Ln2hi 0x1.62e42fefa39efp-1 @@ -76,10 +76,10 @@ expm1 (double x) return 2 * fma (p, t, t - 0.5); } -PL_SIG (S, D, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (expm1, 1.68) -PL_TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000) -PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000) -PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) -PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100) -PL_TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100) +TEST_SIG (S, D, 1, expm1, -9.9, 9.9) +TEST_ULP (expm1, 1.68) +TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000) +TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000) +TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) +TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100) +TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100) diff --git a/math/aarch64/experimental/expm1_data.c b/math/aarch64/experimental/expm1_data.c new file mode 100644 index 00000000000000..95589505692438 --- /dev/null +++ b/math/aarch64/experimental/expm1_data.c @@ -0,0 +1,21 @@ +/* + * Coefficients for double-precision e^x - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Generated using fpminimax, see tools/expm1.sollya for details. */ +const double __expm1_poly[] = { 0x1p-1, + 0x1.5555555555559p-3, + 0x1.555555555554bp-5, + 0x1.111111110f663p-7, + 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, + 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, + 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, + 0x1.1f143d060a28ap-29 }; diff --git a/pl/math/expm1f_1u6.c b/math/aarch64/experimental/expm1f_1u6.c similarity index 82% rename from pl/math/expm1f_1u6.c rename to math/aarch64/experimental/expm1f_1u6.c index e12c9ba9a8a2d9..03d1e9dc31ef96 100644 --- a/pl/math/expm1f_1u6.c +++ b/math/aarch64/experimental/expm1f_1u6.c @@ -1,23 +1,23 @@ /* * Single-precision e^x - 1 function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f32.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Shift (0x1.8p23f) #define InvLn2 (0x1.715476p+0f) #define Ln2hi (0x1.62e4p-1f) #define Ln2lo (0x1.7f7d1cp-20f) #define AbsMask (0x7fffffff) -#define InfLimit \ +#define InfLimit \ (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows. */ -#define NegLimit \ +#define NegLimit \ (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */ /* Approximation for exp(x) - 1 using polynomial on a reduced interval. @@ -70,10 +70,10 @@ expm1f (float x) return 2 * fmaf (p, t, t - 0.5f); } -PL_SIG (S, F, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (expm1f, 1.02) -PL_TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000) -PL_TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000) -PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000) -PL_TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000) +TEST_SIG (S, F, 1, expm1, -9.9, 9.9) +TEST_ULP (expm1f, 1.02) +TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000) +TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000) +TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000) +TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000) +TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000) diff --git a/pl/math/expm1f_data.c b/math/aarch64/experimental/expm1f_data.c similarity index 59% rename from pl/math/expm1f_data.c rename to math/aarch64/experimental/expm1f_data.c index 9d02dc448ebb1f..92d9189ff5033d 100644 --- a/pl/math/expm1f_data.c +++ b/math/aarch64/experimental/expm1f_data.c @@ -1,12 +1,12 @@ /* * Coefficients for single-precision e^x - 1 function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" /* Generated using fpminimax, see tools/expm1f.sollya for details. */ -const float __expm1f_poly[] = {0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5, - 0x1.12287cp-7, 0x1.6b55a2p-10}; +const float __expm1f_poly[] = { 0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5, + 0x1.12287cp-7, 0x1.6b55a2p-10 }; diff --git a/pl/math/log10_2u.c b/math/aarch64/experimental/log10_2u.c similarity index 84% rename from pl/math/log10_2u.c rename to math/aarch64/experimental/log10_2u.c index 74828ea9ef3caa..84ee1544fe1af9 100644 --- a/pl/math/log10_2u.c +++ b/math/aarch64/experimental/log10_2u.c @@ -1,13 +1,13 @@ /* * Double-precision log10(x) function. * - * Copyright (c) 2020-2023, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* Polynomial coefficients and lookup tables. */ #define T __log10_data.tab @@ -32,11 +32,11 @@ top16 (double x) /* Fast and low accuracy implementation of log10. The implementation is similar to that of math/log, except that: - Polynomials are computed for log10(1+r) with r on same intervals as log. - - Lookup parameters are scaled (at runtime) to switch from base e to base 10. - Many errors above 1.59 ulp are observed across the whole range of doubles. - The greatest observed error is 1.61 ulp, at around 0.965: - log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6 - want -0x1.fee26884905a8p-6. */ + - Lookup parameters are scaled (at runtime) to switch from base e to + base 10. Many errors above 1.59 ulp are observed across the whole range of + doubles. The greatest observed error is 1.61 ulp, at around 0.965: + log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6 + want -0x1.fee26884905a8p-6. */ double log10 (double x) { @@ -61,8 +61,8 @@ log10 (double x) y = r3 * (B[1] + r * B[2] + r2 * B[3] + r3 - * (B[4] + r * B[5] + r2 * B[6] - + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); + * (B[4] + r * B[5] + r2 * B[6] + + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); /* Worst-case error is around 0.507 ULP. */ w = r * 0x1p27; double_t rhi = r + w - w; @@ -123,7 +123,8 @@ log10 (double x) r2 = r * r; /* rounding error: 0x1p-54/N^2. */ /* Scale by 1/ln(10). Polynomial already contains scaling. */ - y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi; + y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + + hi; y = y * InvLn10; return eval_as_double (y); @@ -143,8 +144,8 @@ log10l (long double x) #endif // clang-format on -PL_SIG (S, D, 1, log10, 0.01, 11.1) -PL_TEST_ULP (log10, 1.11) -PL_TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000) -PL_TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000) -PL_TEST_INTERVAL (log10, 0, inf, 40000) +TEST_SIG (S, D, 1, log10, 0.01, 11.1) +TEST_ULP (log10, 1.11) +TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000) +TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000) +TEST_INTERVAL (log10, 0, inf, 40000) diff --git a/pl/math/log10_data.c b/math/aarch64/experimental/log10_data.c similarity index 99% rename from pl/math/log10_data.c rename to math/aarch64/experimental/log10_data.c index 9976f19cd6df3f..20b5ef883ed846 100644 --- a/pl/math/log10_data.c +++ b/math/aarch64/experimental/log10_data.c @@ -1,7 +1,7 @@ /* * Data for log10. * - * Copyright (c) 2020-2023, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -333,5 +333,5 @@ that logc + poly(z/c - 1) has small error, however near x == 1 when {0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54}, #endif }, -#endif /* !HAVE_FAST_FMA */ +#endif /* !HAVE_FAST_FMA. */ }; diff --git a/pl/math/log1p_2u.c b/math/aarch64/experimental/log1p_2u.c similarity index 91% rename from pl/math/log1p_2u.c rename to math/aarch64/experimental/log1p_2u.c index f9491ce52b4449..a1ff309ecb5fcc 100644 --- a/pl/math/log1p_2u.c +++ b/math/aarch64/experimental/log1p_2u.c @@ -1,19 +1,19 @@ /* * Double-precision log(1+x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f64.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Ln2Hi 0x1.62e42fefa3800p-1 #define Ln2Lo 0x1.ef35793c76730p-45 #define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ -#define OneMHfRt2Top \ +#define OneMHfRt2Top \ 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ #define OneTop12 0x3ff #define BottomMask 0xffffffff @@ -123,9 +123,9 @@ log1p (double x) return y + fma (Ln2Hi, kd, p); } -PL_SIG (S, D, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (log1p, 1.26) -PL_TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000) -PL_TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000) -PL_TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000) -PL_TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000) +TEST_SIG (S, D, 1, log1p, -0.9, 10.0) +TEST_ULP (log1p, 1.26) +TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000) +TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000) diff --git a/math/aarch64/experimental/log1p_data.c b/math/aarch64/experimental/log1p_data.c new file mode 100644 index 00000000000000..91a7196d795f8a --- /dev/null +++ b/math/aarch64/experimental/log1p_data.c @@ -0,0 +1,20 @@ +/* + * Data used in double-precision log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Polynomial coefficients generated using Remez algorithm, see + log1p.sollya for details. */ +const struct log1p_data __log1p_data + = { .coeffs + = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6 } }; diff --git a/pl/math/log1pf_2u1.c b/math/aarch64/experimental/log1pf_2u1.c similarity index 93% rename from pl/math/log1pf_2u1.c rename to math/aarch64/experimental/log1pf_2u1.c index e991748537204d..fe4f9386522023 100644 --- a/pl/math/log1pf_2u1.c +++ b/math/aarch64/experimental/log1pf_2u1.c @@ -1,14 +1,14 @@ /* * Single-precision log(1+x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "poly_scalar_f32.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define Ln2 (0x1.62e43p-1f) #define SignMask (0x80000000) @@ -153,9 +153,9 @@ log1pf (float x) return fmaf (scale_back, Ln2, p); } -PL_SIG (S, F, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (log1pf, 1.52) -PL_TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) -PL_TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) -PL_TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000) -PL_TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000) +TEST_SIG (S, F, 1, log1p, -0.9, 10.0) +TEST_ULP (log1pf, 1.52) +TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000) +TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000) diff --git a/pl/math/log1pf_data.c b/math/aarch64/experimental/log1pf_data.c similarity index 59% rename from pl/math/log1pf_data.c rename to math/aarch64/experimental/log1pf_data.c index 8c92d5738fe82a..e0ac269a10692c 100644 --- a/pl/math/log1pf_data.c +++ b/math/aarch64/experimental/log1pf_data.c @@ -1,7 +1,7 @@ /* * Data used in single-precision log1p(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" @@ -9,6 +9,6 @@ /* Polynomial coefficients generated using floating-point minimax algorithm, see tools/log1pf.sollya for details. */ const struct log1pf_data __log1pf_data - = {.coeffs = {-0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, - -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f, - -0x1.6f0d5ep-5f}}; + = { .coeffs = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, + 0x1.abcb6p-4f, -0x1.6f0d5ep-5f } }; diff --git a/pl/math/sinh_3u.c b/math/aarch64/experimental/sinh_3u.c similarity index 72% rename from pl/math/sinh_3u.c rename to math/aarch64/experimental/sinh_3u.c index 1d86629ee2a352..39030d2750a92e 100644 --- a/pl/math/sinh_3u.c +++ b/math/aarch64/experimental/sinh_3u.c @@ -1,22 +1,19 @@ /* * Double-precision sinh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" +#include "exp_inline.h" #define AbsMask 0x7fffffffffffffff #define Half 0x3fe0000000000000 -#define OFlowBound \ - 0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results \ - in NaN. */ - -double -__exp_dd (double, double); +/* 0x1.62e42fefa39fp+9, above which using expm1 results in NaN. */ +#define OFlowBound 0x40862e42fefa39f0 /* Approximation for double-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. @@ -44,7 +41,7 @@ sinh (double x) either. We use the identity: exp(a) = (exp(a / 2)) ^ 2 to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0 ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. */ - double e = __exp_dd (ax / 2, 0); + double e = exp_inline (ax / 2, 0); return (e * halfsign) * e; } @@ -56,8 +53,8 @@ sinh (double x) return (t + t / (t + 1)) * halfsign; } -PL_SIG (S, D, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (sinh, 2.08) -PL_TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100) -PL_TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) -PL_TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) +TEST_SIG (S, D, 1, sinh, -10.0, 10.0) +TEST_ULP (sinh, 2.08) +TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100) +TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) +TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) diff --git a/pl/math/sinhf_2u3.c b/math/aarch64/experimental/sinhf_2u3.c similarity index 69% rename from pl/math/sinhf_2u3.c rename to math/aarch64/experimental/sinhf_2u3.c index aa7aadcf67c530..860ddc0fc83c26 100644 --- a/pl/math/sinhf_2u3.c +++ b/math/aarch64/experimental/sinhf_2u3.c @@ -1,25 +1,21 @@ /* * Single-precision sinh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask 0x7fffffff #define Half 0x3f000000 -#define Expm1OFlowLimit \ - 0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f \ - overflows. */ -#define OFlowLimit \ - 0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should \ - overflow. */ - -float -optr_aor_exp_f32 (float); +/* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f overflows. */ +#define Expm1OFlowLimit 0x42b17218 +/* 0x1.65a9fap+6, minimum positive value for which sinhf should overflow. */ +#define OFlowLimit 0x42b2d4fd /* Approximation for single-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. @@ -54,7 +50,7 @@ sinhf (float x) ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. Greatest error in this region is 1.89 ULP: sinhf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. */ - float e = optr_aor_exp_f32 (ax / 2); + float e = expf (ax / 2); return (e * halfsign) * e; } @@ -66,8 +62,8 @@ sinhf (float x) return (t + t / (t + 1)) * halfsign; } -PL_SIG (S, F, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (sinhf, 1.76) -PL_TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) -PL_TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) -PL_TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) +TEST_SIG (S, F, 1, sinh, -10.0, 10.0) +TEST_ULP (sinhf, 1.76) +TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) +TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) +TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) diff --git a/math/aarch64/experimental/sve/erfinv_25u.c b/math/aarch64/experimental/sve/erfinv_25u.c new file mode 100644 index 00000000000000..4de6d08ab80fc6 --- /dev/null +++ b/math/aarch64/experimental/sve/erfinv_25u.c @@ -0,0 +1,156 @@ +/* + * Double-precision inverse error function (SVE variant). + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "sv_math.h" +#include "test_defs.h" +#include "math_config.h" +#include "test_sig.h" +#include "sv_poly_f64.h" +#define SV_LOG_INLINE_POLY_ORDER 4 +#include "sv_log_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the + coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs + of the denominator. P is interleaved P_17 and P_37, similar for Q. */ + double P[7][2], Q[7][2]; + double P_57[9], Q_57[9], tailshift, P37_0; + struct sv_log_inline_data log_tbl; +} data = { + .P37_0 = -0x1.f3596123109edp-7, + .tailshift = -0.87890625, + .P = { { 0x1.007ce8f01b2e8p+4, 0x1.60b8fe375999ep-2 }, + { -0x1.6b23cc5c6c6d7p+6, -0x1.779bb9bef7c0fp+1 }, + { 0x1.74e5f6ceb3548p+7, 0x1.786ea384470a2p+3 }, + { -0x1.5200bb15cc6bbp+7, -0x1.6a7c1453c85d3p+4 }, + { 0x1.05d193233a849p+6, 0x1.31f0fc5613142p+4 }, + { -0x1.148c5474ee5e1p+3, -0x1.5ea6c007d4dbbp+2 }, + { 0x1.689181bbafd0cp-3, 0x1.e66f265ce9e5p-3 } }, + .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 }, + { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 }, + { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 }, + { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 }, + { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 }, + { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 }, + { 0x1p+0, -0x1.4075c56404eecp+3 } }, + .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2, + 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3, + 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 }, + .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2, + 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3, + 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2 }, + .log_tbl = SV_LOG_CONSTANTS +}; + +static inline svfloat64_t +special (svbool_t pg, svfloat64_t x, const struct data *d) +{ + /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf. + By using log here, instead of log1p, we return finite values for both + these inputs, and values outside [-1, 1]. This is non-compliant, but is an + acceptable optimisation at Ofast. To get correct behaviour for all finite + values use the log1p_inline helper on -abs(x) - note that erfinv(inf) + will still be finite. */ + svfloat64_t ax = svabs_x (pg, x); + svfloat64_t t + = svneg_x (pg, sv_log_inline (pg, svsubr_x (pg, ax, 1), &d->log_tbl)); + t = svdivr_x (pg, svsqrt_x (pg, t), 1); + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (ax), svreinterpret_u64 (x)); + svfloat64_t ts + = svreinterpret_f64 (svorr_x (pg, sign, svreinterpret_u64 (t))); + + svfloat64_t q = svadd_x (pg, t, d->Q_57[8]); + for (int i = 7; i >= 0; i--) + q = svmad_x (pg, q, t, d->Q_57[i]); + + return svdiv_x (pg, sv_horner_8_f64_x (pg, t, d->P_57), svmul_x (pg, ts, q)); +} + +static inline svfloat64_t +lookup (const double *c, svuint64_t idx) +{ + svfloat64_t x = svld1rq_f64 (svptrue_b64 (), c); + return svtbl (x, idx); +} + +static inline svfloat64_t +notails (svbool_t pg, svfloat64_t x, const struct data *d) +{ + svfloat64_t t = svmad_x (pg, x, x, -0.5625); + svfloat64_t p = svmla_x (pg, sv_f64 (d->P[5][0]), t, d->P[6][0]); + svfloat64_t q = svadd_x (pg, t, d->Q[5][0]); + for (int i = 4; i >= 0; i--) + { + p = svmad_x (pg, t, p, d->P[i][0]); + q = svmad_x (pg, t, q, d->Q[i][0]); + } + p = svmul_x (pg, p, x); + return svdiv_x (pg, p, q); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in double precision. Largest observed error is 24.75 ULP: + _ZGVsMxv_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0 + want 0x1.ea0547268660cp+0. */ +svfloat64_t SV_NAME_D1 (erfinv) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. + + Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + + svbool_t no_tail = svacle (pg, x, 0.75); + if (unlikely (!svptest_any (pg, svnot_z (pg, no_tail)))) + return notails (pg, x, d); + + svbool_t is_tail = svnot_z (pg, no_tail); + svbool_t extreme_tail = svacgt (pg, x, 0.9375); + svuint64_t idx = svdup_n_u64_z (is_tail, 1); + + svfloat64_t t = svsel_f64 (is_tail, sv_f64 (d->tailshift), sv_f64 (-0.5625)); + t = svmla_x (pg, t, x, x); + + svfloat64_t p = lookup (&d->P[6][0], idx); + svfloat64_t q + = svmla_x (pg, lookup (&d->Q[6][0], idx), svdup_n_f64_z (is_tail, 1), t); + for (int i = 5; i >= 0; i--) + { + p = svmla_x (pg, lookup (&d->P[i][0], idx), p, t); + q = svmla_x (pg, lookup (&d->Q[i][0], idx), q, t); + } + p = svmad_m (is_tail, p, t, d->P37_0); + p = svmul_x (pg, p, x); + + if (likely (svptest_any (pg, extreme_tail))) + return svsel (extreme_tail, special (pg, x, d), svdiv_x (pg, p, q)); + return svdiv_x (pg, p, q); +} + +#if USE_MPFR +# warning Not generating tests for _ZGVsMxv_erfinv, as MPFR has no suitable reference +#else +TEST_SIG (SV, D, 1, erfinv, -0.99, 0.99) +TEST_ULP (SV_NAME_D1 (erfinv), 24.5) +TEST_DISABLE_FENV (SV_NAME_D1 (erfinv)) +/* Test with control lane in each interval. */ +TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 100000) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/experimental/sve/erfinvf_5u.c b/math/aarch64/experimental/sve/erfinvf_5u.c new file mode 100644 index 00000000000000..2c81c4e0b9a23d --- /dev/null +++ b/math/aarch64/experimental/sve/erfinvf_5u.c @@ -0,0 +1,156 @@ +/* + * Single-precision inverse error function (SVE variant). + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" +#include "sv_logf_inline.h" + +const static struct data +{ + /* We use P_N and Q_N to refer to arrays of coefficients, where P_N + is the coeffs of the numerator in table N of Blair et al, and + Q_N is the coeffs of the denominator. Coefficients stored in + interleaved format to support lookup scheme. */ + float P10_2, P29_3, Q10_2, Q29_2; + float P10_0, P29_1, P10_1, P29_2; + float Q10_0, Q29_0, Q10_1, Q29_1; + float P29_0, P_50[6], Q_50[2], tailshift; + struct sv_logf_data logf_tbl; +} data = { .P10_0 = -0x1.a31268p+3, + .P10_1 = 0x1.ac9048p+4, + .P10_2 = -0x1.293ff6p+3, + .P29_0 = -0x1.fc0252p-4, + .P29_1 = 0x1.119d44p+0, + .P29_2 = -0x1.f59ee2p+0, + .P29_3 = 0x1.b13626p-2, + .Q10_0 = -0x1.8265eep+3, + .Q10_1 = 0x1.ef5eaep+4, + .Q10_2 = -0x1.12665p+4, + .Q29_0 = -0x1.69952p-4, + .Q29_1 = 0x1.c7b7d2p-1, + .Q29_2 = -0x1.167d7p+1, + .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1, + -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 }, + .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0 }, + .tailshift = -0.87890625, + .logf_tbl = SV_LOGF_CONSTANTS }; + +static inline svfloat32_t +special (svbool_t pg, svfloat32_t x, const struct data *d) +{ + svfloat32_t ax = svabs_x (pg, x); + svfloat32_t t = svdivr_x ( + pg, + svsqrt_x (pg, svneg_x (pg, sv_logf_inline (pg, svsubr_x (pg, ax, 1), + &d->logf_tbl))), + 1); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (ax), svreinterpret_u32 (x)); + svfloat32_t ts + = svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (t))); + svfloat32_t q + = svmla_x (pg, sv_f32 (d->Q_50[0]), svadd_x (pg, t, d->Q_50[1]), t); + return svdiv_x (pg, sv_horner_5_f32_x (pg, t, d->P_50), svmul_x (pg, ts, q)); +} + +static inline svfloat32_t +notails (svbool_t pg, svfloat32_t x, const struct data *d) +{ + /* Shortcut when no input is in a tail region - no need to gather shift or + coefficients. */ + svfloat32_t t = svmad_x (pg, x, x, -0.5625); + svfloat32_t q = svadd_x (pg, t, d->Q10_2); + q = svmad_x (pg, t, q, d->Q10_1); + q = svmad_x (pg, t, q, d->Q10_0); + + svfloat32_t p = svmla_x (pg, sv_f32 (d->P10_1), t, d->P10_2); + p = svmad_x (pg, p, t, d->P10_0); + + return svdiv_x (pg, svmul_x (pg, x, p), q); +} + +/* Vector implementation of Blair et al's rational approximation to inverse + error function in single-precision. Worst-case error is 4.71 ULP, in the + tail region: + _ZGVsMxv_erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0 + want 0x1.b83274p+0. */ +svfloat32_t SV_NAME_F1 (erfinv) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Calculate inverse error using algorithm described in + J. M. Blair, C. A. Edwards, and J. H. Johnson, + "Rational Chebyshev approximations for the inverse of the error function", + Math. Comp. 30, pp. 827--830 (1976). + https://doi.org/10.1090/S0025-5718-1976-0421040-7. */ + + /* Algorithm has 3 intervals: + - 'Normal' region [-0.75, 0.75] + - Tail region [0.75, 0.9375] U [-0.9375, -0.75] + - Extreme tail [-1, -0.9375] U [0.9375, 1] + Normal and tail are both rational approximation of similar order on + shifted input - these are typically performed in parallel using gather + loads to obtain correct coefficients depending on interval. */ + svbool_t is_tail = svacge (pg, x, 0.75); + svbool_t extreme_tail = svacge (pg, x, 0.9375); + + if (likely (!svptest_any (pg, is_tail))) + return notails (pg, x, d); + + /* Select requisite shift depending on interval: polynomial is evaluated on + x * x - shift. + Normal shift = 0.5625 + Tail shift = 0.87890625. */ + svfloat32_t t = svmla_x ( + pg, svsel (is_tail, sv_f32 (d->tailshift), sv_f32 (-0.5625)), x, x); + + svuint32_t idx = svdup_u32_z (is_tail, 1); + svuint32_t idxhi = svadd_x (pg, idx, 2); + + /* Load coeffs in quadwords and select them according to interval. */ + svfloat32_t pqhi = svld1rq (svptrue_b32 (), &d->P10_2); + svfloat32_t plo = svld1rq (svptrue_b32 (), &d->P10_0); + svfloat32_t qlo = svld1rq (svptrue_b32 (), &d->Q10_0); + + svfloat32_t p2 = svtbl (pqhi, idx); + svfloat32_t p1 = svtbl (plo, idxhi); + svfloat32_t p0 = svtbl (plo, idx); + svfloat32_t q0 = svtbl (qlo, idx); + svfloat32_t q1 = svtbl (qlo, idxhi); + svfloat32_t q2 = svtbl (pqhi, idxhi); + + svfloat32_t p = svmla_x (pg, p1, p2, t); + p = svmla_x (pg, p0, p, t); + /* Tail polynomial has higher order - merge with normal lanes. */ + p = svmad_m (is_tail, p, t, d->P29_0); + svfloat32_t y = svmul_x (pg, x, p); + + /* Least significant term of both Q polynomials is 1, so no need to generate + it. */ + svfloat32_t q = svadd_x (pg, t, q2); + q = svmla_x (pg, q1, q, t); + q = svmla_x (pg, q0, q, t); + + if (unlikely (svptest_any (pg, extreme_tail))) + return svsel (extreme_tail, special (extreme_tail, x, d), + svdiv_x (pg, y, q)); + return svdiv_x (pg, y, q); +} + +#if USE_MPFR +# warning Not generating tests for _ZGVsMxv_erfinvf, as MPFR has no suitable reference +#else +TEST_SIG (SV, F, 1, erfinv, -0.99, 0.99) +TEST_ULP (SV_NAME_F1 (erfinv), 4.09) +TEST_DISABLE_FENV (SV_NAME_F1 (erfinv)) +TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 40000) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8) +TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95) +#endif +CLOSE_SVE_ATTR diff --git a/pl/math/sv_powi.c b/math/aarch64/experimental/sve/powi.c similarity index 96% rename from pl/math/sv_powi.c rename to math/aarch64/experimental/sve/powi.c index e53bf219553362..62dd1b11497073 100644 --- a/pl/math/sv_powi.c +++ b/math/aarch64/experimental/sve/powi.c @@ -1,7 +1,7 @@ /* * Double-precision SVE powi(x, n) function. * - * Copyright (c) 2020-2023, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -46,3 +46,4 @@ _ZGVsMxvv_powk (svfloat64_t as, svint64_t ns, svbool_t p) return acc; } +CLOSE_SVE_ATTR diff --git a/pl/math/sv_powif.c b/math/aarch64/experimental/sve/powif.c similarity index 96% rename from pl/math/sv_powif.c rename to math/aarch64/experimental/sve/powif.c index 7e032fd86a2047..fd74acf12df79f 100644 --- a/pl/math/sv_powif.c +++ b/math/aarch64/experimental/sve/powif.c @@ -1,7 +1,7 @@ /* * Single-precision SVE powi(x, n) function. * - * Copyright (c) 2020-2023, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -46,3 +46,4 @@ _ZGVsMxvv_powi (svfloat32_t as, svint32_t ns, svbool_t p) return acc; } +CLOSE_SVE_ATTR diff --git a/math/aarch64/experimental/sve/sv_logf_inline.h b/math/aarch64/experimental/sve/sv_logf_inline.h new file mode 100644 index 00000000000000..c317a23f6fc36e --- /dev/null +++ b/math/aarch64/experimental/sve/sv_logf_inline.h @@ -0,0 +1,51 @@ +/* + * Single-precision vector log function - inline version + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" + +struct sv_logf_data +{ + float p1, p3, p5, p6, p0, p2, p4; + float ln2; + uint32_t off, mantissa_mask; +}; + +#define SV_LOGF_CONSTANTS \ + { \ + .p0 = -0x1.ffffc8p-2f, .p1 = 0x1.555d7cp-2f, .p2 = -0x1.00187cp-2f, \ + .p3 = 0x1.961348p-3f, .p4 = -0x1.4f9934p-3f, .p5 = 0x1.5a9aa2p-3f, \ + .p6 = -0x1.3e737cp-3f, .ln2 = 0x1.62e43p-1f, .off = 0x3f2aaaab, \ + .mantissa_mask = 0x007fffff \ + } + +static inline svfloat32_t +sv_logf_inline (svbool_t pg, svfloat32_t x, const struct sv_logf_data *d) +{ + svuint32_t u = svreinterpret_u32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = svsub_x (pg, u, d->off); + svfloat32_t n = svcvt_f32_s32_x ( + pg, svasr_x (pg, svreinterpret_s32_u32 (u), 23)); /* signextend. */ + u = svand_x (pg, u, d->mantissa_mask); + u = svadd_x (pg, u, d->off); + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); + + /* y = log(1+r) + n*ln2. */ + svfloat32_t r2 = svmul_x (pg, r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + svfloat32_t p1356 = svld1rq_f32 (svptrue_b32 (), &d->p1); + svfloat32_t p = svmla_lane (sv_f32 (d->p4), r, p1356, 2); + svfloat32_t q = svmla_lane (sv_f32 (d->p2), r, p1356, 1); + svfloat32_t y = svmla_lane (sv_f32 (d->p0), r, p1356, 0); + p = svmla_lane (p, r2, p1356, 3); + q = svmla_x (pg, q, p, r2); + y = svmla_x (pg, y, q, r2); + p = svmla_x (pg, r, n, d->ln2); + + return svmla_x (pg, p, y, r2); +} diff --git a/pl/math/tanf_3u3.c b/math/aarch64/experimental/tanf_3u3.c similarity index 80% rename from pl/math/tanf_3u3.c rename to math/aarch64/experimental/tanf_3u3.c index 30c86fa89730c3..c26e92db588fcb 100644 --- a/pl/math/tanf_3u3.c +++ b/math/aarch64/experimental/tanf_3u3.c @@ -1,12 +1,12 @@ /* * Single-precision scalar tan(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "poly_scalar_f32.h" /* Useful constants. */ @@ -52,14 +52,6 @@ reduce (float x, int32_t *in) return r; } -/* Table with 4/PI to 192 bit precision. To avoid unaligned accesses - only 8 new bits are added per entry, making the table 4 times larger. */ -static const uint32_t __inv_pio4[24] - = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44, - 0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1, - 0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62, - 0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041}; - /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic. XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored). Return the modulo between -PI/4 and PI/4 and store the quadrant in NP. @@ -130,11 +122,11 @@ tanf (float x) return fmaf (x2, x * y, x); } /* Similar to other trigonometric routines, fast inaccurate reduction is - performed for values of x from pi/4 up to RangeVal. In order to keep errors - below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for - other trigonometric routines. Above this value more advanced but slower - reduction techniques need to be implemented to reach a similar accuracy. - */ + performed for values of x from pi/4 up to RangeVal. In order to keep + errors below 3.5ulps, we set the value of RangeVal to 2^17. This might + differ for other trigonometric routines. Above this value more advanced + but slower reduction techniques need to be implemented to reach a similar + accuracy. */ else if (ia12 < top12 (RangeVal)) { /* Fast inaccurate reduction. */ @@ -182,12 +174,12 @@ tanf (float x) return fmaf (scale, p, offset); } -PL_SIG (S, F, 1, tan, -3.1, 3.1) -PL_TEST_ULP (tanf, 2.80) -PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) -PL_TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) -PL_TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000) -PL_TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000) -PL_TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000) -PL_TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) -PL_TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000) +TEST_SIG (S, F, 1, tan, -3.1, 3.1) +TEST_ULP (tanf, 2.80) +TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) +TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000) +TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000) +TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000) +TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) +TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000) diff --git a/pl/math/tanf_data.c b/math/aarch64/experimental/tanf_data.c similarity index 96% rename from pl/math/tanf_data.c rename to math/aarch64/experimental/tanf_data.c index a6b9d512eed2c1..f310cd77d4ecbc 100644 --- a/pl/math/tanf_data.c +++ b/math/aarch64/experimental/tanf_data.c @@ -1,7 +1,7 @@ /* * Data used in single-precision tan(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/tanh_3u.c b/math/aarch64/experimental/tanh_3u.c similarity index 80% rename from pl/math/tanh_3u.c rename to math/aarch64/experimental/tanh_3u.c index 86f2904afc32d0..838b6c4f12c133 100644 --- a/pl/math/tanh_3u.c +++ b/math/aarch64/experimental/tanh_3u.c @@ -1,13 +1,13 @@ /* * Double-precision tanh(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include "poly_scalar_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define AbsMask 0x7fffffffffffffff #define InvLn2 0x1.71547652b82fep0 @@ -15,8 +15,10 @@ #define Ln2lo 0x1.abc9e3b39803fp-56 #define Shift 0x1.8p52 -#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */ -#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */ +/* asuint64 (0x1.241bf835f9d5fp+4). */ +#define BoringBound 0x403241bf835f9d5f +/* asuint64 (0x1p-27). */ +#define TinyBound 0x3e40000000000000 #define One 0x3ff0000000000000 static inline double @@ -71,8 +73,8 @@ tanh (double x) return q / (q + 2); } -PL_SIG (S, D, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (tanh, 2.27) -PL_TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000) -PL_TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000) -PL_TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000) +TEST_SIG (S, D, 1, tanh, -10.0, 10.0) +TEST_ULP (tanh, 2.27) +TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000) +TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000) +TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000) diff --git a/pl/math/tanhf_2u6.c b/math/aarch64/experimental/tanhf_2u6.c similarity index 79% rename from pl/math/tanhf_2u6.c rename to math/aarch64/experimental/tanhf_2u6.c index 93ea3cf5d865ae..d9adae5c3a76bb 100644 --- a/pl/math/tanhf_2u6.c +++ b/math/aarch64/experimental/tanhf_2u6.c @@ -1,16 +1,15 @@ /* * Single-precision tanh(x) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" -#define BoringBound \ - 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ - negative). */ +/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ +#define BoringBound 0x41102cb3 #define AbsMask 0x7fffffff #define One 0x3f800000 @@ -26,8 +25,8 @@ expm1f_inline (float x) { /* Helper routine for calculating exp(x) - 1. Copied from expm1f_1u6.c, with several simplifications: - - No special-case handling for tiny or special values, instead return early - from the main routine. + - No special-case handling for tiny or special values, instead return + early from the main routine. - No special handling for large values: - No early return for infinity. - Simpler combination of p and t in final stage of algorithm. @@ -81,8 +80,8 @@ tanhf (float x) return q / (q + 2); } -PL_SIG (S, F, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (tanhf, 2.09) -PL_TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000) -PL_TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) -PL_TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) +TEST_SIG (S, F, 1, tanh, -10.0, 10.0) +TEST_ULP (tanhf, 2.09) +TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) +TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) diff --git a/math/aarch64/sincospi_4u.c b/math/aarch64/sincospi_4u.c new file mode 100644 index 00000000000000..2a944bed23e15d --- /dev/null +++ b/math/aarch64/sincospi_4u.c @@ -0,0 +1,158 @@ +/* + * Double-precision scalar sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f64.h" + +/* Taylor series coefficents for sin(pi * x). + C2 coefficient (orginally ~=5.16771278) has been split into two parts: + C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278) + This change in magnitude reduces floating point rounding errors. + C2_hi is then reintroduced after the polynomial approxmation. */ +const static struct sincospi_data +{ + double poly[10]; +} sincospi_data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, +}; + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint64_t +abstop12 (double x) +{ + return (asuint64 (x) >> 52) & 0x7ff; +} + +/* Triages special cases into 4 categories: + -1 or +1 if iy represents half an integer + -1 if round(y) is odd. + +1 if round(y) is even. + -2 or +2 if iy represents and integer. + -2 if iy is odd. + +2 if iy is even. + The argument is the bit representation of a positive non-zero + finite floating-point value which is either a half or an integer. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + { + if ((iy - 1) & 2) + return -1; + else + return 1; + } + if (iy & (1 << (0x3ff + 52 - e))) + return -2; + return 2; +} + +/* Approximation for scalar double-precision sincospi(x). + Maximum error for sin: 3.46 ULP: + sincospif_sin(0x1.3d8a067cd8961p+14) got 0x1.ffe609a279008p-1 want + 0x1.ffe609a27900cp-1. + Maximum error for cos: 3.66 ULP: + sincospif_cos(0x1.a0ec6997557eep-24) got 0x1.ffffffffffe59p-1 want + 0x1.ffffffffffe5dp-1. */ +void +arm_math_sincospi (double x, double *out_sin, double *out_cos) +{ + const struct sincospi_data *d = ptr_barrier (&sincospi_data); + uint64_t sign = asuint64 (x) & 0x8000000000000000; + + if (likely (abstop12 (x) < abstop12 (0x1p51))) + { + /* ax = |x| - n (range reduction into -1/2 .. 1/2). */ + double ar_s = x - rint (x); + + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + ax = 0.5 - |x - rint(x)|. */ + double ar_c = 0.5 - fabs (ar_s); + + /* ss = sin(pi * ax). */ + double ar2_s = ar_s * ar_s; + double ar2_c = ar_c * ar_c; + double ar4_s = ar2_s * ar2_s; + double ar4_c = ar2_c * ar2_c; + + uint64_t cc_sign = ((uint64_t) llrint (x)) << 63; + uint64_t ss_sign = cc_sign; + if (ar_s == 0) + ss_sign = sign; + + double ss = pw_horner_9_f64 (ar2_s, ar4_s, d->poly); + double cc = pw_horner_9_f64 (ar2_c, ar4_c, d->poly); + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) + always be positive, therefore, the sign must be introduced + based upon if x rounds to odd or even. For sin(x) the sign is + copied from x. */ + *out_sin + = asdouble (asuint64 (fma (-4 * ar2_s, ar_s, ss * ar_s)) ^ ss_sign); + *out_cos + = asdouble (asuint64 (fma (-4 * ar2_c, ar_c, cc * ar_c)) ^ cc_sign); + } + else + { + /* When abs(x) > 0x1p51, the x will be either + - Half integer (relevant if abs(x) in [0x1p51, 0x1p52]) + - Odd integer (relevant if abs(x) in [0x1p52, 0x1p53]) + - Even integer (relevant if abs(x) in [0x1p53, inf]) + - Inf or NaN. */ + if (abstop12 (x) >= 0x7ff) + { + double inv_result = __math_invalid (x); + *out_sin = inv_result; + *out_cos = inv_result; + return; + } + else + { + uint64_t ax = asuint64 (x) & 0x7fffffffffffffff; + int m = checkint (ax); + /* The case where ax is half integer. */ + if (m & 1) + { + *out_sin = sign ? -m : m; + *out_cos = 0; + return; + } + /* The case where ax is integer. */ + else + { + *out_sin = asdouble (sign); + *out_cos = m >> 1; + return; + } + } + } +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (arm_math_sincospi_sin) +TEST_DISABLE_FENV (arm_math_sincospi_cos) +TEST_ULP (arm_math_sincospi_sin, 2.96) +TEST_ULP (arm_math_sincospi_cos, 3.16) +# define SINCOS_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospi_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospi_cos, lo, hi, n) +SINCOS_INTERVAL (0, 0x1p-63, 10000) +SINCOS_INTERVAL (0x1p-63, 0.5, 50000) +SINCOS_INTERVAL (0.5, 0x1p51, 50000) +SINCOS_INTERVAL (0x1p51, inf, 10000) +#endif diff --git a/math/aarch64/sincospif_3u2.c b/math/aarch64/sincospif_3u2.c new file mode 100644 index 00000000000000..b79694d2ac656f --- /dev/null +++ b/math/aarch64/sincospif_3u2.c @@ -0,0 +1,145 @@ +/* + * Single-precision scalar sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f32.h" + +/* Taylor series coefficents for sin(pi * x). */ +const static struct sincospif_data +{ + float poly[6]; +} sincospif_data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, +}; + +/* Top 12 bits of the float representation with the sign bit cleared. */ +static inline uint32_t +abstop12 (float x) +{ + return (asuint (x) >> 20) & 0x7ff; +} + +/* Triages special cases into 4 categories: + -1 or +1 if iy represents half an integer + -1 if round(y) is odd. + +1 if round(y) is even. + -2 or +2 if iy represents and integer. + -2 if iy is odd. + +2 if iy is even. + The argument is the bit representation of a positive non-zero + finite floating-point value which is either a half or an integer. */ +static inline int +checkint (uint32_t iy) +{ + int e = iy >> 23; + if (e > 0x7f + 23) + return 2; + if (iy & ((1 << (0x7f + 23 - e)) - 1)) + { + if ((iy - 1) & 2) + return -1; + else + return 1; + } + if (iy & (1 << (0x7f + 23 - e))) + return -2; + return 2; +} + +/* Approximation for scalar single-precision sincospif(x). + Maximum error for sin: 3.04 ULP: + sincospif_sin(0x1.c597ccp-2) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Maximum error for cos: 3.18 ULP: + sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. */ +void +arm_math_sincospif (float x, float *out_sin, float *out_cos) +{ + + const struct sincospif_data *d = ptr_barrier (&sincospif_data); + uint32_t sign = asuint (x) & 0x80000000; + + /* abs(x) in [0, 0x1p22]. */ + if (likely (abstop12 (x) < abstop12 (0x1p22))) + { + /* ar_s = x - n (range reduction into -1/2 .. 1/2). */ + float ar_s = x - rintf (x); + /* We know that cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + ar_c = 0.5 - |x - n|. */ + float ar_c = 0.5f - fabsf (ar_s); + + float ar2_s = ar_s * ar_s; + float ar2_c = ar_c * ar_c; + float ar4_s = ar2_s * ar2_s; + float ar4_c = ar2_c * ar2_c; + + uint32_t cc_sign = lrintf (x) << 31; + uint32_t ss_sign = cc_sign; + if (ar_s == 0) + ss_sign = sign; + + /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) + always be positive, therefore, the sign must be introduced + based upon if x rounds to odd or even. For sin(x) the sign is + copied from x. */ + *out_sin = pw_horner_5_f32 (ar2_s, ar4_s, d->poly) + * asfloat (asuint (ar_s) ^ ss_sign); + *out_cos = pw_horner_5_f32 (ar2_c, ar4_c, d->poly) + * asfloat (asuint (ar_c) ^ cc_sign); + return; + } + else + { + /* When abs(x) > 0x1p22, the x will be either + - Half integer (relevant if abs(x) in [0x1p22, 0x1p23]) + - Odd integer (relevant if abs(x) in [0x1p22, 0x1p24]) + - Even integer (relevant if abs(x) in [0x1p22, inf]) + - Inf or NaN. */ + if (abstop12 (x) >= 0x7f8) + { + float inv_result = __math_invalidf (x); + *out_sin = inv_result; + *out_cos = inv_result; + return; + } + else + { + uint32_t ax = asuint (x) & 0x7fffffff; + int m = checkint (ax); + if (m & 1) + { + *out_sin = sign ? -m : m; + *out_cos = 0; + return; + } + else + { + *out_sin = asfloat (sign); + *out_cos = m >> 1; + return; + } + } + } +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (arm_math_sincospif_sin) +TEST_DISABLE_FENV (arm_math_sincospif_cos) +TEST_ULP (arm_math_sincospif_sin, 2.54) +TEST_ULP (arm_math_sincospif_cos, 2.68) +# define SINCOSPIF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospif_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (arm_math_sincospif_cos, lo, hi, n) +SINCOSPIF_INTERVAL (0, 0x1p-31, 10000) +SINCOSPIF_INTERVAL (0x1p-31, 1, 50000) +SINCOSPIF_INTERVAL (1, 0x1p22f, 50000) +SINCOSPIF_INTERVAL (0x1p22f, inf, 10000) +#endif diff --git a/pl/math/sinpi_3u.c b/math/aarch64/sinpi_3u5.c similarity index 76% rename from pl/math/sinpi_3u.c rename to math/aarch64/sinpi_3u5.c index a04a352a62e635..f96d9a312b53ca 100644 --- a/pl/math/sinpi_3u.c +++ b/math/aarch64/sinpi_3u5.c @@ -1,7 +1,7 @@ /* * Double-precision scalar sinpi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,8 +9,8 @@ #include #include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "poly_scalar_f64.h" /* Taylor series coefficents for sin(pi * x). @@ -25,15 +25,17 @@ static const double poly[] -0x1.012a9870eeb7dp-25 }; #define Shift 0x1.8p+52 +/* TODO Store constant in structure for more efficient load. */ +#define Pi 0x1.921fb54442d18p+1 /* Approximation for scalar double-precision sinpi(x). Maximum error: 3.03 ULP: sinpi(0x1.a90da2818f8b5p+7) got 0x1.fe358f255a4b3p-1 want 0x1.fe358f255a4b6p-1. */ double -sinpi (double x) +arm_math_sinpi (double x) { - if (isinf (x)) + if (isinf (x) || isnan (x)) return __math_invalid (x); double r = asdouble (asuint64 (x) & ~0x8000000000000000); @@ -42,17 +44,17 @@ sinpi (double x) /* Edge cases for when sinpif should be exactly 0. (Integers) 0x1p53 is the limit for single precision to store any decimal places. */ if (r >= 0x1p53) - return 0; + return asdouble (sign); /* If x is an integer, return 0. */ uint64_t m = (uint64_t) r; if (r == m) - return 0; + return asdouble (sign); /* For very small inputs, squaring r causes underflow. Values below this threshold can be approximated via sinpi(x) ≈ pi*x. */ if (r < 0x1p-63) - return M_PI * x; + return Pi * x; /* Any non-integer values >= 0x1x51 will be int + 0.5. These values should return exactly 1 or -1. */ @@ -82,9 +84,18 @@ sinpi (double x) return asdouble (asuint64 (y) ^ sign); } -PL_SIG (S, D, 1, sinpi, -0.9, 0.9) -PL_TEST_ULP (sinpi, 2.53) -PL_TEST_SYM_INTERVAL (sinpi, 0, 0x1p-63, 5000) -PL_TEST_SYM_INTERVAL (sinpi, 0x1p-63, 0.5, 10000) -PL_TEST_SYM_INTERVAL (sinpi, 0.5, 0x1p51, 10000) -PL_TEST_SYM_INTERVAL (sinpi, 0x1p51, inf, 10000) +#if WANT_EXPERIMENTAL_MATH +double +sinpi (double x) +{ + return arm_math_sinpi (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_sinpi, 2.53) +TEST_SYM_INTERVAL (arm_math_sinpi, 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (arm_math_sinpi, 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_sinpi, 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (arm_math_sinpi, 0x1p51, inf, 10000) +#endif diff --git a/pl/math/sinpif_2u5.c b/math/aarch64/sinpif_2u5.c similarity index 75% rename from pl/math/sinpif_2u5.c rename to math/aarch64/sinpif_2u5.c index af9ca0573b374f..b5d9cd9145771c 100644 --- a/pl/math/sinpif_2u5.c +++ b/math/aarch64/sinpif_2u5.c @@ -1,14 +1,14 @@ /* * Single-precision scalar sinpi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* Taylor series coefficents for sin(pi * x). */ #define C0 0x1.921fb6p1f @@ -25,9 +25,9 @@ sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 want 0x1.fa8c02p-1. */ float -sinpif (float x) +arm_math_sinpif (float x) { - if (isinf (x)) + if (isinf (x) || isnan (x)) return __math_invalidf (x); float r = asfloat (asuint (x) & ~0x80000000); @@ -36,11 +36,11 @@ sinpif (float x) /* Edge cases for when sinpif should be exactly 0. (Integers) 0x1p23 is the limit for single precision to store any decimal places. */ if (r >= 0x1p23f) - return 0; + return asfloat (sign); int32_t m = roundf (r); if (m == r) - return 0; + return asfloat (sign); /* For very small inputs, squaring r causes underflow. Values below this threshold can be approximated via sinpi(x) ~= pi*x. */ @@ -75,9 +75,18 @@ sinpif (float x) return asfloat (asuint (y * r) ^ sign); } -PL_SIG (S, F, 1, sinpi, -0.9, 0.9) -PL_TEST_ULP (sinpif, 1.99) -PL_TEST_SYM_INTERVAL (sinpif, 0, 0x1p-31, 5000) -PL_TEST_SYM_INTERVAL (sinpif, 0x1p-31, 0.5, 10000) -PL_TEST_SYM_INTERVAL (sinpif, 0.5, 0x1p22f, 10000) -PL_TEST_SYM_INTERVAL (sinpif, 0x1p22f, inf, 10000) +#if WANT_EXPERIMENTAL_MATH +float +sinpif (float x) +{ + return arm_math_sinpif (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_sinpif, 1.99) +TEST_SYM_INTERVAL (arm_math_sinpif, 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (arm_math_sinpif, 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (arm_math_sinpif, 0.5, 0x1p22f, 10000) +TEST_SYM_INTERVAL (arm_math_sinpif, 0x1p22f, inf, 10000) +#endif diff --git a/pl/math/sv_acos_2u.c b/math/aarch64/sve/acos.c similarity index 85% rename from pl/math/sv_acos_2u.c rename to math/aarch64/sve/acos.c index e06db6cae6af9d..da633392aa3e20 100644 --- a/pl/math/sv_acos_2u.c +++ b/math/aarch64/sve/acos.c @@ -1,14 +1,14 @@ /* * Double-precision SVE acos(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -82,10 +82,12 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) return svmla_x (pg, add, mul, y); } -PL_SIG (SV, D, 1, acos, -1.0, 1.0) -PL_TEST_ULP (SV_NAME_D1 (acos), 1.02) -PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000) +TEST_SIG (SV, D, 1, acos, -1.0, 1.0) +TEST_ULP (SV_NAME_D1 (acos), 1.02) +TEST_DISABLE_FENV (SV_NAME_D1 (acos)) +TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_acosf_1u4.c b/math/aarch64/sve/acosf.c similarity index 83% rename from pl/math/sv_acosf_1u4.c rename to math/aarch64/sve/acosf.c index 7ac59ceedfbdb0..86b7822cefc3dc 100644 --- a/pl/math/sv_acosf_1u4.c +++ b/math/aarch64/sve/acosf.c @@ -1,14 +1,14 @@ /* * Single-precision SVE acos(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f32.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "sv_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -75,10 +75,12 @@ svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg) return svmla_x (pg, add, mul, y); } -PL_SIG (SV, F, 1, acos, -1.0, 1.0) -PL_TEST_ULP (SV_NAME_F1 (acos), 0.82) -PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000) +TEST_SIG (SV, F, 1, acos, -1.0, 1.0) +TEST_ULP (SV_NAME_F1 (acos), 0.82) +TEST_DISABLE_FENV (SV_NAME_F1 (acos)) +TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/acosh.c b/math/aarch64/sve/acosh.c new file mode 100644 index 00000000000000..d54c21922e1b18 --- /dev/null +++ b/math/aarch64/sve/acosh.c @@ -0,0 +1,51 @@ +/* + * Double-precision SVE acosh(x) function. + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_SV_LOG1P_K0_SHORTCUT 1 +#include "sv_log1p_inline.h" + +#define One (0x3ff0000000000000) +#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One. */ + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (acosh, x, y, special); +} + +/* SVE approximation for double-precision acosh, based on log1p. + The largest observed error is 3.19 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: + SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2 + want 0x1.ed23399f51373p-2. */ +svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) +{ + /* (ix - One) >= (BigBound - One). */ + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); + + svfloat64_t xm1 = svsub_x (pg, x, 1.0); + svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0)); + svfloat64_t y = svadd_x (pg, xm1, svsqrt_x (pg, u)); + + /* Fall back to scalar routine for special lanes. */ + if (unlikely (svptest_any (pg, special))) + return special_case (x, sv_log1p_inline (y, pg), special); + return sv_log1p_inline (y, pg); +} + +TEST_SIG (SV, D, 1, acosh, 1.0, 10.0) +TEST_ULP (SV_NAME_D1 (acosh), 2.69) +TEST_DISABLE_FENV (SV_NAME_D1 (acosh)) +TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000) +TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000) +TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000) +TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/acoshf.c b/math/aarch64/sve/acoshf.c new file mode 100644 index 00000000000000..f48ef724e8ebf0 --- /dev/null +++ b/math/aarch64/sve/acoshf.c @@ -0,0 +1,51 @@ +/* + * Single-precision SVE acosh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define One 0x3f800000 +#define Thres 0x20000000 /* asuint(0x1p64) - One. */ + +#include "sv_log1pf_inline.h" + +static svfloat32_t NOINLINE +special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special) +{ + svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f); + svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ()); + return sv_call_f32 (acoshf, x, y, special); +} + +/* Single-precision SVE acosh(x) routine. Implements the same algorithm as + vector acoshf and log1p. + + Maximum error is 2.47 ULPs: + SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4 + want 0x1.e435a2p-4. */ +svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) +{ + svuint32_t ix = svreinterpret_u32 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); + + svfloat32_t xm1 = svsub_x (pg, x, 1.0f); + svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f)); + svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u)); + + if (unlikely (svptest_any (pg, special))) + return special_case (xm1, tmp, special); + return sv_log1pf_inline (tmp, pg); +} + +TEST_SIG (SV, F, 1, acosh, 1.0, 10.0) +TEST_ULP (SV_NAME_F1 (acosh), 1.97) +TEST_DISABLE_FENV (SV_NAME_F1 (acosh)) +TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500) +TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000) +TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000) +TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_asin_3u.c b/math/aarch64/sve/asin.c similarity index 80% rename from pl/math/sv_asin_3u.c rename to math/aarch64/sve/asin.c index c3dd37b145ae76..cac629afae1555 100644 --- a/pl/math/sv_asin_3u.c +++ b/math/aarch64/sve/asin.c @@ -1,14 +1,14 @@ /* * Double-precision SVE asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -42,8 +42,8 @@ static const struct data asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). The largest observed error in this region is 2.69 ulps, - _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 - want 0x1.110d7e85fdd53p-1. */ + _ZGVsMxv_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); @@ -75,10 +75,12 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); } -PL_SIG (SV, D, 1, asin, -1.0, 1.0) -PL_TEST_ULP (SV_NAME_D1 (asin), 2.19) -PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000) +TEST_SIG (SV, D, 1, asin, -1.0, 1.0) +TEST_ULP (SV_NAME_D1 (asin), 2.20) +TEST_DISABLE_FENV (SV_NAME_D1 (asin)) +TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_asinf_2u5.c b/math/aarch64/sve/asinf.c similarity index 81% rename from pl/math/sv_asinf_2u5.c rename to math/aarch64/sve/asinf.c index 8e9edc2439f5d4..fe94feba7a425b 100644 --- a/pl/math/sv_asinf_2u5.c +++ b/math/aarch64/sve/asinf.c @@ -1,14 +1,14 @@ /* * Single-precision SVE asin(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f32.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "sv_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -67,10 +67,12 @@ svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg) return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); } -PL_SIG (SV, F, 1, asin, -1.0, 1.0) -PL_TEST_ULP (SV_NAME_F1 (asin), 1.91) -PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000) -PL_TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000) \ No newline at end of file +TEST_SIG (SV, F, 1, asin, -1.0, 1.0) +TEST_ULP (SV_NAME_F1 (asin), 1.91) +TEST_DISABLE_FENV (SV_NAME_F1 (asin)) +TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000) +TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/asinh.c b/math/aarch64/sve/asinh.c new file mode 100644 index 00000000000000..5574116de1e12a --- /dev/null +++ b/math/aarch64/sve/asinh.c @@ -0,0 +1,197 @@ +/* + * Double-precision SVE asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define SignMask (0x8000000000000000) +#define One (0x3ff0000000000000) +#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */ +#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1) + +static const struct data +{ + double even_coeffs[9]; + double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17; + uint64_t off, mask; + +} data = { + /* Polynomial generated using Remez on [2^-26, 1]. */ + .even_coeffs ={ + -0x1.55555555554a7p-3, + -0x1.6db6db68332e6p-5, + -0x1.6e8b8b654a621p-6, + -0x1.c9871d10885afp-7, + -0x1.3ddca533e9f54p-7, + -0x1.b90c7099dd397p-8, + -0x1.d217026a669ecp-9, + -0x1.e0f37daef9127p-11, + -0x1.021a48685e287p-14, }, + + .c1 = 0x1.3333333326c7p-4, + .c3 = 0x1.f1c71b26fb40dp-6, + .c5 = 0x1.1c4daa9e67871p-6, + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c9 = 0x1.0becef748dafcp-7, + .c11 = 0x1.541f2bb1ffe51p-8, + .c13 = 0x1.0b5c7977aaf7p-9, + .c15 = 0x1.388b5fe542a6p-12, + .c17 = 0x1.93d4ba83d34dap-18, + + .ln2 = 0x1.62e42fefa39efp-1, + .p0 = -0x1.ffffffffffff7p-2, + .p1 = 0x1.55555555170d4p-2, + .p2 = -0x1.0000000399c27p-2, + .p3 = 0x1.999b2e90e94cap-3, + .p4 = -0x1.554e550bd501ep-3, + .off = 0x3fe6900900000000, + .mask = 0xfffULL << 52, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (asinh, x, y, special); +} + +static inline svfloat64_t +__sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) +{ + /* Double-precision SVE log, copied from SVE log implementation with some + cosmetic modification and special-cases removed. See that file for details + of the algorithm used. */ + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t i_off = svsub_x (pg, ix, d->off); + svuint64_t i + = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask)); + svfloat64_t z = svreinterpret_f64 (iz); + + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + + svfloat64_t ln2_p3 = svld1rq (svptrue_b64 (), &d->ln2); + svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1); + + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); + svfloat64_t kd + = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52)); + + svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1); + svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0); + + y = svmla_lane (y, r2, p1_p4, 1); + y = svmla_x (pg, p, r2, y); + y = svmla_x (pg, hi, r2, y); + return y; +} + +/* Double-precision implementation of SVE asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 2.51 ULP, in + |x| >= 1: + _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1 + want 0x1.e3181c43b0f39p-1. */ +svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svuint64_t iax = svbic_x (pg, ix, SignMask); + svuint64_t sign = svand_x (pg, ix, SignMask); + svfloat64_t ax = svreinterpret_f64 (iax); + svbool_t ge1 = svcmpge (pg, iax, One); + svbool_t special = svcmpge (pg, iax, Thres); + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */ + svfloat64_t option_1 = sv_f64 (0); + if (likely (svptest_any (pg, ge1))) + { + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); + option_1 = __sv_log_inline ( + svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + The largest observed error in this region is 1.51 ULPs: + _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 + want 0x1.c1e649ee2681dp-1. */ + + svfloat64_t option_2 = sv_f64 (0); + if (likely (svptest_any (pg, svnot_z (pg, ge1)))) + { + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); + svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2); + /* Order-17 Pairwise Horner scheme. */ + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); + + svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1); + svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1); + svfloat64_t p1213 + = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0); + svfloat64_t p1415 + = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1); + svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17); + + svfloat64_t p = svmla_x (pg, p1415, x4, p1617); + p = svmla_x (pg, p1213, x4, p); + p = svmla_x (pg, p1011, x4, p); + p = svmla_x (pg, p89, x4, p); + + p = svmla_x (pg, p67, x4, p); + p = svmla_x (pg, p45, x4, p); + + p = svmla_x (pg, p23, x4, p); + + p = svmla_x (pg, p01, x4, p); + + option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax)); + } + + if (unlikely (svptest_any (pg, special))) + return special_case ( + x, + svreinterpret_f64 (sveor_x ( + pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)), + special); + + /* Choose the right option for each lane. */ + svfloat64_t y = svsel (ge1, option_1, option_2); + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} + +TEST_SIG (SV, D, 1, asinh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (asinh), 2.52) +TEST_DISABLE_FENV (SV_NAME_D1 (asinh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0, 0x1p-26, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0x1p-26, 1, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 1, 0x1p511, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (asinh), 0x1p511, inf, 40000) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the v_sel is choosing the right option in all cases. */ +TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 0.5) +TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 2) +TEST_CONTROL_VALUE (SV_NAME_D1 (asinh), 0x1p600) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_asinhf_2u5.c b/math/aarch64/sve/asinhf.c similarity index 53% rename from pl/math/sv_asinhf_2u5.c rename to math/aarch64/sve/asinhf.c index 1f1f6e5c846f68..32aedbfd3a6d35 100644 --- a/pl/math/sv_asinhf_2u5.c +++ b/math/aarch64/sve/asinhf.c @@ -1,31 +1,33 @@ /* * Single-precision SVE asinh(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "include/mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "sv_log1pf_inline.h" -#define BigBound (0x5f800000) /* asuint(0x1p64). */ +#define BigBound 0x5f800000 /* asuint(0x1p64). */ static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special) { + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); + y = svreinterpret_f32 ( + svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y))); return sv_call_f32 (asinhf, x, y, special); } /* Single-precision SVE asinh(x) routine. Implements the same algorithm as vector asinhf and log1p. - Maximum error is 2.48 ULPs: - SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4 - want 0x1.ffbbb8p-4. */ + Maximum error is 1.92 ULPs: + SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2 + want -0x1.fd0bc8p-2. */ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) { svfloat32_t ax = svabs_x (pg, x); @@ -41,15 +43,15 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg); if (unlikely (svptest_any (pg, special))) - return special_case ( - x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))), - special); + return special_case (iax, sign, y, special); return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))); } -PL_SIG (SV, F, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_F1 (asinh), 1.98) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000) +TEST_SIG (SV, F, 1, asinh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (asinh), 1.43) +TEST_DISABLE_FENV (SV_NAME_F1 (asinh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000) +TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_atan_2u5.c b/math/aarch64/sve/atan.c similarity index 86% rename from pl/math/sv_atan_2u5.c rename to math/aarch64/sve/atan.c index 7ab486a4c9d2c4..73fc29a94f23f6 100644 --- a/pl/math/sv_atan_2u5.c +++ b/math/aarch64/sve/atan.c @@ -1,14 +1,14 @@ /* * Double-precision vector atan(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" static const struct data { @@ -79,9 +79,11 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) return y; } -PL_SIG (SV, D, 1, atan, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_D1 (atan), 1.78) -PL_TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000) -PL_TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000) +TEST_SIG (SV, D, 1, atan, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (atan), 1.78) +TEST_DISABLE_FENV (SV_NAME_D1 (atan)) +TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_atan2_2u5.c b/math/aarch64/sve/atan2.c similarity index 74% rename from pl/math/sv_atan2_2u5.c rename to math/aarch64/sve/atan2.c index 00530a324a76fb..1e1d00678b1d91 100644 --- a/pl/math/sv_atan2_2u5.c +++ b/math/aarch64/sve/atan2.c @@ -1,14 +1,14 @@ /* * Double-precision vector atan2(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" static const struct data { @@ -27,9 +27,6 @@ static const struct data .pi_over_2 = 0x1.921fb54442d18p+0, }; -/* Useful constants. */ -#define SignMask sv_u64 (0x8000000000000000) - /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ static svfloat64_t NOINLINE special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, @@ -51,7 +48,8 @@ zeroinfnan (svuint64_t i, const svbool_t pg) x are reasonably close together. The greatest observed error is 2.28 ULP: _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ -svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) +svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, + const svbool_t pg) { const struct data *data_ptr = ptr_barrier (&data); @@ -62,14 +60,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) svbool_t cmp_y = zeroinfnan (iy, pg); svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - svuint64_t sign_x = svand_x (pg, ix, SignMask); - svuint64_t sign_y = svand_x (pg, iy, SignMask); - svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); - svfloat64_t ax = svabs_x (pg, x); svfloat64_t ay = svabs_x (pg, y); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t iay = svreinterpret_u64 (ay); + + svuint64_t sign_x = sveor_x (pg, ix, iax); + svuint64_t sign_y = sveor_x (pg, iy, iay); + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ @@ -78,8 +77,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) svfloat64_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); + shift = svsel (pred_aygtax, sv_f64 (1.0), shift); + shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); shift = svmul_x (pg, shift, data_ptr->pi_over_2); /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ @@ -99,18 +99,20 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); - if (unlikely (svptest_any (pg, cmp_xy))) - return special_case (y, x, ret, cmp_xy); - - return ret; + return special_case ( + y, x, + svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)), + cmp_xy); + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); } /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ -PL_SIG (SV, D, 2, atan2) -PL_TEST_ULP (SV_NAME_D2 (atan2), 1.78) -PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000) -PL_TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000) +TEST_SIG (SV, D, 2, atan2) +TEST_ULP (SV_NAME_D2 (atan2), 1.78) +TEST_DISABLE_FENV (SV_NAME_D2 (atan2)) +TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_atan2f_3u.c b/math/aarch64/sve/atan2f.c similarity index 68% rename from pl/math/sv_atan2f_3u.c rename to math/aarch64/sve/atan2f.c index 9ff73ecb74ba20..563b708cfcbb1f 100644 --- a/pl/math/sv_atan2f_3u.c +++ b/math/aarch64/sve/atan2f.c @@ -1,14 +1,14 @@ /* * Single-precision vector atan2f(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f32.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" static const struct data { @@ -22,10 +22,8 @@ static const struct data .pi_over_2 = 0x1.921fb6p+0f, }; -#define SignMask sv_u32 (0x80000000) - /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ -static inline svfloat32_t +static svfloat32_t NOINLINE special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, const svbool_t cmp) { @@ -46,7 +44,8 @@ zeroinfnan (svuint32_t i, const svbool_t pg) observed error is 2.95 ULP: _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 want 0x1.967f00p-1. */ -svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) +svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, + const svbool_t pg) { const struct data *data_ptr = ptr_barrier (&data); @@ -57,14 +56,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) svbool_t cmp_y = zeroinfnan (iy, pg); svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - svuint32_t sign_x = svand_x (pg, ix, SignMask); - svuint32_t sign_y = svand_x (pg, iy, SignMask); - svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); - svfloat32_t ax = svabs_x (pg, x); svfloat32_t ay = svabs_x (pg, y); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t iay = svreinterpret_u32 (ay); + + svuint32_t sign_x = sveor_x (pg, ix, iax); + svuint32_t sign_y = sveor_x (pg, iy, iay); + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ @@ -73,11 +73,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) svfloat32_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); + shift = svsel (pred_aygtax, sv_f32 (1.0), shift); + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ svfloat32_t z2 = svmul_x (pg, z, z); svfloat32_t z4 = svmul_x (pg, z2, z2); svfloat32_t z8 = svmul_x (pg, z4, z4); @@ -91,18 +92,22 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); if (unlikely (svptest_any (pg, cmp_xy))) - return special_case (y, x, ret, cmp_xy); + return special_case ( + y, x, + svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)), + cmp_xy); - return ret; + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); } /* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ -PL_SIG (SV, F, 2, atan2) -PL_TEST_ULP (SV_NAME_F2 (atan2), 2.45) -PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000) -PL_TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000) +TEST_SIG (SV, F, 2, atan2) +TEST_ULP (SV_NAME_F2 (atan2), 2.45) +TEST_DISABLE_FENV (SV_NAME_F2 (atan2)) +TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_atanf_2u9.c b/math/aarch64/sve/atanf.c similarity index 83% rename from pl/math/sv_atanf_2u9.c rename to math/aarch64/sve/atanf.c index 4defb356e7f9cd..a2cd37b1274449 100644 --- a/pl/math/sv_atanf_2u9.c +++ b/math/aarch64/sve/atanf.c @@ -1,14 +1,14 @@ /* * Single-precision vector atan(x) function. * - * Copyright (c) 2021-2023, Arm Limited. + * Copyright (c) 2021-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f32.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" static const struct data { @@ -68,9 +68,11 @@ svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_SIG (SV, F, 1, atan, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_F1 (atan), 2.9) -PL_TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000) +TEST_SIG (SV, F, 1, atan, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (atan), 2.9) +TEST_DISABLE_FENV (SV_NAME_F1 (atan)) +TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000) +TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000) +TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000) +TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_atanh_3u3.c b/math/aarch64/sve/atanh.c similarity index 72% rename from pl/math/sv_atanh_3u3.c rename to math/aarch64/sve/atanh.c index dcc9350b4962b1..b404df56fd7541 100644 --- a/pl/math/sv_atanh_3u3.c +++ b/math/aarch64/sve/atanh.c @@ -1,13 +1,13 @@ /* * Double-precision SVE atanh(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define WANT_SV_LOG1P_K0_SHORTCUT 0 #include "sv_log1p_inline.h" @@ -34,7 +34,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half)); /* It is special if iax >= 1. */ -// svbool_t special = svcmpge (pg, iax, One); svbool_t special = svacge (pg, x, 1.0); /* Computation is performed based on the following sequence of equality: @@ -50,11 +49,14 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) return svmul_x (pg, halfsign, y); } -PL_SIG (SV, D, 1, atanh, -1.0, 1.0) -PL_TEST_ULP (SV_NAME_D1 (atanh), 3.32) +TEST_SIG (SV, D, 1, atanh, -1.0, 1.0) +TEST_ULP (SV_NAME_D1 (atanh), 3.32) +TEST_DISABLE_FENV (SV_NAME_D1 (atanh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000) +TEST_SYM_INTERVAL (SV_NAME_D1 (atanh), 1, inf, 100) /* atanh is asymptotic at 1, which is the default control value - have to set - -c 0 specially to ensure fp exceptions are triggered correctly (choice of - control lane is irrelevant if fp exceptions are disabled). */ -PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0) -PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0) -PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 1, inf, 100, 0) + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +TEST_CONTROL_VALUE (SV_NAME_D1 (atanh), 0) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_atanhf_2u8.c b/math/aarch64/sve/atanhf.c similarity index 61% rename from pl/math/sv_atanhf_2u8.c rename to math/aarch64/sve/atanhf.c index 413c60ce05daf8..2e10a8cd22f7f1 100644 --- a/pl/math/sv_atanhf_2u8.c +++ b/math/aarch64/sve/atanhf.c @@ -1,14 +1,13 @@ /* * Single-precision vector atanh(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #include "sv_log1pf_inline.h" @@ -16,15 +15,18 @@ #define Half (0x3f000000) static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign, + svfloat32_t y, svbool_t special) { + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); + y = svmul_x (svptrue_b32 (), halfsign, y); return sv_call_f32 (atanhf, x, y, special); } /* Approximation for vector single-precision atanh(x) using modified log1p. - The maximum error is 2.28 ULP: - _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5 - want 0x1.ffbbb6p-5. */ + The maximum error is 1.99 ULP: + _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5 + want 0x1.f1f4f6p-5. */ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) { svfloat32_t ax = svabs_x (pg, x); @@ -41,16 +43,19 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) y = sv_log1pf_inline (y, pg); if (unlikely (svptest_any (pg, special))) - return special_case (x, svmul_x (pg, halfsign, y), special); + return special_case (iax, sign, halfsign, y, special); return svmul_x (pg, halfsign, y); } -PL_SIG (SV, F, 1, atanh, -1.0, 1.0) -PL_TEST_ULP (SV_NAME_F1 (atanh), 2.59) +TEST_SIG (SV, F, 1, atanh, -1.0, 1.0) +TEST_ULP (SV_NAME_F1 (atanh), 1.50) +TEST_DISABLE_FENV (SV_NAME_F1 (atanh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000) +TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000) +TEST_SYM_INTERVAL (SV_NAME_F1 (atanh), 1, inf, 1000) /* atanh is asymptotic at 1, which is the default control value - have to set -c 0 specially to ensure fp exceptions are triggered correctly (choice of control lane is irrelevant if fp exceptions are disabled). */ -PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000, 0) -PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000, 0) -PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 1, inf, 1000, 0) +TEST_CONTROL_VALUE (SV_NAME_F1 (atanh), 0) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cbrt_2u.c b/math/aarch64/sve/cbrt.c similarity index 77% rename from pl/math/sv_cbrt_2u.c rename to math/aarch64/sve/cbrt.c index 192f1cd80d5909..3e6a972463f0a7 100644 --- a/pl/math/sv_cbrt_2u.c +++ b/math/aarch64/sve/cbrt.c @@ -1,14 +1,14 @@ /* * Double-precision SVE cbrt(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" const static struct data { @@ -48,10 +48,16 @@ shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i) } /* Approximation for double-precision vector cbrt(x), using low-order - polynomial and two Newton iterations. Greatest observed error is 1.79 ULP. - Errors repeat according to the exponent, for instance an error observed for - double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i - is an integer. + polynomial and two Newton iterations. + + The vector version of frexp does not handle subnormals + correctly. As a result these need to be handled by the scalar + fallback, where accuracy may be worse than that of the vector code + path. + + Greatest observed error in the normal range is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value m + * 2^e will be observed for any input m * 2^(e + 3*i), where i is an integer. _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342 want 0x1.965f53b0e5d95p-342. */ svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg) @@ -117,6 +123,13 @@ svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg) return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); } -PL_SIG (SV, D, 1, cbrt, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_D1 (cbrt), 1.30) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000) +/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which + has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error + in the vector path is 1.79 ULP. + [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical + Functions in Single, Double, Double Extended, and Quadruple Precision. */ +TEST_SIG (SV, D, 1, cbrt, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (cbrt), 3.17) +TEST_DISABLE_FENV (SV_NAME_D1 (cbrt)) +TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cbrtf_1u7.c b/math/aarch64/sve/cbrtf.c similarity index 92% rename from pl/math/sv_cbrtf_1u7.c rename to math/aarch64/sve/cbrtf.c index 5b625f308827ce..afdace7865f19c 100644 --- a/pl/math/sv_cbrtf_1u7.c +++ b/math/aarch64/sve/cbrtf.c @@ -1,14 +1,14 @@ /* * Single-precision SVE cbrt(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f32.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" const static struct data { @@ -111,6 +111,8 @@ svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg) return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); } -PL_SIG (SV, F, 1, cbrt, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_F1 (cbrt), 1.15) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000) +TEST_SIG (SV, F, 1, cbrt, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (cbrt), 1.15) +TEST_DISABLE_FENV (SV_NAME_F1 (cbrt)) +TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cexpi_3u5.c b/math/aarch64/sve/cexpi.c similarity index 79% rename from pl/math/sv_cexpi_3u5.c rename to math/aarch64/sve/cexpi.c index 920acfea5da0f5..0ccd110484c88b 100644 --- a/pl/math/sv_cexpi_3u5.c +++ b/math/aarch64/sve/cexpi.c @@ -1,13 +1,13 @@ /* * Double-precision vector cexpi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "sv_sincos_common.h" #include "sv_math.h" -#include "pl_test.h" +#include "sv_sincos_common.h" +#include "test_defs.h" static svfloat64x2_t NOINLINE special_case (svfloat64_t x, svbool_t special, svfloat64x2_t y) @@ -34,12 +34,15 @@ _ZGVsMxv_cexpi (svfloat64_t x, svbool_t pg) return sc; } -PL_TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73) -PL_TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73) +TEST_DISABLE_FENV (_ZGVsMxv_cexpi_sin) +TEST_DISABLE_FENV (_ZGVsMxv_cexpi_cos) +TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73) +TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73) #define SV_CEXPI_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n) + TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n) SV_CEXPI_INTERVAL (0, 0x1p23, 500000) SV_CEXPI_INTERVAL (-0, -0x1p23, 500000) SV_CEXPI_INTERVAL (0x1p23, inf, 10000) SV_CEXPI_INTERVAL (-0x1p23, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cexpif_1u8.c b/math/aarch64/sve/cexpif.c similarity index 80% rename from pl/math/sv_cexpif_1u8.c rename to math/aarch64/sve/cexpif.c index 93f2f998cb3896..fd07ce553cd893 100644 --- a/pl/math/sv_cexpif_1u8.c +++ b/math/aarch64/sve/cexpif.c @@ -1,13 +1,13 @@ /* * Single-precision vector cexpi function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "sv_sincosf_common.h" #include "sv_math.h" -#include "pl_test.h" +#include "sv_sincosf_common.h" +#include "test_defs.h" static svfloat32x2_t NOINLINE special_case (svfloat32_t x, svbool_t special, svfloat32x2_t y) @@ -36,12 +36,15 @@ _ZGVsMxv_cexpif (svfloat32_t x, svbool_t pg) return sc; } -PL_TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17) -PL_TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31) +TEST_DISABLE_FENV (_ZGVsMxv_cexpif_sin) +TEST_DISABLE_FENV (_ZGVsMxv_cexpif_cos) +TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17) +TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31) #define SV_CEXPIF_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n) + TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n) SV_CEXPIF_INTERVAL (0, 0x1p20, 500000) SV_CEXPIF_INTERVAL (-0, -0x1p20, 500000) SV_CEXPIF_INTERVAL (0x1p20, inf, 10000) SV_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cos_2u5.c b/math/aarch64/sve/cos.c similarity index 88% rename from pl/math/sv_cos_2u5.c rename to math/aarch64/sve/cos.c index 76af3459b3f2e2..93e93674a98a1d 100644 --- a/pl/math/sv_cos_2u5.c +++ b/math/aarch64/sve/cos.c @@ -1,13 +1,13 @@ /* * Double-precision SVE cos(x) function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -80,7 +80,9 @@ svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) return svmul_x (pg, f, y); } -PL_SIG (SV, D, 1, cos, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_D1 (cos), 1.61) -PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000) +TEST_SIG (SV, D, 1, cos, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (cos), 1.61) +TEST_DISABLE_FENV (SV_NAME_D1 (cos)) +TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000) +TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cosf_2u1.c b/math/aarch64/sve/cosf.c similarity index 87% rename from pl/math/sv_cosf_2u1.c rename to math/aarch64/sve/cosf.c index 4bdb0dd146bbfc..7d18f8c2ad21a4 100644 --- a/pl/math/sv_cosf_2u1.c +++ b/math/aarch64/sve/cosf.c @@ -1,13 +1,13 @@ /* * Single-precision SVE cos(x) function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -74,7 +74,9 @@ svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg) return svmul_x (pg, f, y); } -PL_SIG (SV, F, 1, cos, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_F1 (cos), 1.57) -PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000) +TEST_SIG (SV, F, 1, cos, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (cos), 1.57) +TEST_DISABLE_FENV (SV_NAME_F1 (cos)) +TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000) +TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cosh_2u.c b/math/aarch64/sve/cosh.c similarity index 77% rename from pl/math/sv_cosh_2u.c rename to math/aarch64/sve/cosh.c index a6d743fb9b966a..775854cfbe5a87 100644 --- a/pl/math/sv_cosh_2u.c +++ b/math/aarch64/sve/cosh.c @@ -1,19 +1,19 @@ /* * Double-precision SVE cosh(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2025, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float64_t poly[3]; float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; - uint64_t index_mask, special_bound; + uint64_t special_bound; } data = { .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, 0x1.5555576a59599p-5, }, @@ -25,14 +25,16 @@ static const struct data .shift = 0x1.8p+52, .thres = 704.0, - .index_mask = 0xff, /* 0x1.6p9, above which exp overflows. */ .special_bound = 0x4086000000000000, }; static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special) { + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + svfloat64_t y = svadd_x (pg, half_t, half_over_t); return sv_call_f64 (cosh, x, y, special); } @@ -50,12 +52,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) svuint64_t u = svreinterpret_u64 (z); svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); - svuint64_t i = svand_x (pg, u, d->index_mask); + svuint64_t i = svand_x (svptrue_b64 (), u, 0xff); svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); y = svmla_x (pg, sv_f64 (1.0), r, y); - y = svmul_x (pg, r, y); + y = svmul_x (svptrue_b64 (), r, y); /* s = 2^(n/N). */ u = svld1_gather_index (pg, __v_exp_tail_data, i); @@ -84,17 +86,19 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) /* Up to the point that exp overflows, we can use it to calculate cosh by exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ svfloat64_t t = exp_inline (ax, pg, d); - svfloat64_t half_t = svmul_x (pg, t, 0.5); - svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); /* Fall back to scalar for any special cases. */ if (unlikely (svptest_any (pg, special))) - return special_case (x, svadd_x (pg, half_t, half_over_t), special); + return special_case (x, pg, t, special); + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); return svadd_x (pg, half_t, half_over_t); } -PL_SIG (SV, D, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_D1 (cosh), 1.43) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000) +TEST_SIG (SV, D, 1, cosh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (cosh), 1.43) +TEST_DISABLE_FENV (SV_NAME_D1 (cosh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000) +CLOSE_SVE_ATTR \ No newline at end of file diff --git a/math/aarch64/sve/coshf.c b/math/aarch64/sve/coshf.c new file mode 100644 index 00000000000000..b79fed2374b55c --- /dev/null +++ b/math/aarch64/sve/coshf.c @@ -0,0 +1,62 @@ +/* + * Single-precision SVE cosh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_expf_inline.h" + +static const struct data +{ + struct sv_expf_data expf_consts; + float special_bound; +} data = { + .expf_consts = SV_EXPF_DATA, + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = 0x1.5a92d8p+6, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e, + svbool_t pg) +{ + return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e), + pg); +} + +/* Single-precision vector cosh, using vector expf. + Maximum error is 2.77 ULP: + _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2 + want 0x1.e4594cp+2. */ +svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svbool_t special = svacge (pg, x, d->special_bound); + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. + Note that x is passed to exp here, rather than |x|. This is to avoid using + destructive unary ABS for better register usage. However it means the + routine is not exactly symmetrical, as the exp helper is slightly less + accurate in the negative range. */ + svfloat32_t e = expf_inline (x, pg, &d->expf_consts); + svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5); + svfloat32_t half_over_e = svdivr_x (pg, e, 0.5); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, half_e, half_over_e, special); + + return svadd_x (svptrue_b32 (), half_e, half_over_e); +} + +TEST_SIG (SV, F, 1, cosh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (cosh), 2.28) +TEST_DISABLE_FENV (SV_NAME_F1 (cosh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100) +TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cospi_3u2.c b/math/aarch64/sve/cospi.c similarity index 78% rename from pl/math/sv_cospi_3u2.c rename to math/aarch64/sve/cospi.c index d80f899c41e410..9859dbe7a44c7e 100644 --- a/pl/math/sv_cospi_3u2.c +++ b/math/aarch64/sve/cospi.c @@ -1,15 +1,15 @@ /* * Double-precision SVE cospi(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "mathlib.h" #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" static const struct data { @@ -55,9 +55,12 @@ svfloat64_t SV_NAME_D1 (cospi) (svfloat64_t x, const svbool_t pg) return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); } -PL_SIG (SV, D, 1, cospi, -0.9, 0.9) -PL_TEST_ULP (SV_NAME_D1 (cospi), 2.71) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000) +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_D1 (cospi), 2.71) +TEST_DISABLE_FENV (SV_NAME_D1 (cospi)) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000) +#endif +CLOSE_SVE_ATTR diff --git a/pl/math/sv_cospif_2u6.c b/math/aarch64/sve/cospif.c similarity index 75% rename from pl/math/sv_cospif_2u6.c rename to math/aarch64/sve/cospif.c index fb2922d0533abf..d65a2b6190231f 100644 --- a/pl/math/sv_cospif_2u6.c +++ b/math/aarch64/sve/cospif.c @@ -1,15 +1,15 @@ /* * Single-precision SVE cospi(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "mathlib.h" #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f32.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" static const struct data { @@ -51,9 +51,12 @@ svfloat32_t SV_NAME_F1 (cospi) (svfloat32_t x, const svbool_t pg) return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_SIG (SV, F, 1, cospi, -0.9, 0.9) -PL_TEST_ULP (SV_NAME_F1 (cospi), 2.08) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000) +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_F1 (cospi), 2.08) +TEST_DISABLE_FENV (SV_NAME_F1 (cospi)) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/pl/math/sv_erf_2u5.c b/math/aarch64/sve/erf.c similarity index 83% rename from pl/math/sv_erf_2u5.c rename to math/aarch64/sve/erf.c index cbf9718e5bb0fd..ccade93e103397 100644 --- a/pl/math/sv_erf_2u5.c +++ b/math/aarch64/sve/erf.c @@ -1,13 +1,13 @@ /* * Double-precision vector erf(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -57,14 +57,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg) svfloat64_t a = svabs_x (pg, x); svfloat64_t shift = sv_f64 (dat->shift); svfloat64_t z = svadd_x (pg, a, shift); - svuint64_t i - = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift)); + svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff); + i = svadd_x (pg, i, i); /* Lookup without shortcut for small values but with predicate to avoid segfault for large values and NaNs. */ svfloat64_t r = svsub_x (pg, z, shift); - svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i); - svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i); + svfloat64_t erfr + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i); + svfloat64_t scale + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i); /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ svfloat64_t d = svsub_x (pg, a, r); @@ -104,8 +106,10 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg) return svreinterpret_f64 (svorr_x (pg, sign, iy)); } -PL_SIG (SV, D, 1, erf, -6.0, 6.0) -PL_TEST_ULP (SV_NAME_D1 (erf), 1.79) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000) +TEST_SIG (SV, D, 1, erf, -6.0, 6.0) +TEST_ULP (SV_NAME_D1 (erf), 1.79) +TEST_DISABLE_FENV (SV_NAME_D1 (erf)) +TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000) +TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000) +TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_erfc_1u8.c b/math/aarch64/sve/erfc.c similarity index 91% rename from pl/math/sv_erfc_1u8.c rename to math/aarch64/sve/erfc.c index a91bef96f2e73a..a85cacb1ae6226 100644 --- a/pl/math/sv_erfc_1u8.c +++ b/math/aarch64/sve/erfc.c @@ -1,13 +1,13 @@ /* * Double-precision vector erfc(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -93,7 +93,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg) /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ i = svadd_x (pg, i, i); - const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr; + const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr; svfloat64_t erfcr = svld1_gather_index (pg, p, i); svfloat64_t scale = svld1_gather_index (pg, p + 1, i); @@ -155,10 +155,12 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg) return svmla_x (pg, off, fac, y); } -PL_SIG (SV, D, 1, erfc, -6.0, 28.0) -PL_TEST_ULP (SV_NAME_D1 (erfc), 1.21) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000) -PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) -PL_TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) -PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000) -PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000) +TEST_SIG (SV, D, 1, erfc, -6.0, 28.0) +TEST_ULP (SV_NAME_D1 (erfc), 1.21) +TEST_DISABLE_FENV (SV_NAME_D1 (erfc)) +TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000) +TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_erfcf_1u7.c b/math/aarch64/sve/erfcf.c similarity index 77% rename from pl/math/sv_erfcf_1u7.c rename to math/aarch64/sve/erfcf.c index cda8f0b3752e6f..936881332291ad 100644 --- a/pl/math/sv_erfcf_1u7.c +++ b/math/aarch64/sve/erfcf.c @@ -1,13 +1,13 @@ /* * Single-precision vector erfc(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -66,23 +66,23 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx); /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ - i = svmul_x (pg, i, 2); - const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr; + i = svlsl_x (svptrue_b32 (), i, 1); + const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr; svfloat32_t erfcr = svld1_gather_index (pg, p, i); svfloat32_t scale = svld1_gather_index (pg, p + 1, i); /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ svfloat32_t r = svsub_x (pg, z, shift); svfloat32_t d = svsub_x (pg, a, r); - svfloat32_t d2 = svmul_x (pg, d, d); - svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third); - svfloat32_t third = svdup_lane (coeffs, 0); svfloat32_t p1 = r; - svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1); - svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); + svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1); + svfloat32_t p3 + = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2); p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4); @@ -102,10 +102,12 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) return svmla_x (pg, off, fac, y); } -PL_SIG (SV, F, 1, erfc, -4.0, 10.0) -PL_TEST_ULP (SV_NAME_F1 (erfc), 1.14) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000) +TEST_SIG (SV, F, 1, erfc, -4.0, 10.0) +TEST_ULP (SV_NAME_F1 (erfc), 1.14) +TEST_DISABLE_FENV (SV_NAME_F1 (erfc)) +TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000) +TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_erff_2u.c b/math/aarch64/sve/erff.c similarity index 77% rename from pl/math/sv_erff_2u.c rename to math/aarch64/sve/erff.c index adeee798ee2e08..c8c87499a63fae 100644 --- a/pl/math/sv_erff_2u.c +++ b/math/aarch64/sve/erff.c @@ -1,13 +1,13 @@ /* * Single-precision vector erf(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -52,18 +52,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg) svfloat32_t shift = sv_f32 (dat->shift); svfloat32_t z = svadd_x (pg, a, shift); - svuint32_t i - = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift)); - - /* Saturate lookup index. */ - i = svsel (a_ge_max, sv_u32 (512), i); + svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff); + i = svadd_x (pg, i, i); /* r and erf(r) set to 0 for |x| below min. */ svfloat32_t r = svsub_z (a_gt_min, z, shift); - svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i); + svfloat32_t erfr + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i); /* scale set to 2/sqrt(pi) for |x| below min. */ - svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i); + svfloat32_t scale + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i); scale = svsel (a_gt_min, scale, sv_f32 (dat->scale)); /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */ @@ -82,9 +81,11 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg) return svreinterpret_f32 (svorr_x (pg, sign, iy)); } -PL_SIG (SV, F, 1, erf, -4.0, 4.0) -PL_TEST_ULP (SV_NAME_F1 (erf), 1.43) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000) +TEST_SIG (SV, F, 1, erf, -4.0, 4.0) +TEST_ULP (SV_NAME_F1 (erf), 1.43) +TEST_DISABLE_FENV (SV_NAME_F1 (erf)) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000) +TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_exp_1u5.c b/math/aarch64/sve/exp.c similarity index 79% rename from pl/math/sv_exp_1u5.c rename to math/aarch64/sve/exp.c index c187def9e62530..b021e64ffedf21 100644 --- a/pl/math/sv_exp_1u5.c +++ b/math/aarch64/sve/exp.c @@ -1,22 +1,25 @@ /* * Double-precision vector e^x function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2025, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { - double poly[4]; + double c0, c2; + double c1, c3; double ln2_hi, ln2_lo, inv_ln2, shift, thres; + } data = { - .poly = { /* ulp error: 0.53. */ - 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5, - 0x1.1111266d28935p-7 }, + .c0 = 0x1.fffffffffdbcdp-2, + .c1 = 0x1.555555555444cp-3, + .c2 = 0x1.555573c6a9f7dp-5, + .c3 = 0x1.1111266d28935p-7, .ln2_hi = 0x1.62e42fefa3800p-1, .ln2_lo = 0x1.ef35793c76730p-45, /* 1/ln2. */ @@ -26,7 +29,6 @@ static const struct data .thres = 704.0, }; -#define C(i) sv_f64 (d->poly[i]) #define SpecialOffset 0x6000000000000000 /* 0x1p513. */ /* SpecialBias1 + SpecialBias1 = asuint(1.0). */ #define SpecialBias1 0x7000000000000000 /* 0x1p769. */ @@ -46,20 +48,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n) svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ - /* Set s1 to generate overflow depending on sign of exponent n. */ - svfloat64_t s1 = svreinterpret_f64 ( - svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */ - /* Offset s to avoid overflow in final result if n is below threshold. */ + /* Set s1 to generate overflow depending on sign of exponent n, + ie. s1 = 0x70...0 - b. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. + ie. s2 = as_u64 (s) - 0x3010...0 + b. */ svfloat64_t s2 = svreinterpret_f64 ( - svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), - b)); /* as_u64 (s) - 0x3010...0 + b. */ + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, 1280.0); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -93,16 +95,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); svuint64_t u = svreinterpret_u64 (z); svfloat64_t n = svsub_x (pg, z, d->shift); - + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); svfloat64_t r = svmls_lane (x, n, ln2, 0); r = svmls_lane (r, n, ln2, 1); /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p01 = svmla_x (pg, C (0), C (1), r); - svfloat64_t p23 = svmla_x (pg, C (2), C (3), r); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); svfloat64_t p04 = svmla_x (pg, p01, p23, r2); svfloat64_t y = svmla_x (pg, r, p04, r2); @@ -129,9 +131,11 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) return svmla_x (pg, s, s, y); } -PL_SIG (SV, D, 1, exp, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_D1 (exp), 1.46) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000) +TEST_SIG (SV, D, 1, exp, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (exp), 1.46) +TEST_DISABLE_FENV (SV_NAME_D1 (exp)) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_exp10_1u5.c b/math/aarch64/sve/exp10.c similarity index 79% rename from pl/math/sv_exp10_1u5.c rename to math/aarch64/sve/exp10.c index 519693afcab0b3..3d6af334e155f2 100644 --- a/pl/math/sv_exp10_1u5.c +++ b/math/aarch64/sve/exp10.c @@ -1,28 +1,30 @@ /* * Double-precision SVE 10^x function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2025, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" +#include "test_sig.h" +#include "test_defs.h" #define SpecialBound 307.0 /* floor (log10 (2^1023)). */ static const struct data { - double poly[5]; + double c1, c3, c2, c4, c0; double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound; } data = { /* Coefficients generated using Remez algorithm. rel error: 0x1.9fcb9b3p-60 abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ] max ulp err 0.52 +0.5. */ - .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1, - 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 }, + .c0 = 0x1.26bb1bbb55516p1, + .c1 = 0x1.53524c73cd32ap1, + .c2 = 0x1.0470591daeafbp1, + .c3 = 0x1.2bd77b1361ef6p0, + .c4 = 0x1.142b5d54e9621p-1, /* 1.5*2^46+1023. This value is further explained below. */ .shift = 0x1.800000000ffc0p+46, .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ @@ -60,9 +62,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, d->scale_thres); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -93,11 +95,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) comes at significant performance cost. */ svuint64_t u = svreinterpret_u64 (z); svfloat64_t scale = svexpa (u); - + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); /* Approximate exp10(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2, - sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1)); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + svfloat64_t p14 = svmla_x (pg, p12, p34, r2); + + svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14); /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound multiplication may overflow, so use special case routine. */ @@ -116,7 +121,11 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) return svmla_x (pg, scale, scale, y); } -PL_SIG (SV, D, 1, exp10, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_D1 (exp10), 0.52) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, 307, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 307, inf, 1000) +#if WANT_EXP10_TESTS +TEST_SIG (SV, D, 1, exp10, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (exp10), 0.52) +TEST_DISABLE_FENV (SV_NAME_D1 (exp10)) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, SpecialBound, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), SpecialBound, inf, 1000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/exp10f.c b/math/aarch64/sve/exp10f.c new file mode 100644 index 00000000000000..8679df87702f0f --- /dev/null +++ b/math/aarch64/sve/exp10f.c @@ -0,0 +1,101 @@ +/* + * Single-precision SVE 10^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" + +/* For x < -Thres, the result is subnormal and not handled correctly by + FEXPA. */ +#define Thres 37.9 + +static const struct data +{ + float log2_10_lo, c0, c2, c4; + float c1, c3, log10_2; + float shift, log2_10_hi, thres; +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 0.52 +0.5 ulp. */ + .c0 = 0x1.26bb16p+1f, + .c1 = 0x1.5350d2p+1f, + .c2 = 0x1.04744ap+1f, + .c3 = 0x1.2d8176p+0f, + .c4 = 0x1.12b41ap-1f, + /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ + .shift = 0x1.803f8p17f, + .log10_2 = 0x1.a934fp+1, + .log2_10_hi = 0x1.344136p-2, + .log2_10_lo = -0x1.ec10cp-27, + .thres = Thres, +}; + +static inline svfloat32_t +sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) +{ + /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ + + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo); + + /* n = round(x/(log10(2)/N)). */ + svfloat32_t shift = sv_f32 (d->shift); + svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift); + svfloat32_t n = svsub_x (svptrue_b32 (), z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x); + r = svmls_lane (r, n, lane_consts, 0); + + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct data *d) +{ + return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d), + special); +} + +/* Single-precision SVE exp10f routine. Implements the same algorithm + as AdvSIMD exp10f. + Worst case error is 1.02 ULPs. + _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 + want 0x1.ba5f9cp-1. */ +svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacgt (pg, x, d->thres); + if (unlikely (svptest_any (special, special))) + return special_case (x, special, d); + return sv_exp10f_inline (x, pg, d); +} + +#if WANT_EXP10_TESTS +TEST_SIG (SV, F, 1, exp10, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (exp10), 0.52) +TEST_DISABLE_FENV (SV_NAME_F1 (exp10)) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, Thres, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), Thres, inf, 50000) +#endif +CLOSE_SVE_ATTR diff --git a/pl/math/sv_exp2_2u.c b/math/aarch64/sve/exp2.c similarity index 72% rename from pl/math/sv_exp2_2u.c rename to math/aarch64/sve/exp2.c index dcbca8adddd1de..adbe40c648ac9a 100644 --- a/pl/math/sv_exp2_2u.c +++ b/math/aarch64/sve/exp2.c @@ -1,14 +1,13 @@ /* * Double-precision SVE 2^x function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2025, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define N (1 << V_EXP_TABLE_BITS) @@ -17,15 +16,15 @@ static const struct data { - double poly[4]; + double c0, c2; + double c1, c3; double shift, big_bound, uoflow_bound; } data = { /* Coefficients are computed using Remez algorithm with minimisation of the absolute error. */ - .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5, - 0x1.3b2abf5571ad8p-7 }, - .shift = 0x1.8p52 / N, - .uoflow_bound = UOFlowBound, + .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3, + .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7, + .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound, .big_bound = BigBound, }; @@ -57,9 +56,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -89,19 +88,24 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); /* Approximate exp2(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); - svfloat64_t y = svmul_x (pg, r, p); - + /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); + svfloat64_t p = svmla_x (pg, p01, p23, r2); + svfloat64_t y = svmul_x (svptrue_b64 (), r, p); /* Assemble exp2(x) = exp2(r) * scale. */ if (unlikely (svptest_any (pg, special))) return special_case (pg, scale, y, kd, d); return svmla_x (pg, scale, scale, y); } -PL_SIG (SV, D, 1, exp2, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_D1 (exp2), 1.15) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000) +TEST_SIG (SV, D, 1, exp2, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (exp2), 1.15) +TEST_DISABLE_FENV (SV_NAME_D1 (exp2)) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000) +TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/exp2f.c b/math/aarch64/sve/exp2f.c new file mode 100644 index 00000000000000..f4c1d0ae607e01 --- /dev/null +++ b/math/aarch64/sve/exp2f.c @@ -0,0 +1,83 @@ +/* + * Single-precision SVE 2^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Thres 0x1.5d5e2ap+6f + +static const struct data +{ + float c0, c2, c4, c1, c3; + float shift, thres; +} data = { + /* Coefficients copied from the polynomial in AdvSIMD variant. */ + .c0 = 0x1.62e422p-1f, + .c1 = 0x1.ebf9bcp-3f, + .c2 = 0x1.c6bd32p-5f, + .c3 = 0x1.3ce9e4p-7f, + .c4 = 0x1.59977ap-10f, + /* 1.5*2^17 + 127. */ + .shift = 0x1.803f8p17f, + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ + .thres = Thres, +}; + +static inline svfloat32_t +sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) +{ + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift); + svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift); + svfloat32_t r = svsub_x (svptrue_b32 (), x, n); + + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* Polynomial evaluation: poly(r) ~ exp2(r)-1. + Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for + coefficients 1 to 4, and apply most significant coefficient directly. */ + svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2); + svfloat32_t p14 = svmla_x (pg, p12, r2, p34); + svfloat32_t p0 = svmul_lane (r, even_coeffs, 0); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct data *d) +{ + return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d), + special); +} + +/* Single-precision SVE exp2f routine. Implements the same algorithm + as AdvSIMD exp2f. + Worst case error is 1.04 ULPs. + _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1 + want 0x1.ba6a64p-1. */ +svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacgt (pg, x, d->thres); + if (unlikely (svptest_any (special, special))) + return special_case (x, special, d); + return sv_exp2f_inline (x, pg, d); +} + +TEST_SIG (SV, F, 1, exp2, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (exp2), 0.54) +TEST_DISABLE_FENV (SV_NAME_F1 (exp2)) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/expf.c b/math/aarch64/sve/expf.c new file mode 100644 index 00000000000000..11528abdbbaf7d --- /dev/null +++ b/math/aarch64/sve/expf.c @@ -0,0 +1,50 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_expf_inline.h" + +/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ +#define Thres 0x1.5d5e2ap+6f + +static const struct data +{ + struct sv_expf_data d; + float thres; +} data = { + .d = SV_EXPF_DATA, + .thres = Thres, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d) +{ + return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special); +} + +/* Optimised single-precision SVE exp function. + Worst-case error is 1.04 ulp: + SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4 + want 0x1.ba74bap+4. */ +svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t is_special_case = svacgt (pg, x, d->thres); + if (unlikely (svptest_any (pg, is_special_case))) + return special_case (x, is_special_case, &d->d); + return expf_inline (x, pg, &d->d); +} + +TEST_SIG (SV, F, 1, exp, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (exp), 0.55) +TEST_DISABLE_FENV (SV_NAME_F1 (exp)) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, Thres, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (exp), Thres, inf, 50000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_expm1_2u5.c b/math/aarch64/sve/expm1.c similarity index 86% rename from pl/math/sv_expm1_2u5.c rename to math/aarch64/sve/expm1.c index 82a31f6d9c0e92..f4fb8cb982f02b 100644 --- a/pl/math/sv_expm1_2u5.c +++ b/math/aarch64/sve/expm1.c @@ -1,14 +1,14 @@ /* * Double-precision vector exp(x) - 1 function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" #define SpecialBound 0x1.62b7d369a5aa9p+9 #define ExponentBias 0x3ff0000000000000 @@ -88,8 +88,10 @@ svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg) return y; } -PL_SIG (SV, D, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_D1 (expm1), 1.68) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000) +TEST_SIG (SV, D, 1, expm1, -9.9, 9.9) +TEST_ULP (SV_NAME_D1 (expm1), 1.68) +TEST_DISABLE_FENV (SV_NAME_D1 (expm1)) +TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000) +TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_expm1f_1u6.c b/math/aarch64/sve/expm1f.c similarity index 67% rename from pl/math/sv_expm1f_1u6.c rename to math/aarch64/sve/expm1f.c index 0ec7c00f5300b2..95f7c09a403d03 100644 --- a/pl/math/sv_expm1f_1u6.c +++ b/math/aarch64/sve/expm1f.c @@ -1,13 +1,13 @@ /* * Single-precision vector exp(x) - 1 function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* Largest value of x for which expm1(x) should round to -1. */ #define SpecialBound 0x1.5ebc4p+6f @@ -17,20 +17,17 @@ static const struct data /* These 4 are grouped together so they can be loaded as one quadword, then used with _lane forms of svmla/svmls. */ float c2, c4, ln2_hi, ln2_lo; - float c0, c1, c3, inv_ln2, special_bound, shift; + float c0, inv_ln2, c1, c3, special_bound; } data = { /* Generated using fpminimax. */ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, - .c4 = 0x1.6b55a2p-10, + .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f, + .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f, + .ln2_hi = 0x1.62e4p-1f, - .special_bound = SpecialBound, .shift = 0x1.8p23f, - .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, - .ln2_lo = 0x1.7f7d1cp-20f, }; -#define C(i) sv_f32 (d->c##i) - static svfloat32_t NOINLINE special_case (svfloat32_t x, svbool_t pg) { @@ -60,9 +57,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); - j = svsub_x (pg, j, d->shift); - svint32_t i = svcvt_s32_x (pg, j); + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); + j = svrinta_x (pg, j); svfloat32_t f = svmls_lane (x, j, lane_constants, 2); f = svmls_lane (f, j, lane_constants, 3); @@ -72,22 +68,24 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); - svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); - svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); svfloat32_t p = svmla_x (pg, p12, f2, p34); - p = svmla_x (pg, C (0), f, p); + + p = svmla_x (pg, sv_f32 (d->c0), f, p); p = svmla_x (pg, f, f2, p); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - svfloat32_t t = svreinterpret_f32 ( - svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000)); - return svmla_x (pg, svsub_x (pg, t, 1), p, t); + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); } -PL_SIG (SV, F, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_F1 (expm1), 1.02) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000) +TEST_SIG (SV, F, 1, expm1, -9.9, 9.9) +TEST_ULP (SV_NAME_F1 (expm1), 1.02) +TEST_DISABLE_FENV (SV_NAME_F1 (expm1)) +TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_hypot_1u5.c b/math/aarch64/sve/hypot.c similarity index 72% rename from pl/math/sv_hypot_1u5.c rename to math/aarch64/sve/hypot.c index cf1590e4b9ab19..2ed298623accfa 100644 --- a/pl/math/sv_hypot_1u5.c +++ b/math/aarch64/sve/hypot.c @@ -1,13 +1,13 @@ /* * Double-precision SVE hypot(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -43,9 +43,11 @@ svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg) return svsqrt_x (pg, sqsum); } -PL_SIG (SV, D, 2, hypot, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_D2 (hypot), 0.71) -PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) +TEST_SIG (SV, D, 2, hypot, -10.0, 10.0) +TEST_ULP (SV_NAME_D2 (hypot), 0.71) +TEST_DISABLE_FENV (SV_NAME_D2 (hypot)) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_hypotf_1u5.c b/math/aarch64/sve/hypotf.c similarity index 69% rename from pl/math/sv_hypotf_1u5.c rename to math/aarch64/sve/hypotf.c index f428832b3dbcd7..b977b998986b87 100644 --- a/pl/math/sv_hypotf_1u5.c +++ b/math/aarch64/sve/hypotf.c @@ -1,13 +1,13 @@ /* * Single-precision SVE hypot(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" #define TinyBound 0x0c800000 /* asuint (0x1p-102). */ #define Thres 0x73000000 /* 0x70000000 - TinyBound. */ @@ -37,9 +37,11 @@ svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y, return svsqrt_x (pg, sqsum); } -PL_SIG (SV, F, 2, hypot, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_F2 (hypot), 0.71) -PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) +TEST_SIG (SV, F, 2, hypot, -10.0, 10.0) +TEST_ULP (SV_NAME_F2 (hypot), 0.71) +TEST_DISABLE_FENV (SV_NAME_F2 (hypot)) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log.c b/math/aarch64/sve/log.c new file mode 100644 index 00000000000000..c612df48c1fdb5 --- /dev/null +++ b/math/aarch64/sve/log.c @@ -0,0 +1,97 @@ +/* + * Double-precision SVE log(x) function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define N (1 << V_LOG_TABLE_BITS) +#define Max (0x7ff0000000000000) +#define Min (0x0010000000000000) +#define Thresh (0x7fe0000000000000) /* Max - Min. */ + +static const struct data +{ + double c0, c2; + double c1, c3; + double ln2, c4; + uint64_t off; +} data = { + .c0 = -0x1.ffffffffffff7p-2, + .c1 = 0x1.55555555170d4p-2, + .c2 = -0x1.0000000399c27p-2, + .c3 = 0x1.999b2e90e94cap-3, + .c4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .off = 0x3fe6900900000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) +{ + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special); +} + +/* Double-precision SVE log routine. + Maximum measured error is 2.64 ulp: + SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6 + want 0x1.fffffffe88cafp+6. */ +svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. + The actual value of i is double this due to table layout. */ + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); + /* Lookup in 2 global lists (length N). */ + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + /* hi = r + log(c) + k*Ln2. */ + svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2); + svfloat64_t r = svmad_x (pg, invc, z, -1); + svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0); + hi = svadd_x (pg, r, hi); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_lane_f64 (y, r2, ln2_and_c4, 1); + y = svmla_x (pg, p, r2, y); + + if (unlikely (svptest_any (pg, special))) + return special_case (hi, tmp, y, r2, special, d); + return svmla_x (pg, hi, r2, y); +} + +TEST_SIG (SV, D, 1, log, 0.01, 11.1) +TEST_ULP (SV_NAME_D1 (log), 2.15) +TEST_DISABLE_FENV (SV_NAME_D1 (log)) +TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000) +TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000) +TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log10.c b/math/aarch64/sve/log10.c new file mode 100644 index 00000000000000..5af142d79f55fb --- /dev/null +++ b/math/aarch64/sve/log10.c @@ -0,0 +1,101 @@ +/* + * Double-precision SVE log10(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define Min 0x0010000000000000 +#define Max 0x7ff0000000000000 +#define Thres 0x7fe0000000000000 /* Max - Min. */ +#define N (1 << V_LOG10_TABLE_BITS) + +static const struct data +{ + double c0, c2; + double c1, c3; + double invln10, log10_2; + double c4; + uint64_t off; +} data = { + .c0 = -0x1.bcb7b1526e506p-3, + .c1 = 0x1.287a7636be1d1p-3, + .c2 = -0x1.bcb7b158af938p-4, + .c3 = 0x1.63c78734e6d07p-4, + .c4 = -0x1.287461742fee4p-4, + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + .off = 0x3fe6900900000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) +{ + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special); +} + +/* Double-precision SVE log10 routine. + Maximum measured error is 2.46 ulps. + SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 + want 0x1.fffbdf6eaa667p-6. */ +svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); + + /* log(x) = k*log(2) + log(c) + log(z/c). */ + svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i); + svfloat64_t logc + = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i); + + /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1): + r = z/c - 1 (we look up precomputed 1/c) + log(z/c) ~= P(r). */ + svfloat64_t r = svmad_x (pg, invc, z, -1.0); + + /* hi = log(c) + k*log(2). */ + svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10); + svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0); + svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_x (pg, y, r2, d->c4); + y = svmla_x (pg, p, r2, y); + + if (unlikely (svptest_any (pg, special))) + return special_case (hi, tmp, y, r2, special, d); + return svmla_x (pg, hi, r2, y); +} + +TEST_SIG (SV, D, 1, log10, 0.01, 11.1) +TEST_ULP (SV_NAME_D1 (log10), 1.97) +TEST_DISABLE_FENV (SV_NAME_D1 (log10)) +TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100) +TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_log10f_3u5.c b/math/aarch64/sve/log10f.c similarity index 56% rename from pl/math/sv_log10f_3u5.c rename to math/aarch64/sve/log10f.c index a685b23e5de539..6c3add45176193 100644 --- a/pl/math/sv_log10f_3u5.c +++ b/math/aarch64/sve/log10f.c @@ -1,19 +1,20 @@ /* * Single-precision SVE log10 function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float poly_0246[4]; float poly_1357[4]; float ln2, inv_ln10; + uint32_t off, lower; } data = { .poly_1357 = { /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs @@ -25,18 +26,23 @@ static const struct data -0x1.0fc92cp-4f }, .ln2 = 0x1.62e43p-1f, .inv_ln10 = 0x1.bcb7b2p-2f, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab }; -#define Min 0x00800000 -#define Max 0x7f800000 -#define Thres 0x7f000000 /* Max - Min. */ -#define Offset 0x3f2aaaab /* 0.666667. */ +#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */ #define MantissaMask 0x007fffff static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) { - return sv_call_f32 (log10f, x, y, special); + return sv_call_f32 ( + log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); } /* Optimised implementation of SVE log10f using the same algorithm and @@ -47,23 +53,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special) svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svuint32_t ix = svreinterpret_u32 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); + + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - ix = svsub_x (pg, ix, Offset); svfloat32_t n = svcvt_f32_x ( - pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */ - ix = svand_x (pg, ix, MantissaMask); - ix = svadd_x (pg, ix, Offset); + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */ + svuint32_t ix = svand_x (pg, u_off, MantissaMask); + ix = svadd_x (pg, ix, d->off); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f); /* y = log10(1+r) + n*log10(2) log10(1+r) ~ r * InvLn(10) + P(r) where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t r4 = svmul_x (pg, r2, r2); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2); svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); @@ -78,16 +86,17 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) hi = svmul_x (pg, hi, d->inv_ln10); if (unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), - special); - return svmla_x (pg, hi, r2, y); + return special_case (u_off, hi, r2, y, special); + return svmla_x (svptrue_b32 (), hi, r2, y); } -PL_SIG (SV, F, 1, log10, 0.01, 11.1) -PL_TEST_ULP (SV_NAME_F1 (log10), 2.82) -PL_TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000) +TEST_SIG (SV, F, 1, log10, 0.01, 11.1) +TEST_ULP (SV_NAME_F1 (log10), 2.82) +TEST_DISABLE_FENV (SV_NAME_F1 (log10)) +TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100) +TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_log1p_2u5.c b/math/aarch64/sve/log1p.c similarity index 88% rename from pl/math/sv_log1p_2u5.c rename to math/aarch64/sve/log1p.c index f178ab16238ab0..e6b895b5290820 100644 --- a/pl/math/sv_log1p_2u5.c +++ b/math/aarch64/sve/log1p.c @@ -1,14 +1,14 @@ /* * Double-precision SVE log(1+x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -107,10 +107,12 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) return y; } -PL_SIG (SV, D, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (SV_NAME_D1 (log1p), 1.97) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000) -PL_TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10) +TEST_SIG (SV, D, 1, log1p, -0.9, 10.0) +TEST_ULP (SV_NAME_D1 (log1p), 1.97) +TEST_DISABLE_FENV (SV_NAME_D1 (log1p)) +TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000) +TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log1pf.c b/math/aarch64/sve/log1pf.c new file mode 100644 index 00000000000000..77ae6218f93198 --- /dev/null +++ b/math/aarch64/sve/log1pf.c @@ -0,0 +1,43 @@ +/* + * Single-precision vector log(x + 1) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_log1pf_inline.h" + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special) +{ + return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()), + special); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.27 ULP very close to 0.5. + _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2 + want 0x1.9f323ep-2. */ +svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) +{ + /* x < -1, Inf/Nan. */ + svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000); + special = svorn_z (pg, special, svcmpge (pg, x, -1)); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, special); + + return sv_log1pf_inline (x, pg); +} + +TEST_SIG (SV, F, 1, log1p, -0.9, 10.0) +TEST_ULP (SV_NAME_F1 (log1p), 0.77) +TEST_DISABLE_FENV (SV_NAME_F1 (log1p)) +TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000) +TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000) +TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000) +TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/log2.c b/math/aarch64/sve/log2.c new file mode 100644 index 00000000000000..11c65c1b296309 --- /dev/null +++ b/math/aarch64/sve/log2.c @@ -0,0 +1,96 @@ +/* + * Double-precision SVE log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define N (1 << V_LOG2_TABLE_BITS) +#define Max (0x7ff0000000000000) +#define Min (0x0010000000000000) +#define Thresh (0x7fe0000000000000) /* Max - Min. */ + +static const struct data +{ + double c0, c2; + double c1, c3; + double invln2, c4; + uint64_t off; +} data = { + .c0 = -0x1.71547652b83p-1, + .c1 = 0x1.ec709dc340953p-2, + .c2 = -0x1.71547651c8f35p-2, + .c3 = 0x1.2777ebe12dda5p-2, + .c4 = -0x1.ec738d616fe26p-3, + .invln2 = 0x1.71547652b82fep0, + .off = 0x3fe6900900000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) +{ + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special); +} + +/* Double-precision SVE log2 routine. + Implements the same algorithm as AdvSIMD log10, with coefficients and table + entries scaled in extended precision. + The maximum observed error is 2.58 ULP: + SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + svfloat64_t z = svreinterpret_f64 ( + svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); + + svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i); + svfloat64_t log2c + = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i); + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + + svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2); + svfloat64_t r = svmad_x (pg, invc, z, -1.0); + svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0); + w = svadd_x (pg, k, w); + + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_lane_f64 (y, r2, invln2_and_c4, 1); + y = svmla_x (pg, p, r2, y); + + if (unlikely (svptest_any (pg, special))) + return special_case (w, tmp, y, r2, special, d); + return svmla_x (pg, w, r2, y); +} + +TEST_SIG (SV, D, 1, log2, 0.01, 11.1) +TEST_ULP (SV_NAME_D1 (log2), 2.09) +TEST_DISABLE_FENV (SV_NAME_D1 (log2)) +TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000) +TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_log2f_2u5.c b/math/aarch64/sve/log2f.c similarity index 53% rename from pl/math/sv_log2f_2u5.c rename to math/aarch64/sve/log2f.c index 9e96c62bbcc6c9..312fd448226bf3 100644 --- a/pl/math/sv_log2f_2u5.c +++ b/math/aarch64/sve/log2f.c @@ -1,18 +1,19 @@ /* * Single-precision vector/SVE log2 function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float poly_02468[5]; float poly_1357[4]; + uint32_t off, lower; } data = { .poly_1357 = { /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs @@ -22,18 +23,23 @@ static const struct data }, .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, 0x1.9d8ecap-3f, 0x1.9e495p-3f }, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab }; -#define Min (0x00800000) -#define Max (0x7f800000) -#define Thres (0x7f000000) /* Max - Min. */ +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ #define MantissaMask (0x007fffff) -#define Off (0x3f2aaaab) /* 0.666667. */ static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) { - return sv_call_f32 (log2f, x, y, cmp); + return sv_call_f32 ( + log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); } /* Optimised implementation of SVE log2f, using the same algorithm @@ -45,19 +51,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svuint32_t u = svreinterpret_u32 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres); + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_x (pg, u, Off); svfloat32_t n = svcvt_f32_x ( - pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ - u = svand_x (pg, u, MantissaMask); - u = svadd_x (pg, u, Off); + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ + svuint32_t u = svand_x (pg, u_off, MantissaMask); + u = svadd_x (pg, u, d->off); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log2(1+r) + n. */ - svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); /* Evaluate polynomial using pairwise Horner scheme. */ svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); @@ -71,16 +78,17 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) y = svmla_x (pg, q_01, r2, y); if (unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special); - return svmla_x (pg, n, r, y); + return special_case (u_off, n, r, y, special); + return svmla_x (svptrue_b32 (), n, r, y); } -PL_SIG (SV, F, 1, log2, 0.01, 11.1) -PL_TEST_ULP (SV_NAME_F1 (log2), 1.99) -PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_F1 (log2)) -PL_TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000) -PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000) -PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000) +TEST_SIG (SV, F, 1, log2, 0.01, 11.1) +TEST_ULP (SV_NAME_F1 (log2), 1.99) +TEST_DISABLE_FENV (SV_NAME_F1 (log2)) +TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000) +TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_logf_3u4.c b/math/aarch64/sve/logf.c similarity index 52% rename from pl/math/sv_logf_3u4.c rename to math/aarch64/sve/logf.c index 96735524703621..2898e36974d6d8 100644 --- a/pl/math/sv_logf_3u4.c +++ b/math/aarch64/sve/logf.c @@ -1,19 +1,20 @@ /* * Single-precision vector log function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { float poly_0135[4]; float poly_246[3]; float ln2; + uint32_t off, lower; } data = { .poly_0135 = { /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so @@ -22,21 +23,24 @@ static const struct data -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f }, .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, - .ln2 = 0x1.62e43p-1f + .ln2 = 0x1.62e43p-1f, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab }; -#define Min (0x00800000) -#define Max (0x7f800000) -#define Thresh (0x7f000000) /* Max - Min. */ +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ #define Mask (0x007fffff) -#define Off (0x3f2aaaab) /* 0.666667. */ - -float optr_aor_log_f32 (float); static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) { - return sv_call_f32 (optr_aor_log_f32, x, y, cmp); + return sv_call_f32 ( + logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); } /* Optimised implementation of SVE logf, using the same algorithm and @@ -47,19 +51,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svuint32_t u = svreinterpret_u32 (x); - svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh); + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_x (pg, u, Off); svfloat32_t n = svcvt_f32_x ( - pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ - u = svand_x (pg, u, Mask); - u = svadd_x (pg, u, Off); + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ + + svuint32_t u = svand_x (pg, u_off, Mask); + u = svadd_x (pg, u, d->off); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log(1+r) + n*ln2. */ - svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); @@ -72,15 +78,17 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) p = svmla_x (pg, r, n, d->ln2); if (unlikely (svptest_any (pg, cmp))) - return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp); + return special_case (u_off, p, r2, y, cmp); return svmla_x (pg, p, r2, y); } -PL_SIG (SV, F, 1, log, 0.01, 11.1) -PL_TEST_ULP (SV_NAME_F1 (log), 2.85) -PL_TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100) -PL_TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100) -PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000) +TEST_SIG (SV, F, 1, log, 0.01, 11.1) +TEST_ULP (SV_NAME_F1 (log), 2.85) +TEST_DISABLE_FENV (SV_NAME_F1 (log)) +TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100) +TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100) +TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/modf.c b/math/aarch64/sve/modf.c new file mode 100644 index 00000000000000..5944c7d37c4c19 --- /dev/null +++ b/math/aarch64/sve/modf.c @@ -0,0 +1,36 @@ +/* + * Double-precision SVE modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modf algorithm. Produces exact values in all rounding modes. */ +svfloat64_t SV_NAME_D1_L1 (modf) (svfloat64_t x, double *out_int, + const svbool_t pg) +{ + /* Get integer component of x. */ + svfloat64_t fint_comp = svrintz_x (pg, x); + + svst1_f64 (pg, out_int, fint_comp); + + /* Subtract integer component from input. */ + svfloat64_t remaining = svsub_f64_x (svptrue_b64 (), x, fint_comp); + + /* Return +0 for integer x. */ + svbool_t is_integer = svcmpeq (pg, x, fint_comp); + return svsel (is_integer, sv_f64 (0), remaining); +} + +TEST_ULP (_ZGVsMxvl8_modf_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_frac, 1, inf, 20000) + +TEST_ULP (_ZGVsMxvl8_modf_int, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl8_modf_int, 1, inf, 20000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/modff.c b/math/aarch64/sve/modff.c new file mode 100644 index 00000000000000..ad7ce4e2c88fb6 --- /dev/null +++ b/math/aarch64/sve/modff.c @@ -0,0 +1,36 @@ +/* + * Single-precision SVE modff(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modff algorithm. Produces exact values in all rounding modes. */ +svfloat32_t SV_NAME_F1_L1 (modf) (svfloat32_t x, float *out_int, + const svbool_t pg) +{ + /* Get integer component of x. */ + svfloat32_t fint_comp = svrintz_x (pg, x); + + svst1_f32 (pg, out_int, fint_comp); + + /* Subtract integer component from input. */ + svfloat32_t remaining = svsub_f32_x (svptrue_b32 (), x, fint_comp); + + /* Return +0 for integer x. */ + svbool_t is_integer = svcmpeq (pg, x, fint_comp); + return svsel (is_integer, sv_f32 (0), remaining); +} + +TEST_ULP (_ZGVsMxvl4_modff_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_frac, 1, inf, 20000) + +TEST_ULP (_ZGVsMxvl4_modff_int, 0.0) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVsMxvl4_modff_int, 1, inf, 20000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_pow_1u5.c b/math/aarch64/sve/pow.c similarity index 64% rename from pl/math/sv_pow_1u5.c rename to math/aarch64/sve/pow.c index 0838810206a1a2..12b2fb42b2cb76 100644 --- a/pl/math/sv_pow_1u5.c +++ b/math/aarch64/sve/pow.c @@ -1,13 +1,13 @@ /* * Double-precision SVE pow(x, y) function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2025, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* This version share a similar algorithm as AOR scalar pow. @@ -23,8 +23,8 @@ The SVE algorithm drops the tail in the exp computation at the price of a lower accuracy, slightly above 1ULP. The SVE algorithm also drops the special treatement of small (< 2^-65) and - large (> 2^63) finite values of |y|, as they only affect non-round to nearest - modes. + large (> 2^63) finite values of |y|, as they only affect non-round to + nearest modes. Maximum measured error is 1.04 ULPs: SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12) @@ -33,19 +33,18 @@ /* Data is defined in v_pow_log_data.c. */ #define N_LOG (1 << V_POW_LOG_TABLE_BITS) -#define A __v_pow_log_data.poly #define Off 0x3fe6955500000000 /* Data is defined in v_pow_exp_data.c. */ #define N_EXP (1 << V_POW_EXP_TABLE_BITS) #define SignBias (0x800 << V_POW_EXP_TABLE_BITS) -#define C __v_pow_exp_data.poly #define SmallExp 0x3c9 /* top12(0x1p-54). */ #define BigExp 0x408 /* top12(512.). */ #define ThresExp 0x03f /* BigExp - SmallExp. */ #define HugeExp 0x409 /* top12(1024.). */ /* Constants associated with pow. */ +#define SmallBoundX 0x1p-126 #define SmallPowX 0x001 /* top12(0x1p-126). */ #define BigPowX 0x7ff /* top12(INFINITY). */ #define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ @@ -53,6 +52,31 @@ #define BigPowY 0x43e /* top12(0x1.749p62). */ #define ThresPowY 0x080 /* BigPowY - SmallPowY. */ +static const struct data +{ + double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo; + double log_c1, log_c3, log_c5, off; + double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo; + double exp_c0, exp_c1; +} data = { + .log_c0 = -0x1p-1, + .log_c1 = -0x1.555555555556p-1, + .log_c2 = 0x1.0000000000006p-1, + .log_c3 = 0x1.999999959554ep-1, + .log_c4 = -0x1.555555529a47ap-1, + .log_c5 = -0x1.2495b9b4845e9p0, + .log_c6 = 0x1.0002b8b263fc3p0, + .off = Off, + .exp_c0 = 0x1.fffffffffffd4p-2, + .exp_c1 = 0x1.5555571d6ef9p-3, + .exp_c2 = 0x1.5555576a5adcep-5, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP, + .ln2_over_n_hi = 0x1.62e42fefc0000p-9, + .ln2_over_n_lo = -0x1.c610ca86c3899p-45, +}; + /* Check if x is an integer. */ static inline svbool_t sv_isint (svbool_t pg, svfloat64_t x) @@ -71,7 +95,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x) static inline svbool_t sv_isodd (svbool_t pg, svfloat64_t x) { - svfloat64_t y = svmul_x (pg, x, 0.5); + svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5); return sv_isnotint (pg, y); } @@ -110,7 +134,7 @@ zeroinfnan (uint64_t i) static inline svbool_t sv_zeroinfnan (svbool_t pg, svuint64_t i) { - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1), + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 2 * asuint64 (INFINITY) - 1); } @@ -163,23 +187,24 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, additional 15 bits precision. IX is the bit representation of x, but normalized in the subnormal range using the sign bit for the exponent. */ static inline svfloat64_t -sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail, + const struct data *d) { /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t tmp = svsub_x (pg, ix, d->off); svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), sv_u64 (N_LOG - 1)); svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52))); + svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52)); svfloat64_t z = svreinterpret_f64 (iz); svfloat64_t kd = svcvt_f64_x (pg, k); /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version - that uses array of structures. We also do the lookup earlier in the code to - make sure it finishes as early as possible. */ + that uses array of structures. We also do the lookup earlier in the code + to make sure it finishes as early as possible. */ svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i); svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i); svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i); @@ -188,40 +213,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ svfloat64_t r = svmad_x (pg, z, invc, -1.0); /* k*Ln2 + log(c) + r. */ - svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi); + + svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi); + svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0); svfloat64_t t2 = svadd_x (pg, t1, r); - svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo); + svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1); svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); /* Evaluation is optimized assuming superscalar pipelined execution. */ - svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */ - svfloat64_t ar2 = svmul_x (pg, r, ar); - svfloat64_t ar3 = svmul_x (pg, r, ar2); + + svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0); + svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0); + svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar); + svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2); /* k*Ln2 + log(c) + r + A[0]*r*r. */ svfloat64_t hi = svadd_x (pg, t2, ar2); - svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r); + svfloat64_t lo3 = svmls_x (pg, ar2, ar, r); svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); /* p = log1p(r) - r - A[0]*r*r. */ /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * A[6])))). */ - svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]); - svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]); - svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]); + + svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4); + svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1); + svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0); + svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1); svfloat64_t p = svmla_x (pg, a34, ar2, a56); p = svmla_x (pg, a12, ar2, p); - p = svmul_x (pg, ar3, p); + p = svmul_x (svptrue_b64 (), ar3, p); svfloat64_t lo = svadd_x ( - pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); + pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); svfloat64_t y = svadd_x (pg, hi, lo); *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); return y; } +static inline svfloat64_t +sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits, + svuint64_t *ki, const struct data *d) +{ + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2); + svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svrinta_x (pg, z); + *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd)); + + svfloat64_t ln2_over_n_hilo + = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi); + svfloat64_t r = x; + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0); + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = svadd_x (pg, r, xtail); + /* 2^(k/N) ~= scale. */ + svuint64_t idx = svand_x (pg, *ki, N_EXP - 1); + svuint64_t top + = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); + *sbits = svadd_x (pg, *sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1); + *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp); + *tmp = svmla_x (pg, r, r2, *tmp); + svfloat64_t scale = svreinterpret_f64 (*sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + z = svmla_x (pg, scale, scale, *tmp); + return z; +} + /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ static inline svfloat64_t sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, - svuint64_t sign_bias) + svuint64_t sign_bias, const struct data *d) { /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) and other cases of large values of x (scale * (1 + TMP) oflow). */ @@ -229,73 +299,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); - /* Conditions special, uflow and oflow are all expressed as uoflow && - something, hence do not bother computing anything if no lane in uoflow is - true. */ - svbool_t special = svpfalse_b (); - svbool_t uflow = svpfalse_b (); - svbool_t oflow = svpfalse_b (); + svfloat64_t tmp; + svuint64_t sbits, ki; if (unlikely (svptest_any (pg, uoflow))) { + svfloat64_t z + = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); + /* |x| is tiny (|x| <= 0x1p-54). */ - uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); + svbool_t uflow + = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); uflow = svand_z (pg, uoflow, uflow); /* |x| is huge (|x| >= 1024). */ - oflow = svcmpge (pg, abstop, HugeExp); + svbool_t oflow = svcmpge (pg, abstop, HugeExp); oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow - or underflow. */ - special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + or underflow. */ + svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + + /* Update result with special and large cases. */ + z = sv_call_specialcase (tmp, sbits, ki, z, special); + + /* Handle underflow and overflow. */ + svbool_t x_is_neg = svcmplt (pg, x, 0); + svuint64_t sign_mask + = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); + svfloat64_t res_uoflow + = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); + res_uoflow = svreinterpret_f64 ( + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); + /* Avoid spurious underflow for tiny x. */ + svfloat64_t res_spurious_uflow + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); + + z = svsel (oflow, res_uoflow, z); + z = svsel (uflow, res_spurious_uflow, z); + return z; } - /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ - /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ - svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2); - /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift); - svfloat64_t kd = svadd_x (pg, z, shift); - svuint64_t ki = svreinterpret_u64 (kd); - kd = svsub_x (pg, kd, shift); - svfloat64_t r = x; - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi); - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo); - /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ - r = svadd_x (pg, r, xtail); - /* 2^(k/N) ~= scale. */ - svuint64_t idx = svand_x (pg, ki, N_EXP - 1); - svuint64_t top - = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); - /* This is only a valid scale when -1023*N < k < 1024*N. */ - svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); - sbits = svadd_x (pg, sbits, top); - /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]); - tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp); - tmp = svmla_x (pg, r, r2, tmp); - svfloat64_t scale = svreinterpret_f64 (sbits); - /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there - is no spurious underflow here even without fma. */ - z = svmla_x (pg, scale, scale, tmp); - - /* Update result with special and large cases. */ - if (unlikely (svptest_any (pg, special))) - z = sv_call_specialcase (tmp, sbits, ki, z, special); - - /* Handle underflow and overflow. */ - svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63); - svbool_t x_is_neg = svcmpne (pg, sign_bit, 0); - svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); - svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); - res_uoflow = svreinterpret_f64 ( - svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); - z = svsel (oflow, res_uoflow, z); - /* Avoid spurious underflow for tiny x. */ - svfloat64_t res_spurious_uflow - = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); - z = svsel (uflow, res_spurious_uflow, z); - - return z; + return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); } static inline double @@ -323,56 +366,46 @@ pow_sc (double x, double y) double_t x2 = x * x; if (ix >> 63 && checkint (iy) == 1) x2 = -x2; - /* Without the barrier some versions of clang hoist the 1/x2 and - thus division by zero exception can be signaled spuriously. */ - return (iy >> 63) ? opt_barrier_double (1 / x2) : x2; + return (iy >> 63) ? 1 / x2 : x2; } return x; } svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* This preamble handles special case conditions used in the final scalar fallbacks. It also updates ix and sign_bias, that are used in the core computation too, i.e., exp( y * log (x) ). */ svuint64_t vix0 = svreinterpret_u64 (x); svuint64_t viy0 = svreinterpret_u64 (y); - svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52); /* Negative x cases. */ - svuint64_t sign_bit = svlsr_m (pg, vix0, 63); - svbool_t xisneg = svcmpeq (pg, sign_bit, 1); + svbool_t xisneg = svcmplt (pg, x, 0); /* Set sign_bias and ix depending on sign of x and nature of y. */ - svbool_t yisnotint_xisneg = svpfalse_b (); + svbool_t yint_or_xpos = pg; svuint64_t sign_bias = sv_u64 (0); svuint64_t vix = vix0; - svuint64_t vtopx1 = vtopx0; if (unlikely (svptest_any (pg, xisneg))) { /* Determine nature of y. */ - yisnotint_xisneg = sv_isnotint (xisneg, y); - svbool_t yisint_xisneg = sv_isint (xisneg, y); + yint_or_xpos = sv_isint (xisneg, y); svbool_t yisodd_xisneg = sv_isodd (xisneg, y); /* ix set to abs(ix) if y is integer. */ - vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff); - vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff); + vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff); /* Set to SignBias if x is negative and y is odd. */ sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); } - /* Special cases of x or y: zero, inf and nan. */ - svbool_t xspecial = sv_zeroinfnan (pg, vix0); - svbool_t yspecial = sv_zeroinfnan (pg, viy0); - svbool_t special = svorr_z (pg, xspecial, yspecial); - /* Small cases of x: |x| < 0x1p-126. */ - svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff); - svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX); - if (unlikely (svptest_any (pg, xsmall))) + svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX); + if (unlikely (svptest_any (yint_or_xpos, xsmall))) { /* Normalize subnormal x so exponent becomes negative. */ - svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0); + svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52); + svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0); svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); @@ -382,33 +415,38 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) /* y_hi = log(ix, &y_lo). */ svfloat64_t vlo; - svfloat64_t vhi = sv_log_inline (pg, vix, &vlo); + svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d); /* z = exp(y_hi, y_lo, sign_bias). */ - svfloat64_t vehi = svmul_x (pg, y, vhi); - svfloat64_t velo = svmul_x (pg, y, vlo); - svfloat64_t vemi = svmls_x (pg, vehi, y, vhi); - velo = svsub_x (pg, velo, vemi); - svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias); + svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi); + svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi); + svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo); + svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d); /* Cases of finite y and finite negative x. */ - vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz); + vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan (""))); + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0); + svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0); + svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial); /* Cases of zero/inf/nan x or y. */ - if (unlikely (svptest_any (pg, special))) + if (unlikely (svptest_any (svptrue_b64 (), special))) vz = sv_call2_f64 (pow_sc, x, y, vz, special); return vz; } -PL_SIG (SV, D, 2, pow) -PL_TEST_ULP (SV_NAME_D2 (pow), 0.55) +TEST_SIG (SV, D, 2, pow) +TEST_ULP (SV_NAME_D2 (pow), 0.55) +TEST_DISABLE_FENV (SV_NAME_D2 (pow)) /* Wide intervals spanning the whole domain but shared between x and y. */ -#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) #define EXPAND(str) str##000000000 #define SHL52(str) EXPAND (str) SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) @@ -426,10 +464,10 @@ SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) /* x is negative, y is odd or even integer, or y is real not integer. */ -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) /* |x| is inf, y is odd or even integer, or y is real not integer. */ SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1) SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1) @@ -438,7 +476,8 @@ SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1) /* 0.0^y. */ SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) /* 1.0^y. */ -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) -PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_powf_2u6.c b/math/aarch64/sve/powf.c similarity index 69% rename from pl/math/sv_powf_2u6.c rename to math/aarch64/sve/powf.c index 2db0636aea6211..8457e83e749510 100644 --- a/pl/math/sv_powf_2u6.c +++ b/math/aarch64/sve/powf.c @@ -1,13 +1,13 @@ /* * Single-precision SVE powf function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2025, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* The following data is used in the SVE pow core computation and special case detection. */ @@ -15,7 +15,6 @@ #define Tlogc __v_powf_data.logc #define Texp __v_powf_data.scale #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) -#define Shift 0x1.8p52 #define Norm 0x1p23f /* 0x4b000000. */ /* Overall ULP error bound for pow is 2.6 ulp @@ -25,7 +24,7 @@ static const struct data double log_poly[4]; double exp_poly[3]; float uflow_bound, oflow_bound, small_bound; - uint32_t sign_bias, sign_mask, subnormal_bias, off; + uint32_t sign_bias, subnormal_bias, off; } data = { /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of V_POWF_EXP2_N. */ @@ -42,7 +41,6 @@ static const struct data .small_bound = 0x1p-126f, .off = 0x3f35d000, .sign_bias = SignBias, - .sign_mask = 0x80000000, .subnormal_bias = 0x0b800000, /* 23 << 23. */ }; @@ -75,7 +73,7 @@ svisodd (svbool_t pg, svfloat32_t x) static inline svbool_t sv_zeroinfnan (svbool_t pg, svuint32_t i) { - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 2u * 0x7f800000 - 1); } @@ -104,7 +102,7 @@ zeroinfnan (uint32_t ix) } /* A scalar subroutine used to fix main power special cases. Similar to the - preamble of finite_powf except that we do not update ix and sign_bias. This + preamble of scalar powf except that we do not update ix and sign_bias. This is done in the preamble of the SVE powf. */ static inline float powf_specialcase (float x, float y, float z) @@ -139,9 +137,14 @@ powf_specialcase (float x, float y, float z) } /* Scalar fallback for special case routines with custom signature. */ -static inline svfloat32_t -sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y) { + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1)); + svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2)); + svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial); + svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { @@ -171,30 +174,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, /* Polynomial to approximate log1p(r)/ln2. */ svfloat64_t logx = A (0); - logx = svmla_x (pg, A (1), r, logx); - logx = svmla_x (pg, A (2), r, logx); - logx = svmla_x (pg, A (3), r, logx); - logx = svmla_x (pg, y0, r, logx); + logx = svmad_x (pg, r, logx, A (1)); + logx = svmad_x (pg, r, logx, A (2)); + logx = svmad_x (pg, r, logx, A (3)); + logx = svmad_x (pg, r, logx, y0); *pylogx = svmul_x (pg, y, logx); /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - svfloat64_t kd = svadd_x (pg, *pylogx, Shift); - svuint64_t ki = svreinterpret_u64 (kd); - kd = svsub_x (pg, kd, Shift); + svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx); + svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd)); r = svsub_x (pg, *pylogx, kd); /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ - svuint64_t t - = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); - svuint64_t ski = svadd_x (pg, ki, sign_bias); - t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); + svuint64_t t = svld1_gather_index ( + svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1)); + svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias); + t = svadd_x (svptrue_b64 (), t, + svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS)); svfloat64_t s = svreinterpret_f64 (t); svfloat64_t p = C (0); p = svmla_x (pg, C (1), p, r); p = svmla_x (pg, C (2), p, r); - p = svmla_x (pg, s, p, svmul_x (pg, s, r)); + p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r)); return p; } @@ -208,19 +211,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, { const svbool_t ptrue = svptrue_b64 (); - /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in - order to perform core computation in double precision. */ + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two + in order to perform core computation in double precision. */ const svbool_t pg_lo = svunpklo (pg); const svbool_t pg_hi = svunpkhi (pg); - svfloat64_t y_lo = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); - svfloat64_t y_hi = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); - svfloat32_t z = svreinterpret_f32 (iz); - svfloat64_t z_lo = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); - svfloat64_t z_hi = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); + svfloat64_t y_lo + = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); + svfloat64_t y_hi + = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); + svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz))); + svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz))); svuint64_t i_lo = svunpklo (i); svuint64_t i_hi = svunpkhi (i); svint64_t k_lo = svunpklo (k); @@ -247,9 +247,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, /* Implementation of SVE powf. Provides the same accuracy as AdvSIMD powf, since it relies on the same algorithm. The theoretical maximum error is under 2.60 ULPs. - Maximum measured error is 2.56 ULPs: - SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 - want 0x1.fd4b06p+127. */ + Maximum measured error is 2.57 ULPs: + SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127 + want 0x1.fff862p+127. */ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) { const struct data *d = ptr_barrier (&data); @@ -258,21 +258,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) svuint32_t viy0 = svreinterpret_u32 (y); /* Negative x cases. */ - svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); - svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); + svbool_t xisneg = svcmplt (pg, x, sv_f32 (0)); /* Set sign_bias and ix depending on sign of x and nature of y. */ - svbool_t yisnotint_xisneg = svpfalse_b (); + svbool_t yint_or_xpos = pg; svuint32_t sign_bias = sv_u32 (0); svuint32_t vix = vix0; if (unlikely (svptest_any (pg, xisneg))) { /* Determine nature of y. */ - yisnotint_xisneg = svisnotint (xisneg, y); - svbool_t yisint_xisneg = svisint (xisneg, y); + yint_or_xpos = svisint (xisneg, y); svbool_t yisodd_xisneg = svisodd (xisneg, y); /* ix set to abs(ix) if y is integer. */ - vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); + vix = svand_m (yint_or_xpos, vix0, 0x7fffffff); /* Set to SignBias if x is negative and y is odd. */ sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); } @@ -283,8 +281,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) svbool_t cmp = svorr_z (pg, xspecial, yspecial); /* Small cases of x: |x| < 0x1p-126. */ - svbool_t xsmall = svaclt (pg, x, d->small_bound); - if (unlikely (svptest_any (pg, xsmall))) + svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound); + if (unlikely (svptest_any (yint_or_xpos, xsmall))) { /* Normalize subnormal x so exponent becomes negative. */ svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); @@ -293,44 +291,48 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) vix = svsel (xsmall, vix_norm, vix); } /* Part of core computation carried in working precision. */ - svuint32_t tmp = svsub_x (pg, vix, d->off); - svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), - V_POWF_LOG2_N - 1); - svuint32_t top = svand_x (pg, tmp, 0xff800000); - svuint32_t iz = svsub_x (pg, vix, top); - svint32_t k - = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); - - /* Compute core in extended precision and return intermediate ylogx results to - handle cases of underflow and underflow in exp. */ + svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off); + svuint32_t i = svand_x ( + yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + V_POWF_LOG2_N - 1); + svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000); + svuint32_t iz = svsub_x (yint_or_xpos, vix, top); + svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top), + (23 - V_POWF_EXP2_TABLE_BITS)); + + /* Compute core in extended precision and return intermediate ylogx results + to handle cases of underflow and underflow in exp. */ svfloat32_t ylogx; - svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); + svfloat32_t ret + = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d); /* Handle exp special cases of underflow and overflow. */ - svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); + svuint32_t sign + = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); svfloat32_t ret_oflow - = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); + = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY))); svfloat32_t ret_uflow = svreinterpret_f32 (sign); - ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); - ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); + ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret); + ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret); /* Cases of finite y and finite negative x. */ - ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); + ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf (""))); - if (unlikely (svptest_any (pg, cmp))) - return sv_call_powf_sc (x, y, ret, cmp); + if (unlikely (svptest_any (cmp, cmp))) + return sv_call_powf_sc (x, y, ret); return ret; } -PL_SIG (SV, F, 2, pow) -PL_TEST_ULP (SV_NAME_F2 (pow), 2.06) +TEST_SIG (SV, F, 2, pow) +TEST_ULP (SV_NAME_F2 (pow), 2.08) +TEST_DISABLE_FENV (SV_NAME_F2 (pow)) /* Wide intervals spanning the whole domain but shared between x and y. */ -#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n) SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000) SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000) SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000) @@ -342,10 +344,10 @@ SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) /* x is negative, y is odd or even integer, or y is real not integer. */ -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) /* |x| is inf, y is odd or even integer, or y is real not integer. */ SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1) SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1) @@ -354,7 +356,8 @@ SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1) /* 0.0^y. */ SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) /* 1.0^y. */ -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) -PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sin_3u5.c b/math/aarch64/sve/sin.c similarity index 89% rename from pl/math/sv_sin_3u5.c rename to math/aarch64/sve/sin.c index a81f3fc80f3d77..7e22515ceb7949 100644 --- a/pl/math/sv_sin_3u5.c +++ b/math/aarch64/sve/sin.c @@ -1,13 +1,13 @@ /* * Double-precision SVE sin(x) function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -90,7 +90,9 @@ svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg) return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd)); } -PL_SIG (SV, D, 1, sin, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_D1 (sin), 2.73) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000) +TEST_SIG (SV, D, 1, sin, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (sin), 2.73) +TEST_DISABLE_FENV (SV_NAME_D1 (sin)) +TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sincos_3u5.c b/math/aarch64/sve/sincos.c similarity index 72% rename from pl/math/sv_sincos_3u5.c rename to math/aarch64/sve/sincos.c index f73550082d5b82..26b8bb3c6a5a21 100644 --- a/pl/math/sv_sincos_3u5.c +++ b/math/aarch64/sve/sincos.c @@ -1,7 +1,7 @@ /* * Double-precision vector sincos function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,12 +9,22 @@ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to be linked against the scalar sincosf from math/. */ #define _GNU_SOURCE -#include -#undef _GNU_SOURCE -#include "sv_sincos_common.h" #include "sv_math.h" -#include "pl_test.h" +#include "sv_sincos_common.h" +#include "test_defs.h" + +#include + +/* sincos not available for all scalar libm implementations. */ +#ifndef __GLIBC__ +static void +sincos (double x, double *out_sin, double *out_cos) +{ + *out_sin = sin (x); + *out_cos = cos (x); +} +#endif static void NOINLINE special_case (svfloat64_t x, svbool_t special, double *out_sin, @@ -50,12 +60,14 @@ _ZGVsMxvl8l8_sincos (svfloat64_t x, double *out_sin, double *out_cos, special_case (x, special, out_sin, out_cos); } -PL_TEST_ULP (_ZGVsMxv_sincos_sin, 2.73) -PL_TEST_ULP (_ZGVsMxv_sincos_cos, 2.73) +TEST_DISABLE_FENV (_ZGVsMxv_sincos_sin) +TEST_DISABLE_FENV (_ZGVsMxv_sincos_cos) +TEST_ULP (_ZGVsMxv_sincos_sin, 2.73) +TEST_ULP (_ZGVsMxv_sincos_cos, 2.73) #define SV_SINCOS_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n) -SV_SINCOS_INTERVAL (0, 0x1p23, 500000) -SV_SINCOS_INTERVAL (-0, -0x1p23, 500000) + TEST_SYM_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n) +SV_SINCOS_INTERVAL (0, 0x1p-63, 50000) +SV_SINCOS_INTERVAL (0x1p-63, 0x1p23, 500000) SV_SINCOS_INTERVAL (0x1p23, inf, 10000) -SV_SINCOS_INTERVAL (-0x1p23, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sincosf_1u8.c b/math/aarch64/sve/sincosf.c similarity index 72% rename from pl/math/sv_sincosf_1u8.c rename to math/aarch64/sve/sincosf.c index c335de8d3dbb0b..f3e956ee62e232 100644 --- a/pl/math/sv_sincosf_1u8.c +++ b/math/aarch64/sve/sincosf.c @@ -1,7 +1,7 @@ /* * Single-precision vector sincos function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,12 +9,22 @@ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to be linked against the scalar sincosf from math/. */ #define _GNU_SOURCE -#include -#undef _GNU_SOURCE -#include "sv_sincosf_common.h" #include "sv_math.h" -#include "pl_test.h" +#include "sv_sincosf_common.h" +#include "test_defs.h" + +#include + +/* sincos not available for all scalar libm implementations. */ +#ifndef __GLIBC__ +static void +sincosf (float x, float *out_sin, float *out_cos) +{ + *out_sin = sinf (x); + *out_cos = cosf (x); +} +#endif static void NOINLINE special_case (svfloat32_t x, svbool_t special, float *out_sin, float *out_cos) @@ -51,12 +61,14 @@ _ZGVsMxvl4l4_sincosf (svfloat32_t x, float *out_sin, float *out_cos, special_case (x, special, out_sin, out_cos); } -PL_TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17) -PL_TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31) +TEST_DISABLE_FENV (_ZGVsMxv_sincosf_sin) +TEST_DISABLE_FENV (_ZGVsMxv_sincosf_cos) +TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17) +TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31) #define SV_SINCOSF_INTERVAL(lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \ - PL_TEST_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n) -SV_SINCOSF_INTERVAL (0, 0x1p20, 500000) -SV_SINCOSF_INTERVAL (-0, -0x1p20, 500000) + TEST_SYM_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n) +SV_SINCOSF_INTERVAL (0, 0x1p-31, 50000) +SV_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000) SV_SINCOSF_INTERVAL (0x1p20, inf, 10000) -SV_SINCOSF_INTERVAL (-0x1p20, -inf, 10000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sincospi.c b/math/aarch64/sve/sincospi.c new file mode 100644 index 00000000000000..d06ca8cc416522 --- /dev/null +++ b/math/aarch64/sve/sincospi.c @@ -0,0 +1,47 @@ +/* + * Double-precision SVE sincospi(x, *y, *z) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_defs.h" +#include "mathlib.h" +#include "sv_sincospi_common.h" + +/* Double-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.09 ULP: + _ZGVsMxvl8l8_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1. + Worst-case error for sin is 3.16 ULP: + _ZGVsMxvl8l8_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. + */ +void +_ZGVsMxvl8l8_sincospi (svfloat64_t x, double *out_sin, double *out_cos, + svbool_t pg) +{ + const struct sv_sincospi_data *d = ptr_barrier (&sv_sincospi_data); + + svfloat64x2_t sc = sv_sincospi_inline (pg, x, d); + + svst1 (pg, out_sin, svget2 (sc, 0)); + svst1 (pg, out_cos, svget2 (sc, 1)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVsMxvl8l8_sincospi_sin) +TEST_DISABLE_FENV (_ZGVsMxvl8l8_sincospi_cos) +TEST_ULP (_ZGVsMxvl8l8_sincospi_sin, 2.59) +TEST_ULP (_ZGVsMxvl8l8_sincospi_cos, 2.66) +# define SV_SINCOSPI_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl8l8_sincospi_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl8l8_sincospi_cos, lo, hi, n) +SV_SINCOSPI_INTERVAL (0, 0x1p-63, 10000) +SV_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000) +SV_SINCOSPI_INTERVAL (0.5, 0x1p53, 50000) +SV_SINCOSPI_INTERVAL (0x1p53, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sincospif.c b/math/aarch64/sve/sincospif.c new file mode 100644 index 00000000000000..20476f9346e916 --- /dev/null +++ b/math/aarch64/sve/sincospif.c @@ -0,0 +1,46 @@ +/* + * Single-precision SVE sincospi(x, *y, *z) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_defs.h" +#include "mathlib.h" +#include "sv_sincospif_common.h" + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVsMxvl4l4_sincospif_sin(0x1.b51b8p-2) got 0x1.f28b5ep-1 want + 0x1.f28b58p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVsMxvl4l4_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want + 0x1.f7cd5p-1. */ +void +_ZGVsMxvl4l4_sincospif (svfloat32_t x, float *out_sin, float *out_cos, + svbool_t pg) +{ + const struct sv_sincospif_data *d = ptr_barrier (&sv_sincospif_data); + + svfloat32x2_t sc = sv_sincospif_inline (pg, x, d); + + svst1 (pg, out_sin, svget2 (sc, 0)); + svst1 (pg, out_cos, svget2 (sc, 1)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVsMxvl4l4_sincospif_sin) +TEST_DISABLE_FENV (_ZGVsMxvl4l4_sincospif_cos) +TEST_ULP (_ZGVsMxvl4l4_sincospif_sin, 2.54) +TEST_ULP (_ZGVsMxvl4l4_sincospif_cos, 2.68) +# define SV_SINCOSPIF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl4l4_sincospif_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVsMxvl4l4_sincospif_cos, lo, hi, n) +SV_SINCOSPIF_INTERVAL (0, 0x1p-31, 10000) +SV_SINCOSPIF_INTERVAL (0x1p-31, 0.5, 50000) +SV_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000) +SV_SINCOSPIF_INTERVAL (0x1p31, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sinf_1u9.c b/math/aarch64/sve/sinf.c similarity index 89% rename from pl/math/sv_sinf_1u9.c rename to math/aarch64/sve/sinf.c index 675d7b2480f764..62127194d60f3f 100644 --- a/pl/math/sv_sinf_1u9.c +++ b/math/aarch64/sve/sinf.c @@ -1,13 +1,13 @@ /* * Single-precision SVE sin(x) function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -87,7 +87,9 @@ svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg) return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_SIG (SV, F, 1, sin, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_F1 (sin), 1.40) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000) +TEST_SIG (SV, F, 1, sin, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (sin), 1.40) +TEST_DISABLE_FENV (SV_NAME_F1 (sin)) +TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sinh_3u.c b/math/aarch64/sve/sinh.c similarity index 88% rename from pl/math/sv_sinh_3u.c rename to math/aarch64/sve/sinh.c index a01e19caecdab0..8a35c1c38525ce 100644 --- a/pl/math/sv_sinh_3u.c +++ b/math/aarch64/sve/sinh.c @@ -1,14 +1,14 @@ /* * Double-precision SVE sinh(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "sv_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -96,8 +96,10 @@ svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg) return svmul_x (pg, t, halfsign); } -PL_SIG (SV, D, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_D1 (sinh), 2.08) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000) +TEST_SIG (SV, D, 1, sinh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (sinh), 2.08) +TEST_DISABLE_FENV (SV_NAME_D1 (sinh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sinhf_2u3.c b/math/aarch64/sve/sinhf.c similarity index 78% rename from pl/math/sv_sinhf_2u3.c rename to math/aarch64/sve/sinhf.c index e34ecf378ad3bc..82b7ee4427806e 100644 --- a/pl/math/sv_sinhf_2u3.c +++ b/math/aarch64/sve/sinhf.c @@ -1,14 +1,13 @@ /* * Single-precision SVE sinh(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - +#include "test_sig.h" +#include "test_defs.h" #include "sv_expm1f_inline.h" static const struct data @@ -54,11 +53,13 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg) if (unlikely (svptest_any (pg, special))) return special_case (x, svmul_x (pg, t, halfsign), special); - return svmul_x (pg, t, halfsign); + return svmul_x (svptrue_b32 (), t, halfsign); } -PL_SIG (SV, F, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_F1 (sinh), 1.76) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) +TEST_SIG (SV, F, 1, sinh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (sinh), 1.76) +TEST_DISABLE_FENV (SV_NAME_F1 (sinh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sinpi_3u1.c b/math/aarch64/sve/sinpi.c similarity index 66% rename from pl/math/sv_sinpi_3u1.c rename to math/aarch64/sve/sinpi.c index c9f23da1b19b54..8fad3678b17294 100644 --- a/pl/math/sv_sinpi_3u1.c +++ b/math/aarch64/sve/sinpi.c @@ -1,19 +1,19 @@ /* * Double-precision SVE sinpi(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "mathlib.h" #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f64.h" static const struct data { - double poly[10]; + double poly[10], range_val; } data = { /* Polynomial coefficients generated using Remez algorithm, see sinpi.sollya for details. */ @@ -21,6 +21,7 @@ static const struct data -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, + .range_val = 0x1p63, }; /* A fast SVE implementation of sinpi. @@ -37,8 +38,9 @@ svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg) svfloat64_t r = svsub_x (pg, x, n); /* Result should be negated based on if n is odd or not. */ - svuint64_t intn = svreinterpret_u64 (svcvt_s64_x (pg, n)); - svuint64_t sign = svlsl_z (pg, intn, 63); + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n)); + svuint64_t sign = svlsl_z (cmp, intn, 63); /* y = sin(r). */ svfloat64_t r2 = svmul_x (pg, r, r); @@ -49,9 +51,12 @@ svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg) return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); } -PL_SIG (SV, D, 1, sinpi, -0.9, 0.9) -PL_TEST_ULP (SV_NAME_D1 (sinpi), 2.61) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000) +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_D1 (sinpi), 2.61) +TEST_DISABLE_FENV (SV_NAME_D1 (sinpi)) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/pl/math/sv_sinpif_2u5.c b/math/aarch64/sve/sinpif.c similarity index 61% rename from pl/math/sv_sinpif_2u5.c rename to math/aarch64/sve/sinpif.c index ac3f924bed682c..b91768a29cb61c 100644 --- a/pl/math/sv_sinpif_2u5.c +++ b/math/aarch64/sve/sinpif.c @@ -1,23 +1,24 @@ /* * Single-precision SVE sinpi(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "mathlib.h" #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f32.h" +#include "mathlib.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_poly_f32.h" static const struct data { - float poly[6]; + float poly[6], range_val; } data = { /* Taylor series coefficents for sin(pi * x). */ .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, 0x1.50783p-4f, -0x1.e30750p-8f }, + .range_val = 0x1p31, }; /* A fast SVE implementation of sinpif. @@ -34,8 +35,9 @@ svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg) svfloat32_t r = svsub_x (pg, x, n); /* Result should be negated based on if n is odd or not. */ - svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n)); - svuint32_t sign = svlsl_z (pg, intn, 31); + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t intn = svreinterpret_u32 (svcvt_s32_z (pg, n)); + svuint32_t sign = svlsl_z (cmp, intn, 31); /* y = sin(r). */ svfloat32_t r2 = svmul_x (pg, r, r); @@ -45,9 +47,12 @@ svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg) return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); } -PL_SIG (SV, F, 1, sinpi, -0.9, 0.9) -PL_TEST_ULP (SV_NAME_F1 (sinpi), 1.99) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000) +#if WANT_TRIGPI_TESTS +TEST_ULP (SV_NAME_F1 (sinpi), 1.99) +TEST_DISABLE_FENV (SV_NAME_F1 (sinpi)) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000) +TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/sv_expf_inline.h b/math/aarch64/sve/sv_expf_inline.h new file mode 100644 index 00000000000000..6054e65bb202d8 --- /dev/null +++ b/math/aarch64/sve/sv_expf_inline.h @@ -0,0 +1,66 @@ +/* + * SVE helper for single-precision routines which calculate exp(x) and do + * not need special-case handling + * + * Copyright (c) 2023-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_SV_EXPF_INLINE_H +#define MATH_SV_EXPF_INLINE_H + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +struct sv_expf_data +{ + float c1, c3, inv_ln2; + float ln2_lo, c0, c2, c4; + float ln2_hi, shift; +}; + +/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ +#define SV_EXPF_DATA \ + { \ + /* Coefficients copied from the polynomial in AdvSIMD variant. */ \ + .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \ + .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .shift = 0x1.803f8p17f, \ + } + +#define C(i) sv_f32 (d->poly[i]) + +static inline svfloat32_t +expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) +{ + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo); + + /* n = round(x/(ln2/N)). */ + svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift); + svfloat32_t n = svsub_x (pg, z, d->shift); + + /* r = x - n*ln2/N. */ + svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x); + r = svmls_lane (r, n, lane_consts, 0); + + /* scale = 2^(n/N). */ + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); + + /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */ + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +#endif // MATH_SV_EXPF_INLINE_H diff --git a/pl/math/sv_expm1f_inline.h b/math/aarch64/sve/sv_expm1f_inline.h similarity index 65% rename from pl/math/sv_expm1f_inline.h rename to math/aarch64/sve/sv_expm1f_inline.h index a6e2050ff4a640..35892f519690eb 100644 --- a/pl/math/sv_expm1f_inline.h +++ b/math/aarch64/sve/sv_expm1f_inline.h @@ -2,12 +2,12 @@ * SVE helper for single-precision routines which calculate exp(x) - 1 and do * not need special-case handling * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_SV_EXPM1F_INLINE_H -#define PL_MATH_SV_EXPM1F_INLINE_H +#ifndef MATH_SV_EXPM1F_INLINE_H +#define MATH_SV_EXPM1F_INLINE_H #include "sv_math.h" @@ -16,21 +16,18 @@ struct sv_expm1f_data /* These 4 are grouped together so they can be loaded as one quadword, then used with _lane forms of svmla/svmls. */ float32_t c2, c4, ln2_hi, ln2_lo; - float32_t c0, c1, c3, inv_ln2, shift; + float c0, inv_ln2, c1, c3, special_bound; }; /* Coefficients generated using fpminimax. */ #define SV_EXPM1F_DATA \ { \ - .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \ - .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \ + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \ \ - .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ - .ln2_lo = 0x1.7f7d1cp-20f, \ + .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \ } -#define C(i) sv_f32 (d->c##i) - static inline svfloat32_t expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) { @@ -44,9 +41,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); - j = svsub_x (pg, j, d->shift); - svint32_t i = svcvt_s32_x (pg, j); + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); + j = svrinta_x (pg, j); svfloat32_t f = svmls_lane (x, j, lane_constants, 2); f = svmls_lane (f, j, lane_constants, 3); @@ -56,18 +52,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); - svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); - svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); svfloat32_t p = svmla_x (pg, p12, f2, p34); - p = svmla_x (pg, C (0), f, p); + p = svmla_x (pg, sv_f32 (d->c0), f, p); p = svmla_x (pg, f, f2, p); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - svfloat32_t t = svscale_x (pg, sv_f32 (1), i); - return svmla_x (pg, svsub_x (pg, t, 1), p, t); + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); } -#endif // PL_MATH_SV_EXPM1F_INLINE_H \ No newline at end of file +#endif // MATH_SV_EXPM1F_INLINE_H diff --git a/pl/math/sv_log1p_inline.h b/math/aarch64/sve/sv_log1p_inline.h similarity index 90% rename from pl/math/sv_log1p_inline.h rename to math/aarch64/sve/sv_log1p_inline.h index 983f8e1b04134b..86a5bb1456f688 100644 --- a/pl/math/sv_log1p_inline.h +++ b/math/aarch64/sve/sv_log1p_inline.h @@ -2,14 +2,14 @@ * Helper for SVE double-precision routines which calculate log(1 + x) and do * not need special-case handling * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_SV_LOG1P_INLINE_H -#define PL_MATH_SV_LOG1P_INLINE_H +#ifndef MATH_SV_LOG1P_INLINE_H +#define MATH_SV_LOG1P_INLINE_H #include "sv_math.h" -#include "poly_sve_f64.h" +#include "sv_poly_f64.h" static const struct sv_log1p_data { @@ -67,8 +67,8 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg) svfloat64_t cm; #ifndef WANT_SV_LOG1P_K0_SHORTCUT -#error \ - "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +# error \ + "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" #elif WANT_SV_LOG1P_K0_SHORTCUT /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is that the approximation is solely the polynomial. */ @@ -93,4 +93,4 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg) return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); } -#endif // PL_MATH_SV_LOG1P_INLINE_H +#endif // MATH_SV_LOG1P_INLINE_H diff --git a/math/aarch64/sve/sv_log1pf_inline.h b/math/aarch64/sve/sv_log1pf_inline.h new file mode 100644 index 00000000000000..238079c61a5b03 --- /dev/null +++ b/math/aarch64/sve/sv_log1pf_inline.h @@ -0,0 +1,83 @@ +/* + * Helper for SVE routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_SV_LOG1PF_INLINE_H +#define MATH_SV_LOG1PF_INLINE_H + +#define SignExponentMask 0xff800000 + +static const struct sv_log1pf_data +{ + float c0, c2, c4, c6; + float c1, c3, c5, c7; + float ln2, exp_bias, quarter; + uint32_t four, three_quarters; +} sv_log1pf_data = { + /* Do not store first term of polynomial, which is -0.5, as + this can be fmov-ed directly instead of including it in + the main load-and-mla polynomial schedule. */ + .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f, + .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f, + .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f, + .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000, + .three_quarters = 0x3f400000, +}; + +static inline svfloat32_t +sv_log1pf_inline (svfloat32_t x, svbool_t pg) +{ + const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data); + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + svfloat32_t m = svadd_x (pg, x, 1); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + svint32_t k + = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), + sv_s32 (SignExponentMask)); + + /* Scale x by exponent manipulation. */ + svfloat32_t m_scale = svreinterpret_f32 ( + svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); + svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2); + m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2)); + + /* Evaluate polynomial on reduced interval. */ + svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale); + + svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1); + svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0); + svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1); + svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2); + svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3); + + svfloat32_t p = svmla_x (pg, p45, p67, ms2); + p = svmla_x (pg, p23, p, ms2); + p = svmla_x (pg, p01, p, ms2); + + p = svmad_x (pg, m_scale, p, -0.5); + p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1); + return svmla_lane_f32 (p, scale_back, fconst, 0); +} + +#endif // SV_LOG1PF_INLINE_H diff --git a/math/aarch64/sve/sv_log_inline.h b/math/aarch64/sve/sv_log_inline.h new file mode 100644 index 00000000000000..a1b169a0b72794 --- /dev/null +++ b/math/aarch64/sve/sv_log_inline.h @@ -0,0 +1,83 @@ +/* + * Double-precision vector log(x) function - inline version + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "math_config.h" + +#ifndef SV_LOG_INLINE_POLY_ORDER +# error Cannot use inline log helper without specifying poly order (options are 4 or 5) +#endif + +#if SV_LOG_INLINE_POLY_ORDER == 4 +# define POLY \ + { \ + -0x1.ffffffffcbad3p-2, 0x1.555555578ed68p-2, -0x1.0000d3a1e7055p-2, \ + 0x1.999392d02a63ep-3 \ + } +#elif SV_LOG_INLINE_POLY_ORDER == 5 +# define POLY \ + { \ + -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, \ + 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 \ + } +#else +# error Can only choose order 4 or 5 for log poly +#endif + +struct sv_log_inline_data +{ + double poly[SV_LOG_INLINE_POLY_ORDER]; + double ln2; + uint64_t off, sign_exp_mask; +}; + +#define SV_LOG_CONSTANTS \ + { \ + .poly = POLY, .ln2 = 0x1.62e42fefa39efp-1, \ + .sign_exp_mask = 0xfff0000000000000, .off = 0x3fe6900900000000 \ + } + +#define P(i) sv_f64 (d->poly[i]) +#define N (1 << V_LOG_TABLE_BITS) + +static inline svfloat64_t +sv_log_inline (svbool_t pg, svfloat64_t x, const struct sv_log_inline_data *d) +{ + svuint64_t ix = svreinterpret_u64 (x); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + svuint64_t tmp = svsub_x (pg, ix, d->off); + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. + The actual value of i is double this due to table layout. */ + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); + svint64_t k + = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); + + /* Lookup in 2 global lists (length N). */ + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + svfloat64_t r = svmad_x (pg, invc, z, -1); + svfloat64_t kd = svcvt_f64_x (pg, k); + /* hi = r + log(c) + k*Ln2. */ + svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t y = svmla_x (pg, P (2), r, P (3)); + svfloat64_t p = svmla_x (pg, P (0), r, P (1)); +#if SV_LOG_INLINE_POLY_ORDER == 5 + y = svmla_x (pg, P (4), r2); +#endif + y = svmla_x (pg, p, r2, y); + return svmla_x (pg, hi, r2, y); +} diff --git a/pl/math/sv_math.h b/math/aarch64/sve/sv_math.h similarity index 72% rename from pl/math/sv_math.h rename to math/aarch64/sve/sv_math.h index f67fe91803babf..db688a89303270 100644 --- a/pl/math/sv_math.h +++ b/math/aarch64/sve/sv_math.h @@ -1,24 +1,38 @@ /* * Wrapper functions for SVE ACLE. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef SV_MATH_H #define SV_MATH_H -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 +/* Enable SVE in this translation unit. Note, because this is 'pushed' in + clang, any file including sv_math.h will have to pop it back off again by + ending the source file with CLOSE_SVE_ATTR. It is important that sv_math.h + is included first so that all functions have the target attribute. */ +#ifdef __clang__ +# pragma clang attribute push(__attribute__((target("sve"))), \ + apply_to = any(function)) +# define CLOSE_SVE_ATTR _Pragma("clang attribute pop") +#else +# pragma GCC target("+sve") +# define CLOSE_SVE_ATTR #endif -#if WANT_VMATH +#include +#include -# include -# include +#include "math_config.h" -# include "math_config.h" +#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f +#define SV_NAME_D1(fun) _ZGVsMxv_##fun +#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f +#define SV_NAME_D2(fun) _ZGVsMxvv_##fun +#define SV_NAME_F1_L1(fun) _ZGVsMxvl4_##fun##f +#define SV_NAME_D1_L1(fun) _ZGVsMxvl8_##fun +#define SV_NAME_F1_L2(fun) _ZGVsMxvl4l4_##fun##f /* Double precision. */ static inline svint64_t @@ -129,5 +143,3 @@ sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2, return y; } #endif - -#endif diff --git a/pl/math/poly_sve_f32.h b/math/aarch64/sve/sv_poly_f32.h similarity index 78% rename from pl/math/poly_sve_f32.h rename to math/aarch64/sve/sv_poly_f32.h index a97e2ced027aeb..2d73014a4b450f 100644 --- a/pl/math/poly_sve_f32.h +++ b/math/aarch64/sve/sv_poly_f32.h @@ -2,12 +2,12 @@ * Helpers for evaluating polynomials on single-precision SVE input, using * various schemes. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_POLY_SVE_F32_H -#define PL_MATH_POLY_SVE_F32_H +#ifndef MATH_POLY_SVE_F32_H +#define MATH_POLY_SVE_F32_H #include @@ -17,7 +17,7 @@ #define STYPE float #define VWRAP(f) sv_##f##_f32_x #define DUP svdup_f32 -#include "poly_sve_generic.h" +#include "sv_poly_generic.h" #undef DUP #undef VWRAP #undef STYPE diff --git a/pl/math/poly_sve_f64.h b/math/aarch64/sve/sv_poly_f64.h similarity index 78% rename from pl/math/poly_sve_f64.h rename to math/aarch64/sve/sv_poly_f64.h index 5fb14b3c1700b9..f92be9bf8e9c86 100644 --- a/pl/math/poly_sve_f64.h +++ b/math/aarch64/sve/sv_poly_f64.h @@ -2,12 +2,12 @@ * Helpers for evaluating polynomials on double-precision SVE input, using * various schemes. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_POLY_SVE_F64_H -#define PL_MATH_POLY_SVE_F64_H +#ifndef MATH_POLY_SVE_F64_H +#define MATH_POLY_SVE_F64_H #include @@ -17,7 +17,7 @@ #define STYPE double #define VWRAP(f) sv_##f##_f64_x #define DUP svdup_f64 -#include "poly_sve_generic.h" +#include "sv_poly_generic.h" #undef DUP #undef VWRAP #undef STYPE diff --git a/pl/math/poly_sve_generic.h b/math/aarch64/sve/sv_poly_generic.h similarity index 91% rename from pl/math/poly_sve_generic.h rename to math/aarch64/sve/sv_poly_generic.h index b568e4cddff38a..a1fc59baa8d3ba 100644 --- a/pl/math/poly_sve_generic.h +++ b/math/aarch64/sve/sv_poly_generic.h @@ -2,7 +2,7 @@ * Helpers for evaluating polynomials with various schemes - specific to SVE * but precision-agnostic. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -299,3 +299,33 @@ static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); return svmla_x (pg, p01, x2, p2_18); } + +static inline VTYPE VWRAP (lw_pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly_even, + const STYPE *poly_odd) +{ + VTYPE c13 = svld1rq (pg, poly_odd); + + VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0); + VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1); + VTYPE p45 = svmla_x (pg, DUP (poly_even[2]), x, poly_odd[2]); + + VTYPE p; + p = svmla_x (pg, p23, x2, p45); + p = svmla_x (pg, p01, x2, p); + return p; +} +static inline VTYPE VWRAP (lw_pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, + const STYPE *poly_even, + const STYPE *poly_odd) +{ + VTYPE c13 = svld1rq (pg, poly_odd); + + VTYPE p49 = VWRAP (lw_pw_horner_5) (pg, x, x2, poly_even + 2, poly_odd + 2); + VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1); + + VTYPE p29 = svmla_x (pg, p23, x2, p49); + VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0); + + return svmla_x (pg, p01, x2, p29); +} diff --git a/pl/math/sv_sincos_common.h b/math/aarch64/sve/sv_sincos_common.h similarity index 97% rename from pl/math/sv_sincos_common.h rename to math/aarch64/sve/sv_sincos_common.h index f7b58deb90bdfe..2a537da157b04b 100644 --- a/pl/math/sv_sincos_common.h +++ b/math/aarch64/sve/sv_sincos_common.h @@ -1,12 +1,12 @@ /* * Core approximation for double-precision vector sincos * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" +#include "sv_poly_f64.h" static const struct sv_sincos_data { diff --git a/pl/math/sv_sincosf_common.h b/math/aarch64/sve/sv_sincosf_common.h similarity index 98% rename from pl/math/sv_sincosf_common.h rename to math/aarch64/sve/sv_sincosf_common.h index 714e996443b3d1..bda89ed2468074 100644 --- a/pl/math/sv_sincosf_common.h +++ b/math/aarch64/sve/sv_sincosf_common.h @@ -1,7 +1,7 @@ /* * Core approximation for single-precision vector sincos * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/math/aarch64/sve/sv_sincospi_common.h b/math/aarch64/sve/sv_sincospi_common.h new file mode 100644 index 00000000000000..672ebbc8e855f4 --- /dev/null +++ b/math/aarch64/sve/sv_sincospi_common.h @@ -0,0 +1,76 @@ +/* + * Core approximation for double-precision SVE sincospi + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f64.h" + +static const struct sv_sincospi_data +{ + double c0, c2, c4, c6, c8; + double c1, c3, c5, c7, c9; + double range_val; +} sv_sincospi_data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .c0 = 0x1.921fb54442d184p1, + .c1 = -0x1.4abbce625be53p2, + .c2 = 0x1.466bc6775ab16p1, + .c3 = -0x1.32d2cce62dc33p-1, + .c4 = 0x1.507834891188ep-4, + .c5 = -0x1.e30750a28c88ep-8, + .c6 = 0x1.e8f48308acda4p-12, + .c7 = -0x1.6fc0032b3c29fp-16, + .c8 = 0x1.af86ae521260bp-21, + .c9 = -0x1.012a9870eeb7dp-25, + /* Exclusive upper bound for a signed integer. */ + .range_val = 0x1p63 +}; + +/* Double-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.09 ULP: + _ZGVsMxvl8l8_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1. + Worst-case error for cos is 3.16 ULP: + _ZGVsMxvl8l8_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. + */ +static inline svfloat64x2_t +sv_sincospi_inline (svbool_t pg, svfloat64_t x, + const struct sv_sincospi_data *d) +{ + const svbool_t pt = svptrue_b64 (); + + /* r = x - rint(x). */ + /* pt hints unpredicated instruction. */ + svfloat64_t rx = svrinta_x (pg, x); + svfloat64_t sr = svsub_x (pt, x, rx); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + svfloat64_t cr = svsubr_x (pg, svabs_x (pg, sr), 0.5); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + /* pt hints unpredicated instruction. */ + svfloat64_t sr2 = svmul_x (pt, sr, sr); + svfloat64_t cr2 = svmul_x (pt, cr, cr); + svfloat64_t sr4 = svmul_x (pt, sr2, sr2); + svfloat64_t cr4 = svmul_x (pt, cr2, cr2); + + /* If rint(x) is odd, the sign of the result should be inverted for sinpi and + re-introduced for cospi. cmp filters rxs that saturate to max sint. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t odd = svlsl_x (pt, svreinterpret_u64 (svcvt_s64_z (pg, rx)), 63); + sr = svreinterpret_f64 (sveor_x (pt, svreinterpret_u64 (sr), odd)); + cr = svreinterpret_f64 (sveor_m (cmp, svreinterpret_u64 (cr), odd)); + + svfloat64_t sinpix = svmul_x ( + pt, sv_lw_pw_horner_9_f64_x (pg, sr2, sr4, &(d->c0), &(d->c1)), sr); + svfloat64_t cospix = svmul_x ( + pt, sv_lw_pw_horner_9_f64_x (pg, cr2, cr4, &(d->c0), &(d->c1)), cr); + + return svcreate2 (sinpix, cospix); +} diff --git a/math/aarch64/sve/sv_sincospif_common.h b/math/aarch64/sve/sv_sincospif_common.h new file mode 100644 index 00000000000000..4b9101de74ed9d --- /dev/null +++ b/math/aarch64/sve/sv_sincospif_common.h @@ -0,0 +1,82 @@ +/* + * Helper for single-precision SVE sincospi + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "sv_poly_f32.h" + +const static struct sv_sincospif_data +{ + float c0, c2, c4; + float c1, c3, c5; + float range_val; +} sv_sincospif_data = { + /* Taylor series coefficents for sin(pi * x). */ + .c0 = 0x1.921fb6p1f, + .c1 = -0x1.4abbcep2f, + .c2 = 0x1.466bc6p1f, + .c3 = -0x1.32d2ccp-1f, + .c4 = 0x1.50783p-4f, + .c5 = -0x1.e30750p-8f, + /* Exclusive upper bound for a signed integer. */ + .range_val = 0x1p31f, +}; + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVsMxvl4l4_sincospif_sin(0x1.b51b8p-2) got 0x1.f28b5ep-1 want + 0x1.f28b58p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVsMxvl4l4_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want + 0x1.f7cd5p-1. */ +static inline svfloat32x2_t +sv_sincospif_inline (svbool_t pg, svfloat32_t x, + const struct sv_sincospif_data *d) +{ + const svbool_t pt = svptrue_b32 (); + + /* r = x - rint(x). */ + svfloat32_t rx = svrinta_x (pg, x); + svfloat32_t sr = svsub_x (pt, x, rx); + + /* cospi(x) = sinpi(0.5 - abs(r)) for values -1/2 .. 1/2. */ + svfloat32_t cr = svsubr_x (pt, svabs_x (pg, sr), 0.5f); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + svfloat32_t sr2 = svmul_x (pt, sr, sr); + svfloat32_t sr4 = svmul_x (pt, sr2, sr2); + svfloat32_t cr2 = svmul_x (pt, cr, cr); + svfloat32_t cr4 = svmul_x (pt, cr2, cr2); + + /* If rint(x) is odd, the sign of the result should be inverted for sinpi and + re-introduced for cospi. cmp filters rxs that saturate to max sint. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t odd = svlsl_x (pt, svreinterpret_u32 (svcvt_s32_z (pg, rx)), 31); + sr = svreinterpret_f32 (sveor_x (pt, svreinterpret_u32 (sr), odd)); + cr = svreinterpret_f32 (sveor_m (cmp, svreinterpret_u32 (cr), odd)); + + svfloat32_t c135 = svld1rq_f32 (svptrue_b32 (), &d->c1); + + svfloat32_t sp01 = svmla_lane (sv_f32 (d->c0), sr2, c135, 0); + svfloat32_t sp23 = svmla_lane (sv_f32 (d->c2), sr2, c135, 1); + svfloat32_t sp45 = svmla_lane (sv_f32 (d->c4), sr2, c135, 2); + + svfloat32_t cp01 = svmla_lane (sv_f32 (d->c0), cr2, c135, 0); + svfloat32_t cp23 = svmla_lane (sv_f32 (d->c2), cr2, c135, 1); + svfloat32_t cp45 = svmla_lane (sv_f32 (d->c4), cr2, c135, 2); + + svfloat32_t sp = svmla_x (pg, sp23, sr4, sp45); + svfloat32_t cp = svmla_x (pg, cp23, cr4, cp45); + + sp = svmla_x (pg, sp01, sr4, sp); + cp = svmla_x (pg, cp01, cr4, cp); + + svfloat32_t sinpix = svmul_x (pt, sp, sr); + svfloat32_t cospix = svmul_x (pt, cp, cr); + + return svcreate2 (sinpix, cospix); +} diff --git a/math/aarch64/sve/tan.c b/math/aarch64/sve/tan.c new file mode 100644 index 00000000000000..1dfc5c422d5e67 --- /dev/null +++ b/math/aarch64/sve/tan.c @@ -0,0 +1,131 @@ +/* + * Double-precision SVE tan(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + double c2, c4, c6, c8; + double poly_1357[4]; + double c0, inv_half_pi; + double half_pi_hi, half_pi_lo, range_val; +} data = { + /* Polynomial generated with FPMinimax. */ + .c2 = 0x1.ba1ba1bb46414p-5, + .c4 = 0x1.226e5e5ecdfa3p-7, + .c6 = 0x1.7ea75d05b583ep-10, + .c8 = 0x1.4e4fd14147622p-12, + .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6, + 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 }, + .c0 = 0x1.5555555555556p-2, + .inv_half_pi = 0x1.45f306dc9c883p-1, + .half_pi_hi = 0x1.921fb54442d18p0, + .half_pi_lo = 0x1.1a62633145c07p-54, + .range_val = 0x1p23, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg, + svbool_t special) +{ + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d); + return sv_call_f64 (tan, x, y, special); +} + +/* Vector approximation for double-precision tan. + Maximum measured error is 3.48 ULP: + _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) +{ + const struct data *dat = ptr_barrier (&data); + svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0); + /* q = nearest integer to 2 * x / pi. */ + svfloat64_t q = svmul_lane (x, half_pi_c0, 1); + q = svrinta_x (pg, q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + svfloat64_t r = x; + svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi); + r = svmls_lane (r, q, half_pi, 0); + r = svmls_lane (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = svmul_x (svptrue_b64 (), r, 0.5); + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: + tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2); + svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4); + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2); + svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6); + + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0); + svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1); + svfloat64_t p03 = svmla_x (pg, p01, p23, r4); + + svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1); + svfloat64_t p47 = svmla_x (pg, p45, p67, r4); + + svfloat64_t p = svmla_x (pg, p03, p47, r8); + + svfloat64_t z = svmul_x (svptrue_b64 (), p, r); + z = svmul_x (svptrue_b64 (), r2, z); + z = svmla_lane (z, r, half_pi_c0, 0); + p = svmla_x (pg, r, r2, z); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of + numerator/denominator dependent on odd/even-ness of q (quadrant). */ + + /* Invert condition to catch NaNs and Infs as well as large values. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); + + if (unlikely (svptest_any (pg, special))) + { + return special_case (x, p, q, pg, special); + } + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + return svdiv_x (pg, n, d); +} + +TEST_SIG (SV, D, 1, tan, -3.1, 3.1) +TEST_ULP (SV_NAME_D1 (tan), 2.99) +TEST_DISABLE_FENV (SV_NAME_D1 (tan)) +TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_tanf_3u5.c b/math/aarch64/sve/tanf.c similarity index 79% rename from pl/math/sv_tanf_3u5.c rename to math/aarch64/sve/tanf.c index 6b8cd1e64b446a..d34fc2fc1a4e61 100644 --- a/pl/math/sv_tanf_3u5.c +++ b/math/aarch64/sve/tanf.c @@ -1,13 +1,13 @@ /* * Single-precision vector tan(x) function. * - * Copyright (c) 2020-2023, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -50,21 +50,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - /* Determine whether input is too large to perform fast regression. */ - svbool_t cmp = svacge (pg, x, d->range_val); - svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); /* n = rint(x/(pi/2)). */ - svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3); - svfloat32_t n = svsub_x (pg, q, d->shift); + svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3)); /* n is already a signed integer, simply convert it. */ svint32_t in = svcvt_s32_x (pg, n); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ svint32_t alt = svand_x (pg, in, 1); svbool_t pred_alt = svcmpne (pg, alt, 0); - /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ svfloat32_t r; r = svmls_lane (x, n, pi_vals, 0); @@ -83,7 +78,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], using Estrin on z^2. */ - svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); @@ -96,24 +91,27 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); - /* Transform result back, if necessary. */ - svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); - /* No need to pass pg to specialcase here since cmp is a strict subset, guaranteed by the cmpge above. */ + + /* Determine whether input is too large to perform fast regression. */ + svbool_t cmp = svacge (pg, x, d->range_val); if (unlikely (svptest_any (pg, cmp))) - return special_case (x, svsel (pred_alt, inv_y, y), cmp); + return special_case (x, svdivr_x (pg, y, 1.0f), cmp); + svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); return svsel (pred_alt, inv_y, y); } -PL_SIG (SV, F, 1, tan, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_F1 (tan), 2.96) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000) +TEST_SIG (SV, F, 1, tan, -3.1, 3.1) +TEST_ULP (SV_NAME_F1 (tan), 2.96) +TEST_DISABLE_FENV (SV_NAME_F1 (tan)) +TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000) +TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000) +CLOSE_SVE_ATTR diff --git a/pl/math/sv_tanh_3u.c b/math/aarch64/sve/tanh.c similarity index 86% rename from pl/math/sv_tanh_3u.c rename to math/aarch64/sve/tanh.c index f54139f1ddbcc5..41f64cb4b2c74e 100644 --- a/pl/math/sv_tanh_3u.c +++ b/math/aarch64/sve/tanh.c @@ -1,14 +1,14 @@ /* * Double-precision SVE tanh(x) function. - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" -#include "poly_sve_f64.h" +#include "sv_poly_f64.h" #include "mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" static const struct data { @@ -89,8 +89,10 @@ svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg) return svdiv_x (pg, q, qp2); } -PL_SIG (SV, D, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_D1 (tanh), 2.27) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) +TEST_SIG (SV, D, 1, tanh, -10.0, 10.0) +TEST_ULP (SV_NAME_D1 (tanh), 2.27) +TEST_DISABLE_FENV (SV_NAME_D1 (tanh)) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanhf.c b/math/aarch64/sve/tanhf.c new file mode 100644 index 00000000000000..9007e7badb0df3 --- /dev/null +++ b/math/aarch64/sve/tanhf.c @@ -0,0 +1,68 @@ +/* + * Single-precision SVE tanh(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "sv_expm1f_inline.h" + +/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */ +#define BoringBound 0x1.205966p+3f + +static const struct data +{ + struct sv_expm1f_data expm1f_consts; + uint32_t onef, special_bound; + float boring_bound; +} data = { + .expm1f_consts = SV_EXPM1F_DATA, + .onef = 0x3f800000, + .special_bound = 0x7f800000, + .boring_bound = BoringBound, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring, + svfloat32_t boring, svfloat32_t q, svbool_t special) +{ + svfloat32_t y + = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0))); + return sv_call_f32 (tanhf, x, y, special); +} + +/* Approximation for single-precision SVE tanh(x), using a simplified + version of expm1f. The maximum error is 2.57 ULP: + _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5 + want 0x1.fb71aap-5. */ +svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); + svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef)); + svbool_t special = svcmpgt (pg, iax, d->special_bound); + svbool_t is_boring = svacgt (pg, x, d->boring_bound); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg, + &d->expm1f_consts); + + if (unlikely (svptest_any (pg, special))) + return special_case (x, pg, is_boring, boring, q, special); + svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); + return svsel_f32 (is_boring, boring, y); +} + +TEST_SIG (SV, F, 1, tanh, -10.0, 10.0) +TEST_ULP (SV_NAME_F1 (tanh), 2.07) +TEST_DISABLE_FENV (SV_NAME_F1 (tanh)) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, BoringBound, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), BoringBound, inf, 100) +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanpi.c b/math/aarch64/sve/tanpi.c new file mode 100644 index 00000000000000..d9e7d2487d533c --- /dev/null +++ b/math/aarch64/sve/tanpi.c @@ -0,0 +1,89 @@ +/* + * Double-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpi_data +{ + double c0, c2, c4, c6, c8, c10, c12; + double c1, c3, c5, c7, c9, c11, c13, c14; +} tanpi_data = { + /* Coefficents for tan(pi * x) computed with fpminimax + on [ 0x1p-1022 0x1p-2 ] + approx rel error: 0x1.7eap-55 + approx abs error: 0x1.7eap-55. */ + .c0 = 0x1.921fb54442d18p1, /* pi. */ + .c1 = 0x1.4abbce625be52p3, .c2 = 0x1.466bc6775b0f9p5, + .c3 = 0x1.45fff9b426f5ep7, .c4 = 0x1.45f4730dbca5cp9, + .c5 = 0x1.45f3265994f85p11, .c6 = 0x1.45f4234b330cap13, + .c7 = 0x1.45dca11be79ebp15, .c8 = 0x1.47283fc5eea69p17, + .c9 = 0x1.3a6d958cdefaep19, .c10 = 0x1.927896baee627p21, + .c11 = -0x1.89333f6acd922p19, .c12 = 0x1.5d4e912bb8456p27, + .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32, +}; + +/* Approximation for double-precision vector tanpi(x) + The maximum error is 3.06 ULP: + _ZGVsMxv_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3 + want -0x1.fa30112702c95p+3. */ +svfloat64_t SV_NAME_D1 (tanpi) (svfloat64_t x, const svbool_t pg) +{ + const struct v_tanpi_data *d = ptr_barrier (&tanpi_data); + + svfloat64_t n = svrintn_x (pg, x); + + /* inf produces nan that propagates. */ + svfloat64_t xr = svsub_x (pg, x, n); + svfloat64_t ar = svabd_x (pg, x, n); + svbool_t flip = svcmpgt (pg, ar, 0.25); + svfloat64_t r = svsel (flip, svsubr_x (pg, ar, 0.5), ar); + + /* Order-14 pairwise Horner. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + + svfloat64_t c_1_3 = svld1rq (pg, &d->c1); + svfloat64_t c_5_7 = svld1rq (pg, &d->c5); + svfloat64_t c_9_11 = svld1rq (pg, &d->c9); + svfloat64_t c_13_14 = svld1rq (pg, &d->c13); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r2, c_1_3, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r2, c_1_3, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r2, c_5_7, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), r2, c_5_7, 1); + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), r2, c_9_11, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), r2, c_9_11, 1); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), r2, c_13_14, 0); + + svfloat64_t p = svmla_lane (p1213, r4, c_13_14, 1); + p = svmad_x (pg, p, r4, p1011); + p = svmad_x (pg, p, r4, p89); + p = svmad_x (pg, p, r4, p67); + p = svmad_x (pg, p, r4, p45); + p = svmad_x (pg, p, r4, p23); + p = svmad_x (pg, p, r4, p01); + p = svmul_x (pg, r, p); + + svfloat64_t p_recip = svdivr_x (pg, p, 1.0); + svfloat64_t y = svsel (flip, p_recip, p); + + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (xr), svreinterpret_u64 (ar)); + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (SV_NAME_D1 (tanpi)) +TEST_ULP (SV_NAME_D1 (tanpi), 2.57) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0.5, 1.0, 200000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 1.0, 0x1p23, 50000) +TEST_SYM_INTERVAL (SV_NAME_D1 (tanpi), 0x1p23, inf, 50000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/sve/tanpif.c b/math/aarch64/sve/tanpif.c new file mode 100644 index 00000000000000..2ba968a799fe06 --- /dev/null +++ b/math/aarch64/sve/tanpif.c @@ -0,0 +1,68 @@ +/* + * Single-precision vector tanpif(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "test_defs.h" +#include "test_sig.h" + +const static struct v_tanpif_data +{ + float c0, c2, c4, c6; + float c1, c3, c5, c7; +} tanpif_data = { + /* Coefficients for tan(pi * x). */ + .c0 = 0x1.921fb4p1f, .c1 = 0x1.4abbcep3f, .c2 = 0x1.466b8p5f, + .c3 = 0x1.461c72p7f, .c4 = 0x1.42e9d4p9f, .c5 = 0x1.69e2c4p11f, + .c6 = 0x1.e85558p11f, .c7 = 0x1.a52e08p16f, +}; + +/* Approximation for single-precision vector tanpif(x) + The maximum error is 3.34 ULP: + _ZGVsMxv_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2 + want 0x1.f70aa6p+2. */ +svfloat32_t SV_NAME_F1 (tanpi) (svfloat32_t x, const svbool_t pg) +{ + const struct v_tanpif_data *d = ptr_barrier (&tanpif_data); + svfloat32_t odd_coeffs = svld1rq (pg, &d->c1); + svfloat32_t n = svrintn_x (pg, x); + + /* inf produces nan that propagates. */ + svfloat32_t xr = svsub_x (pg, x, n); + svfloat32_t ar = svabd_x (pg, x, n); + svbool_t flip = svcmpgt (pg, ar, 0.25f); + svfloat32_t r = svsel (flip, svsub_x (pg, sv_f32 (0.5f), ar), ar); + + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r4 = svmul_x (pg, r2, r2); + + /* Order-7 Pairwise Horner. */ + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), r2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), r2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), r2, odd_coeffs, 2); + svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), r2, odd_coeffs, 3); + svfloat32_t p = svmad_x (pg, p67, r4, p45); + p = svmad_x (pg, p, r4, p23); + p = svmad_x (pg, p, r4, p01); + svfloat32_t poly = svmul_x (pg, r, p); + + svfloat32_t poly_recip = svdiv_x (pg, sv_f32 (1.0), poly); + svfloat32_t y = svsel (flip, poly_recip, poly); + + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (xr), svreinterpret_u32 (ar)); + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (SV_NAME_F1 (tanpi)) +TEST_ULP (SV_NAME_F1 (tanpi), 2.84) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000) +TEST_SYM_INTERVAL (SV_NAME_F1 (tanpi), 0x1p23f, inf, 100000) +#endif +CLOSE_SVE_ATTR diff --git a/math/aarch64/tanpi_2u5.c b/math/aarch64/tanpi_2u5.c new file mode 100644 index 00000000000000..154b9faf454d6c --- /dev/null +++ b/math/aarch64/tanpi_2u5.c @@ -0,0 +1,158 @@ +/* + * Double-precision scalar tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f64.h" + +#define SIGN_MASK 0x8000000000000000 + +const static struct tanpi_data +{ + double tan_poly[14], cot_poly[9], pi, invpi; +} tanpi_data = { + /* Coefficents for tan(pi * x). */ + .tan_poly = { + 0x1.4abbce625be52p3, + 0x1.466bc6775b0f9p5, + 0x1.45fff9b426f5ep7, + 0x1.45f4730dbca5cp9, + 0x1.45f3265994f85p11, + 0x1.45f4234b330cap13, + 0x1.45dca11be79ebp15, + 0x1.47283fc5eea69p17, + 0x1.3a6d958cdefaep19, + 0x1.927896baee627p21, + -0x1.89333f6acd922p19, + 0x1.5d4e912bb8456p27, + -0x1.a854d53ab6874p29, + 0x1.1b76de7681424p32, + }, + /* Coefficents for cot(pi * x). */ + .cot_poly = { + -0x1.0c152382d7366p0, + -0x1.60c8539c1d316p-1, + -0x1.4b9a2f3516354p-1, + -0x1.47474060b6ba8p-1, + -0x1.464633ad9dcb1p-1, + -0x1.45ff229d7edd6p-1, + -0x1.46d8dbf492923p-1, + -0x1.3873892311c6bp-1, + -0x1.b2f3d0ff96d73p-1, + }, + .pi = 0x1.921fb54442d18p1, + .invpi = 0x1.45f306dc9c883p-2, +}; + +/* Double-precision scalar tanpi(x) implementation. + Maximum error 2.19 ULP: + tanpi(0x1.68847e177a855p-2) got 0x1.fe9a0ff9bb9d7p+0 + want 0x1.fe9a0ff9bb9d5p+0. */ +double +arm_math_tanpi (double x) +{ + uint64_t xabs_12 = asuint64 (x) >> 52 & 0x7ff; + + /* x >= 0x1p54. */ + if (unlikely (xabs_12 >= 0x434)) + { + /* tanpi(+/-inf) and tanpi(+/-nan) = nan. */ + if (unlikely (xabs_12 == 0x7ff)) + { + return __math_invalid (x); + } + + uint64_t x_sign = asuint64 (x) & SIGN_MASK; + return asdouble (x_sign); + } + + const struct tanpi_data *d = ptr_barrier (&tanpi_data); + + double rounded = round (x); + if (unlikely (rounded == x)) + { + /* If x == 0, return with sign. */ + if (x == 0) + { + return x; + } + /* Otherwise, return zero with alternating sign. */ + int64_t m = (int64_t) rounded; + if (x < 0) + { + return m & 1 ? 0.0 : -0.0; + } + else + { + return m & 1 ? -0.0 : 0.0; + } + } + + double x_reduced = x - rounded; + double abs_x_reduced = 0.5 - fabs (x_reduced); + + /* Prevent underflow exceptions. x <= 0x1p-63. */ + if (unlikely (xabs_12 < 0x3c0)) + { + return d->pi * x; + } + + double result, offset, scale; + + /* Test 0.25 < abs_x < 0.5 independent from abs_x_reduced. */ + double x2 = x + x; + int64_t rounded_x2 = (int64_t) round (x2); + if (rounded_x2 & 1) + { + double r_x = abs_x_reduced; + + double r_x2 = r_x * r_x; + double r_x4 = r_x2 * r_x2; + + uint64_t sign = asuint64 (x_reduced) & SIGN_MASK; + r_x = asdouble (asuint64 (r_x) ^ sign); + + // calculate sign for half-fractional inf values + uint64_t is_finite = asuint64 (abs_x_reduced); + uint64_t is_odd = (rounded_x2 & 2) << 62; + uint64_t is_neg = rounded_x2 & SIGN_MASK; + uint64_t keep_sign = is_finite | (is_odd ^ is_neg); + offset = d->invpi / (keep_sign ? r_x : -r_x); + scale = r_x; + + result = pw_horner_8_f64 (r_x2, r_x4, d->cot_poly); + } + else + { + double r_x2 = x_reduced * x_reduced; + double r_x4 = r_x2 * r_x2; + + offset = d->pi * x_reduced; + scale = x_reduced * r_x2; + + result = pw_horner_13_f64 (r_x2, r_x4, d->tan_poly); + } + + return fma (scale, result, offset); +} + +#if WANT_EXPERIMENTAL_MATH +double +tanpi (double x) +{ + return arm_math_tanpi (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_tanpi, 1.69) +TEST_SYM_INTERVAL (arm_math_tanpi, 0, 0x1p-63, 50000) +TEST_SYM_INTERVAL (arm_math_tanpi, 0x1p-63, 0.5, 100000) +TEST_SYM_INTERVAL (arm_math_tanpi, 0.5, 0x1p53, 100000) +TEST_SYM_INTERVAL (arm_math_tanpi, 0x1p53, inf, 100000) +#endif diff --git a/math/aarch64/tanpif_3u1.c b/math/aarch64/tanpif_3u1.c new file mode 100644 index 00000000000000..8cd66594c290e6 --- /dev/null +++ b/math/aarch64/tanpif_3u1.c @@ -0,0 +1,145 @@ +/* + * Single-precision scalar tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "math_config.h" +#include "test_sig.h" +#include "test_defs.h" +#include "poly_scalar_f32.h" + +const static struct tanpif_data +{ + float tan_poly[6], cot_poly[4], pi, invpi; +} tanpif_data = { + /* Coefficents for tan(pi * x). */ + .tan_poly = { + 0x1.4abbc8p3, + 0x1.467284p5, + 0x1.44cf12p7, + 0x1.596b5p9, + 0x1.753858p10, + 0x1.76ff52p14, + }, + /* Coefficents for cot(pi * x). */ + .cot_poly = { + -0x1.0c1522p0, + -0x1.60ce32p-1, + -0x1.49cd42p-1, + -0x1.73f786p-1, + }, + .pi = 0x1.921fb6p1f, + .invpi = 0x1.45f308p-2f, +}; + +/* Single-precision scalar tanpi(x) implementation. + Maximum error 2.56 ULP: + tanpif(0x1.4bf948p-1) got -0x1.fcc9ep+0 + want -0x1.fcc9e6p+0. */ +float +arm_math_tanpif (float x) +{ + uint32_t xabs_12 = asuint (x) >> 20 & 0x7f8; + + /* x >= 0x1p24f. */ + if (unlikely (xabs_12 >= 0x4b1)) + { + /* tanpif(+/-inf) and tanpif(+/-nan) = nan. */ + if (unlikely (xabs_12 == 0x7f8)) + { + return __math_invalidf (x); + } + + uint32_t x_sign = asuint (x) & 0x80000000; + return asfloat (x_sign); + } + + const struct tanpif_data *d = ptr_barrier (&tanpif_data); + + /* Prevent underflow exceptions. x <= 0x1p-31. */ + if (unlikely (xabs_12 < 0x300)) + { + return d->pi * x; + } + + float rounded = roundf (x); + if (unlikely (rounded == x)) + { + /* If x == 0, return with sign. */ + if (x == 0) + { + return x; + } + /* Otherwise, return zero with alternating sign. */ + int32_t m = (int32_t) rounded; + if (x < 0) + { + return m & 1 ? 0.0f : -0.0f; + } + else + { + return m & 1 ? -0.0f : 0.0f; + } + } + + float x_reduced = x - rounded; + float abs_x_reduced = 0.5f - asfloat (asuint (x_reduced) & 0x7fffffff); + + float result, offset, scale; + + /* Test 0.25 < abs_x < 0.5 independent from abs_x_reduced. */ + float x2 = x + x; + int32_t rounded_x2 = (int32_t) roundf (x2); + if (rounded_x2 & 1) + { + float r_x = abs_x_reduced; + + float r_x2 = r_x * r_x; + float r_x4 = r_x2 * r_x2; + + uint32_t sign = asuint (x_reduced) & 0x80000000; + r_x = asfloat (asuint (r_x) ^ sign); + + // calculate sign for half-fractional inf values + uint32_t is_finite = asuint (abs_x_reduced); + uint32_t is_odd = (rounded_x2 & 2) << 30; + uint32_t is_neg = rounded_x2 & 0x80000000; + uint32_t keep_sign = is_finite | (is_odd ^ is_neg); + offset = d->invpi / (keep_sign ? r_x : -r_x); + scale = r_x; + + result = pairwise_poly_3_f32 (r_x2, r_x4, d->cot_poly); + } + else + { + float r_x = x_reduced; + + float r_x2 = r_x * r_x; + float r_x4 = r_x2 * r_x2; + + offset = d->pi * r_x; + scale = r_x * r_x2; + + result = pw_horner_5_f32 (r_x2, r_x4, d->tan_poly); + } + + return fmaf (scale, result, offset); +} + +#if WANT_EXPERIMENTAL_MATH +float +tanpif (float x) +{ + return arm_math_tanpif (x); +} +#endif + +#if WANT_TRIGPI_TESTS +TEST_ULP (arm_math_tanpif, 2.57) +TEST_SYM_INTERVAL (arm_math_tanpif, 0, 0x1p-31f, 50000) +TEST_SYM_INTERVAL (arm_math_tanpif, 0x1p-31f, 0.5, 100000) +TEST_SYM_INTERVAL (arm_math_tanpif, 0.5, 0x1p23f, 100000) +TEST_SYM_INTERVAL (arm_math_tanpif, 0x1p23f, inf, 100000) +#endif diff --git a/pl/math/erf_data.c b/math/aarch64/v_erf_data.c similarity index 99% rename from pl/math/erf_data.c rename to math/aarch64/v_erf_data.c index 138e03578e77cf..5400d6b8d0e300 100644 --- a/pl/math/erf_data.c +++ b/math/aarch64/v_erf_data.c @@ -1,20 +1,20 @@ /* * Data for approximation of erf. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Lookup table used in erf. +/* Lookup table used in vector erf. For each possible rounded input r (multiples of 1/128), between r = 0.0 and r = 6.0 (769 values): - - the first entry __erff_data.tab.erf contains the values of erf(r), - - the second entry __erff_data.tab.scale contains the values of + - the first entry __v_erff_data.tab.erf contains the values of erf(r), + - the second entry __v_erff_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the algorithm, since lookup is performed only for x >= 1/64-1/512. */ -const struct erf_data __erf_data = { +const struct v_erf_data __v_erf_data = { .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 }, { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 }, { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 }, diff --git a/pl/math/erfc_data.c b/math/aarch64/v_erfc_data.c similarity index 99% rename from pl/math/erfc_data.c rename to math/aarch64/v_erfc_data.c index 40f72a4d6d5b63..6acd96f74be5f0 100644 --- a/pl/math/erfc_data.c +++ b/math/aarch64/v_erfc_data.c @@ -1,20 +1,20 @@ /* * Data used in double-precision erfc(x) function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Lookup table used in erfc. +/* Lookup table used in vector erfc. For each possible rounded input r (multiples of 1/128), between r = 0.0 and r = ~27.0 (3488 values): - - the first entry __erfc_data.tab.erfc contains the values of erfc(r), - - the second entry __erfc_data.tab.scale contains the values of + - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r), + - the second entry __v_erfc_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore they are scaled by a large enough value 2^128 (fits in 8bit). */ -const struct erfc_data __erfc_data = { +const struct v_erfc_data __v_erfc_data = { .tab = { { 0x1p128, 0x1.20dd750429b6dp128 }, { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 }, { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 }, diff --git a/pl/math/erfcf_data.c b/math/aarch64/v_erfcf_data.c similarity index 98% rename from pl/math/erfcf_data.c rename to math/aarch64/v_erfcf_data.c index a54e11973819df..9f992b4887fb16 100644 --- a/pl/math/erfcf_data.c +++ b/math/aarch64/v_erfcf_data.c @@ -1,20 +1,20 @@ /* * Data used in single-precision erfc(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Lookup table used in erfcf. +/* Lookup table used in vector erfcf. For each possible rounded input r (multiples of 1/64), between r = 0.0 and r = 10.0625 (645 values): - - the first entry __erfcf_data.tab.erfc contains the values of erfc(r), - - the second entry __erfcf_data.tab.scale contains the values of + - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r), + - the second entry __v_erfcf_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore they are scaled by a large enough value 2^47 (fits in 8 bits). */ -const struct erfcf_data __erfcf_data = { +const struct v_erfcf_data __v_erfcf_data = { .tab = { { 0x1p47, 0x1.20dd76p47 }, { 0x1.f6f944p46, 0x1.20cb68p47 }, { 0x1.edf3aap46, 0x1.209546p47 }, diff --git a/pl/math/erff_data.c b/math/aarch64/v_erff_data.c similarity index 98% rename from pl/math/erff_data.c rename to math/aarch64/v_erff_data.c index 84c0d2e9546316..8d11d8b6c10bb6 100644 --- a/pl/math/erff_data.c +++ b/math/aarch64/v_erff_data.c @@ -1,20 +1,20 @@ /* * Data for approximation of erff. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* Lookup table used in erff. +/* Lookup table used in vector erff. For each possible rounded input r (multiples of 1/128), between r = 0.0 and r = 4.0 (513 values): - - the first entry __erff_data.tab.erf contains the values of erf(r), - - the second entry __erff_data.tab.scale contains the values of + - the first entry __v_erff_data.tab.erf contains the values of erf(r), + - the second entry __v_erff_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the algorithm, since lookup is performed only for x >= 1/64-1/512. */ -const struct erff_data __erff_data = { +const struct v_erff_data __v_erff_data = { .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 }, { 0x1.20dbf4p-7, 0x1.20d8f2p+0 }, { 0x1.20d770p-6, 0x1.20cb68p+0 }, diff --git a/math/aarch64/v_exp2f_1u.c b/math/aarch64/v_exp2f_1u.c deleted file mode 100644 index ba6b02fbb4bcbd..00000000000000 --- a/math/aarch64/v_exp2f_1u.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const float Poly[] = { - /* maxerr: 0.878 ulp. */ - 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) -#define C5 v_f32 (Poly[5]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -static float32x4_t VPCS_ATTR NOINLINE -specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); - float32x4_t s2 = vreinterpretq_f32_u32 (e - b); - uint32x4_t cmp = absn > v_f32 (192.0f); - float32x4_t r1 = s1 * s1; - float32x4_t r0 = poly * s1 * s2; - return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) - | (~cmp & vreinterpretq_u32_f32 (r0))); -} - -float32x4_t VPCS_ATTR -_ZGVnN4v_exp2f_1u (float32x4_t x) -{ - float32x4_t n, r, scale, poly, absn; - uint32x4_t cmp, e; - - /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - float32x4_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = vreinterpretq_u32_f32 (z) << 23; -#else - n = vrndaq_f32 (x); - r = x - n; - e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; -#endif - scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); - absn = vabsq_f32 (n); - cmp = absn > v_f32 (126.0f); - poly = vfmaq_f32 (C1, C0, r); - poly = vfmaq_f32 (C2, poly, r); - poly = vfmaq_f32 (C3, poly, r); - poly = vfmaq_f32 (C4, poly, r); - poly = vfmaq_f32 (C5, poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c index 45f0848cac5b5b..59db77ac58ccc8 100644 --- a/math/aarch64/v_exp_data.c +++ b/math/aarch64/v_exp_data.c @@ -1,17 +1,14 @@ /* - * Lookup table for double-precision e^x vector function. + * Scale values for vector exp and exp2 * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "v_math.h" +#include "math_config.h" -# define N (1 << V_EXP_TABLE_BITS) - -/* 2^(j/N), j=0..N. */ +/* 2^(j/N), j=0..N, N=2^7=128. */ const uint64_t __v_exp_data[] = { -# if N == 128 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, @@ -55,92 +52,4 @@ const uint64_t __v_exp_data[] = { 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, -# elif N == 256 - 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, - 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, - 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, - 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, - 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, - 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, - 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, - 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, - 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, - 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, - 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, - 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, - 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, - 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, - 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, - 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, - 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, - 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, - 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, - 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, - 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, - 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, - 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, - 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, - 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, - 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, - 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, - 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, - 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, - 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, - 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, - 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, - 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, - 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, - 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, - 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, - 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, - 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, - 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, - 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, - 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, - 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, - 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, - 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, - 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, - 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, - 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, - 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, - 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, - 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, - 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, - 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, - 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, - 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, - 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, - 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, - 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, - 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, - 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, - 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, - 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, - 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, - 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, - 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, - 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, - 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, - 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, - 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, - 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, - 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, - 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, - 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, - 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, - 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, - 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, - 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, - 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, - 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, - 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, - 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, - 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, - 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, - 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, - 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, - 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, - 0x3feff9d96b2a23d9, -# endif }; diff --git a/pl/math/v_exp_tail_data.c b/math/aarch64/v_exp_tail_data.c similarity index 98% rename from pl/math/v_exp_tail_data.c rename to math/aarch64/v_exp_tail_data.c index 989dd41d949a59..5cc58a40b6b7d1 100644 --- a/pl/math/v_exp_tail_data.c +++ b/math/aarch64/v_exp_tail_data.c @@ -1,13 +1,13 @@ /* * Lookup table for double-precision e^x vector function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" -/* 2^(j/N), j=0..N, N=2^8=256. Copied from math/v_exp_data.c. */ +/* 2^(j/N), j=0..N, N=2^8=256. */ const uint64_t __v_exp_tail_data[] = { 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, diff --git a/math/aarch64/v_expf_1u.c b/math/aarch64/v_expf_1u.c deleted file mode 100644 index 43d03fa34efab4..00000000000000 --- a/math/aarch64/v_expf_1u.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const float Poly[] = { - /* maxerr: 0.36565 +0.5 ulp. */ - 0x1.6a6000p-10f, - 0x1.12718ep-7f, - 0x1.555af0p-5f, - 0x1.555430p-3f, - 0x1.fffff4p-2f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -static float32x4_t VPCS_ATTR NOINLINE -specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); - float32x4_t s2 = vreinterpretq_f32_u32 (e - b); - uint32x4_t cmp = absn > v_f32 (192.0f); - float32x4_t r1 = s1 * s1; - float32x4_t r0 = poly * s1 * s2; - return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) - | (~cmp & vreinterpretq_u32_f32 (r0))); -} - -float32x4_t VPCS_ATTR -_ZGVnN4v_expf_1u (float32x4_t x) -{ - float32x4_t n, r, scale, poly, absn, z; - uint32x4_t cmp, e; - - /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = vfmaq_f32 (Shift, x, InvLn2); - n = z - Shift; - r = vfmaq_f32 (x, n, -Ln2hi); - r = vfmaq_f32 (r, n, -Ln2lo); - e = vreinterpretq_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = vrndaq_f32 (z); - r = vfmaq_f32 (x, n, -Ln2hi); - r = vfmaq_f32 (r, n, -Ln2lo); - e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23; -#endif - scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); - absn = vabsq_f32 (n); - cmp = absn > v_f32 (126.0f); - poly = vfmaq_f32 (C1, C0, r); - poly = vfmaq_f32 (C2, poly, r); - poly = vfmaq_f32 (C3, poly, r); - poly = vfmaq_f32 (C4, poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c deleted file mode 100644 index 1d1c1fa62c0423..00000000000000 --- a/math/aarch64/v_log.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Double-precision vector log(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - uint64x2_t min_norm; - uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t ln2; - uint64x2_t sign_exp_mask; -} data = { - /* Worst-case error: 1.17 + 0.5 ulp. - Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), - V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), - V2 (-0x1.554e550bd501ep-3) }, - .ln2 = V2 (0x1.62e42fefa39efp-1), - .min_norm = V2 (0x0010000000000000), - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ - .sign_exp_mask = V2 (0xfff0000000000000) -}; - -#define A(i) d->poly[i] -#define N (1 << V_LOG_TABLE_BITS) -#define IndexMask (N - 1) -#define Off v_u64 (0x3fe6900900000000) - -struct entry -{ - float64x2_t invc; - float64x2_t logc; -}; - -static inline struct entry -lookup (uint64x2_t i) -{ - /* Since N is a power of 2, n % N = n & (N - 1). */ - struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); - float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); - e.invc = vuzp1q_f64 (e0, e1); - e.logc = vuzp2q_f64 (e0, e1); - return e; -} - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, - uint32x2_t cmp) -{ - return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); -} - -float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - float64x2_t z, r, r2, p, y, kd, hi; - uint64x2_t ix, iz, tmp; - uint32x2_t cmp; - int64x2_t k; - struct entry e; - - ix = vreinterpretq_u64_f64 (x); - cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); - - /* x = 2^k z; where z is in range [Off,2*Off) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = vsubq_u64 (ix, Off); - k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ - iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); - z = vreinterpretq_f64_u64 (iz); - e = lookup (tmp); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); - kd = vcvtq_f64_s64 (k); - - /* hi = r + log(c) + k*Ln2. */ - hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = vmulq_f64 (r, r); - y = vfmaq_f64 (A (2), A (3), r); - p = vfmaq_f64 (A (0), A (1), r); - y = vfmaq_f64 (y, A (4), r2); - y = vfmaq_f64 (p, y, r2); - - if (unlikely (v_any_u32h (cmp))) - return special_case (x, y, hi, r2, cmp); - return vfmaq_f64 (hi, y, r2); -} diff --git a/pl/math/v_log10_data.c b/math/aarch64/v_log10_data.c similarity index 99% rename from pl/math/v_log10_data.c rename to math/aarch64/v_log10_data.c index d9a624dab9ce11..bae2685822f6d0 100644 --- a/pl/math/v_log10_data.c +++ b/math/aarch64/v_log10_data.c @@ -1,7 +1,7 @@ /* * Lookup table for double-precision log10(x) vector function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/v_log2_data.c b/math/aarch64/v_log2_data.c similarity index 99% rename from pl/math/v_log2_data.c rename to math/aarch64/v_log2_data.c index 50697daff925ae..fad91d654da8e0 100644 --- a/pl/math/v_log2_data.c +++ b/math/aarch64/v_log2_data.c @@ -1,7 +1,7 @@ /* * Coefficients and table entries for vector log2 * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c index 82351bb14766f2..4f0e6e16738112 100644 --- a/math/aarch64/v_log_data.c +++ b/math/aarch64/v_log_data.c @@ -1,30 +1,35 @@ /* * Lookup table for double-precision log(x) vector function. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "v_math.h" - -#define N (1 << V_LOG_TABLE_BITS) +#include "math_config.h" const struct v_log_data __v_log_data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, + 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 }, + .ln2 = 0x1.62e42fefa39efp-1, /* Algorithm: x = 2^k z log(x) = k ln2 + log(c) + poly(z/c - 1) - where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, - N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables: + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from two lookup + tables: table[i].invc = 1/c table[i].logc = (double)log(c) - where c is near the center of the subinterval and is chosen by trying several - floating point invc candidates around 1/center and selecting one for which - the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval - that contains 1 and the previous one got tweaked to avoid cancellation. */ + where c is near the center of the subinterval and is chosen by trying + several floating point invc candidates around 1/center and selecting one + for which the error in (double)log(c) is minimized (< 0x1p-74), except the + subinterval that contains 1 and the previous one got tweaked to avoid + cancellation. */ .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c deleted file mode 100644 index 66ebbbcd2b5a84..00000000000000 --- a/math/aarch64/v_logf.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Single-precision vector log function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - uint32x4_t min_norm; - uint16x8_t special_bound; - float32x4_t poly[7]; - float32x4_t ln2, tiny_bound; - uint32x4_t off, mantissa_mask; -} data = { - /* 3.34 ulp error. */ - .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), - V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), - V4 (-0x1.ffffc8p-2f) }, - .ln2 = V4 (0x1.62e43p-1f), - .tiny_bound = V4 (0x1p-126), - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ - .off = V4 (0x3f2aaaab), /* 0.666667. */ - .mantissa_mask = V4 (0x007fffff) -}; - -#define P(i) d->poly[7 - i] - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, - uint16x4_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); -} - -float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - float32x4_t n, p, q, r, r2, y; - uint32x4_t u; - uint16x4_t cmp; - - u = vreinterpretq_u32_f32 (x); - cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); - n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vandq_u32 (u, d->mantissa_mask); - u = vaddq_u32 (u, d->off); - r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); - - /* y = log(1+r) + n*ln2. */ - r2 = vmulq_f32 (r, r); - /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = vfmaq_f32 (P (5), P (6), r); - q = vfmaq_f32 (P (3), P (4), r); - y = vfmaq_f32 (P (1), P (2), r); - p = vfmaq_f32 (p, P (7), r2); - q = vfmaq_f32 (q, p, r2); - y = vfmaq_f32 (y, q, r2); - p = vfmaq_f32 (r, d->ln2, n); - - if (unlikely (v_any_u16h (cmp))) - return special_case (x, y, r2, p, cmp); - return vfmaq_f32 (p, y, r2); -} diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h deleted file mode 100644 index 1dc9916c6fb076..00000000000000 --- a/math/aarch64/v_math.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Vector math abstractions. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _V_MATH_H -#define _V_MATH_H - -#if !__aarch64__ -# error "Cannot build without AArch64" -#endif - -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) - -#define V_NAME_F1(fun) _ZGVnN4v_##fun##f -#define V_NAME_D1(fun) _ZGVnN2v_##fun -#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f -#define V_NAME_D2(fun) _ZGVnN2vv_##fun - -#include -#include "../math_config.h" -#include - -/* Shorthand helpers for declaring constants. */ -# define V2(X) { X, X } -# define V4(X) { X, X, X, X } -# define V8(X) { X, X, X, X, X, X, X, X } - -static inline int -v_any_u16h (uint16x4_t x) -{ - return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; -} - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline float32x4_t -v_f32 (float x) -{ - return (float32x4_t) V4 (x); -} -static inline uint32x4_t -v_u32 (uint32_t x) -{ - return (uint32x4_t) V4 (x); -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (uint32x4_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; -} -static inline int -v_any_u32h (uint32x2_t x) -{ - return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; -} -static inline float32x4_t -v_lookup_f32 (const float *tab, uint32x4_t idx) -{ - return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline uint32x4_t -v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) -{ - return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline float32x4_t -v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) -{ - return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline float32x4_t -v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, - float32x4_t y, uint32x4_t p) -{ - return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0], - p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], - p[3] ? f (x1[3], x2[3]) : y[3]}; -} - -static inline int -v_lanes64 (void) -{ - return 2; -} -static inline float64x2_t -v_f64 (double x) -{ - return (float64x2_t) V2 (x); -} -static inline uint64x2_t -v_u64 (uint64_t x) -{ - return (uint64x2_t) V2 (x); -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (uint64x2_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (x) != 0; -} -static inline float64x2_t -v_lookup_f64 (const double *tab, uint64x2_t idx) -{ - return (float64x2_t){tab[idx[0]], tab[idx[1]]}; -} -static inline uint64x2_t -v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) -{ - return (uint64x2_t){tab[idx[0]], tab[idx[1]]}; -} -static inline float64x2_t -v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) -{ - double p1 = p[1]; - double x1 = x[1]; - if (likely (p[0])) - y[0] = f (x[0]); - if (likely (p1)) - y[1] = f (x1); - return y; -} - -#endif diff --git a/math/aarch64/v_pow.c b/math/aarch64/v_pow.c deleted file mode 100644 index 734f1663a283d4..00000000000000 --- a/math/aarch64/v_pow.c +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Double-precision vector pow function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) -{ - float64x2_t z; - for (int lane = 0; lane < v_lanes64 (); lane++) - { - double sx = x[lane]; - double sy = y[lane]; - double sz = pow (sx, sy); - z[lane] = sz; - } - return z; -} diff --git a/pl/math/v_pow_exp_data.c b/math/aarch64/v_pow_exp_data.c similarity index 99% rename from pl/math/v_pow_exp_data.c rename to math/aarch64/v_pow_exp_data.c index 5d921ef648a48a..db615ce94bd7c8 100644 --- a/pl/math/v_pow_exp_data.c +++ b/math/aarch64/v_pow_exp_data.c @@ -1,7 +1,7 @@ /* * Shared data between exp, exp2 and pow. * - * Copyright (c) 2018-2023, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/v_pow_log_data.c b/math/aarch64/v_pow_log_data.c similarity index 99% rename from pl/math/v_pow_log_data.c rename to math/aarch64/v_pow_log_data.c index 036faa5c97c1d2..7df277f74e4f4c 100644 --- a/pl/math/v_pow_log_data.c +++ b/math/aarch64/v_pow_log_data.c @@ -1,7 +1,7 @@ /* * Data for the log part of pow. * - * Copyright (c) 2018-2023, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c deleted file mode 100644 index 3a4163ab05582b..00000000000000 --- a/math/aarch64/v_powf.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Single-precision vector powf function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" - -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Thresh v_u32 (0x7f000000) /* Max - Min. */ -#define MantissaMask v_u32 (0x007fffff) - -#define A data.log2_poly -#define C data.exp2f_poly - -/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ -#define Off v_u32 (0x3f35d000) - -#define V_POWF_LOG2_TABLE_BITS 5 -#define V_EXP2F_TABLE_BITS 5 -#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1) -#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) - -static const struct -{ - struct - { - double invc, logc; - } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; - double log2_poly[4]; - uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; - double exp2f_poly[3]; -} data = { - .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, - {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, - {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, - {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, - {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, - {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, - {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, - {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, - {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, - {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, - {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, - {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, - {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, - {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, - {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, - {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, - {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, - {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, - {0x1p+0, 0x0p+0 * Scale}, - {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, - {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, - {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, - {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, - {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, - {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, - {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, - {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, - {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, - {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, - {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, - {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, - {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, - .log2_poly = { /* rel err: 1.5 * 2^-30. */ - -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale, - -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,}, - .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, - 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, - 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, - 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, - 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, - 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, - 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, - 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, - 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, - 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, - 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, - .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ - 0x1.c6af84b912394p-5 / Scale / Scale / Scale, - 0x1.ebfce50fac4f3p-3 / Scale / Scale, - 0x1.62e42ff0c52d6p-1 / Scale}}; - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) -{ - return v_call2_f32 (powf, x, y, ret, cmp); -} - -float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) -{ - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); - uint32x4_t tmp = vsubq_u32 (u, Off); - uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)), - Log2IdxMask); - uint32x4_t top = vbicq_u32 (tmp, MantissaMask); - uint32x4_t iz = vsubq_u32 (u, top); - int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), - 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ - - float32x4_t ret; - for (int lane = 0; lane < 4; lane++) - { - /* Use double precision for each lane. */ - double invc = data.log2_tab[i[lane]].invc; - double logc = data.log2_tab[i[lane]].logc; - double z = (double) asfloat (iz[lane]); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ - double r = __builtin_fma (z, invc, -1.0); - double y0 = logc + (double) k[lane]; - - /* Polynomial to approximate log1p(r)/ln2. */ - double logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + y0; - double ylogx = y[lane] * logx; - cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff) - >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47 - ? 1 - : cmp[lane]; - - /* N*x = k + r with r in [-1/2, 1/2]. */ - double kd = round (ylogx); - uint64_t ki = lround (ylogx); - r = ylogx - kd; - - /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ - uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)]; - t += ki << (52 - V_EXP2F_TABLE_BITS); - double s = asdouble (t); - double p = C[0]; - p = __builtin_fma (p, r, C[1]); - p = __builtin_fma (p, r, C[2]); - p = __builtin_fma (p, s * r, s); - - ret[lane] = p; - } - if (unlikely (v_any_u32 (cmp))) - return special_case (x, y, ret, cmp); - return ret; -} diff --git a/pl/math/v_powf_data.c b/math/aarch64/v_powf_data.c similarity index 98% rename from pl/math/v_powf_data.c rename to math/aarch64/v_powf_data.c index ded211924b8047..5cf1b876941450 100644 --- a/pl/math/v_powf_data.c +++ b/math/aarch64/v_powf_data.c @@ -1,7 +1,7 @@ /* * Coefficients for single-precision SVE pow(x) function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/math/cosf.c b/math/cosf.c index 6293ce8f1b7d6b..a9b1f9da16ed55 100644 --- a/math/cosf.c +++ b/math/cosf.c @@ -1,7 +1,7 @@ /* * Single-precision cos function. * - * Copyright (c) 2018-2021, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,6 +9,8 @@ #include #include "math_config.h" #include "sincosf.h" +#include "test_defs.h" +#include "test_sig.h" /* Fast cosf implementation. Worst-case ULP is 0.5607, maximum relative error is 0.5303 * 2^-23. A single-step range reduction is used for @@ -61,3 +63,9 @@ cosf (float y) else return __math_invalidf (y); } + +TEST_SIG (S, F, 1, cos, -3.1, 3.1) +TEST_ULP (cosf, 0.06) +TEST_ULP_NONNEAREST (cosf, 0.5) +TEST_INTERVAL (cosf, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (cosf, 0x1p-14, 0x1p54, 50000) diff --git a/math/erf.c b/math/erf.c index 5f9f40dda26434..2c93a304346a1f 100644 --- a/math/erf.c +++ b/math/erf.c @@ -1,13 +1,15 @@ /* * Double-precision erf(x) function. * - * Copyright (c) 2020, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" #include #include +#include "test_defs.h" +#include "test_sig.h" #define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 #define C 0x1.b0ac16p-1 @@ -242,3 +244,11 @@ erf (double x) return 1.0; } } + +TEST_SIG (S, D, 1, erf, -6.0, 6.0) +TEST_ULP (erf, 0.51) +TEST_ULP_NONNEAREST (erf, 0.9) +TEST_INTERVAL (erf, 0, 0xffff000000000000, 10000) +TEST_SYM_INTERVAL (erf, 0x1p-1022, 0x1p-26, 40000) +TEST_SYM_INTERVAL (erf, 0x1p-26, 0x1p3, 40000) +TEST_INTERVAL (erf, 0, inf, 40000) diff --git a/math/erff.c b/math/erff.c index 9fa476dbbab2d7..fd64f40a2d22c9 100644 --- a/math/erff.c +++ b/math/erff.c @@ -1,13 +1,15 @@ /* * Single-precision erf(x) function. * - * Copyright (c) 2020, Arm Limited. + * Copyright (c) 2020-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f #define A __erff_data.erff_poly_A @@ -102,3 +104,11 @@ erff (float x) } return r; } + +TEST_SIG (S, F, 1, erf, -6.0, 6.0) +TEST_ULP (erff, 0.6) +TEST_ULP_NONNEAREST (erff, 0.9) +TEST_INTERVAL (erff, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000) +TEST_SYM_INTERVAL (erff, 0x1p-26, 0x1p3, 40000) +TEST_INTERVAL (erff, 0, inf, 40000) diff --git a/math/exp.c b/math/exp.c index 1de500c31f3ed0..3b08d44688a803 100644 --- a/math/exp.c +++ b/math/exp.c @@ -1,7 +1,7 @@ /* * Double-precision e^x function. * - * Copyright (c) 2018-2019, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,6 +9,8 @@ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" #define N (1 << EXP_TABLE_BITS) #define InvLn2N __exp_data.invln2N @@ -77,7 +79,7 @@ top12 (double x) /* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. If hastail is 0 then xtail is assumed to be 0 too. */ static inline double -exp_inline (double x, double xtail, int hastail) +exp_inline (double x, double xtail) { uint32_t abstop; uint64_t ki, idx, top, sbits; @@ -125,7 +127,7 @@ exp_inline (double x, double xtail, int hastail) #endif r = x + kd * NegLn2hiN + kd * NegLn2loN; /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ - if (hastail) + if (!__builtin_constant_p (xtail) || xtail != 0.0) r += xtail; /* 2^(k/N) ~= scale * (1 + tail). */ idx = 2 * (ki % N); @@ -156,21 +158,20 @@ exp_inline (double x, double xtail, int hastail) double exp (double x) { - return exp_inline (x, 0, 0); + return exp_inline (x, 0); } -/* May be useful for implementing pow where more than double - precision input is needed. */ -double -__exp_dd (double x, double xtail) -{ - return exp_inline (x, xtail, 1); -} #if USE_GLIBC_ABI strong_alias (exp, __exp_finite) hidden_alias (exp, __ieee754_exp) -hidden_alias (__exp_dd, __exp1) # if LDBL_MANT_DIG == 53 long double expl (long double x) { return exp (x); } # endif #endif + +TEST_SIG (S, D, 1, exp, -9.9, 9.9) +TEST_ULP (exp, 0.01) +TEST_ULP_NONNEAREST (exp, 0.5) +TEST_INTERVAL (exp, 0, 0xffff000000000000, 10000) +TEST_SYM_INTERVAL (exp, 0x1p-6, 0x1p6, 400000) +TEST_SYM_INTERVAL (exp, 633.3, 733.3, 10000) diff --git a/math/exp10.c b/math/exp10.c index 0fbec4c694ca83..de8ece42e09e63 100644 --- a/math/exp10.c +++ b/math/exp10.c @@ -1,11 +1,13 @@ /* * Double-precision 10^x function. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" #define N (1 << EXP_TABLE_BITS) #define IndexMask (N - 1) @@ -22,7 +24,7 @@ special_case (uint64_t sbits, double_t tmp, uint64_t ki) { double_t scale, y; - if (ki - (1ull << 16) < 0x80000000) + if ((ki & 0x80000000) == 0) { /* The exponent of scale might have overflowed by 1. */ sbits -= 1ull << 52; @@ -84,14 +86,14 @@ exp10 (double x) /* Reduce x: z = x * N / log10(2), k = round(z). */ double_t z = __exp_data.invlog10_2N * x; double_t kd; - int64_t ki; + uint64_t ki; #if TOINT_INTRINSICS kd = roundtoint (z); ki = converttoint (z); #else kd = eval_as_double (z + Shift); + ki = asuint64 (kd); kd -= Shift; - ki = kd; #endif /* r = x - k * log10(2), r in [-0.5, 0.5]. */ @@ -127,3 +129,15 @@ exp10 (double x) double_t s = asdouble (sbits); return eval_as_double (s * y + s); } + +#if WANT_EXP10_TESTS +TEST_SIG (S, D, 1, exp10, -9.9, 9.9) +TEST_ULP (exp10, 0.02) +TEST_ULP_NONNEAREST (exp10, 0.5) +TEST_SYM_INTERVAL (exp10, 0, 0x1p-47, 5000) +TEST_SYM_INTERVAL (exp10, 0x1p47, 1, 50000) +TEST_INTERVAL (exp10, 1, OFlowBound, 50000) +TEST_INTERVAL (exp10, -1, UFlowBound, 50000) +TEST_INTERVAL (exp10, OFlowBound, inf, 5000) +TEST_INTERVAL (exp10, UFlowBound, -inf, 5000) +#endif diff --git a/math/exp2.c b/math/exp2.c index a1eee44f1f4828..f26ac3cda2ccbe 100644 --- a/math/exp2.c +++ b/math/exp2.c @@ -1,7 +1,7 @@ /* * Double-precision 2^x function. * - * Copyright (c) 2018-2019, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,6 +9,8 @@ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" #define N (1 << EXP_TABLE_BITS) #define Shift __exp_data.exp2_shift @@ -141,3 +143,10 @@ hidden_alias (exp2, __ieee754_exp2) long double exp2l (long double x) { return exp2 (x); } # endif #endif + +TEST_SIG (S, D, 1, exp2, -9.9, 9.9) +TEST_ULP (exp2, 0.01) +TEST_ULP_NONNEAREST (exp2, 0.5) +TEST_INTERVAL (exp2, 0, 0xffff000000000000, 10000) +TEST_SYM_INTERVAL (exp2, 0x1p-6, 0x1p6, 40000) +TEST_SYM_INTERVAL (exp2, 633.3, 733.3, 10000) diff --git a/math/exp2f.c b/math/exp2f.c index 776c3ddf76636a..3202f41377adce 100644 --- a/math/exp2f.c +++ b/math/exp2f.c @@ -1,13 +1,15 @@ /* * Single-precision 2^x function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" /* EXP2F_TABLE_BITS = 5 @@ -78,3 +80,9 @@ exp2f (float x) strong_alias (exp2f, __exp2f_finite) hidden_alias (exp2f, __ieee754_exp2f) #endif + +TEST_SIG (S, F, 1, exp2, -9.9, 9.9) +TEST_ULP (exp2f, 0.01) +TEST_ULP_NONNEAREST (exp2f, 0.5) +TEST_INTERVAL (exp2f, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (exp2f, 0x1p-14, 0x1p8, 50000) diff --git a/math/expf.c b/math/expf.c index 08a20d59e49145..6572b99a1e68a7 100644 --- a/math/expf.c +++ b/math/expf.c @@ -1,13 +1,15 @@ /* * Single-precision e^x function. * - * Copyright (c) 2017-2019, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" /* EXP2F_TABLE_BITS = 5 @@ -89,3 +91,9 @@ expf (float x) strong_alias (expf, __expf_finite) hidden_alias (expf, __ieee754_expf) #endif + +TEST_SIG (S, F, 1, exp, -9.9, 9.9) +TEST_ULP (expf, 0.01) +TEST_ULP_NONNEAREST (expf, 0.5) +TEST_INTERVAL (expf, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (expf, 0x1p-14, 0x1p8, 500000) diff --git a/math/include/mathlib.h b/math/include/mathlib.h index 64cbb9c1f8506e..23d04da99d93f2 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -1,58 +1,268 @@ /* * Public API. * - * Copyright (c) 2015-2023, Arm Limited. + * Copyright (c) 2015-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATHLIB_H #define _MATHLIB_H -float expf (float); -float exp2f (float); -float logf (float); -float log2f (float); -float powf (float, float); -float sinf (float); -float cosf (float); -void sincosf (float, float*, float*); - -double exp (double); -double exp10 (double); -double exp2 (double); -double log (double); -double log2 (double); -double pow (double, double); - #if __aarch64__ -# if __GNUC__ >= 5 -typedef __Float32x4_t __f32x4_t; -typedef __Float64x2_t __f64x2_t; -# elif __clang_major__*100+__clang_minor__ >= 305 -typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; -typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -# else -# error Unsupported compiler -# endif +/* Low-accuracy scalar implementations of C23 routines. */ +float arm_math_cospif (float); +double arm_math_cospi (double); +float arm_math_sinpif (float); +double arm_math_sinpi (double); +float arm_math_tanpif (float); +double arm_math_tanpi (double); +void arm_math_sincospif (float, float *, float *); +void arm_math_sincospi (double, double *, double *); +#endif + +/* SIMD declaration for autovectorisation with fast-math enabled. Only GCC is + supported, and vector routines are only supported on Linux on AArch64. */ +#if defined __aarch64__ && __linux__ && defined(__GNUC__) \ + && !defined(__clang__) && defined(__FAST_MATH__) +# define DECL_SIMD_aarch64 __attribute__ ((__simd__ ("notinbranch"), const)) +#else +# define DECL_SIMD_aarch64 +#endif + +#if WANT_EXPERIMENTAL_MATH + +float arm_math_erff (float); +DECL_SIMD_aarch64 float cospif (float); +DECL_SIMD_aarch64 float erfinvf (float); +DECL_SIMD_aarch64 float sinpif (float); +DECL_SIMD_aarch64 float tanpif (float); + +double arm_math_erf (double); +DECL_SIMD_aarch64 double cospi (double); +DECL_SIMD_aarch64 double erfinv (double); +DECL_SIMD_aarch64 double sinpi (double); +DECL_SIMD_aarch64 double tanpi (double); + +long double erfinvl (long double); + +#endif -# if __GNUC__ >= 9 || __clang_major__ >= 8 -# undef __vpcs -# define __vpcs __attribute__((__aarch64_vector_pcs__)) +/* Note these routines may not be provided by AOR (some are only available with + WANT_EXPERIMENTAL_MATH, some are not provided at all. Redeclare them here to + add vector annotations. */ +DECL_SIMD_aarch64 float acosf (float); +DECL_SIMD_aarch64 float acoshf (float); +DECL_SIMD_aarch64 float asinf (float); +DECL_SIMD_aarch64 float asinhf (float); +DECL_SIMD_aarch64 float atan2f (float, float); +DECL_SIMD_aarch64 float atanf (float); +DECL_SIMD_aarch64 float atanhf (float); +DECL_SIMD_aarch64 float cbrtf (float); +DECL_SIMD_aarch64 float cosf (float); +DECL_SIMD_aarch64 float coshf (float); +DECL_SIMD_aarch64 float erfcf (float); +DECL_SIMD_aarch64 float erff (float); +DECL_SIMD_aarch64 float exp10f (float); +DECL_SIMD_aarch64 float exp2f (float); +DECL_SIMD_aarch64 float expf (float); +DECL_SIMD_aarch64 float expm1f (float); +DECL_SIMD_aarch64 float hypotf (float, float); +DECL_SIMD_aarch64 float log10f (float); +DECL_SIMD_aarch64 float log1pf (float); +DECL_SIMD_aarch64 float log2f (float); +DECL_SIMD_aarch64 float logf (float); +DECL_SIMD_aarch64 float powf (float, float); +DECL_SIMD_aarch64 float sinf (float); +void sincosf (float, float *, float *); +DECL_SIMD_aarch64 float sinhf (float); +DECL_SIMD_aarch64 float tanf (float); +DECL_SIMD_aarch64 float tanhf (float); + +DECL_SIMD_aarch64 double acos (double); +DECL_SIMD_aarch64 double acosh (double); +DECL_SIMD_aarch64 double asin (double); +DECL_SIMD_aarch64 double asinh (double); +DECL_SIMD_aarch64 double atan2 (double, double); +DECL_SIMD_aarch64 double atan (double); +DECL_SIMD_aarch64 double atanh (double); +DECL_SIMD_aarch64 double cbrt (double); +DECL_SIMD_aarch64 double cos (double); +DECL_SIMD_aarch64 double cosh (double); +DECL_SIMD_aarch64 double erfc (double); +DECL_SIMD_aarch64 double erf (double); +DECL_SIMD_aarch64 double exp10 (double); +DECL_SIMD_aarch64 double exp2 (double); +DECL_SIMD_aarch64 double exp (double); +DECL_SIMD_aarch64 double expm1 (double); +DECL_SIMD_aarch64 double hypot (double, double); +DECL_SIMD_aarch64 double log10 (double); +DECL_SIMD_aarch64 double log1p (double); +DECL_SIMD_aarch64 double log2 (double); +DECL_SIMD_aarch64 double log (double); +DECL_SIMD_aarch64 double pow (double, double); +DECL_SIMD_aarch64 double sin (double); +DECL_SIMD_aarch64 double sinh (double); +DECL_SIMD_aarch64 double tan (double); +DECL_SIMD_aarch64 double tanh (double); + +#if __aarch64__ && __linux__ +# include +# undef __vpcs +# define __vpcs __attribute__((__aarch64_vector_pcs__)) /* Vector functions following the vector PCS using ABI names. */ -__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); -__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); -__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); -__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); -__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); +__vpcs float32x4_t _ZGVnN4v_acosf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_acoshf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_asinf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_asinhf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_atanf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_atanhf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_cbrtf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_cosf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_coshf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_cospif (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_erfcf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_erff (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_exp10f (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_exp2f (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_exp2f_1u (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_expf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_expf_1u (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_expm1f (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_log10f (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_log1pf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_log2f (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_logf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_sinf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_sinhf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_sinpif (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_tanf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_tanhf (float32x4_t); +__vpcs float32x4_t _ZGVnN4v_tanpif (float32x4_t); +__vpcs float32x4_t _ZGVnN4vl4_modff (float32x4_t, float *); +__vpcs float32x4_t _ZGVnN4vv_atan2f (float32x4_t, float32x4_t); +__vpcs float32x4_t _ZGVnN4vv_hypotf (float32x4_t, float32x4_t); +__vpcs float32x4_t _ZGVnN4vv_powf (float32x4_t, float32x4_t); +__vpcs float32x4x2_t _ZGVnN4v_cexpif (float32x4_t); +__vpcs void _ZGVnN4vl4l4_sincosf (float32x4_t, float *, float *); +__vpcs void _ZGVnN4vl4l4_sincospif (float32x4_t, float *, float *); + +__vpcs float64x2_t _ZGVnN2v_acos (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_acosh (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_asin (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_asinh (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_atan (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_atanh (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_cbrt (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_cos (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_cosh (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_cospi (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_erf (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_erfc (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_exp (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_exp10 (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_exp2 (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_expm1 (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_log (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_log10 (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_log1p (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_log2 (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_sin (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_sinh (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_sinpi (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_tan (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_tanh (float64x2_t); +__vpcs float64x2_t _ZGVnN2v_tanpi (float64x2_t); +__vpcs float64x2_t _ZGVnN2vl8_modf (float64x2_t, double *); +__vpcs float64x2_t _ZGVnN2vv_atan2 (float64x2_t, float64x2_t); +__vpcs float64x2_t _ZGVnN2vv_hypot (float64x2_t, float64x2_t); +__vpcs float64x2_t _ZGVnN2vv_pow (float64x2_t, float64x2_t); +__vpcs float64x2x2_t _ZGVnN2v_cexpi (float64x2_t); +__vpcs void _ZGVnN2vl8l8_sincos (float64x2_t, double *, double *); +__vpcs void _ZGVnN2vl8l8_sincospi (float64x2_t, double *, double *); + +# if WANT_EXPERIMENTAL_MATH +__vpcs float32x4_t _ZGVnN4v_erfinvf (float32x4_t); +__vpcs float64x2_t _ZGVnN2v_erfinv (float64x2_t); +# endif + +# include +svfloat32_t _ZGVsMxv_acosf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_acoshf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_asinf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_asinhf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_atanhf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_cbrtf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_coshf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_cospif (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_erfcf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_exp10f (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_exp2f (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_expm1f (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_log1pf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_sinhf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_sinpif (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_tanhf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_tanpif (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxvl4_modff (svfloat32_t, float *, svbool_t); +svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxvv_hypotf (svfloat32_t, svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxvv_powf (svfloat32_t, svfloat32_t, svbool_t); +svfloat32x2_t _ZGVsMxv_cexpif (svfloat32_t, svbool_t); +void _ZGVsMxvl4l4_sincosf (svfloat32_t, float *, float *, svbool_t); +void _ZGVsMxvl4l4_sincospif (svfloat32_t, float *, float *, svbool_t); + +svfloat64_t _ZGVsMxv_acos (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_acosh (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_asin (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_asinh (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_atanh (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_cbrt (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_cosh (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_cospi (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_exp (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_exp10 (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_exp2 (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_expm1 (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_log1p (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_sinh (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_sinpi (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_tan (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_tanh (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_tanpi (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxvl8_modf (svfloat64_t, double *, svbool_t); +svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxvv_hypot (svfloat64_t, svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxvv_pow (svfloat64_t, svfloat64_t, svbool_t); +svfloat64x2_t _ZGVsMxv_cexpi (svfloat64_t, svbool_t); +void _ZGVsMxvl8l8_sincos (svfloat64_t, double *, double *, svbool_t); +void _ZGVsMxvl8l8_sincospi (svfloat64_t, double *, double *, svbool_t); + +# if WANT_EXPERIMENTAL_MATH + +svfloat32_t _ZGVsMxv_erfinvf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxvv_powi (svfloat32_t, svint32_t, svbool_t); + +svfloat64_t _ZGVsMxvv_powk (svfloat64_t, svint64_t, svbool_t); +svfloat64_t _ZGVsMxv_erfinv (svfloat64_t, svbool_t); + # endif #endif diff --git a/math/include/test_defs.h b/math/include/test_defs.h new file mode 100644 index 00000000000000..2fe66fa6f14c17 --- /dev/null +++ b/math/include/test_defs.h @@ -0,0 +1,21 @@ +/* + * Helper macros for emitting various details about routines for consumption by + * runulp.sh. This version of the file is for inclusion when building routines, + * so expansions are empty - see math/test/test_defs for versions used by the + * build system. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. + */ + +#define TEST_ULP(f, l) +#define TEST_ULP_NONNEAREST(f, l) + +#define TEST_DISABLE_FENV(f) +#define TEST_DISABLE_FENV_IF_NOT(f, e) + +#define TEST_INTERVAL(f, lo, hi, n) +#define TEST_SYM_INTERVAL(f, lo, hi, n) +#define TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) + +#define TEST_CONTROL_VALUE(f, c) diff --git a/math/include/test_sig.h b/math/include/test_sig.h new file mode 100644 index 00000000000000..a967829098d6c7 --- /dev/null +++ b/math/include/test_sig.h @@ -0,0 +1,47 @@ +/* + * Macros for emitting various ulp/bench entries based on function signature + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. + */ + +#define TEST_DECL_SF1(fun) float fun##f (float); +#define TEST_DECL_SF2(fun) float fun##f (float, float); +#define TEST_DECL_SD1(fun) double fun (double); +#define TEST_DECL_SD2(fun) double fun (double, double); + +#define TEST_DECL_VF1(fun) \ + float32x4_t VPCS_ATTR V_NAME_F1 (fun##f) (float32x4_t); +#define TEST_DECL_VF2(fun) \ + float32x4_t VPCS_ATTR V_NAME_F2 (fun##f) (float32x4_t, float32x4_t); +#define TEST_DECL_VD1(fun) VPCS_ATTR float64x2_t V_NAME_D1 (fun) (float64x2_t); +#define TEST_DECL_VD2(fun) \ + VPCS_ATTR float64x2_t V_NAME_D2 (fun) (float64x2_t, float64x2_t); + +#define TEST_DECL_SVF1(fun) \ + svfloat32_t SV_NAME_F1 (fun) (svfloat32_t, svbool_t); +#define TEST_DECL_SVF2(fun) \ + svfloat32_t SV_NAME_F2 (fun) (svfloat32_t, svfloat32_t, svbool_t); +#define TEST_DECL_SVD1(fun) \ + svfloat64_t SV_NAME_D1 (fun) (svfloat64_t, svbool_t); +#define TEST_DECL_SVD2(fun) \ + svfloat64_t SV_NAME_D2 (fun) (svfloat64_t, svfloat64_t, svbool_t); + +/* For building the routines, emit function prototype from TEST_SIG. This + ensures that the correct signature has been chosen (wrong one will be a + compile error). TEST_SIG is defined differently by various components of the + build system to emit entries in the wrappers and entries for mathbench and + ulp. */ +#ifndef _TEST_SIG +# if defined(EMIT_ULP_FUNCS) +# define _TEST_SIG(v, t, a, f, ...) TEST_SIG _Z##v##t##a (f) +# elif defined(EMIT_ULP_WRAPPERS) +# define _TEST_SIG(v, t, a, f, ...) TEST_SIG Z##v##N##t##a##_WRAP (f) +# elif defined(EMIT_MATHBENCH_FUNCS) +# define _TEST_SIG(v, t, a, f, ...) TEST_SIG _Z##v##t##a (f, ##__VA_ARGS__) +# else +# define _TEST_SIG(v, t, a, f, ...) TEST_DECL_##v##t##a (f) +# endif +#endif + +#define TEST_SIG(...) _TEST_SIG (__VA_ARGS__) diff --git a/math/log.c b/math/log.c index 43dfc2a744f060..1d6244c30b79eb 100644 --- a/math/log.c +++ b/math/log.c @@ -1,7 +1,7 @@ /* * Double-precision log(x) function. * - * Copyright (c) 2018-2019, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,6 +9,8 @@ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" #define T __log_data.tab #define T2 __log_data.tab2 @@ -160,3 +162,10 @@ hidden_alias (log, __ieee754_log) long double logl (long double x) { return log (x); } # endif #endif + +TEST_SIG (S, D, 1, log, 0.01, 11.1) +TEST_ULP (log, 0.02) +TEST_ULP_NONNEAREST (log, 0.5) +TEST_INTERVAL (log, 0, 0xffff000000000000, 10000) +TEST_INTERVAL (log, 0x1p-4, 0x1p4, 400000) +TEST_INTERVAL (log, 0, inf, 400000) diff --git a/pl/math/log10f.c b/math/log10f.c similarity index 84% rename from pl/math/log10f.c rename to math/log10f.c index 5c80008e4e57be..f8561d063107d3 100644 --- a/pl/math/log10f.c +++ b/math/log10f.c @@ -1,7 +1,7 @@ /* * Single-precision log10 function. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,8 +9,8 @@ #include #include "math_config.h" -#include "pl_sig.h" -#include "pl_test.h" +#include "test_sig.h" +#include "test_defs.h" /* Data associated to logf: @@ -30,7 +30,8 @@ /* This naive implementation of log10f mimics that of log then simply scales the result by 1/log(10) to switch from base e to base 10. Hence, most computations are carried out in double precision. - Scaling before rounding to single precision is both faster and more accurate. + Scaling before rounding to single precision is both faster and more + accurate. ULP error: 0.797 ulp (nearest rounding.). */ float @@ -88,10 +89,11 @@ log10f (float x) return eval_as_float (y); } -PL_SIG (S, F, 1, log10, 0.01, 11.1) -PL_TEST_ULP (log10f, 0.30) -PL_TEST_INTERVAL (log10f, 0, 0xffff0000, 10000) -PL_TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000) -PL_TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000) -PL_TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000) -PL_TEST_INTERVAL (log10f, 0, inf, 50000) +TEST_SIG (S, F, 1, log10, 0.01, 11.1) +TEST_ULP (log10f, 0.30) +TEST_ULP_NONNEAREST (log10f, 0.5) +TEST_INTERVAL (log10f, 0, 0xffff0000, 10000) +TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000) +TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000) +TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000) +TEST_INTERVAL (log10f, 0, inf, 50000) diff --git a/math/log2.c b/math/log2.c index 3f9c21b0396263..6462915a24f0c4 100644 --- a/math/log2.c +++ b/math/log2.c @@ -1,7 +1,7 @@ /* * Double-precision log2(x) function. * - * Copyright (c) 2018-2019, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,6 +9,8 @@ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" #define T __log2_data.tab #define T2 __log2_data.tab2 @@ -139,3 +141,10 @@ hidden_alias (log2, __ieee754_log2) long double log2l (long double x) { return log2 (x); } # endif #endif + +TEST_SIG (S, D, 1, log2, 0.01, 11.1) +TEST_ULP (log2, 0.05) +TEST_ULP_NONNEAREST (log2, 0.5) +TEST_INTERVAL (log2, 0, 0xffff000000000000, 10000) +TEST_INTERVAL (log2, 0x1p-4, 0x1p4, 40000) +TEST_INTERVAL (log2, 0, inf, 40000) diff --git a/math/log2f.c b/math/log2f.c index 0a44fa2024f606..7d47379b41cbbb 100644 --- a/math/log2f.c +++ b/math/log2f.c @@ -1,13 +1,15 @@ /* * Single-precision log2 function. * - * Copyright (c) 2017-2018, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" /* LOG2F_TABLE_BITS = 4 @@ -78,3 +80,10 @@ log2f (float x) strong_alias (log2f, __log2f_finite) hidden_alias (log2f, __ieee754_log2f) #endif + +TEST_SIG (S, F, 1, log2, 0.01, 11.1) +TEST_ULP (log2f, 0.26) +TEST_ULP_NONNEAREST (log2f, 0.5) +TEST_INTERVAL (log2f, 0, 0xffff0000, 10000) +TEST_INTERVAL (log2f, 0x1p-4, 0x1p4, 50000) +TEST_INTERVAL (log2f, 0, inf, 50000) diff --git a/math/logf.c b/math/logf.c index 820f74c3e66a70..f2c26deaff19b9 100644 --- a/math/logf.c +++ b/math/logf.c @@ -1,13 +1,15 @@ /* * Single-precision log function. * - * Copyright (c) 2017-2023, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include "math_config.h" +#include "test_defs.h" +#include "test_sig.h" /* LOGF_TABLE_BITS = 4 @@ -77,3 +79,10 @@ logf (float x) strong_alias (logf, __logf_finite) hidden_alias (logf, __ieee754_logf) #endif + +TEST_SIG (S, F, 1, log, 0.01, 11.1) +TEST_ULP (logf, 0.32) +TEST_ULP_NONNEAREST (logf, 0.5) +TEST_INTERVAL (logf, 0, 0xffff0000, 10000) +TEST_INTERVAL (logf, 0x1p-4, 0x1p4, 500000) +TEST_INTERVAL (logf, 0, inf, 50000) diff --git a/math/logf_data.c b/math/logf_data.c index 04247684755fdf..5c301a90af8e2d 100644 --- a/math/logf_data.c +++ b/math/logf_data.c @@ -1,7 +1,7 @@ /* * Data definition for logf. * - * Copyright (c) 2017-2019, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -27,6 +27,7 @@ const struct logf_data __logf_data = { { 0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2 }, }, .ln2 = 0x1.62e42fefa39efp-1, + .invln10 = 0x1.bcb7b1526e50ep-2, .poly = { -0x1.00ea348b88334p-2, 0x1.5575b0be00b6ap-2, -0x1.ffffef20a4123p-2, } diff --git a/math/math_config.h b/math/math_config.h index faf77b31fc99bd..0fc653f937617e 100644 --- a/math/math_config.h +++ b/math/math_config.h @@ -1,7 +1,7 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2023, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -13,9 +13,9 @@ #ifndef WANT_ROUNDING /* If defined to 1, return correct results for special cases in non-nearest - rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f). - This may be set to 0 if there is no fenv support or if math functions only - get called in round to nearest mode. */ + rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than + -0.0f). This may be set to 0 if there is no fenv support or if math + functions only get called in round to nearest mode. */ # define WANT_ROUNDING 1 #endif #ifndef WANT_ERRNO @@ -117,6 +117,25 @@ #define __math_check_oflowf arm_math_check_oflowf #define __math_check_uflowf arm_math_check_uflowf +#define __exp_data arm_math_exp_data +#define __asin_poly arm_math_asin_poly +#define __asinf_poly arm_math_asinf_poly +#define __asinh_data arm_math_asinh_data +#define __asinhf_data arm_math_asinhf_data +#define __atan_poly_data arm_math_atan_poly_data +#define __atanf_poly_data arm_math_atanf_poly_data +#define __cbrt_data arm_math_cbrt_data +#define __cbrtf_data arm_math_cbrtf_data +#define __erf_data arm_math_erf_data +#define __expf_data arm_math_expf_data +#define __expm1_poly arm_math_expm1_poly +#define __expm1f_poly arm_math_expm1f_poly +#define __log10_data arm_math_log10_data +#define __log1p_data arm_math_log1p_data +#define __log1pf_data arm_math_log1pf_data +#define __log_data arm_math_log_data +#define __tanf_poly_data arm_math_tanf_poly_data +#define __v_log_data arm_math_v_log_data #define __sincosf_table arm_math_sincosf_table #define __inv_pio4 arm_math_inv_pio4 #define __exp2f_data arm_math_exp2f_data @@ -131,6 +150,25 @@ #define __erf_data arm_math_erf_data #define __v_exp_data arm_math_v_exp_data #define __v_log_data arm_math_v_log_data +#define __v_erf_data arm_math_v_erf_data +#define __v_erfc_data arm_math_v_erfc_data +#define __v_erfcf_data arm_math_v_erfcf_data +#define __v_erff_data arm_math_v_erff_data +#define __v_exp_tail_data arm_math_v_exp_tail_data +#define __v_log10_data arm_math_v_log10_data +#define __v_log2_data arm_math_v_log2_data +#define __v_pow_exp_data arm_math_v_pow_exp_data +#define __v_pow_log_data arm_math_v_pow_log_data +#define __v_powf_data arm_math_v_powf_data + +/* On some platforms (in particular Windows) INFINITY and HUGE_VAL might + be defined in such a way that might not produce the expected bit pattern, + therefore we enforce the glibc math.h definition using a builtin that is + supported in both gcc and clang. */ +#if defined (_WIN32) && (defined (__GNUC__) || defined (__clang__)) +# undef INFINITY +# define INFINITY __builtin_inff() +#endif #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with @@ -365,11 +403,12 @@ extern const struct exp2f_data uint64_t tab[1 << EXP2F_TABLE_BITS]; double shift_scaled; double poly[EXP2F_POLY_ORDER]; - double shift; double invln2_scaled; double poly_scaled[EXP2F_POLY_ORDER]; + double shift; } __exp2f_data HIDDEN; +/* Data for logf and log10f. */ #define LOGF_TABLE_BITS 4 #define LOGF_POLY_ORDER 4 extern const struct logf_data @@ -379,6 +418,7 @@ extern const struct logf_data double invc, logc; } tab[1 << LOGF_TABLE_BITS]; double ln2; + double invln10; double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */ } __logf_data HIDDEN; @@ -427,17 +467,19 @@ extern const struct powf_log2_data extern const struct exp_data { double invln2N; - double invlog10_2N; - double shift; double negln2hiN; double negln2loN; - double neglog10_2hiN; - double neglog10_2loN; double poly[4]; /* Last four coefficients. */ + double shift; + double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; + + double neglog10_2hiN; + double neglog10_2loN; double exp10_poly[5]; uint64_t tab[2*(1 << EXP_TABLE_BITS)]; + double invlog10_2N; } __exp_data HIDDEN; #define LOG_TABLE_BITS 7 @@ -509,13 +551,214 @@ extern const struct erf_data #define V_EXP_TABLE_BITS 7 extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; +#define V_LOG_POLY_ORDER 6 #define V_LOG_TABLE_BITS 7 extern const struct v_log_data { + /* Shared data for vector log and log-derived routines (e.g. asinh). */ + double poly[V_LOG_POLY_ORDER - 1]; + double ln2; struct { double invc, logc; } table[1 << V_LOG_TABLE_BITS]; } __v_log_data HIDDEN; +/* Some data for SVE powf's internal exp and log. */ +#define V_POWF_EXP2_TABLE_BITS 5 +#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS) +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS) +extern const struct v_powf_data +{ + double invc[V_POWF_LOG2_N]; + double logc[V_POWF_LOG2_N]; + uint64_t scale[V_POWF_EXP2_N]; +} __v_powf_data HIDDEN; + +/* Some data for AdvSIMD and SVE pow's internal exp and log. */ +#define V_POW_EXP_TABLE_BITS 8 +extern const struct v_pow_exp_data +{ + double poly[3]; + double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift; + uint64_t sbits[1 << V_POW_EXP_TABLE_BITS]; +} __v_pow_exp_data HIDDEN; + +#define V_POW_LOG_TABLE_BITS 7 +extern const struct v_pow_log_data +{ + double poly[7]; /* First coefficient is 1. */ + double ln2_hi, ln2_lo; + double invc[1 << V_POW_LOG_TABLE_BITS]; + double logc[1 << V_POW_LOG_TABLE_BITS]; + double logctail[1 << V_POW_LOG_TABLE_BITS]; +} __v_pow_log_data HIDDEN; + +#define V_LOG2_TABLE_BITS 7 +extern const struct v_log2_data +{ + double poly[5]; + double invln2; + struct + { + double invc, log2c; + } table[1 << V_LOG2_TABLE_BITS]; +} __v_log2_data HIDDEN; + +#define V_LOG10_TABLE_BITS 7 +extern const struct v_log10_data +{ + double poly[5]; + double invln10, log10_2; + struct + { + double invc, log10c; + } table[1 << V_LOG10_TABLE_BITS]; +} __v_log10_data HIDDEN; + +#define V_EXP_TAIL_TABLE_BITS 8 +extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN; + +extern const struct v_erff_data +{ + struct + { + float erf, scale; + } tab[513]; +} __v_erff_data HIDDEN; + +extern const struct v_erfcf_data +{ + struct + { + float erfc, scale; + } tab[645]; +} __v_erfcf_data HIDDEN; + +extern const struct v_erf_data +{ + struct + { + double erf, scale; + } tab[769]; +} __v_erf_data HIDDEN; + +extern const struct v_erfc_data +{ + struct + { + double erfc, scale; + } tab[3488]; +} __v_erfc_data HIDDEN; + +/* Table with 4/PI to 192 bit precision. */ +extern const uint32_t __inv_pio4[] HIDDEN; + +#if WANT_EXPERIMENTAL_MATH + +# define LOG1P_NCOEFFS 19 +extern const struct log1p_data +{ + double coeffs[LOG1P_NCOEFFS]; +} __log1p_data HIDDEN; + +# define LOG1PF_2U5 +# define LOG1PF_NCOEFFS 9 +extern const struct log1pf_data +{ + float coeffs[LOG1PF_NCOEFFS]; +} __log1pf_data HIDDEN; + +# define ASINF_POLY_ORDER 4 +extern const float __asinf_poly[ASINF_POLY_ORDER + 1] HIDDEN; + +# define ASIN_POLY_ORDER 11 +extern const double __asin_poly[ASIN_POLY_ORDER + 1] HIDDEN; + +# define ASINHF_NCOEFFS 8 +extern const struct asinhf_data +{ + float coeffs[ASINHF_NCOEFFS]; +} __asinhf_data HIDDEN; + +# define ASINH_NCOEFFS 18 +extern const struct asinh_data +{ + double poly[ASINH_NCOEFFS]; +} __asinh_data HIDDEN; + +# define ATAN_POLY_NCOEFFS 20 +extern const struct atan_poly_data +{ + double poly[ATAN_POLY_NCOEFFS]; +} __atan_poly_data HIDDEN; + +# define ATANF_POLY_NCOEFFS 8 +extern const struct atanf_poly_data +{ + float poly[ATANF_POLY_NCOEFFS]; +} __atanf_poly_data HIDDEN; + +extern const struct cbrtf_data +{ + float poly[4]; + float table[5]; +} __cbrtf_data HIDDEN; + +extern const struct cbrt_data +{ + double poly[4]; + double table[5]; +} __cbrt_data HIDDEN; + +# define EXPF_TABLE_BITS 5 +# define EXPF_POLY_ORDER 3 +extern const struct expf_data +{ + uint64_t tab[1 << EXPF_TABLE_BITS]; + double invln2_scaled; + double poly_scaled[EXPF_POLY_ORDER]; +} __expf_data HIDDEN; + +# define EXPM1F_POLY_ORDER 5 +extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN; + +# define EXPM1_POLY_ORDER 11 +extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN; + +/* Data for low accuracy log10 (with 1/ln(10) included in coefficients). */ +# define LOG10_TABLE_BITS 7 +# define LOG10_POLY_ORDER 6 +# define LOG10_POLY1_ORDER 12 +extern const struct log10_data +{ + double ln2hi; + double ln2lo; + double invln10; + double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */ + double poly1[LOG10_POLY1_ORDER - 1]; + struct + { + double invc, logc; + } tab[1 << LOG10_TABLE_BITS]; +# if !HAVE_FAST_FMA + struct + { + double chi, clo; + } tab2[1 << LOG10_TABLE_BITS]; +# endif +} __log10_data HIDDEN; + +# define TANF_P_POLY_NCOEFFS 6 +/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps. */ +# define TANF_Q_POLY_NCOEFFS 4 +extern const struct tanf_poly_data +{ + float poly_tan[TANF_P_POLY_NCOEFFS]; + float poly_cotan[TANF_Q_POLY_NCOEFFS]; +} __tanf_poly_data HIDDEN; + +#endif /* WANT_EXPERIMENTAL_MATH. */ + #endif diff --git a/pl/math/poly_generic.h b/math/poly_generic.h similarity index 99% rename from pl/math/poly_generic.h rename to math/poly_generic.h index 3fc25f8762f256..c21b61aad4c3d3 100644 --- a/pl/math/poly_generic.h +++ b/math/poly_generic.h @@ -1,7 +1,7 @@ /* * Generic helpers for evaluating polynomials with various schemes. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ diff --git a/pl/math/poly_scalar_f32.h b/math/poly_scalar_f32.h similarity index 80% rename from pl/math/poly_scalar_f32.h rename to math/poly_scalar_f32.h index a9b1c5544494c8..198e5801938a06 100644 --- a/pl/math/poly_scalar_f32.h +++ b/math/poly_scalar_f32.h @@ -2,12 +2,12 @@ * Helpers for evaluating polynomials on siongle-precision scalar input, using * various schemes. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_POLY_SCALAR_F32_H -#define PL_MATH_POLY_SCALAR_F32_H +#ifndef MATH_POLY_SCALAR_F32_H +#define MATH_POLY_SCALAR_F32_H #include diff --git a/pl/math/poly_scalar_f64.h b/math/poly_scalar_f64.h similarity index 80% rename from pl/math/poly_scalar_f64.h rename to math/poly_scalar_f64.h index 207dccee30ad07..6fbebe05d1df0d 100644 --- a/pl/math/poly_scalar_f64.h +++ b/math/poly_scalar_f64.h @@ -2,12 +2,12 @@ * Helpers for evaluating polynomials on double-precision scalar input, using * various schemes. * - * Copyright (c) 2023, Arm Limited. + * Copyright (c) 2023-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifndef PL_MATH_POLY_SCALAR_F64_H -#define PL_MATH_POLY_SCALAR_F64_H +#ifndef MATH_POLY_SCALAR_F64_H +#define MATH_POLY_SCALAR_F64_H #include diff --git a/math/pow.c b/math/pow.c index af719fe5ab1058..1983bb2bbeba86 100644 --- a/math/pow.c +++ b/math/pow.c @@ -1,7 +1,7 @@ /* * Double-precision x^y function. * - * Copyright (c) 2018-2020, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,6 +9,7 @@ #include #include #include "math_config.h" +#include "test_defs.h" /* Worst-case error: 0.54 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53) @@ -378,3 +379,22 @@ hidden_alias (pow, __ieee754_pow) long double powl (long double x, long double y) { return pow (x, y); } # endif #endif + +TEST_ULP (pow, 0.05) +TEST_ULP_NONNEAREST (pow, 0.5) +TEST_INTERVAL2 (pow, 0.5, 2.0, 0, inf, 20000) +TEST_INTERVAL2 (pow, -0.5, -2.0, 0, inf, 20000) +TEST_INTERVAL2 (pow, 0.5, 2.0, -0, -inf, 20000) +TEST_INTERVAL2 (pow, -0.5, -2.0, -0, -inf, 20000) +TEST_INTERVAL2 (pow, 0.5, 2.0, 0x1p-10, 0x1p10, 40000) +TEST_INTERVAL2 (pow, 0.5, 2.0, -0x1p-10, -0x1p10, 40000) +TEST_INTERVAL2 (pow, 0, inf, 0.5, 2.0, 80000) +TEST_INTERVAL2 (pow, 0, inf, -0.5, -2.0, 80000) +TEST_INTERVAL2 (pow, 0x1.fp-1, 0x1.08p0, 0x1p8, 0x1p17, 80000) +TEST_INTERVAL2 (pow, 0x1.fp-1, 0x1.08p0, -0x1p8, -0x1p17, 80000) +TEST_INTERVAL2 (pow, 0, 0x1p-1000, 0, 1.0, 50000) +TEST_INTERVAL2 (pow, 0x1p1000, inf, 0, 1.0, 50000) +TEST_INTERVAL2 (pow, 0x1.ffffffffffff0p-1, 0x1.0000000000008p0, 0x1p60, 0x1p68, + 50000) +TEST_INTERVAL2 (pow, 0x1.ffffffffff000p-1, 0x1p0, 0x1p50, 0x1p52, 50000) +TEST_INTERVAL2 (pow, -0x1.ffffffffff000p-1, -0x1p0, 0x1p50, 0x1p52, 50000) diff --git a/math/powf.c b/math/powf.c index 05c80bb2eb670e..3f3f41ca276aaa 100644 --- a/math/powf.c +++ b/math/powf.c @@ -1,13 +1,14 @@ /* * Single-precision pow function. * - * Copyright (c) 2017-2019, Arm Limited. + * Copyright (c) 2017-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include "math_config.h" +#include "test_defs.h" /* POWF_LOG2_POLY_ORDER = 5 @@ -219,3 +220,12 @@ powf (float x, float y) strong_alias (powf, __powf_finite) hidden_alias (powf, __ieee754_powf) #endif + +TEST_ULP (powf, 0.4) +TEST_ULP_NONNEAREST (powf, 0.5) +TEST_INTERVAL2 (powf, 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000) +TEST_INTERVAL2 (powf, 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000) +TEST_INTERVAL2 (powf, 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000) +TEST_INTERVAL2 (powf, 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000) +TEST_INTERVAL2 (powf, 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000) +TEST_INTERVAL2 (powf, 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000) diff --git a/math/sincosf.c b/math/sincosf.c index 446f21d60faf3a..05a71d78bb1efd 100644 --- a/math/sincosf.c +++ b/math/sincosf.c @@ -1,7 +1,7 @@ /* * Single-precision sin/cos function. * - * Copyright (c) 2018-2021, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -9,6 +9,7 @@ #include #include "math_config.h" #include "sincosf.h" +#include "test_defs.h" /* Fast sincosf implementation. Worst-case ULP is 0.5607, maximum relative error is 0.5303 * 2^-23. A single-step range reduction is used for @@ -77,3 +78,12 @@ sincosf (float y, float *sinp, float *cosp) #endif } } + +TEST_ULP (sincosf_sinf, 0.06) +TEST_ULP (sincosf_cosf, 0.06) +TEST_ULP_NONNEAREST (sincosf_sinf, 0.5) +TEST_ULP_NONNEAREST (sincosf_cosf, 0.5) +TEST_INTERVAL (sincosf_sinf, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (sincosf_sinf, 0x1p-14, 0x1p54, 50000) +TEST_INTERVAL (sincosf_cosf, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (sincosf_cosf, 0x1p-14, 0x1p54, 50000) diff --git a/math/sincosf.h b/math/sincosf.h index ec23ed7aeb2615..912def33d29581 100644 --- a/math/sincosf.h +++ b/math/sincosf.h @@ -1,7 +1,7 @@ /* * Header for sinf, cosf and sincosf. * - * Copyright (c) 2018-2021, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -27,9 +27,6 @@ typedef struct /* Polynomial data (the cosine polynomial is negated in the 2nd entry). */ extern const sincos_t __sincosf_table[2] HIDDEN; -/* Table with 4/PI to 192 bit precision. */ -extern const uint32_t __inv_pio4[] HIDDEN; - /* Top 12 bits of the float representation with the sign bit cleared. */ static inline uint32_t abstop12 (float x) diff --git a/math/sinf.c b/math/sinf.c index 8dd8ae458794c5..e244e115d32b21 100644 --- a/math/sinf.c +++ b/math/sinf.c @@ -1,13 +1,15 @@ /* * Single-precision sin function. * - * Copyright (c) 2018-2021, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include "math_config.h" #include "sincosf.h" +#include "test_defs.h" +#include "test_sig.h" /* Fast sinf implementation. Worst-case ULP is 0.5607, maximum relative error is 0.5303 * 2^-23. A single-step range reduction is used for @@ -65,3 +67,9 @@ sinf (float y) else return __math_invalidf (y); } + +TEST_SIG (S, F, 1, sin, -3.1, 3.1) +TEST_ULP (sinf, 0.06) +TEST_ULP_NONNEAREST (sinf, 0.5) +TEST_INTERVAL (sinf, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (sinf, 0x1p-14, 0x1p54, 50000) diff --git a/math/test/mathbench.c b/math/test/mathbench.c index ed7e89bb7710a0..653c58fbc48477 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -1,10 +1,23 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018-2023, Arm Limited. + * Copyright (c) 2018-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#if WANT_SVE_TESTS +# if __aarch64__ && __linux__ +# ifdef __clang__ +# pragma clang attribute push(__attribute__((target("sve"))), \ + apply_to = any(function)) +# else +# pragma GCC target("+sve") +# endif +# else +# error "SVE not supported - please disable WANT_SVE_TESTS" +# endif +#endif + #undef _GNU_SOURCE #define _GNU_SOURCE 1 #include @@ -29,94 +42,6 @@ static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; -#ifdef __vpcs -#include -typedef float64x2_t v_double; - -#define v_double_len() 2 - -static inline v_double -v_double_load (const double *p) -{ - return (v_double){p[0], p[1]}; -} - -static inline v_double -v_double_dup (double x) -{ - return (v_double){x, x}; -} - -typedef float32x4_t v_float; - -#define v_float_len() 4 - -static inline v_float -v_float_load (const float *p) -{ - return (v_float){p[0], p[1], p[2], p[3]}; -} - -static inline v_float -v_float_dup (float x) -{ - return (v_float){x, x, x, x}; -} -#else -/* dummy definitions to make things compile. */ -typedef double v_double; -typedef float v_float; -#define v_double_len(x) 1 -#define v_double_load(x) (x)[0] -#define v_double_dup(x) (x) -#define v_float_len(x) 1 -#define v_float_load(x) (x)[0] -#define v_float_dup(x) (x) - -#endif - -#if WANT_SVE_MATH -#include -typedef svbool_t sv_bool; -typedef svfloat64_t sv_double; - -#define sv_double_len() svcntd() - -static inline sv_double -sv_double_load (const double *p) -{ - svbool_t pg = svptrue_b64(); - return svld1(pg, p); -} - -static inline sv_double -sv_double_dup (double x) -{ - return svdup_n_f64(x); -} - -typedef svfloat32_t sv_float; - -#define sv_float_len() svcntw() - -static inline sv_float -sv_float_load (const float *p) -{ - svbool_t pg = svptrue_b32(); - return svld1(pg, p); -} - -static inline sv_float -sv_float_dup (float x) -{ - return svdup_n_f32(x); -} -#else -/* dummy definitions to make things compile. */ -#define sv_double_len(x) 1 -#define sv_float_len(x) 1 -#endif - static double dummy (double x) { @@ -128,28 +53,28 @@ dummyf (float x) { return x; } -#ifdef __vpcs -__vpcs static v_double -__vn_dummy (v_double x) +#if __aarch64__ && __linux__ +__vpcs static float64x2_t +__vn_dummy (float64x2_t x) { return x; } -__vpcs static v_float -__vn_dummyf (v_float x) +__vpcs static float32x4_t +__vn_dummyf (float32x4_t x) { return x; } #endif -#if WANT_SVE_MATH -static sv_double -__sv_dummy (sv_double x, sv_bool pg) +#if WANT_SVE_TESTS +static svfloat64_t +__sv_dummy (svfloat64_t x, svbool_t pg) { return x; } -static sv_float -__sv_dummyf (sv_float x, sv_bool pg) +static svfloat32_t +__sv_dummyf (svfloat32_t x, svbool_t pg) { return x; } @@ -169,16 +94,17 @@ static const struct fun { double (*d) (double); float (*f) (float); -#ifdef __vpcs - __vpcs v_double (*vnd) (v_double); - __vpcs v_float (*vnf) (v_float); +#if __aarch64__ && __linux__ + __vpcs float64x2_t (*vnd) (float64x2_t); + __vpcs float32x4_t (*vnf) (float32x4_t); #endif -#if WANT_SVE_MATH - sv_double (*svd) (sv_double, sv_bool); - sv_float (*svf) (sv_float, sv_bool); +#if WANT_SVE_TESTS + svfloat64_t (*svd) (svfloat64_t, svbool_t); + svfloat32_t (*svf) (svfloat32_t, svbool_t); #endif } fun; } funtab[] = { +// clang-format off #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, @@ -187,11 +113,11 @@ static const struct fun #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, D (dummy, 1.0, 2.0) F (dummyf, 1.0, 2.0) -#ifdef __vpcs +#if __aarch64__ && __linux__ VND (__vn_dummy, 1.0, 2.0) VNF (__vn_dummyf, 1.0, 2.0) #endif -#if WANT_SVE_MATH +#if WANT_SVE_TESTS SVD (__sv_dummy, 1.0, 2.0) SVF (__sv_dummyf, 1.0, 2.0) #endif @@ -203,6 +129,7 @@ SVF (__sv_dummyf, 1.0, 2.0) #undef VND #undef SVF #undef SVD + // clang-format on }; static void @@ -301,75 +228,77 @@ runf_latency (float f (float)) prev = f (Af[i] + prev * z); } -#ifdef __vpcs +#if __aarch64__ && __linux__ static void -run_vn_thruput (__vpcs v_double f (v_double)) +run_vn_thruput (__vpcs float64x2_t f (float64x2_t)) { - for (int i = 0; i < N; i += v_double_len ()) - f (v_double_load (A+i)); + for (int i = 0; i < N; i += 2) + f (vld1q_f64 (A + i)); } static void -runf_vn_thruput (__vpcs v_float f (v_float)) +runf_vn_thruput (__vpcs float32x4_t f (float32x4_t)) { - for (int i = 0; i < N; i += v_float_len ()) - f (v_float_load (Af+i)); + for (int i = 0; i < N; i += 4) + f (vld1q_f32 (Af + i)); } static void -run_vn_latency (__vpcs v_double f (v_double)) +run_vn_latency (__vpcs float64x2_t f (float64x2_t)) { volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; uint64x2_t sel = vsel; - v_double prev = v_double_dup (0); - for (int i = 0; i < N; i += v_double_len ()) - prev = f (vbslq_f64 (sel, prev, v_double_load (A+i))); + float64x2_t prev = vdupq_n_f64 (0); + for (int i = 0; i < N; i += 2) + prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i))); } static void -runf_vn_latency (__vpcs v_float f (v_float)) +runf_vn_latency (__vpcs float32x4_t f (float32x4_t)) { volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; uint32x4_t sel = vsel; - v_float prev = v_float_dup (0); - for (int i = 0; i < N; i += v_float_len ()) - prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i))); + float32x4_t prev = vdupq_n_f32 (0); + for (int i = 0; i < N; i += 4) + prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i))); } #endif -#if WANT_SVE_MATH +#if WANT_SVE_TESTS static void -run_sv_thruput (sv_double f (sv_double, sv_bool)) +run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t)) { - for (int i = 0; i < N; i += sv_double_len ()) - f (sv_double_load (A+i), svptrue_b64 ()); + for (int i = 0; i < N; i += svcntd ()) + f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ()); } static void -runf_sv_thruput (sv_float f (sv_float, sv_bool)) +runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t)) { - for (int i = 0; i < N; i += sv_float_len ()) - f (sv_float_load (Af+i), svptrue_b32 ()); + for (int i = 0; i < N; i += svcntw ()) + f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ()); } static void -run_sv_latency (sv_double f (sv_double, sv_bool)) +run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t)) { - volatile sv_bool vsel = svptrue_b64 (); - sv_bool sel = vsel; - sv_double prev = sv_double_dup (0); - for (int i = 0; i < N; i += sv_double_len ()) - prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ()); + volatile svbool_t vsel = svptrue_b64 (); + svbool_t sel = vsel; + svfloat64_t prev = svdup_f64 (0); + for (int i = 0; i < N; i += svcntd ()) + prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev), + svptrue_b64 ()); } static void -runf_sv_latency (sv_float f (sv_float, sv_bool)) +runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t)) { - volatile sv_bool vsel = svptrue_b32 (); - sv_bool sel = vsel; - sv_float prev = sv_float_dup (0); - for (int i = 0; i < N; i += sv_float_len ()) - prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ()); + volatile svbool_t vsel = svptrue_b32 (); + svbool_t sel = vsel; + svfloat32_t prev = svdup_f32 (0); + for (int i = 0; i < N; i += svcntw ()) + prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev), + svptrue_b32 ()); } #endif @@ -377,7 +306,11 @@ static uint64_t tic (void) { struct timespec ts; +#if defined(_MSC_VER) + if (!timespec_get (&ts, TIME_UTC)) +#else if (clock_gettime (CLOCK_REALTIME, &ts)) +#endif abort (); return ts.tv_sec * 1000000000ULL + ts.tv_nsec; } @@ -405,9 +338,11 @@ bench1 (const struct fun *f, int type, double lo, double hi) int vlen = 1; if (f->vec == 'n') - vlen = f->prec == 'd' ? v_double_len() : v_float_len(); + vlen = f->prec == 'd' ? 2 : 4; +#if WANT_SVE_TESTS else if (f->vec == 's') - vlen = f->prec == 'd' ? sv_double_len() : sv_float_len(); + vlen = f->prec == 'd' ? svcntd () : svcntw (); +#endif if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); @@ -417,7 +352,7 @@ bench1 (const struct fun *f, int type, double lo, double hi) TIMEIT (runf_thruput, f->fun.f); else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); -#ifdef __vpcs +#if __aarch64__ && __linux__ else if (f->prec == 'd' && type == 't' && f->vec == 'n') TIMEIT (run_vn_thruput, f->fun.vnd); else if (f->prec == 'd' && type == 'l' && f->vec == 'n') @@ -427,7 +362,7 @@ bench1 (const struct fun *f, int type, double lo, double hi) else if (f->prec == 'f' && type == 'l' && f->vec == 'n') TIMEIT (runf_vn_latency, f->fun.vnf); #endif -#if WANT_SVE_MATH +#if WANT_SVE_TESTS else if (f->prec == 'd' && type == 't' && f->vec == 's') TIMEIT (run_sv_thruput, f->fun.svd); else if (f->prec == 'd' && type == 'l' && f->vec == 's') @@ -640,3 +575,7 @@ main (int argc, char *argv[]) } return 0; } + +#if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h index 84c4e68650acbb..261ab02f55c3fd 100644 --- a/math/test/mathbench_funcs.h +++ b/math/test/mathbench_funcs.h @@ -1,27 +1,13 @@ /* * Function entries for mathbench. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* clang-format off */ -D (exp, -9.9, 9.9) -D (exp, 0.5, 1.0) -D (exp10, -9.9, 9.9) -D (exp2, -9.9, 9.9) -D (log, 0.01, 11.1) -D (log, 0.999, 1.001) -D (log2, 0.01, 11.1) -D (log2, 0.999, 1.001) {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, D (xpow, 0.01, 11.1) D (ypow, -9.9, 9.9) -D (erf, -6.0, 6.0) - -F (expf, -9.9, 9.9) -F (exp2f, -9.9, 9.9) -F (logf, 0.01, 11.1) -F (log2f, 0.01, 11.1) {"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, F (xpowf, 0.01, 11.1) F (ypowf, -9.9, 9.9) @@ -31,32 +17,105 @@ F (ypowf, -9.9, 9.9) {"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, {"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, {"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, -F (sinf, 0.1, 0.7) -F (sinf, 0.8, 3.1) -F (sinf, -3.1, 3.1) -F (sinf, 3.3, 33.3) -F (sinf, 100, 1000) -F (sinf, 1e6, 1e32) -F (cosf, 0.1, 0.7) -F (cosf, 0.8, 3.1) -F (cosf, -3.1, 3.1) -F (cosf, 3.3, 33.3) -F (cosf, 100, 1000) -F (cosf, 1e6, 1e32) -F (erff, -4.0, 4.0) -#ifdef __vpcs -VND (_ZGVnN2v_exp, -9.9, 9.9) -VND (_ZGVnN2v_log, 0.01, 11.1) -{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, -VND (_ZGVnN2v_sin, -3.1, 3.1) -VND (_ZGVnN2v_cos, -3.1, 3.1) -VNF (_ZGVnN4v_expf, -9.9, 9.9) +#if WANT_TRIGPI_TESTS +F (arm_math_cospif, -0.9, 0.9) +D (arm_math_cospi, -0.9, 0.9) +F (arm_math_sinpif, -0.9, 0.9) +D (arm_math_sinpi, -0.9, 0.9) +F (arm_math_tanpif, -0.9, 0.9) +D (arm_math_tanpi, -0.9, 0.9) +{"sincospif", 'f', 0, -0.9, 0.9, {.f = sincospif_wrap}}, +{"sincospi", 'd', 0, -0.9, 0.9, {.d = sincospi_wrap}}, +#endif +#if WANT_EXPERIMENTAL_MATH +D (arm_math_erf, -6.0, 6.0) +F (arm_math_erff, -4.0, 4.0) +{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, +{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}}, +{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}}, +#endif +#if __aarch64__ && __linux__ +{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}}, +{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}}, +{"_ZGVnN4vv_hypotf", 'f', 'n', -10.0, 10.0, {.vnf = _Z_hypotf_wrap}}, +{"_ZGVnN2vv_hypot", 'd', 'n', -10.0, 10.0, {.vnd = _Z_hypot_wrap}}, +{"_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = xy_Z_pow}}, +{"x_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = x_Z_pow}}, +{"y_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = y_Z_pow}}, +{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, +{"x_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = x_Z_powf}}, +{"y_ZGVnN4vv_powf", 'f', 'n', -10.0, 10.0, {.vnf = y_Z_powf}}, +{"_ZGVnN4vl4_modff", 'f', 'n', -10.0, 10.0, {.vnf = _Z_modff_wrap}}, +{"_ZGVnN2vl8_modf", 'd', 'n', -10.0, 10.0, {.vnd = _Z_modf_wrap}}, +{"_ZGVnN4vl4l4_sincosf", 'f', 'n', -3.1, 3.1, {.vnf = _Z_sincosf_wrap}}, +{"_ZGVnN2vl8l8_sincos", 'd', 'n', -3.1, 3.1, {.vnd = _Z_sincos_wrap}}, +{"_ZGVnN4v_cexpif", 'f', 'n', -3.1, 3.1, {.vnf = _Z_cexpif_wrap}}, +{"_ZGVnN2v_cexpi", 'd', 'n', -3.1, 3.1, {.vnd = _Z_cexpi_wrap}}, VNF (_ZGVnN4v_expf_1u, -9.9, 9.9) -VNF (_ZGVnN4v_exp2f, -9.9, 9.9) VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9) -VNF (_ZGVnN4v_logf, 0.01, 11.1) -{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, -VNF (_ZGVnN4v_sinf, -3.1, 3.1) -VNF (_ZGVnN4v_cosf, -3.1, 3.1) +# if WANT_TRIGPI_TESTS +VNF (_ZGVnN4v_cospif, -0.9, 0.9) +VND (_ZGVnN2v_cospi, -0.9, 0.9) +VNF (_ZGVnN4v_sinpif, -0.9, 0.9) +VND (_ZGVnN2v_sinpi, -0.9, 0.9) +VNF (_ZGVnN4v_tanpif, -0.9, 0.9) +VND (_ZGVnN2v_tanpi, -0.9, 0.9) +{"_ZGVnN4vl4l4_sincospif", 'f', 'n', -0.9, 0.9, {.vnf = _Z_sincospif_wrap}}, +{"_ZGVnN2vl8l8_sincospi", 'd', 'n', -0.9, 0.9, {.vnd = _Z_sincospi_wrap}}, +# endif +#endif + +#if WANT_SVE_TESTS +{ "_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, { .svf = _Z_sv_atan2f_wrap } }, +{ "_ZGVsMxvv_atan2", 'd', 's', -10.0, 10.0, { .svd = _Z_sv_atan2_wrap } }, +{ "_ZGVsMxvv_hypotf", 'f', 's', -10.0, 10.0, { .svf = _Z_sv_hypotf_wrap } }, +{ "_ZGVsMxvv_hypot", 'd', 's', -10.0, 10.0, { .svd = _Z_sv_hypot_wrap } }, +{"_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = xy_Z_sv_powf}}, +{"x_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = x_Z_sv_powf}}, +{"y_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = y_Z_sv_powf}}, +{"_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = xy_Z_sv_pow}}, +{"x_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = x_Z_sv_pow}}, +{"y_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = y_Z_sv_pow}}, +{"_ZGVsMxvl4_modff", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_modff_wrap}}, +{"_ZGVsMxvl8_modf", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_modf_wrap}}, +{"_ZGVsMxvl4l4_sincosf", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_sincosf_wrap}}, +{"_ZGVsMxvl8l8_sincos", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_sincos_wrap}}, +{"_ZGVsMxv_cexpif", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_cexpif_wrap}}, +{"_ZGVsMxv_cexpi", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_cexpi_wrap}}, +# if WANT_TRIGPI_TESTS +SVF (_ZGVsMxv_cospif, -0.9, 0.9) +SVD (_ZGVsMxv_cospi, -0.9, 0.9) +SVF (_ZGVsMxv_sinpif, -0.9, 0.9) +SVD (_ZGVsMxv_sinpi, -0.9, 0.9) +SVF (_ZGVsMxv_tanpif, -0.9, 0.9) +SVD (_ZGVsMxv_tanpi, -0.9, 0.9) +{"_ZGVsMxvl4l4_sincospif", 'f', 's', -0.9, 0.9, {.svf = _Z_sv_sincospif_wrap}}, +{"_ZGVsMxvl8l8_sincospi", 'd', 's', -0.9, 0.9, {.svd = _Z_sv_sincospi_wrap}}, +# endif +# if WANT_EXPERIMENTAL_MATH +{"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}}, +{"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}}, +# endif #endif - /* clang-format on */ + /* clang-format on */ + +#define _ZSF1(fun, a, b) F (fun##f, a, b) +#define _ZSD1(f, a, b) D (f, a, b) + +#define _ZVF1(fun, a, b) VNF (_ZGVnN4v_##fun##f, a, b) +#define _ZVD1(f, a, b) VND (_ZGVnN2v_##f, a, b) + +#define _ZSVF1(fun, a, b) SVF (_ZGVsMxv_##fun##f, a, b) +#define _ZSVD1(f, a, b) SVD (_ZGVsMxv_##f, a, b) + +/* No auto-generated wrappers for binary functions - they have be + manually defined in mathbench_wrappers.h. We have to define silent + macros for them anyway as they will be emitted by TEST_SIG. */ +#define _ZSF2(...) +#define _ZSD2(...) +#define _ZVF2(...) +#define _ZVD2(...) +#define _ZSVF2(...) +#define _ZSVD2(...) + +#include "test/mathbench_funcs_gen.h" diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h index 062b9db56de51a..32dcee36530a44 100644 --- a/math/test/mathbench_wrappers.h +++ b/math/test/mathbench_wrappers.h @@ -1,24 +1,314 @@ /* * Function wrappers for mathbench. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#ifdef __vpcs +#if WANT_EXPERIMENTAL_MATH +static double +atan2_wrap (double x) +{ + return atan2 (5.0, x); +} + +static float +atan2f_wrap (float x) +{ + return atan2f (5.0f, x); +} + +static double +powi_wrap (double x) +{ + return __builtin_powi (x, (int) round (x)); +} +#endif /* WANT_EXPERIMENTAL_MATH. */ + +#if __aarch64__ && __linux__ + +__vpcs static float32x4_t +_Z_sincospif_wrap (float32x4_t x) +{ + float s[4], c[4]; + _ZGVnN4vl4l4_sincospif (x, s, c); + return vld1q_f32 (s) + vld1q_f32 (c); +} + +__vpcs static float64x2_t +_Z_sincospi_wrap (float64x2_t x) +{ + double s[2], c[2]; + _ZGVnN2vl8l8_sincospi (x, s, c); + return vld1q_f64 (s) + vld1q_f64 (c); +} -__vpcs static v_float -xy_Z_powf (v_float x) +__vpcs static float64x2_t +_Z_atan2_wrap (float64x2_t x) +{ + return _ZGVnN2vv_atan2 (vdupq_n_f64 (5.0), x); +} + +__vpcs static float32x4_t +_Z_atan2f_wrap (float32x4_t x) +{ + return _ZGVnN4vv_atan2f (vdupq_n_f32 (5.0f), x); +} + +__vpcs static float32x4_t +_Z_hypotf_wrap (float32x4_t x) +{ + return _ZGVnN4vv_hypotf (vdupq_n_f32 (5.0f), x); +} + +__vpcs static float64x2_t +_Z_hypot_wrap (float64x2_t x) +{ + return _ZGVnN2vv_hypot (vdupq_n_f64 (5.0), x); +} + +__vpcs static float32x4_t +xy_Z_powf (float32x4_t x) { return _ZGVnN4vv_powf (x, x); } -__vpcs static v_double -xy_Z_pow (v_double x) +__vpcs static float32x4_t +x_Z_powf (float32x4_t x) +{ + return _ZGVnN4vv_powf (x, vdupq_n_f32 (23.4)); +} + +__vpcs static float32x4_t +y_Z_powf (float32x4_t x) +{ + return _ZGVnN4vv_powf (vdupq_n_f32 (2.34), x); +} + +__vpcs static float64x2_t +xy_Z_pow (float64x2_t x) { return _ZGVnN2vv_pow (x, x); } +__vpcs static float64x2_t +x_Z_pow (float64x2_t x) +{ + return _ZGVnN2vv_pow (x, vdupq_n_f64 (23.4)); +} + +__vpcs static float64x2_t +y_Z_pow (float64x2_t x) +{ + return _ZGVnN2vv_pow (vdupq_n_f64 (2.34), x); +} + +__vpcs static float32x4_t +_Z_modff_wrap (float32x4_t x) +{ + float y[4]; + float32x4_t ret = _ZGVnN4vl4_modff (x, y); + return ret + vld1q_f32 (y); +} + +__vpcs static float64x2_t +_Z_modf_wrap (float64x2_t x) +{ + double y[2]; + float64x2_t ret = _ZGVnN2vl8_modf (x, y); + return ret + vld1q_f64 (y); +} + +__vpcs static float32x4_t +_Z_sincosf_wrap (float32x4_t x) +{ + float s[4], c[4]; + _ZGVnN4vl4l4_sincosf (x, s, c); + return vld1q_f32 (s) + vld1q_f32 (c); +} + +__vpcs static float32x4_t +_Z_cexpif_wrap (float32x4_t x) +{ + float32x4x2_t sc = _ZGVnN4v_cexpif (x); + return sc.val[0] + sc.val[1]; +} + +__vpcs static float64x2_t +_Z_sincos_wrap (float64x2_t x) +{ + double s[2], c[2]; + _ZGVnN2vl8l8_sincos (x, s, c); + return vld1q_f64 (s) + vld1q_f64 (c); +} + +__vpcs static float64x2_t +_Z_cexpi_wrap (float64x2_t x) +{ + float64x2x2_t sc = _ZGVnN2v_cexpi (x); + return sc.val[0] + sc.val[1]; +} + +#endif + +#if WANT_SVE_TESTS + +static svfloat32_t +_Z_sv_atan2f_wrap (svfloat32_t x, svbool_t pg) +{ + return _ZGVsMxvv_atan2f (x, svdup_f32 (5.0f), pg); +} + +static svfloat64_t +_Z_sv_atan2_wrap (svfloat64_t x, svbool_t pg) +{ + return _ZGVsMxvv_atan2 (x, svdup_f64 (5.0), pg); +} + +static svfloat32_t +_Z_sv_hypotf_wrap (svfloat32_t x, svbool_t pg) +{ + return _ZGVsMxvv_hypotf (x, svdup_f32 (5.0), pg); +} + +static svfloat64_t +_Z_sv_hypot_wrap (svfloat64_t x, svbool_t pg) +{ + return _ZGVsMxvv_hypot (x, svdup_f64 (5.0), pg); +} + +static svfloat32_t +xy_Z_sv_powf (svfloat32_t x, svbool_t pg) +{ + return _ZGVsMxvv_powf (x, x, pg); +} + +static svfloat32_t +x_Z_sv_powf (svfloat32_t x, svbool_t pg) +{ + return _ZGVsMxvv_powf (x, svdup_f32 (23.4f), pg); +} + +static svfloat32_t +y_Z_sv_powf (svfloat32_t x, svbool_t pg) +{ + return _ZGVsMxvv_powf (svdup_f32 (2.34f), x, pg); +} + +static svfloat64_t +xy_Z_sv_pow (svfloat64_t x, svbool_t pg) +{ + return _ZGVsMxvv_pow (x, x, pg); +} + +static svfloat64_t +x_Z_sv_pow (svfloat64_t x, svbool_t pg) +{ + return _ZGVsMxvv_pow (x, svdup_f64 (23.4), pg); +} + +static svfloat64_t +y_Z_sv_pow (svfloat64_t x, svbool_t pg) +{ + return _ZGVsMxvv_pow (svdup_f64 (2.34), x, pg); +} + +static svfloat32_t +_Z_sv_sincospif_wrap (svfloat32_t x, svbool_t pg) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincospif (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, c)); +} + +static svfloat64_t +_Z_sv_sincospi_wrap (svfloat64_t x, svbool_t pg) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincospi (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, c)); +} + +static svfloat32_t +_Z_sv_modff_wrap (svfloat32_t x, svbool_t pg) +{ + float i[svcntw ()]; + svfloat32_t r = _ZGVsMxvl4_modff (x, i, pg); + return svadd_x (pg, r, svld1 (pg, i)); +} + +static svfloat64_t +_Z_sv_modf_wrap (svfloat64_t x, svbool_t pg) +{ + double i[svcntd ()]; + svfloat64_t r = _ZGVsMxvl8_modf (x, i, pg); + return svadd_x (pg, r, svld1 (pg, i)); +} + +static svfloat32_t +_Z_sv_sincosf_wrap (svfloat32_t x, svbool_t pg) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincosf (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); +} + +static svfloat32_t +_Z_sv_cexpif_wrap (svfloat32_t x, svbool_t pg) +{ + svfloat32x2_t sc = _ZGVsMxv_cexpif (x, pg); + return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); +} + +static svfloat64_t +_Z_sv_sincos_wrap (svfloat64_t x, svbool_t pg) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincos (x, s, c, pg); + return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); +} + +static svfloat64_t +_Z_sv_cexpi_wrap (svfloat64_t x, svbool_t pg) +{ + svfloat64x2_t sc = _ZGVsMxv_cexpi (x, pg); + return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); +} + +# if WANT_EXPERIMENTAL_MATH + +static svfloat32_t +_Z_sv_powi_wrap (svfloat32_t x, svbool_t pg) +{ + return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg); +} + +static svfloat64_t +_Z_sv_powk_wrap (svfloat64_t x, svbool_t pg) +{ + return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg); +} + +# endif + +#endif + +#if __aarch64__ +static float +sincospif_wrap (float x) +{ + float s, c; + arm_math_sincospif (x, &s, &c); + return s + c; +} + +static double +sincospi_wrap (double x) +{ + double s, c; + arm_math_sincospi (x, &s, &c); + return s + c; +} #endif static double diff --git a/math/test/mathtest.c b/math/test/mathtest.c index 834233fdde9da7..6e81f0d7b6340d 100644 --- a/math/test/mathtest.c +++ b/math/test/mathtest.c @@ -1,10 +1,12 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2023, Arm Limited. + * Copyright (c) 1998-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +/* clang-format off */ +#define _GNU_SOURCE #include #include #include @@ -196,11 +198,9 @@ int is_complex_rettype(int rettype) { #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name } #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name } -#ifndef PL /* sincosf wrappers for easier testing. */ static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; } static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; } -#endif test_func tfuncs[] = { /* trigonometric */ @@ -220,10 +220,9 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT), TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4), -#ifndef PL TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4), -#endif + /* hyperbolic */ TFUNC(at_d, rt_d, atanh, 4*ULPUNIT), TFUNC(at_d, rt_d, asinh, 4*ULPUNIT), @@ -254,7 +253,9 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4), TFUNC(at_s,rt_s, expm1f, ULPUNIT), +#if WANT_EXP10_TESTS TFUNC(at_d,rt_d, exp10, ULPUNIT), +#endif /* power */ TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4), @@ -1707,3 +1708,4 @@ void undef_func() { failed++; puts("ERROR: undefined function called"); } +/* clang-format on */ diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c index 5b3e9b4f18e467..dd8ceb068141b0 100644 --- a/math/test/rtest/dotest.c +++ b/math/test/rtest/dotest.c @@ -1,7 +1,7 @@ /* * dotest.c - actually generate mathlib test cases * - * Copyright (c) 1999-2019, Arm Limited. + * Copyright (c) 1999-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -18,6 +18,35 @@ #define MPFR_PREC 96 /* good enough for float or double + a few extra bits */ +#if MPFR_VERSION < MPFR_VERSION_NUM(4, 2, 0) +int +mpfr_tanpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) +{ + MPFR_DECL_INIT (frd, MPFR_PREC); + mpfr_const_pi (frd, GMP_RNDN); + mpfr_mul (frd, frd, arg, GMP_RNDN); + return mpfr_tan (ret, frd, GMP_RNDN); +} + +int +mpfr_sinpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) +{ + MPFR_DECL_INIT (frd, MPFR_PREC); + mpfr_const_pi (frd, GMP_RNDN); + mpfr_mul (frd, frd, arg, GMP_RNDN); + return mpfr_sin (ret, frd, GMP_RNDN); +} + +int +mpfr_cospi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) +{ + MPFR_DECL_INIT (frd, MPFR_PREC); + mpfr_const_pi (frd, GMP_RNDN); + mpfr_mul (frd, frd, arg, GMP_RNDN); + return mpfr_cos (ret, frd, GMP_RNDN); +} +#endif + extern int lib_fo, lib_no_arith, ntests; /* @@ -454,6 +483,7 @@ void universal_wrapper(wrapperctx *ctx) } } +/* clang-format off */ Testable functions[] = { /* * Trig functions: sin, cos, tan. We test the core function @@ -479,6 +509,18 @@ Testable functions[] = { cases_uniform_float, 0x39800000, 0x41800000}, {"sincosf_cosf", (funcptr)mpfr_cos, args1f, {NULL}, cases_uniform_float, 0x39800000, 0x41800000}, + {"sinpi", (funcptr)mpfr_sinpi, args1, {NULL}, + cases_uniform, 0x3e400000, 0x40300000}, + {"sinpif", (funcptr)mpfr_sinpi, args1f, {NULL}, + cases_uniform_float, 0x39800000, 0x41800000}, + {"cospi", (funcptr)mpfr_cospi, args1, {NULL}, + cases_uniform, 0x3e400000, 0x40300000}, + {"cospif", (funcptr)mpfr_cospi, args1f, {NULL}, + cases_uniform_float, 0x39800000, 0x41800000}, + {"tanpi", (funcptr)mpfr_tanpi, args1, {NULL}, + cases_uniform, 0x3e400000, 0x40300000}, + {"tanpif", (funcptr)mpfr_tanpi, args1f, {NULL}, + cases_uniform_float, 0x39800000, 0x41800000}, /* * Inverse trig: asin, acos. Between 1 and -1, of course. acos * goes down to 2^-54, asin to 2^-27. @@ -708,6 +750,7 @@ Testable functions[] = { {"tgammaf", (funcptr)mpfr_gamma, args1f, {NULL}, cases_uniform_float, 0x2f800000, 0x43000000}, {"tgamma", (funcptr)mpfr_gamma, args1, {NULL}, cases_uniform, 0x3c000000, 0x40800000}, }; +/* clang-format on */ const int nfunctions = ( sizeof(functions)/sizeof(*functions) ); diff --git a/math/test/runulp.sh b/math/test/runulp.sh index e2e03e3ae76196..672908f355c409 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -2,7 +2,7 @@ # ULP error check script. # -# Copyright (c) 2019-2023, Arm Limited. +# Copyright (c) 2019-2024, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception #set -x @@ -20,260 +20,83 @@ FAIL=0 PASS=0 t() { - [ $r = "n" ] && Lt=$L || Lt=$Ldir - $emu ./ulp -r $r -e $Lt $flags "$@" && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) + # First argument: routine name + routine=$1; shift + # Second and third argument: lo and hi bounds + # Extra processing needed for bivariate routines + IFS=',' read -ra LO <<< "$1"; shift + IFS=',' read -ra HI <<< "$1"; shift + ITV="${LO[0]} ${HI[0]}" + for i in "${!LO[@]}"; do + [[ "$i" -eq "0" ]] || ITV="$ITV x ${LO[$i]} ${HI[$i]}" + done + # Fourth argument: number of test points + n=$1; shift + # Any remaining arguments forwards directly to ulp tool + extra_flags="$@" + + # Read ULP limits, fenv expectation and control values from autogenerated files + limits_file=$LIMITS + [ $r == "n" ] || limits_file=${limits_file}_nn + L=$(grep "^$routine " $limits_file | awk '{print $2}') + [ -n "$L" ] || { echo ERROR: Could not determine ULP limit for $routine in $limits_file && false; } + cvals=($(grep "^$routine " $CVALS | awk '{print $2}')) + + if grep -q "^$routine$" $DISABLE_FENV; then extra_flags="$extra_flags -f"; fi + # Emulate a do-while loop to loop over cvals, but still execute once if it is empty + while : ; do + # Empty string if we are at the end of cvals array + c_arg="" + [ -z "${cvals[0]:-}" ] || c_arg="-c ${cvals[0]}" + $emu ./ulp -e $L $flags $extra_flags -r $r $c_arg $routine $ITV $n && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) + # Shift cvals by 1, and break if it is now empty + cvals=("${cvals[@]:1}") + [ -n "${cvals[0]:-}" ] || break + done + + # Run ULP tool + } check() { - $emu ./ulp -f -q "$@" >/dev/null + $emu ./ulp -f -q "$@" } -Ldir=0.5 +if [[ $WANT_EXPERIMENTAL_MATH -eq 1 ]] && [[ $WANT_SVE_TESTS -eq 1 ]] && [[ $USE_MPFR -eq 0 ]]; then + # No guarantees about powi accuracy, so regression-test for exactness + # w.r.t. the custom reference impl in ulp_wrappers.h + if [ -z "$FUNC" ] || [ "$FUNC" == "_ZGVsMxvv_powi" ]; then + check -q -f -e 0 _ZGVsMxvv_powi 0 inf x 0 1000 100000 + check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x 0 1000 100000 + check -q -f -e 0 _ZGVsMxvv_powi 0 inf x -0 -1000 100000 + check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x -0 -1000 100000 + fi + if [ -z "$FUNC" ] || [ "$FUNC" == "_ZGVsMxvv_powk" ]; then + check -q -f -e 0 _ZGVsMxvv_powk 0 inf x 0 1000 100000 + check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x 0 1000 100000 + check -q -f -e 0 _ZGVsMxvv_powk 0 inf x -0 -1000 100000 + check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x -0 -1000 100000 + fi +fi + +# Test generic routines in all rounding modes for r in $rmodes do -L=0.01 -t exp 0 0xffff000000000000 10000 -t exp 0x1p-6 0x1p6 40000 -t exp -0x1p-6 -0x1p6 40000 -t exp 633.3 733.3 10000 -t exp -633.3 -777.3 10000 - -L=0.01 -t exp2 0 0xffff000000000000 10000 -t exp2 0x1p-6 0x1p6 40000 -t exp2 -0x1p-6 -0x1p6 40000 -t exp2 633.3 733.3 10000 -t exp2 -633.3 -777.3 10000 - -L=0.02 -t log 0 0xffff000000000000 10000 -t log 0x1p-4 0x1p4 40000 -t log 0 inf 40000 - -L=0.05 -t log2 0 0xffff000000000000 10000 -t log2 0x1p-4 0x1p4 40000 -t log2 0 inf 40000 - -L=0.05 -t pow 0.5 2.0 x 0 inf 20000 -t pow -0.5 -2.0 x 0 inf 20000 -t pow 0.5 2.0 x -0 -inf 20000 -t pow -0.5 -2.0 x -0 -inf 20000 -t pow 0.5 2.0 x 0x1p-10 0x1p10 40000 -t pow 0.5 2.0 x -0x1p-10 -0x1p10 40000 -t pow 0 inf x 0.5 2.0 80000 -t pow 0 inf x -0.5 -2.0 80000 -t pow 0x1.fp-1 0x1.08p0 x 0x1p8 0x1p17 80000 -t pow 0x1.fp-1 0x1.08p0 x -0x1p8 -0x1p17 80000 -t pow 0 0x1p-1000 x 0 1.0 50000 -t pow 0x1p1000 inf x 0 1.0 50000 -t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 -t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 -t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 - -L=0.02 -t exp10 0 0x1p-47 5000 -t exp10 -0 -0x1p-47 5000 -t exp10 0x1p-47 1 50000 -t exp10 -0x1p-47 -1 50000 -t exp10 1 0x1.34413509f79ffp8 50000 -t exp10 -1 -0x1.434e6420f4374p8 50000 -t exp10 0x1.34413509f79ffp8 inf 5000 -t exp10 -0x1.434e6420f4374p8 -inf 5000 - -L=1.0 -Ldir=0.9 -t erf 0 0xffff000000000000 10000 -t erf 0x1p-1022 0x1p-26 40000 -t erf -0x1p-1022 -0x1p-26 40000 -t erf 0x1p-26 0x1p3 40000 -t erf -0x1p-26 -0x1p3 40000 -t erf 0 inf 40000 -Ldir=0.5 - -L=0.01 -t expf 0 0xffff0000 10000 -t expf 0x1p-14 0x1p8 50000 -t expf -0x1p-14 -0x1p8 50000 - -L=0.01 -t exp2f 0 0xffff0000 10000 -t exp2f 0x1p-14 0x1p8 50000 -t exp2f -0x1p-14 -0x1p8 50000 - -L=0.32 -t logf 0 0xffff0000 10000 -t logf 0x1p-4 0x1p4 50000 -t logf 0 inf 50000 - -L=0.26 -t log2f 0 0xffff0000 10000 -t log2f 0x1p-4 0x1p4 50000 -t log2f 0 inf 50000 - -L=0.06 -t sinf 0 0xffff0000 10000 -t sinf 0x1p-14 0x1p54 50000 -t sinf -0x1p-14 -0x1p54 50000 - -L=0.06 -t cosf 0 0xffff0000 10000 -t cosf 0x1p-14 0x1p54 50000 -t cosf -0x1p-14 -0x1p54 50000 - -L=0.06 -t sincosf_sinf 0 0xffff0000 10000 -t sincosf_sinf 0x1p-14 0x1p54 50000 -t sincosf_sinf -0x1p-14 -0x1p54 50000 - -L=0.06 -t sincosf_cosf 0 0xffff0000 10000 -t sincosf_cosf 0x1p-14 0x1p54 50000 -t sincosf_cosf -0x1p-14 -0x1p54 50000 - -L=0.4 -t powf 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000 -t powf 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000 -t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000 -t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000 -t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 -t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 - -L=0.6 -Ldir=0.9 -t erff 0 0xffff0000 10000 -t erff 0x1p-127 0x1p-26 40000 -t erff -0x1p-127 -0x1p-26 40000 -t erff 0x1p-26 0x1p3 40000 -t erff -0x1p-26 -0x1p3 40000 -t erff 0 inf 40000 -Ldir=0.5 - + while read F LO HI N + do + [[ -z $F ]] || t $F $LO $HI $N + done << EOF +$(grep "\b$FUNC\b" $GEN_ITVS) +EOF done -# vector functions - -Ldir=0.5 -r='n' -flags="${ULPFLAGS:--q}" - -range_exp=' - 0 0xffff000000000000 10000 - 0x1p-6 0x1p6 400000 - -0x1p-6 -0x1p6 400000 - 633.3 733.3 10000 - -633.3 -777.3 10000 -' - -range_log=' - 0 0xffff000000000000 10000 - 0x1p-4 0x1p4 400000 - 0 inf 400000 -' - -range_pow=' - 0x1p-1 0x1p1 x 0x1p-10 0x1p10 50000 - 0x1p-1 0x1p1 x -0x1p-10 -0x1p10 50000 - 0x1p-500 0x1p500 x 0x1p-1 0x1p1 50000 - 0x1p-500 0x1p500 x -0x1p-1 -0x1p1 50000 - 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p16 50000 - 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16 50000 -' - -range_sin=' - 0 0x1p23 500000 - -0 -0x1p23 500000 - 0x1p23 inf 10000 - -0x1p23 -inf 10000 -' -range_cos="$range_sin" - -range_expf=' - 0 0xffff0000 10000 - 0x1p-14 0x1p8 500000 - -0x1p-14 -0x1p8 500000 -' - -range_expf_1u="$range_expf" -range_exp2f="$range_expf" -range_exp2f_1u="$range_expf" - -range_logf=' - 0 0xffff0000 10000 - 0x1p-4 0x1p4 500000 -' - -range_sinf=' - 0 0x1p20 500000 - -0 -0x1p20 500000 - 0x1p20 inf 10000 - -0x1p20 -inf 10000 -' -range_cosf="$range_sinf" - -range_powf=' - 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000 - 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000 - 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000 - 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000 - 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000 - 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000 -' - -# error limits -L_exp=1.9 -L_log=1.2 -L_pow=0.05 -L_sin=3.0 -L_cos=3.0 -L_expf=1.49 -L_expf_1u=0.4 -L_exp2f=1.49 -L_exp2f_1u=0.4 -L_logf=2.9 -L_sinf=1.4 -L_cosf=1.4 -L_powf=2.1 - -while read G F D +# Only test arch-specific routines in round-to-nearest, with sign of zero ignored (-z flag) +r=n +while read F LO HI N do - case "$G" in \#*) continue ;; esac - eval range="\${range_$G}" - eval L="\${L_$G}" - while read X - do - [ -n "$X" ] || continue - case "$X" in \#*) continue ;; esac - disable_fenv="" - if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then - # If library was built with SIMD exceptions - # disabled, disable fenv checking in ulp - # tool. Otherwise, fenv checking may still be - # disabled by adding -f to the end of the run - # line. - disable_fenv="-f" - fi - t $D $disable_fenv $F $X - done << EOF -$range - -EOF + [[ -z $F ]] || t $F $LO $HI $N -z done << EOF -# group symbol run -exp _ZGVnN2v_exp -log _ZGVnN2v_log -pow _ZGVnN2vv_pow -f -sin _ZGVnN2v_sin -z -cos _ZGVnN2v_cos -expf _ZGVnN4v_expf -expf_1u _ZGVnN4v_expf_1u -f -exp2f _ZGVnN4v_exp2f -exp2f_1u _ZGVnN4v_exp2f_1u -f -logf _ZGVnN4v_logf -sinf _ZGVnN4v_sinf -z -cosf _ZGVnN4v_cosf -powf _ZGVnN4vv_powf -f +$(grep "\b$FUNC\b" $ARCH_ITVS) EOF [ 0 -eq $FAIL ] || { diff --git a/math/test/test_defs.h b/math/test/test_defs.h new file mode 100644 index 00000000000000..d0656c9e1d84d2 --- /dev/null +++ b/math/test/test_defs.h @@ -0,0 +1,31 @@ +/* + * Helper macros for emitting various details about routines for consumption by + * runulp.sh. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. + */ + +#define TEST_ULP(f, l) TEST_ULP f l +#define TEST_ULP_NONNEAREST(f, l) TEST_ULP_NONNEAREST f l + +/* Emit routine name if e == 0 and f is expected to correctly trigger fenv + exceptions. e allows declaration to be emitted conditionally on + WANT_SIMD_EXCEPT - defer expansion by one pass to allow those flags to be + expanded properly. */ +#define TEST_DISABLE_FENV(f) TEST_DISABLE_FENV f +#define TEST_DISABLE_FENV_IF_NOT(f, e) TEST_DISABLE_FENV_IF_NOT_ (f, e) +#define TEST_DISABLE_FENV_IF_NOT_(f, e) TEST_DISABLE_FENV_IF_NOT_##e (f) +#define TEST_DISABLE_FENV_IF_NOT_0(f) TEST_DISABLE_FENV (f) +#define TEST_DISABLE_FENV_IF_NOT_1(f) + +#define TEST_INTERVAL(f, lo, hi, n) TEST_INTERVAL f lo hi n +#define TEST_SYM_INTERVAL(f, lo, hi, n) \ + TEST_INTERVAL (f, lo, hi, n) \ + TEST_INTERVAL (f, -lo, -hi, n) +// clang-format off +#define TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL f xlo,ylo xhi,yhi n +// clang-format on + +#define TEST_CONTROL_VALUE(f, c) TEST_CONTROL_VALUE f c diff --git a/pl/math/test/testcases/directed/acos.tst b/math/test/testcases/directed/acos.tst similarity index 95% rename from pl/math/test/testcases/directed/acos.tst rename to math/test/testcases/directed/acos.tst index a73dcd25965bb4..7889e62f4459fa 100644 --- a/pl/math/test/testcases/directed/acos.tst +++ b/math/test/testcases/directed/acos.tst @@ -1,6 +1,6 @@ ; acos.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=acos op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/acosf.tst b/math/test/testcases/directed/acosf.tst similarity index 95% rename from pl/math/test/testcases/directed/acosf.tst rename to math/test/testcases/directed/acosf.tst index 9e453e3bff5e80..0c2165967abbfc 100644 --- a/pl/math/test/testcases/directed/acosf.tst +++ b/math/test/testcases/directed/acosf.tst @@ -1,6 +1,6 @@ ; acosf.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=acosf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/acosh.tst b/math/test/testcases/directed/acosh.tst similarity index 96% rename from pl/math/test/testcases/directed/acosh.tst rename to math/test/testcases/directed/acosh.tst index dd962bd391daa1..b78d64bb8ea71a 100644 --- a/pl/math/test/testcases/directed/acosh.tst +++ b/math/test/testcases/directed/acosh.tst @@ -1,6 +1,6 @@ ; acosh.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=acosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/acoshf.tst b/math/test/testcases/directed/acoshf.tst similarity index 95% rename from pl/math/test/testcases/directed/acoshf.tst rename to math/test/testcases/directed/acoshf.tst index 606c615f9b74a7..9eec2caf014d17 100644 --- a/pl/math/test/testcases/directed/acoshf.tst +++ b/math/test/testcases/directed/acoshf.tst @@ -1,6 +1,6 @@ ; acoshf.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=acoshf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/asin.tst b/math/test/testcases/directed/asin.tst similarity index 97% rename from pl/math/test/testcases/directed/asin.tst rename to math/test/testcases/directed/asin.tst index 6180d7849d9038..7b916f3624c03c 100644 --- a/pl/math/test/testcases/directed/asin.tst +++ b/math/test/testcases/directed/asin.tst @@ -1,6 +1,6 @@ ; asin.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=asin op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/asinf.tst b/math/test/testcases/directed/asinf.tst similarity index 96% rename from pl/math/test/testcases/directed/asinf.tst rename to math/test/testcases/directed/asinf.tst index a85b2593768d33..d5830b99b62081 100644 --- a/pl/math/test/testcases/directed/asinf.tst +++ b/math/test/testcases/directed/asinf.tst @@ -1,6 +1,6 @@ ; asinf.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=asinf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/asinh.tst b/math/test/testcases/directed/asinh.tst similarity index 95% rename from pl/math/test/testcases/directed/asinh.tst rename to math/test/testcases/directed/asinh.tst index 1485dfeffecf2e..9b250a14f50c8e 100644 --- a/pl/math/test/testcases/directed/asinh.tst +++ b/math/test/testcases/directed/asinh.tst @@ -1,6 +1,6 @@ ; asinh.tst ; -; Copyright (c) 2022-2023, Arm Limited. +; Copyright (c) 2022-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=asinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/asinhf.tst b/math/test/testcases/directed/asinhf.tst similarity index 95% rename from pl/math/test/testcases/directed/asinhf.tst rename to math/test/testcases/directed/asinhf.tst index eb76a5892a7062..f2410e09b03e7e 100644 --- a/pl/math/test/testcases/directed/asinhf.tst +++ b/math/test/testcases/directed/asinhf.tst @@ -1,6 +1,6 @@ ; asinhf.tst ; -; Copyright (c) 2007-2023, Arm Limited. +; Copyright (c) 2007-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=asinhf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/atan.tst b/math/test/testcases/directed/atan.tst similarity index 96% rename from pl/math/test/testcases/directed/atan.tst rename to math/test/testcases/directed/atan.tst index 4c670553d58fb0..d29b13245cd548 100644 --- a/pl/math/test/testcases/directed/atan.tst +++ b/math/test/testcases/directed/atan.tst @@ -1,6 +1,6 @@ ; atan.tst ; -; Copyright (c) 1999-2023, Arm Limited. +; Copyright (c) 1999-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/atan2.tst b/math/test/testcases/directed/atan2.tst similarity index 99% rename from pl/math/test/testcases/directed/atan2.tst rename to math/test/testcases/directed/atan2.tst index 647b3764072cc1..3e34e7641f284c 100644 --- a/pl/math/test/testcases/directed/atan2.tst +++ b/math/test/testcases/directed/atan2.tst @@ -1,6 +1,6 @@ ; atan2.tst ; -; Copyright (c) 1999-2023, Arm Limited. +; Copyright (c) 1999-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i diff --git a/pl/math/test/testcases/directed/atan2f.tst b/math/test/testcases/directed/atan2f.tst similarity index 99% rename from pl/math/test/testcases/directed/atan2f.tst rename to math/test/testcases/directed/atan2f.tst index 85c5c5d47e10b3..e637fe0eba24d8 100644 --- a/pl/math/test/testcases/directed/atan2f.tst +++ b/math/test/testcases/directed/atan2f.tst @@ -1,6 +1,6 @@ ; atan2f.tst ; -; Copyright (c) 1999-2023, Arm Limited. +; Copyright (c) 1999-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=atan2f op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i diff --git a/pl/math/test/testcases/directed/atanf.tst b/math/test/testcases/directed/atanf.tst similarity index 95% rename from pl/math/test/testcases/directed/atanf.tst rename to math/test/testcases/directed/atanf.tst index 0a0bfc24c6050f..8739ea89c3a28a 100644 --- a/pl/math/test/testcases/directed/atanf.tst +++ b/math/test/testcases/directed/atanf.tst @@ -1,6 +1,6 @@ ; atanf.tst ; -; Copyright (c) 2007-2023, Arm Limited. +; Copyright (c) 2007-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=atanf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/atanh.tst b/math/test/testcases/directed/atanh.tst similarity index 97% rename from pl/math/test/testcases/directed/atanh.tst rename to math/test/testcases/directed/atanh.tst index d96ff327fcd9a8..7ba297e5046c7e 100644 --- a/pl/math/test/testcases/directed/atanh.tst +++ b/math/test/testcases/directed/atanh.tst @@ -1,6 +1,6 @@ ; atanh.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=atanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/atanhf.tst b/math/test/testcases/directed/atanhf.tst similarity index 96% rename from pl/math/test/testcases/directed/atanhf.tst rename to math/test/testcases/directed/atanhf.tst index 21a68a661a1134..010012831b3cba 100644 --- a/pl/math/test/testcases/directed/atanhf.tst +++ b/math/test/testcases/directed/atanhf.tst @@ -1,6 +1,6 @@ ; atanhf.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=atanhf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/cbrtf.tst b/math/test/testcases/directed/cbrtf.tst similarity index 97% rename from pl/math/test/testcases/directed/cbrtf.tst rename to math/test/testcases/directed/cbrtf.tst index 0dd8d09f1d4fb5..98942580c7a790 100644 --- a/pl/math/test/testcases/directed/cbrtf.tst +++ b/math/test/testcases/directed/cbrtf.tst @@ -1,6 +1,6 @@ ; cbrtf.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=cbrtf op1=7f800000 result=7f800000 errno=0 diff --git a/pl/math/test/testcases/directed/cosh.tst b/math/test/testcases/directed/cosh.tst similarity index 95% rename from pl/math/test/testcases/directed/cosh.tst rename to math/test/testcases/directed/cosh.tst index c4efacb7272d47..4dc6fe4846dcf6 100644 --- a/pl/math/test/testcases/directed/cosh.tst +++ b/math/test/testcases/directed/cosh.tst @@ -1,6 +1,6 @@ ; cosh.tst ; -; Copyright (c) 1999-2023, Arm Limited. +; Copyright (c) 1999-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=cosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/coshf.tst b/math/test/testcases/directed/coshf.tst similarity index 93% rename from pl/math/test/testcases/directed/coshf.tst rename to math/test/testcases/directed/coshf.tst index 2b967e78f4b425..d224baf486a519 100644 --- a/pl/math/test/testcases/directed/coshf.tst +++ b/math/test/testcases/directed/coshf.tst @@ -1,6 +1,6 @@ ; coshf.tst ; -; Copyright (c) 2007-2023, Arm Limited. +; Copyright (c) 2007-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=coshf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/erfc.tst b/math/test/testcases/directed/erfc.tst similarity index 96% rename from pl/math/test/testcases/directed/erfc.tst rename to math/test/testcases/directed/erfc.tst index c03fc591da47ae..249e7343eac21c 100644 --- a/pl/math/test/testcases/directed/erfc.tst +++ b/math/test/testcases/directed/erfc.tst @@ -1,6 +1,6 @@ ; erfc.tst - Directed test cases for erfc ; -; Copyright (c) 2022-2023, Arm Limited. +; Copyright (c) 2022-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=erfc op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/erfcf.tst b/math/test/testcases/directed/erfcf.tst similarity index 93% rename from pl/math/test/testcases/directed/erfcf.tst rename to math/test/testcases/directed/erfcf.tst index 719baccb2e452b..22a1a8f236d81a 100644 --- a/pl/math/test/testcases/directed/erfcf.tst +++ b/math/test/testcases/directed/erfcf.tst @@ -1,6 +1,6 @@ ; erfcf.tst - Directed test cases for erfcf ; -; Copyright (c) 2007-2023, Arm Limited. +; Copyright (c) 2007-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=erfcf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/expm1.tst b/math/test/testcases/directed/expm1.tst similarity index 96% rename from pl/math/test/testcases/directed/expm1.tst rename to math/test/testcases/directed/expm1.tst index 609d6f47972135..3d58c6b3f16131 100644 --- a/pl/math/test/testcases/directed/expm1.tst +++ b/math/test/testcases/directed/expm1.tst @@ -1,6 +1,6 @@ ; expm1.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=expm1 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/expm1f.tst b/math/test/testcases/directed/expm1f.tst similarity index 98% rename from pl/math/test/testcases/directed/expm1f.tst rename to math/test/testcases/directed/expm1f.tst index 44c38420a617eb..44a15d6798700b 100644 --- a/pl/math/test/testcases/directed/expm1f.tst +++ b/math/test/testcases/directed/expm1f.tst @@ -1,6 +1,6 @@ ; expm1f.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=expm1f op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/log10.tst b/math/test/testcases/directed/log10.tst similarity index 95% rename from pl/math/test/testcases/directed/log10.tst rename to math/test/testcases/directed/log10.tst index 34831436234a8c..3ff2520134980a 100644 --- a/pl/math/test/testcases/directed/log10.tst +++ b/math/test/testcases/directed/log10.tst @@ -1,6 +1,6 @@ ; log10.tst ; -; Copyright (c) 2007-2023, Arm Limited. +; Copyright (c) 2007-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/log10f.tst b/math/test/testcases/directed/log10f.tst similarity index 98% rename from pl/math/test/testcases/directed/log10f.tst rename to math/test/testcases/directed/log10f.tst index d5744a66f092f9..5c83e3f5e9b4ad 100644 --- a/pl/math/test/testcases/directed/log10f.tst +++ b/math/test/testcases/directed/log10f.tst @@ -1,6 +1,6 @@ ; log10f.tst ; -; Copyright (c) 2007-2023, Arm Limited. +; Copyright (c) 2007-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log10f op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/log1p.tst b/math/test/testcases/directed/log1p.tst similarity index 96% rename from pl/math/test/testcases/directed/log1p.tst rename to math/test/testcases/directed/log1p.tst index 9ee8c62fc9c0bf..109413a79e96a3 100644 --- a/pl/math/test/testcases/directed/log1p.tst +++ b/math/test/testcases/directed/log1p.tst @@ -1,6 +1,6 @@ ; log1p.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log1p op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/log1pf.tst b/math/test/testcases/directed/log1pf.tst similarity index 99% rename from pl/math/test/testcases/directed/log1pf.tst rename to math/test/testcases/directed/log1pf.tst index aaa01d67c2b39d..9655b9473612c7 100644 --- a/pl/math/test/testcases/directed/log1pf.tst +++ b/math/test/testcases/directed/log1pf.tst @@ -1,6 +1,6 @@ ; log1pf.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log1pf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/sinh.tst b/math/test/testcases/directed/sinh.tst similarity index 96% rename from pl/math/test/testcases/directed/sinh.tst rename to math/test/testcases/directed/sinh.tst index d6a3da8966933f..ab0d84b84d9ec2 100644 --- a/pl/math/test/testcases/directed/sinh.tst +++ b/math/test/testcases/directed/sinh.tst @@ -1,6 +1,6 @@ ; sinh.tst ; -; Copyright (c) 1999-2023, Arm Limited. +; Copyright (c) 1999-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=sinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/sinhf.tst b/math/test/testcases/directed/sinhf.tst similarity index 95% rename from pl/math/test/testcases/directed/sinhf.tst rename to math/test/testcases/directed/sinhf.tst index 5f7bd1b04137d8..d9269c0fa405cb 100644 --- a/pl/math/test/testcases/directed/sinhf.tst +++ b/math/test/testcases/directed/sinhf.tst @@ -1,6 +1,6 @@ ; sinhf.tst ; -; Copyright (c) 2009-2023, Arm Limited. +; Copyright (c) 2009-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=sinhf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/tanf.tst b/math/test/testcases/directed/tanf.tst similarity index 96% rename from pl/math/test/testcases/directed/tanf.tst rename to math/test/testcases/directed/tanf.tst index 3161f70f43613d..e38142df6e3cea 100644 --- a/pl/math/test/testcases/directed/tanf.tst +++ b/math/test/testcases/directed/tanf.tst @@ -1,6 +1,6 @@ ; tanf.tst ; -; Copyright (c) 2022-2023, Arm Limited. +; Copyright (c) 2022-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=tanf op1=7fc00001 result=7fc00001 errno=0 diff --git a/pl/math/test/testcases/directed/tanh.tst b/math/test/testcases/directed/tanh.tst similarity index 95% rename from pl/math/test/testcases/directed/tanh.tst rename to math/test/testcases/directed/tanh.tst index 78776e6f39249c..e842063c0ef7f3 100644 --- a/pl/math/test/testcases/directed/tanh.tst +++ b/math/test/testcases/directed/tanh.tst @@ -1,6 +1,6 @@ ; tanh.tst ; -; Copyright (c) 1999-2023, Arm Limited. +; Copyright (c) 1999-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=tanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/pl/math/test/testcases/directed/tanhf.tst b/math/test/testcases/directed/tanhf.tst similarity index 95% rename from pl/math/test/testcases/directed/tanhf.tst rename to math/test/testcases/directed/tanhf.tst index 603e3107e44fc0..412aa12b362167 100644 --- a/pl/math/test/testcases/directed/tanhf.tst +++ b/math/test/testcases/directed/tanhf.tst @@ -1,6 +1,6 @@ ; tanhf.tst ; -; Copyright (c) 2007-2023, Arm Limited. +; Copyright (c) 2007-2024, Arm Limited. ; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=tanhf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/trigpi_references.h b/math/test/trigpi_references.h new file mode 100644 index 00000000000000..3dc5a317343622 --- /dev/null +++ b/math/test/trigpi_references.h @@ -0,0 +1,106 @@ +/* + * Extended precision scalar reference functions for trigpi. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#ifndef M_PIl +# define M_PIl 3.141592653589793238462643383279502884l +#endif + +long double +arm_math_sinpil (long double x) +{ + /* sin(inf) should return nan, as defined by C23. */ + if (isinf (x)) + return __math_invalid (x); + + long double ax = fabsl (x); + + /* Return 0 for all values above 2^64 to prevent + overflow when casting to uint64_t. */ + if (ax >= 0x1p64) + return x < 0 ? -0.0l : 0.0l; + + /* All integer cases should return 0, with unchanged sign for zero. */ + if (x == 0.0l) + return x; + if (ax == (uint64_t) ax) + return x < 0 ? -0.0l : 0.0l; + + return sinl (x * M_PIl); +} + +long double +arm_math_cospil (long double x) +{ + /* cos(inf) should return nan, as defined by C23. */ + if (isinf (x)) + return __math_invalid (x); + + long double ax = fabsl (x); + + if (ax >= 0x1p64) + return 1; + + uint64_t m = (uint64_t) ax; + + /* Integer values of cospi(x) should return +/-1. + The sign depends on if x is odd or even. */ + if (m == ax) + return (m & 1) ? -1 : 1; + + /* Values of Integer + 0.5 should always return 0. */ + if (ax - 0.5 == m || ax + 0.5 == m) + return 0; + + return cosl (ax * M_PIl); +} + +long double +arm_math_tanpil (long double x) +{ + /* inf and x = n + 0.5 for any integral n should return nan. */ + if (fabsl (x) >= 0x1p54l) + { + if (isinf (x)) + return __math_invalid (x); + return x < 0 ? -0.0l : 0.0l; + } + + long double i = roundl (x); + long double f = x - i; + int64_t m = (int64_t) i; + + if (x == 0) + { + return x; + } + else if (x == i) + { + if (x < 0) + { + return m & 1 ? 0.0l : -0.0l; + } + else + { + return m & 1 ? -0.0l : 0.0l; + } + } + else if (fabsl (f) == 0.5l) + { + if (x < 0) + { + return m & 1 ? -1.0l / 0.0l : 1.0l / 0.0l; + } + else + { + return m & 1 ? 1.0l / 0.0l : -1.0l / 0.0l; + } + } + + return tanl (f * M_PIl); +} diff --git a/math/test/ulp.c b/math/test/ulp.c index 5ff29972e50ee0..0a75fe26463063 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -1,10 +1,23 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#if WANT_SVE_TESTS +# if __aarch64__ && __linux__ +# ifdef __clang__ +# pragma clang attribute push(__attribute__((target("sve"))), \ + apply_to = any(function)) +# else +# pragma GCC target("+sve") +# endif +# else +# error "SVE not supported - please disable WANT_SVE_TESTS" +# endif +#endif + #define _GNU_SOURCE #include #include @@ -16,6 +29,8 @@ #include #include "mathlib.h" +#include "trigpi_references.h" + /* Don't depend on mpfr by default. */ #ifndef USE_MPFR # define USE_MPFR 0 @@ -24,50 +39,6 @@ # include #endif -static inline uint64_t -asuint64 (double f) -{ - union - { - double f; - uint64_t i; - } u = {f}; - return u.i; -} - -static inline double -asdouble (uint64_t i) -{ - union - { - uint64_t i; - double f; - } u = {i}; - return u.f; -} - -static inline uint32_t -asuint (float f) -{ - union - { - float f; - uint32_t i; - } u = {f}; - return u.i; -} - -static inline float -asfloat (uint32_t i) -{ - union - { - uint32_t i; - float f; - } u = {i}; - return u.f; -} - static uint64_t seed = 0x0123456789abcdef; static uint64_t rand64 (void) @@ -198,68 +169,96 @@ next_d2 (void *g) return (struct args_d2){asdouble (x), asdouble (x2)}; } -struct conf -{ - int r; - int rc; - int quiet; - int mpfr; - int fenv; - unsigned long long n; - double softlim; - double errlim; - int ignore_zero_sign; -}; - /* A bit of a hack: call vector functions twice with the same input in lane 0 but a different value in other lanes: once with an in-range value and then with a special case value. */ static int secondcall; /* Wrappers for vector functions. */ -#ifdef __vpcs -typedef __f32x4_t v_float; -typedef __f64x2_t v_double; +#if __aarch64__ && __linux__ /* First element of fv and dv may be changed by -c argument. */ static float fv[2] = {1.0f, -INFINITY}; static double dv[2] = {1.0, -INFINITY}; -static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; } -static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; } -#if WANT_SVE_MATH +static inline float32x4_t +argf (float x) +{ + return (float32x4_t){ x, x, x, fv[secondcall] }; +} +static inline float64x2_t +argd (double x) +{ + return (float64x2_t){ x, dv[secondcall] }; +} +#if WANT_SVE_TESTS #include -typedef __SVFloat32_t sv_float; -typedef __SVFloat64_t sv_double; - -static inline sv_float svargf(float x) { - int n = svcntw(); - float base[n]; - for (int i=0; i> i) & 1; + return svcmpne (svptrue_b32 (), svld1 (svptrue_b32 (), tmp), 0); + } + else + { + uint64_t tmp[svcntd ()]; + for (unsigned i = 0; i < svcntd (); i++) + tmp[i] = (p >> i) & 1; + return svcmpne (svptrue_b64 (), svld1 (svptrue_b64 (), tmp), 0); + } +} +# endif #endif + +struct conf +{ + int r; + int rc; + int quiet; + int mpfr; + int fenv; + unsigned long long n; + double softlim; + double errlim; + int ignore_zero_sign; +#if WANT_SVE_TESTS + svbool_t *pg; #endif +}; #include "test/ulp_wrappers.h" @@ -269,12 +268,19 @@ struct fun int arity; int singleprec; int twice; + int is_predicated; union { float (*f1) (float); float (*f2) (float, float); double (*d1) (double); double (*d2) (double, double); +#if WANT_SVE_TESTS + float (*f1_pred) (svbool_t, float); + float (*f2_pred) (svbool_t, float, float); + double (*d1_pred) (svbool_t, double); + double (*d2_pred) (svbool_t, double, double); +#endif } fun; union { @@ -294,44 +300,33 @@ struct fun #endif }; +// clang-format off static const struct fun fun[] = { #if USE_MPFR -# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ - {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}, {.t = x_mpfr}}, +# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ + { #x, a, s, twice, 0, { .t = x_wrap }, { .t = x_long }, { .t = x_mpfr } }, +# define SVF(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ + { #x, a, s, twice, 1, { .t##_pred = x_wrap }, { .t = x_long }, { .t = x_mpfr } }, #else -# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ - {#x, a, s, twice, {.t = x_wrap}, {.t = x_long}}, +# define F(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ + { #x, a, s, twice, 0, { .t = x_wrap }, { .t = x_long } }, +# define SVF(x, x_wrap, x_long, x_mpfr, a, s, t, twice) \ + { #x, a, s, twice, 1, { .t##_pred = x_wrap }, { .t = x_long } }, #endif #define F1(x) F (x##f, x##f, x, mpfr_##x, 1, 1, f1, 0) #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0) #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0) #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0) /* Neon routines. */ -#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0) -#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0) -#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0) -#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0) -#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0) -#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0) -#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0) -#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0) -#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0) -#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0) -#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0) -#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0) -#define ZVNF1(x) VNF1 (x) ZVF1 (x) -#define ZVNF2(x) VNF2 (x) ZVF2 (x) -#define ZVND1(x) VND1 (x) ZVD1 (x) -#define ZVND2(x) VND2 (x) ZVD2 (x) +#define ZVNF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0) +#define ZVNF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0) +#define ZVND1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0) +#define ZVND2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0) /* SVE routines. */ -#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0) -#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0) -#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0) -#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0) -#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0) -#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0) -#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0) -#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0) +#define ZSVF1(x) SVF (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0) +#define ZSVF2(x) SVF (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0) +#define ZSVD1(x) SVF (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0) +#define ZSVD2(x) SVF (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0) #include "test/ulp_funcs.h" @@ -340,11 +335,13 @@ static const struct fun fun[] = { #undef F2 #undef D1 #undef D2 -#undef SVF1 -#undef SVF2 -#undef SVD1 -#undef SVD2 - {0}}; +#undef ZSVF1 +#undef ZSVF2 +#undef ZSVD1 +#undef ZSVD2 + { 0 } +}; +// clang-format on /* Boilerplate for generic calls. */ @@ -365,24 +362,40 @@ ulpscale_d (double x) return e - 0x3ff - 52; } static inline float -call_f1 (const struct fun *f, struct args_f1 a) +call_f1 (const struct fun *f, struct args_f1 a, const struct conf *conf) { +#if WANT_SVE_TESTS + if (f->is_predicated) + return f->fun.f1_pred (*conf->pg, a.x); +#endif return f->fun.f1 (a.x); } static inline float -call_f2 (const struct fun *f, struct args_f2 a) +call_f2 (const struct fun *f, struct args_f2 a, const struct conf *conf) { +#if WANT_SVE_TESTS + if (f->is_predicated) + return f->fun.f2_pred (*conf->pg, a.x, a.x2); +#endif return f->fun.f2 (a.x, a.x2); } static inline double -call_d1 (const struct fun *f, struct args_d1 a) +call_d1 (const struct fun *f, struct args_d1 a, const struct conf *conf) { +#if WANT_SVE_TESTS + if (f->is_predicated) + return f->fun.d1_pred (*conf->pg, a.x); +#endif return f->fun.d1 (a.x); } static inline double -call_d2 (const struct fun *f, struct args_d2 a) +call_d2 (const struct fun *f, struct args_d2 a, const struct conf *conf) { +#if WANT_SVE_TESTS + if (f->is_predicated) + return f->fun.d2_pred (*conf->pg, a.x, a.x2); +#endif return f->fun.d2 (a.x, a.x2); } static inline double @@ -594,6 +607,11 @@ usage (void) puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n" " This should be different from tested input in other lanes, and non-special \n" " (i.e. should not trigger fenv exceptions). Default is 1."); +#endif +#if WANT_SVE_TESTS + puts ("-p: integer input for controlling predicate passed to SVE function. " + "If bit N is set, lane N is activated (bits past the vector length " + "are ignored). Default is UINT64_MAX (ptrue)."); #endif puts ("-z: ignore sign of 0."); puts ("Supported func:"); @@ -633,9 +651,21 @@ getnum (const char *s, int singleprec) sign = singleprec ? 1ULL << 31 : 1ULL << 63; s++; } + + /* Sentinel value for failed parse. */ + char *should_not_be_s = NULL; + /* 0xXXXX is treated as bit representation, '-' flips the sign bit. */ if (s[0] == '0' && tolower (s[1]) == 'x' && strchr (s, 'p') == 0) - return sign ^ strtoull (s, 0, 0); + { + uint64_t out = sign ^ strtoull (s, &should_not_be_s, 0); + if (should_not_be_s == s) + { + printf ("ERROR: Could not parse '%s'\n", s); + exit (1); + } + return out; + } // /* SNaN, QNaN, NaN, Inf. */ // for (i=0; s[i] && i < sizeof buf; i++) // buf[i] = tolower(s[i]); @@ -647,8 +677,16 @@ getnum (const char *s, int singleprec) // if (strcmp(buf, "inf") == 0 || strcmp(buf, "infinity") == 0) // return sign | (singleprec ? 0x7f800000 : 0x7ff0000000000000); /* Otherwise assume it's a floating-point literal. */ - return sign - | (singleprec ? asuint (strtof (s, 0)) : asuint64 (strtod (s, 0))); + uint64_t out = sign + | (singleprec ? asuint (strtof (s, &should_not_be_s)) + : asuint64 (strtod (s, &should_not_be_s))); + if (should_not_be_s == s) + { + printf ("ERROR: Could not parse '%s'\n", s); + exit (1); + } + + return out; } static void @@ -720,6 +758,9 @@ main (int argc, char *argv[]) conf.softlim = 0; conf.errlim = INFINITY; conf.ignore_zero_sign = 0; +#if WANT_SVE_TESTS + uint64_t pg_int = UINT64_MAX; +#endif for (;;) { argc--; @@ -767,13 +808,20 @@ main (int argc, char *argv[]) case 'z': conf.ignore_zero_sign = 1; break; -#ifdef __vpcs +#if __aarch64__ && __linux__ case 'c': argc--; argv++; fv[0] = strtof(argv[0], 0); dv[0] = strtod(argv[0], 0); break; +#endif +#if WANT_SVE_TESTS + case 'p': + argc--; + argv++; + pg_int = strtoull (argv[0], 0, 0); + break; #endif default: usage (); @@ -806,7 +854,7 @@ main (int argc, char *argv[]) if (strncmp (argv[0], "_ZGVnN", 6) == 0) exit (0); #endif -#if !WANT_SVE_MATH +#if !WANT_SVE_TESTS if (strncmp (argv[0], "_ZGVsMxv", 8) == 0) exit (0); #endif @@ -824,5 +872,13 @@ main (int argc, char *argv[]) argv++; parsegen (&gen, argc, argv, f); conf.n = gen.cnt; +#if WANT_SVE_TESTS + svbool_t pg = parse_pg (pg_int, f->singleprec); + conf.pg = &pg; +#endif return cmp (f, &gen, &conf); } + +#if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__) +# pragma clang attribute pop +#endif diff --git a/math/test/ulp.h b/math/test/ulp.h index b0bc59aeef8ddb..de122257d3b155 100644 --- a/math/test/ulp.h +++ b/math/test/ulp.h @@ -1,13 +1,13 @@ /* * Generic functions for ULP error estimation. * - * Copyright (c) 2019-2023, Arm Limited. + * Copyright (c) 2019-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* For each different math function type, T(x) should add a different suffix to x. - RT(x) should add a return type specific suffix to x. */ + RT(x) should add a return type specific suffix to x. */ #ifdef NEW_RT #undef NEW_RT @@ -47,8 +47,12 @@ static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, if (RT(asuint) (got) == RT(asuint) (want)) return 0.0; if (isnan (got) && isnan (want)) - /* Ignore sign of NaN. */ + /* Ignore sign of NaN, and signalling-ness for MPFR. */ +# if USE_MPFR + return 0; +# else return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY; +# endif if (signbit (got) != signbit (want)) { /* Fall through to ULP calculation if ignoring sign of zero and at @@ -80,7 +84,7 @@ static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, // TODO: incorrect when got vs want cross a powof2 boundary /* error = got > want ? got - want - tail ulp - 0.5 ulp - : got - want - tail ulp + 0.5 ulp; */ + : got - want - tail ulp + 0.5 ulp. */ d = got - want; e = d > 0 ? -p->tail - 0.5 : -p->tail + 0.5; } @@ -108,32 +112,34 @@ static int RT(isok_nofenv) (RT(float) ygot, RT(float) ywant) } #endif -static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r, - RT(float) * y, int *ex) +static inline void T (call_fenv) (const struct fun *f, struct T (args) a, + int r, RT (float) * y, int *ex, + const struct conf *conf) { if (r != FE_TONEAREST) fesetround (r); feclearexcept (FE_ALL_EXCEPT); - *y = T(call) (f, a); + *y = T (call) (f, a, conf); *ex = fetestexcept (FE_ALL_EXCEPT); if (r != FE_TONEAREST) fesetround (FE_TONEAREST); } -static inline void T(call_nofenv) (const struct fun *f, struct T(args) a, - int r, RT(float) * y, int *ex) +static inline void T (call_nofenv) (const struct fun *f, struct T (args) a, + int r, RT (float) * y, int *ex, + const struct conf *conf) { if (r != FE_TONEAREST) fesetround (r); - *y = T(call) (f, a); + *y = T (call) (f, a, conf); *ex = 0; if (r != FE_TONEAREST) fesetround (FE_TONEAREST); } -static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a, - int r, struct RT(ret) * p, - RT(float) ygot, int exgot) +static inline int T (call_long_fenv) (const struct fun *f, struct T (args) a, + int r, struct RT (ret) * p, + RT (float) ygot, int exgot) { if (r != FE_TONEAREST) fesetround (r); @@ -269,6 +275,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen, int r = conf->r; int use_mpfr = conf->mpfr; int fenv = conf->fenv; + for (;;) { struct RT(ret) want; @@ -279,15 +286,15 @@ static int T(cmp) (const struct fun *f, struct gen *gen, RT(float) ygot2; int fail = 0; if (fenv) - T(call_fenv) (f, a, r, &ygot, &exgot); + T (call_fenv) (f, a, r, &ygot, &exgot, conf); else - T(call_nofenv) (f, a, r, &ygot, &exgot); + T (call_nofenv) (f, a, r, &ygot, &exgot, conf); if (f->twice) { secondcall = 1; if (fenv) - T(call_fenv) (f, a, r, &ygot2, &exgot2); + T (call_fenv) (f, a, r, &ygot2, &exgot2, conf); else - T(call_nofenv) (f, a, r, &ygot2, &exgot2); + T (call_nofenv) (f, a, r, &ygot2, &exgot2, conf); secondcall = 0; if (RT(asuint) (ygot) != RT(asuint) (ygot2)) { diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h index 84f7927d393548..b58a68ff275bf8 100644 --- a/math/test/ulp_funcs.h +++ b/math/test/ulp_funcs.h @@ -1,40 +1,109 @@ /* * Function entries for ulp. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* clang-format off */ - F1 (sin) - F1 (cos) F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0) - F1 (exp) - F1 (exp2) - F1 (log) - F1 (log2) F2 (pow) - F1 (erf) - D1 (exp) - D1 (exp10) - D1 (exp2) - D1 (log) - D1 (log2) D2 (pow) - D1 (erf) -#ifdef __vpcs - F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) +#if __aarch64__ && __linux__ F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) + F (_ZGVnN4v_sincosf_sin, v_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) + F (_ZGVnN4v_sincosf_cos, v_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) + F (_ZGVnN4v_cexpif_sin, v_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) + F (_ZGVnN4v_cexpif_cos, v_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) + F (_ZGVnN4vl4_modff_frac, v_modff_frac, modf_frac, modf_mpfr_frac, 1, 1, f1, 0) + F (_ZGVnN4vl4_modff_int, v_modff_int, modf_int, modf_mpfr_int, 1, 1, f1, 0) + F (_ZGVnN2v_sincos_sin, v_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) + F (_ZGVnN2v_sincos_cos, v_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) + F (_ZGVnN2v_cexpi_sin, v_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) + F (_ZGVnN2v_cexpi_cos, v_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) + F (_ZGVnN2vl8_modf_frac, v_modf_frac, modfl_frac, modf_mpfr_frac, 1, 0, d1, 0) + F (_ZGVnN2vl8_modf_int, v_modf_int, modfl_int, modf_mpfr_int, 1, 0, d1, 0) #endif -/* clang-format on */ + +#if WANT_SVE_TESTS +SVF (_ZGVsMxv_sincosf_sin, sv_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) +SVF (_ZGVsMxv_sincosf_cos, sv_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) +SVF (_ZGVsMxv_cexpif_sin, sv_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) +SVF (_ZGVsMxv_cexpif_cos, sv_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) +SVF (_ZGVsMxvl4_modff_frac, sv_modff_frac, modf_frac, modf_mpfr_frac, 1, 1, f1, 0) +SVF (_ZGVsMxvl4_modff_int, sv_modff_int, modf_int, modf_mpfr_int, 1, 1, f1, 0) +SVF (_ZGVsMxv_sincos_sin, sv_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) +SVF (_ZGVsMxv_sincos_cos, sv_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) +SVF (_ZGVsMxv_cexpi_sin, sv_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) +SVF (_ZGVsMxv_cexpi_cos, sv_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) +SVF (_ZGVsMxvl8_modf_frac, sv_modf_frac, modfl_frac, modf_mpfr_frac, 1, 0, d1, 0) +SVF (_ZGVsMxvl8_modf_int, sv_modf_int, modfl_int, modf_mpfr_int, 1, 0, d1, 0) +#endif + +#if WANT_EXPERIMENTAL_MATH + F (arm_math_erff, arm_math_erff, erf, mpfr_erf, 1, 1, f1, 0) + F (arm_math_erf, arm_math_erf, erfl, mpfr_erf, 1, 0, d1, 0) +#endif + +#if WANT_TRIGPI_TESTS + F (arm_math_cospif, arm_math_cospif, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0) + F (arm_math_cospi, arm_math_cospi, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0) + F (arm_math_sinpif, arm_math_sinpif, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0) + F (arm_math_sinpi, arm_math_sinpi, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0) + F (arm_math_tanpif, arm_math_tanpif, arm_math_tanpi, mpfr_tanpi, 1, 1, f1, 0) + F (arm_math_tanpi, arm_math_tanpi, arm_math_tanpil, mpfr_tanpi, 1, 0, d1, 0) + F (arm_math_sincospif_sin, arm_math_sincospif_sin, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0) + F (arm_math_sincospif_cos, arm_math_sincospif_cos, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0) + F (arm_math_sincospi_sin, arm_math_sincospi_sin, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0) + F (arm_math_sincospi_cos, arm_math_sincospi_cos, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0) +# if __aarch64__ && __linux__ + F (_ZGVnN4v_cospif, Z_cospif, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0) + F (_ZGVnN2v_cospi, Z_cospi, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0) + F (_ZGVnN4v_sinpif, Z_sinpif, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0) + F (_ZGVnN2v_sinpi, Z_sinpi, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0) + F (_ZGVnN4v_tanpif, Z_tanpif, arm_math_tanpi, mpfr_tanpi, 1, 1, f1, 0) + F (_ZGVnN2v_tanpi, Z_tanpi, arm_math_tanpil, mpfr_tanpi, 1, 0, d1, 0) + F (_ZGVnN4v_sincospif_sin, v_sincospif_sin, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0) + F (_ZGVnN4v_sincospif_cos, v_sincospif_cos, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0) + F (_ZGVnN2v_sincospi_sin, v_sincospi_sin, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0) + F (_ZGVnN2v_sincospi_cos, v_sincospi_cos, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0) +# endif +# if WANT_SVE_TESTS + SVF (_ZGVsMxv_cospif, Z_sv_cospif, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0) + SVF (_ZGVsMxv_cospi, Z_sv_cospi, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0) + SVF (_ZGVsMxv_sinpif, Z_sv_sinpif, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0) + SVF (_ZGVsMxv_sinpi, Z_sv_sinpi, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0) + SVF (_ZGVsMxv_tanpif, Z_sv_tanpif, arm_math_tanpi, mpfr_tanpi, 1, 1, f1, 0) + SVF (_ZGVsMxv_tanpi, Z_sv_tanpi, arm_math_tanpil, mpfr_tanpi, 1, 0, d1, 0) + SVF (_ZGVsMxvl4l4_sincospif_sin, sv_sincospif_sin, arm_math_sinpi, mpfr_sinpi, 1, 1, f1, 0) + SVF (_ZGVsMxvl4l4_sincospif_cos, sv_sincospif_cos, arm_math_cospi, mpfr_cospi, 1, 1, f1, 0) + SVF (_ZGVsMxvl8l8_sincospi_sin, sv_sincospi_sin, arm_math_sinpil, mpfr_sinpi, 1, 0, d1, 0) + SVF (_ZGVsMxvl8l8_sincospi_cos, sv_sincospi_cos, arm_math_cospil, mpfr_cospi, 1, 0, d1, 0) +# if WANT_EXPERIMENTAL_MATH +SVF (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0) +SVF (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0) +# endif +# endif +#endif + + /* clang-format on */ + +#define _ZSF1(f) F1 (f) +#define _ZSF2(f) F2 (f) +#define _ZSD1(f) D1 (f) +#define _ZSD2(f) D2 (f) + +#define _ZVF1(f) ZVNF1 (f) +#define _ZVD1(f) ZVND1 (f) +#define _ZVF2(f) ZVNF2 (f) +#define _ZVD2(f) ZVND2 (f) + +#define _ZSVF1(f) ZSVF1 (f) +#define _ZSVF2(f) ZSVF2 (f) +#define _ZSVD1(f) ZSVD1 (f) +#define _ZSVD2(f) ZSVD2 (f) + +#include "test/ulp_funcs_gen.h" diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h index 60dc3d6dd65287..33e1e75f23ab34 100644 --- a/math/test/ulp_wrappers.h +++ b/math/test/ulp_wrappers.h @@ -1,12 +1,18 @@ /* * Function wrappers for ulp. * - * Copyright (c) 2022-2023, Arm Limited. + * Copyright (c) 2022-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* clang-format off */ +#if __aarch64__ && __linux__ +#include +#endif + +#include + /* Wrappers for sincos. */ static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} @@ -15,23 +21,409 @@ static double sincos_cos(double x) {(void)sin(x); return cos(x);} #if USE_MPFR static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } +static int modf_mpfr_frac(mpfr_t f, const mpfr_t x, mpfr_rnd_t r) { MPFR_DECL_INIT(i, 80); return mpfr_modf(i,f,x,r); } +static int modf_mpfr_int(mpfr_t i, const mpfr_t x, mpfr_rnd_t r) { MPFR_DECL_INIT(f, 80); return mpfr_modf(i,f,x,r); } +# if MPFR_VERSION < MPFR_VERSION_NUM(4, 2, 0) +static int mpfr_tanpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) { + MPFR_DECL_INIT (frd, 1080); + mpfr_const_pi (frd, GMP_RNDN); + mpfr_mul (frd, frd, arg, GMP_RNDN); + return mpfr_tan (ret, frd, GMP_RNDN); +} +static int mpfr_sinpi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) { + MPFR_DECL_INIT (frd, 1080); + mpfr_const_pi (frd, GMP_RNDN); + mpfr_mul (frd, frd, arg, GMP_RNDN); + return mpfr_sin (ret, frd, GMP_RNDN); +} + +static int mpfr_cospi (mpfr_t ret, const mpfr_t arg, mpfr_rnd_t rnd) { + MPFR_DECL_INIT (frd, 1080); + mpfr_const_pi (frd, GMP_RNDN); + mpfr_mul (frd, frd, arg, GMP_RNDN); + return mpfr_cos (ret, frd, GMP_RNDN); +} +# endif +# if WANT_EXPERIMENTAL_MATH +static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) { + mpfr_t y2; + mpfr_init(y2); + mpfr_trunc(y2, y); + return mpfr_pow(ret, x, y2, rnd); +} +# endif #endif +float modff_frac(float x) { float i; return modff(x, &i); } +float modff_int(float x) { float i; modff(x, &i); return i; } +double modf_frac(double x) { double i; return modf(x, &i); } +double modf_int(double x) { double i; modf(x, &i); return i; } +long double modfl_frac(long double x) { long double i; return modfl(x, &i); } +long double modfl_int(long double x) { long double i; modfl(x, &i); return i; } + /* Wrappers for vector functions. */ -#ifdef __vpcs -static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } -static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } +#if __aarch64__ && __linux__ static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; } -static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; } -static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } -static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } -static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } -static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } -static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } -static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } -static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } -static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } #endif /* clang-format on */ + +/* No wrappers for scalar routines, but TEST_SIG will emit them. */ +#define ZSNF1_WRAP(func) +#define ZSNF2_WRAP(func) +#define ZSND1_WRAP(func) +#define ZSND2_WRAP(func) + +#define ZVNF1_WRAP(func) \ + static float Z_##func##f (float x) \ + { \ + return _ZGVnN4v_##func##f (argf (x))[0]; \ + } +#define ZVNF2_WRAP(func) \ + static float Z_##func##f (float x, float y) \ + { \ + return _ZGVnN4vv_##func##f (argf (x), argf (y))[0]; \ + } +#define ZVND1_WRAP(func) \ + static double Z_##func (double x) { return _ZGVnN2v_##func (argd (x))[0]; } +#define ZVND2_WRAP(func) \ + static double Z_##func (double x, double y) \ + { \ + return _ZGVnN2vv_##func (argd (x), argd (y))[0]; \ + } + +#if WANT_TRIGPI_TESTS +float +arm_math_sincospif_sin (float x) +{ + float s, c; + arm_math_sincospif (x, &s, &c); + return s; +} +float +arm_math_sincospif_cos (float x) +{ + float s, c; + arm_math_sincospif (x, &s, &c); + return c; +} +double +arm_math_sincospi_sin (double x) +{ + double s, c; + arm_math_sincospi (x, &s, &c); + return s; +} +double +arm_math_sincospi_cos (double x) +{ + double s, c; + arm_math_sincospi (x, &s, &c); + return c; +} +#endif + +#if __aarch64__ && __linux__ + +# if WANT_TRIGPI_TESTS +ZVNF1_WRAP (cospi) +ZVND1_WRAP (cospi) +ZVNF1_WRAP (sinpi) +ZVND1_WRAP (sinpi) +ZVNF1_WRAP (tanpi) +ZVND1_WRAP (tanpi) + +double +v_sincospi_sin (double x) +{ + double s[2], c[2]; + _ZGVnN2vl8l8_sincospi (vdupq_n_f64 (x), s, c); + return s[0]; +} +double +v_sincospi_cos (double x) +{ + double s[2], c[2]; + _ZGVnN2vl8l8_sincospi (vdupq_n_f64 (x), s, c); + return c[0]; +} +float +v_sincospif_sin (float x) +{ + float s[4], c[4]; + _ZGVnN4vl4l4_sincospif (vdupq_n_f32 (x), s, c); + return s[0]; +} +float +v_sincospif_cos (float x) +{ + float s[4], c[4]; + _ZGVnN4vl4l4_sincospif (vdupq_n_f32 (x), s, c); + return c[0]; +} +# endif // WANT_TRIGPI_TESTS + +float +v_sincosf_sin (float x) +{ + float s[4], c[4]; + _ZGVnN4vl4l4_sincosf (vdupq_n_f32 (x), s, c); + return s[0]; +} +float +v_sincosf_cos (float x) +{ + float s[4], c[4]; + _ZGVnN4vl4l4_sincosf (vdupq_n_f32 (x), s, c); + return c[0]; +} +float +v_cexpif_sin (float x) +{ + return _ZGVnN4v_cexpif (vdupq_n_f32 (x)).val[0][0]; +} +float +v_cexpif_cos (float x) +{ + return _ZGVnN4v_cexpif (vdupq_n_f32 (x)).val[1][0]; +} +float +v_modff_frac (float x) +{ + float y[4]; + return _ZGVnN4vl4_modff (vdupq_n_f32 (x), y)[0]; +} +float +v_modff_int (float x) +{ + float y[4]; + _ZGVnN4vl4_modff (vdupq_n_f32 (x), y); + return y[0]; +} +double +v_sincos_sin (double x) +{ + double s[2], c[2]; + _ZGVnN2vl8l8_sincos (vdupq_n_f64 (x), s, c); + return s[0]; +} +double +v_sincos_cos (double x) +{ + double s[2], c[2]; + _ZGVnN2vl8l8_sincos (vdupq_n_f64 (x), s, c); + return c[0]; +} +double +v_cexpi_sin (double x) +{ + return _ZGVnN2v_cexpi (vdupq_n_f64 (x)).val[0][0]; +} +double +v_cexpi_cos (double x) +{ + return _ZGVnN2v_cexpi (vdupq_n_f64 (x)).val[1][0]; +} +double +v_modf_frac (double x) +{ + double y[2]; + return _ZGVnN2vl8_modf (vdupq_n_f64 (x), y)[0]; +} +double +v_modf_int (double x) +{ + double y[2]; + _ZGVnN2vl8_modf (vdupq_n_f64 (x), y); + return y[0]; +} +#endif // __aarch64__ && __linux__ + +#if WANT_SVE_TESTS +# define ZSVNF1_WRAP(func) \ + static float Z_sv_##func##f (svbool_t pg, float x) \ + { \ + return svretf (_ZGVsMxv_##func##f (svargf (x), pg), pg); \ + } +# define ZSVNF2_WRAP(func) \ + static float Z_sv_##func##f (svbool_t pg, float x, float y) \ + { \ + return svretf (_ZGVsMxvv_##func##f (svargf (x), svargf (y), pg), pg); \ + } +# define ZSVND1_WRAP(func) \ + static double Z_sv_##func (svbool_t pg, double x) \ + { \ + return svretd (_ZGVsMxv_##func (svargd (x), pg), pg); \ + } +# define ZSVND2_WRAP(func) \ + static double Z_sv_##func (svbool_t pg, double x, double y) \ + { \ + return svretd (_ZGVsMxvv_##func (svargd (x), svargd (y), pg), pg); \ + } + +# if WANT_TRIGPI_TESTS +ZSVNF1_WRAP (cospi) +ZSVND1_WRAP (cospi) +ZSVNF1_WRAP (sinpi) +ZSVND1_WRAP (sinpi) +ZSVNF1_WRAP (tanpi) +ZSVND1_WRAP (tanpi) +double +sv_sincospi_sin (svbool_t pg, double x) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincospi (svdup_f64 (x), s, c, pg); + return svretd (svld1 (pg, s), pg); +} +double +sv_sincospi_cos (svbool_t pg, double x) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincospi (svdup_f64 (x), s, c, pg); + return svretd (svld1 (pg, c), pg); +} +float +sv_sincospif_sin (svbool_t pg, float x) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincospif (svdup_f32 (x), s, c, pg); + return svretf (svld1 (pg, s), pg); +} +float +sv_sincospif_cos (svbool_t pg, float x) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincospif (svdup_f32 (x), s, c, pg); + return svretf (svld1 (pg, c), pg); +} +# endif // WANT_TRIGPI_TESTS + +float +sv_sincosf_sin (svbool_t pg, float x) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincosf (svdup_f32 (x), s, c, pg); + return svretf (svld1 (pg, s), pg); +} +float +sv_sincosf_cos (svbool_t pg, float x) +{ + float s[svcntw ()], c[svcntw ()]; + _ZGVsMxvl4l4_sincosf (svdup_f32 (x), s, c, pg); + return svretf (svld1 (pg, c), pg); +} +float +sv_cexpif_sin (svbool_t pg, float x) +{ + return svretf (svget2 (_ZGVsMxv_cexpif (svdup_f32 (x), pg), 0), pg); +} +float +sv_cexpif_cos (svbool_t pg, float x) +{ + return svretf (svget2 (_ZGVsMxv_cexpif (svdup_f32 (x), pg), 1), pg); +} +float +sv_modff_frac (svbool_t pg, float x) +{ + float i[svcntw ()]; + return svretf (_ZGVsMxvl4_modff (svdup_f32 (x), i, pg), pg); +} +float +sv_modff_int (svbool_t pg, float x) +{ + float i[svcntw ()]; + _ZGVsMxvl4_modff (svdup_f32 (x), i, pg); + return svretf (svld1 (pg, i), pg); +} +double +sv_sincos_sin (svbool_t pg, double x) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincos (svdup_f64 (x), s, c, pg); + return svretd (svld1 (pg, s), pg); +} +double +sv_sincos_cos (svbool_t pg, double x) +{ + double s[svcntd ()], c[svcntd ()]; + _ZGVsMxvl8l8_sincos (svdup_f64 (x), s, c, pg); + return svretd (svld1 (pg, c), pg); +} +double +sv_cexpi_sin (svbool_t pg, double x) +{ + return svretd (svget2 (_ZGVsMxv_cexpi (svdup_f64 (x), pg), 0), pg); +} +double +sv_cexpi_cos (svbool_t pg, double x) +{ + return svretd (svget2 (_ZGVsMxv_cexpi (svdup_f64 (x), pg), 1), pg); +} +double +sv_modf_frac (svbool_t pg, double x) +{ + double i[svcntd ()]; + return svretd (_ZGVsMxvl8_modf (svdup_f64 (x), i, pg), pg); +} +double +sv_modf_int (svbool_t pg, double x) +{ + double i[svcntd ()]; + _ZGVsMxvl8_modf (svdup_f64 (x), i, pg); + return svretd (svld1 (pg, i), pg); +} + +# if WANT_EXPERIMENTAL_MATH + +/* Our implementations of powi/powk are too imprecise to verify + against any established pow implementation. Instead we have the + following simple implementation, against which it is enough to + maintain bitwise reproducibility. Note the test framework expects + the reference impl to be of higher precision than the function + under test. For instance this means that the reference for + double-precision powi will be passed a long double, so to check + bitwise reproducibility we have to cast it back down to + double. This is fine since a round-trip to higher precision and + back down is correctly rounded. */ +# define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \ + static DBL_T __attribute__ ((unused)) NAME (DBL_T in_val, DBL_T y) \ + { \ + INT_T n = (INT_T) round (y); \ + FLT_T acc = 1.0; \ + bool want_recip = n < 0; \ + n = n < 0 ? -n : n; \ + \ + for (FLT_T c = in_val; n; c *= c, n >>= 1) \ + { \ + if (n & 0x1) \ + { \ + acc *= c; \ + } \ + } \ + if (want_recip) \ + { \ + acc = 1.0 / acc; \ + } \ + return acc; \ + } + +DECL_POW_INT_REF (ref_powif, double, float, int) +DECL_POW_INT_REF (ref_powi, long double, double, int) +static float +Z_sv_powi (svbool_t pg, float x, float y) +{ + return svretf (_ZGVsMxvv_powi (svargf (x), svdup_s32 ((int) round (y)), pg), + pg); +} +static double +Z_sv_powk (svbool_t pg, double x, double y) +{ + return svretd (_ZGVsMxvv_powk (svargd (x), svdup_s64 ((long) round (y)), pg), + pg); +} + +# endif // WANT_EXPERIMENTAL_MATH +#endif // WANT_SVE_TESTS + +#include "test/ulp_wrappers_gen.h" diff --git a/math/tgamma128.c b/math/tgamma128.c index 65deacc49d99f9..d6049207b91f3a 100644 --- a/math/tgamma128.c +++ b/math/tgamma128.c @@ -338,6 +338,8 @@ long double tgamma128(long double x) mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*( 1140.0L+t*(231.25L+t*(24.0L+t))))); break; + default: + __builtin_unreachable(); } } diff --git a/pl/math/tools/asin.sollya b/math/tools/asin.sollya similarity index 93% rename from pl/math/tools/asin.sollya rename to math/tools/asin.sollya index 8ef861d0898bd5..02c4a93356c3d2 100644 --- a/pl/math/tools/asin.sollya +++ b/math/tools/asin.sollya @@ -1,6 +1,6 @@ // polynomial for approximating asin(x) // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception f = asin(x); diff --git a/pl/math/tools/asinf.sollya b/math/tools/asinf.sollya similarity index 94% rename from pl/math/tools/asinf.sollya rename to math/tools/asinf.sollya index 5b627e546c73b3..69d1803875d160 100644 --- a/pl/math/tools/asinf.sollya +++ b/math/tools/asinf.sollya @@ -1,6 +1,6 @@ // polynomial for approximating asinf(x) // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception f = asin(x); diff --git a/pl/math/tools/asinh.sollya b/math/tools/asinh.sollya similarity index 94% rename from pl/math/tools/asinh.sollya rename to math/tools/asinh.sollya index 663ee92f3f3471..eea9b808116859 100644 --- a/pl/math/tools/asinh.sollya +++ b/math/tools/asinh.sollya @@ -1,6 +1,6 @@ // polynomial for approximating asinh(x) // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // Polynomial is used in [2^-26, 1]. However it is least accurate close to 1, so diff --git a/pl/math/tools/asinhf.sollya b/math/tools/asinhf.sollya similarity index 93% rename from pl/math/tools/asinhf.sollya rename to math/tools/asinhf.sollya index ab115b53b8dc79..5f1580fce88328 100644 --- a/pl/math/tools/asinhf.sollya +++ b/math/tools/asinhf.sollya @@ -1,6 +1,6 @@ // polynomial for approximating asinh(x) // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 9; diff --git a/pl/math/tools/atan.sollya b/math/tools/atan.sollya similarity index 93% rename from pl/math/tools/atan.sollya rename to math/tools/atan.sollya index ad4f33b8516a94..048017d8d269ae 100644 --- a/pl/math/tools/atan.sollya +++ b/math/tools/atan.sollya @@ -1,6 +1,6 @@ // polynomial for approximating atan(x) and atan2(y, x) // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // atan is odd, so approximate with an odd polynomial: diff --git a/pl/math/tools/atanf.sollya b/math/tools/atanf.sollya similarity index 92% rename from pl/math/tools/atanf.sollya rename to math/tools/atanf.sollya index ed88d0ba90f937..21c3ba2bfa1d8e 100644 --- a/pl/math/tools/atanf.sollya +++ b/math/tools/atanf.sollya @@ -1,6 +1,6 @@ // polynomial for approximating atanf(x) // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // Generate list of monomials: diff --git a/pl/math/tools/cbrt.sollya b/math/tools/cbrt.sollya similarity index 90% rename from pl/math/tools/cbrt.sollya rename to math/tools/cbrt.sollya index 1d43dc73d8cdfa..2490a69ac029d0 100644 --- a/pl/math/tools/cbrt.sollya +++ b/math/tools/cbrt.sollya @@ -1,6 +1,6 @@ // polynomial for approximating cbrt(x) in double precision // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 3; diff --git a/pl/math/tools/cbrtf.sollya b/math/tools/cbrtf.sollya similarity index 90% rename from pl/math/tools/cbrtf.sollya rename to math/tools/cbrtf.sollya index 4e0cc69b46a58e..1debf930e7226e 100644 --- a/pl/math/tools/cbrtf.sollya +++ b/math/tools/cbrtf.sollya @@ -1,6 +1,6 @@ // polynomial for approximating cbrt(x) in single precision // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 3; diff --git a/pl/math/tools/erf.sollya b/math/tools/erf.sollya similarity index 92% rename from pl/math/tools/erf.sollya rename to math/tools/erf.sollya index b2fc559b511ef1..060e1686c835ae 100644 --- a/pl/math/tools/erf.sollya +++ b/math/tools/erf.sollya @@ -1,6 +1,6 @@ // tables and constants for approximating erf(x). // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception display = hexadecimal; diff --git a/pl/math/tools/erfc.sollya b/math/tools/erfc.sollya similarity index 95% rename from pl/math/tools/erfc.sollya rename to math/tools/erfc.sollya index 1e2791291ebbb8..1b4b0006609392 100644 --- a/pl/math/tools/erfc.sollya +++ b/math/tools/erfc.sollya @@ -1,6 +1,6 @@ // tables and constants for approximating erfc(x). // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception display = hexadecimal; diff --git a/pl/math/tools/erfcf.sollya b/math/tools/erfcf.sollya similarity index 91% rename from pl/math/tools/erfcf.sollya rename to math/tools/erfcf.sollya index 1d7fc264d99d2b..a8e0409f5db56f 100644 --- a/pl/math/tools/erfcf.sollya +++ b/math/tools/erfcf.sollya @@ -1,6 +1,6 @@ // tables and constants for approximating erfcf(x). // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception display = hexadecimal; diff --git a/pl/math/tools/erff.sollya b/math/tools/erff.sollya similarity index 91% rename from pl/math/tools/erff.sollya rename to math/tools/erff.sollya index 59b23ef021f0fb..c0178a2b24adfe 100644 --- a/pl/math/tools/erff.sollya +++ b/math/tools/erff.sollya @@ -1,6 +1,6 @@ // tables and constants for approximating erff(x). // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception display = hexadecimal; diff --git a/pl/math/tools/exp10.sollya b/math/tools/exp10.sollya similarity index 97% rename from pl/math/tools/exp10.sollya rename to math/tools/exp10.sollya index 9f30b401820958..91f92595b96dab 100644 --- a/pl/math/tools/exp10.sollya +++ b/math/tools/exp10.sollya @@ -1,6 +1,6 @@ // polynomial for approximating 10^x // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // exp10f parameters diff --git a/pl/math/tools/expm1.sollya b/math/tools/expm1.sollya similarity index 91% rename from pl/math/tools/expm1.sollya rename to math/tools/expm1.sollya index 7b6f324eb247b8..d87466a066af65 100644 --- a/pl/math/tools/expm1.sollya +++ b/math/tools/expm1.sollya @@ -1,6 +1,6 @@ // polynomial for approximating exp(x)-1 in double precision // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 12; diff --git a/pl/math/tools/expm1f.sollya b/math/tools/expm1f.sollya similarity index 91% rename from pl/math/tools/expm1f.sollya rename to math/tools/expm1f.sollya index efdf1bd301e0ed..bb9496f3f2c426 100644 --- a/pl/math/tools/expm1f.sollya +++ b/math/tools/expm1f.sollya @@ -1,6 +1,6 @@ // polynomial for approximating exp(x)-1 in single precision // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 5; diff --git a/pl/math/tools/log10.sollya b/math/tools/log10.sollya similarity index 96% rename from pl/math/tools/log10.sollya rename to math/tools/log10.sollya index 85d1d15c16985b..78f956b14b9599 100644 --- a/pl/math/tools/log10.sollya +++ b/math/tools/log10.sollya @@ -1,6 +1,6 @@ // polynomial for approximating log10(1+x) // -// Copyright (c) 2019-2023, Arm Limited. +// Copyright (c) 2019-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 6; // poly degree diff --git a/pl/math/tools/log10f.sollya b/math/tools/log10f.sollya similarity index 96% rename from pl/math/tools/log10f.sollya rename to math/tools/log10f.sollya index 94bf32f2c449b3..c64a30aa8e1841 100644 --- a/pl/math/tools/log10f.sollya +++ b/math/tools/log10f.sollya @@ -1,6 +1,6 @@ // polynomial for approximating log10f(1+x) // -// Copyright (c) 2019-2023, Arm Limited. +// Copyright (c) 2019-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // Computation of log10f(1+x) will be carried out in double precision diff --git a/pl/math/tools/log1p.sollya b/math/tools/log1p.sollya similarity index 93% rename from pl/math/tools/log1p.sollya rename to math/tools/log1p.sollya index 598a36af03394f..0cf72081fabb5e 100644 --- a/pl/math/tools/log1p.sollya +++ b/math/tools/log1p.sollya @@ -1,6 +1,6 @@ // polynomial for approximating log(1+x) in double precision // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 20; diff --git a/pl/math/tools/log1pf.sollya b/math/tools/log1pf.sollya similarity index 91% rename from pl/math/tools/log1pf.sollya rename to math/tools/log1pf.sollya index cc1db10e4c0c8d..fc542c93711151 100644 --- a/pl/math/tools/log1pf.sollya +++ b/math/tools/log1pf.sollya @@ -1,6 +1,6 @@ // polynomial for approximating log(1+x) in single precision // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 10; diff --git a/pl/math/tools/sincos.sollya b/math/tools/sincos.sollya similarity index 92% rename from pl/math/tools/sincos.sollya rename to math/tools/sincos.sollya index 7d36266b446b73..600368507f4ee6 100644 --- a/pl/math/tools/sincos.sollya +++ b/math/tools/sincos.sollya @@ -1,9 +1,9 @@ // polynomial for approximating cos(x) // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception -// This script only finds the coeffs for cos - see math/aarch64/v_sin.c for sin coeffs +// This script only finds the coeffs for cos - see math/aarch64/advsimd/sin.c for sin coeffs deg = 14; // polynomial degree a = -pi/4; // interval diff --git a/pl/math/tools/sincosf.sollya b/math/tools/sincosf.sollya similarity index 95% rename from pl/math/tools/sincosf.sollya rename to math/tools/sincosf.sollya index 178ee83ac19607..add874e87a9a18 100644 --- a/pl/math/tools/sincosf.sollya +++ b/math/tools/sincosf.sollya @@ -1,6 +1,6 @@ // polynomial for approximating cos(x) // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // This script only finds the coeffs for cos - see math/tools/sin.sollya for sin coeffs. diff --git a/pl/math/tools/sinpi.sollya b/math/tools/sinpi.sollya similarity index 95% rename from pl/math/tools/sinpi.sollya rename to math/tools/sinpi.sollya index 62cc87e7697d47..9bc5b1c7fc2a60 100644 --- a/pl/math/tools/sinpi.sollya +++ b/math/tools/sinpi.sollya @@ -1,6 +1,6 @@ // polynomial for approximating sinpi(x) // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 19; // polynomial degree diff --git a/pl/math/tools/tan.sollya b/math/tools/tan.sollya similarity index 91% rename from pl/math/tools/tan.sollya rename to math/tools/tan.sollya index bb0bb28270e33a..ca8a170bedaa9d 100644 --- a/pl/math/tools/tan.sollya +++ b/math/tools/tan.sollya @@ -1,6 +1,6 @@ // polynomial for approximating double precision tan(x) // -// Copyright (c) 2023, Arm Limited. +// Copyright (c) 2023-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 8; diff --git a/pl/math/tools/tanf.sollya b/math/tools/tanf.sollya similarity index 98% rename from pl/math/tools/tanf.sollya rename to math/tools/tanf.sollya index f4b49b40ae64ea..054d3db4404696 100644 --- a/pl/math/tools/tanf.sollya +++ b/math/tools/tanf.sollya @@ -1,6 +1,6 @@ // polynomial for approximating single precision tan(x) // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception dtype = single; diff --git a/math/tools/tanpi.sollya b/math/tools/tanpi.sollya new file mode 100644 index 00000000000000..8edbc359ab8e45 --- /dev/null +++ b/math/tools/tanpi.sollya @@ -0,0 +1,48 @@ +// polynomial for approximating tanpi/f(x) +// +// Copyright (c) 2024, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// 0 for tanpi/f [0,0.25], 1 for tanpi/f [0.25,1] +method = 0; +dtype = double; + +if (dtype == single) then { + if (method == 0) then { deg = 5; } + else if (method == 1) then { deg = 3; }; +} else if (dtype == double) then { + if (method == 0) then { deg = 13; } + else if (method == 1) then { deg = 8; }; +}; + +a = 0x1.0p-126; +b = 1/4; + +if (method == 0) then { + g = tan(pi * x); + F = proc(P) { return pi * x + x^3 * P(x^2); }; + f = (g(sqrt(x)) - pi * sqrt(x))/(x^(3/2)); +} else if (method == 1) then { + g = 1/tan(pi * x); + F = proc(P) { return 1/(pi * x) + x * P(x^2); }; + f = (g(sqrt(x)) / sqrt(x)) - 1/(pi * x); +}; + +poly = fpminimax(f, deg, [|dtype ...|], [a*a;b*b]); + +// +// Display coefficients in Sollya +// +display = hexadecimal!; +if (dtype==double) then { prec = 53!; } +else if (dtype==single) then { prec = 23!; }; +print("_coeffs :_ hex"); +for i from 0 to deg do coeff(poly, i); + +// Compute errors +//display = hexadecimal!; +d_rel_err = dirtyinfnorm(1-F(poly)/g(x), [a;b]); +d_abs_err = dirtyinfnorm(g(x)-F(poly), [a;b]); +print("dirty rel error:", d_rel_err); +print("dirty abs error:", d_abs_err); +print("in [",a,b,"]"); diff --git a/pl/math/tools/v_erf.sollya b/math/tools/v_erf.sollya similarity index 91% rename from pl/math/tools/v_erf.sollya rename to math/tools/v_erf.sollya index 394ba377df12b5..5d7795842bcd89 100644 --- a/pl/math/tools/v_erf.sollya +++ b/math/tools/v_erf.sollya @@ -2,7 +2,7 @@ // To generate coefficients for interval i (0 to 47) do: // $ sollya v_erf.sollya $i // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception scale = 1/8; diff --git a/pl/math/tools/v_erfc.sollya b/math/tools/v_erfc.sollya similarity index 96% rename from pl/math/tools/v_erfc.sollya rename to math/tools/v_erfc.sollya index 3b03ba07863dd4..764b333d6d258d 100644 --- a/pl/math/tools/v_erfc.sollya +++ b/math/tools/v_erfc.sollya @@ -1,6 +1,6 @@ // polynomial for approximating erfc(x)*exp(x*x) // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 12; // poly degree diff --git a/pl/math/tools/v_log10.sollya b/math/tools/v_log10.sollya similarity index 96% rename from pl/math/tools/v_log10.sollya rename to math/tools/v_log10.sollya index e2df4364ada016..5181074f676263 100644 --- a/pl/math/tools/v_log10.sollya +++ b/math/tools/v_log10.sollya @@ -1,6 +1,6 @@ // polynomial used for __v_log10(x) // -// Copyright (c) 2019-2023, Arm Limited. +// Copyright (c) 2019-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 6; // poly degree diff --git a/pl/math/tools/v_log10f.sollya b/math/tools/v_log10f.sollya similarity index 96% rename from pl/math/tools/v_log10f.sollya rename to math/tools/v_log10f.sollya index 396d5a92302bd3..4906cb1d2137aa 100644 --- a/pl/math/tools/v_log10f.sollya +++ b/math/tools/v_log10f.sollya @@ -1,6 +1,6 @@ // polynomial for approximating v_log10f(1+x) // -// Copyright (c) 2019-2023, Arm Limited. +// Copyright (c) 2019-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 9; // poly degree diff --git a/pl/math/tools/v_log2f.sollya b/math/tools/v_log2f.sollya similarity index 96% rename from pl/math/tools/v_log2f.sollya rename to math/tools/v_log2f.sollya index 99e050c91b0310..337d4830a2aede 100644 --- a/pl/math/tools/v_log2f.sollya +++ b/math/tools/v_log2f.sollya @@ -1,6 +1,6 @@ // polynomial used for __v_log2f(x) // -// Copyright (c) 2022-2023, Arm Limited. +// Copyright (c) 2022-2024, Arm Limited. // SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 9; // poly degree diff --git a/networking/Dir.mk b/networking/Dir.mk index 2589e0a1f91c47..b3ca2ff335e454 100644 --- a/networking/Dir.mk +++ b/networking/Dir.mk @@ -1,6 +1,6 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2020, Arm Limited. +# Copyright (c) 2019-2025, Arm Limited. # SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/networking @@ -46,12 +46,12 @@ $(networking-objs): CFLAGS_ALL += $(networking-cflags) build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os) $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^ -build/lib/libnetworkinglib.a: $(networking-lib-objs) +build/lib/libnetworking.a: $(networking-lib-objs) rm -f $@ $(AR) rc $@ $^ $(RANLIB) $@ -build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a +build/bin/test/%: $(B)/test/%.o build/lib/libnetworking.a $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) build/include/%.h: $(S)/include/%.h diff --git a/pl/Dir.mk b/pl/Dir.mk deleted file mode 100644 index 2d007790d24145..00000000000000 --- a/pl/Dir.mk +++ /dev/null @@ -1,21 +0,0 @@ -# Makefile fragment - requires GNU make -# -# Copyright (c) 2022, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -# These targets are defined if we prescribe pl in SUBS. -# It requires PLSUBS to be set. - -$(foreach sub,$(PLSUBS),$(eval include $(srcdir)/pl/$(sub)/Dir.mk)) - -pl-files := $($(PLSUBS:%=pl/%-files)) - -all-pl: $(PLSUBS:%=all-pl/%) - -check-pl: $(PLSUBS:%=check-pl/%) - -install-pl: $(PLSUBS:%=install-pl/%) - -clean-pl: $(PLSUBS:%=clean-pl/%) - -.PHONY: all-pl check-pl install-pl clean-pl diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk deleted file mode 100644 index 94b26cf3309c5d..00000000000000 --- a/pl/math/Dir.mk +++ /dev/null @@ -1,216 +0,0 @@ -# Makefile fragment - requires GNU make -# -# Copyright (c) 2019-2024, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -PLM := $(srcdir)/pl/math -AOR := $(srcdir)/math -B := build/pl/math - -pl-lib-srcs := $(wildcard $(PLM)/*.[cS]) - -ifeq ($(WANT_SVE_MATH), 0) -pl-lib-srcs := $(filter-out $(PLM)/sv_%, $(pl-lib-srcs)) -endif - -math-test-srcs := \ - $(AOR)/test/mathtest.c \ - $(AOR)/test/mathbench.c \ - $(AOR)/test/ulp.c \ - -math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS]) - -pl-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h)) -pl-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h)) - -pl-libs := \ - build/pl/lib/libmathlib.so \ - build/pl/lib/libmathlib.a \ - -math-tools := \ - build/pl/bin/mathtest \ - build/pl/bin/mathbench \ - build/pl/bin/mathbench_libc \ - build/pl/bin/runulp.sh \ - build/pl/bin/ulp \ - -math-host-tools := \ - build/pl/bin/rtest \ - -pl-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(pl-lib-srcs))) -math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs))) -math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs))) -pl-target-objs := $(pl-lib-objs) $(math-test-objs) -pl-objs := $(pl-target-objs) $(pl-target-objs:%.o=%.os) $(math-host-objs) - -pl/math-files := \ - $(pl-objs) \ - $(pl-libs) \ - $(math-tools) \ - $(math-host-tools) \ - $(pl-includes) \ - $(pl-test-includes) \ - -all-pl/math: $(pl-libs) $(math-tools) $(pl-includes) $(pl-test-includes) - -$(pl-objs): $(pl-includes) $(pl-test-includes) -$(pl-objs): CFLAGS_PL += $(math-cflags) -$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno -$(math-host-objs): CC = $(HOST_CC) -$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS) - -$(B)/sv_%: CFLAGS_PL += $(math-sve-cflags) - -build/pl/include/test/ulp_funcs_gen.h: $(pl-lib-srcs) - # Replace PL_SIG - cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@ - -build/pl/include/test/mathbench_funcs_gen.h: $(pl-lib-srcs) - # Replace PL_SIG macros with mathbench func entries - cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@ - -build/pl/include/test/ulp_wrappers_gen.h: $(pl-lib-srcs) - # Replace PL_SIG macros with ULP wrapper declarations - cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@ - -$(B)/test/ulp.o: $(AOR)/test/ulp.h build/pl/include/test/ulp_funcs_gen.h build/pl/include/test/ulp_wrappers_gen.h -$(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test - -$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h -$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test - -build/pl/lib/libmathlib.so: $(pl-lib-objs:%.o=%.os) - $(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^ - -build/pl/lib/libmathlib.a: $(pl-lib-objs) - rm -f $@ - $(AR) rc $@ $^ - $(RANLIB) $@ - -$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc -$(math-tools): LDLIBS += $(math-ldlibs) -lm -# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled -$(math-tools): CFLAGS_PL += $(math-sve-cflags) - -# Some targets to build pl/math/test from math/test sources -build/pl/math/test/%.o: $(srcdir)/math/test/%.S - $(CC) $(CFLAGS_PL) -c -o $@ $< - -build/pl/math/test/%.o: $(srcdir)/math/test/%.c - $(CC) $(CFLAGS_PL) -c -o $@ $< - -build/pl/math/test/%.os: $(srcdir)/math/test/%.S - $(CC) $(CFLAGS_PL) -c -o $@ $< - -build/pl/math/test/%.os: $(srcdir)/math/test/%.c - $(CC) $(CFLAGS_PL) -c -o $@ $< - -# Some targets to build pl/ sources using appropriate flags -build/pl/%.o: $(srcdir)/pl/%.S - $(CC) $(CFLAGS_PL) -c -o $@ $< - -build/pl/%.o: $(srcdir)/pl/%.c - $(CC) $(CFLAGS_PL) -c -o $@ $< - -build/pl/%.os: $(srcdir)/pl/%.S - $(CC) $(CFLAGS_PL) -c -o $@ $< - -build/pl/%.os: $(srcdir)/pl/%.c - $(CC) $(CFLAGS_PL) -c -o $@ $< - -build/pl/bin/rtest: $(math-host-objs) - $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) - -build/pl/bin/mathtest: $(B)/test/mathtest.o build/pl/lib/libmathlib.a - $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) - -build/pl/bin/mathbench: $(B)/test/mathbench.o build/pl/lib/libmathlib.a - $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) - -# This is not ideal, but allows custom symbols in mathbench to get resolved. -build/pl/bin/mathbench_libc: $(B)/test/mathbench.o build/pl/lib/libmathlib.a - $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/pl/lib/libmathlib.a -lm - -build/pl/bin/ulp: $(B)/test/ulp.o build/pl/lib/libmathlib.a - $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) - -build/pl/include/%.h: $(PLM)/include/%.h - cp $< $@ - -build/pl/include/test/%.h: $(PLM)/test/%.h - cp $< $@ - -build/pl/bin/%.sh: $(PLM)/test/%.sh - cp $< $@ - -pl-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst) -pl-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst) - -check-pl/math-test: $(math-tools) - cat $(pl-math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags) - -check-pl/math-rtest: $(math-host-tools) $(math-tools) - cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags) - -ulp-input-dir=$(B)/test/inputs - -math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(pl-lib-srcs))) -math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(pl-lib-srcs))) -math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(pl-lib-srcs))) - -ulp-inputs = $(math-lib-lims) $(math-lib-fenvs) $(math-lib-itvs) - -$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags) - -$(ulp-input-dir)/%.ulp: $(PLM)/%.c - mkdir -p $(@D) - $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@ - -$(ulp-input-dir)/%.fenv: $(PLM)/%.c - mkdir -p $(@D) - $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@ - -$(ulp-input-dir)/%.itv: $(PLM)/%.c - mkdir -p $(dir $@) - $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_INTERVAL " || true; } | sed "s/ PL_TEST_INTERVAL/\nPL_TEST_INTERVAL/g" > $@ - -ulp-lims := $(ulp-input-dir)/limits -$(ulp-lims): $(math-lib-lims) - cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@ - -fenv-exps := $(ulp-input-dir)/fenv -$(fenv-exps): $(math-lib-fenvs) - cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@ - -ulp-itvs := $(ulp-input-dir)/intervals -$(ulp-itvs): $(math-lib-itvs) - cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@ - -check-pl/math-ulp: $(math-tools) $(ulp-lims) $(fenv-exps) $(ulp-itvs) - WANT_SVE_MATH=$(WANT_SVE_MATH) \ - ULPFLAGS="$(math-ulpflags)" \ - LIMITS=../../../$(ulp-lims) \ - INTERVALS=../../../$(ulp-itvs) \ - FENV=../../../$(fenv-exps) \ - FUNC=$(func) \ - build/pl/bin/runulp.sh $(EMULATOR) - -check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp - -$(DESTDIR)$(libdir)/pl/%.so: build/pl/lib/%.so - $(INSTALL) -D $< $@ - -$(DESTDIR)$(libdir)/pl/%: build/pl/lib/% - $(INSTALL) -m 644 -D $< $@ - -$(DESTDIR)$(includedir)/pl/%: build/pl/include/% - $(INSTALL) -m 644 -D $< $@ - -install-pl/math: \ - $(pl-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \ - $(pl-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%) - -clean-pl/math: - rm -f $(pl/math-files) - -.PHONY: all-pl/math check-pl/math-test check-pl/math-rtest check-pl/math-ulp check-pl/math install-pl/math clean-pl/math diff --git a/pl/math/asinhf_data.c b/pl/math/asinhf_data.c deleted file mode 100644 index cd1ef16b3b6a61..00000000000000 --- a/pl/math/asinhf_data.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Coefficients for single-precision asinh(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya for - these coeffs were generated. */ -const struct asinhf_data __asinhf_data - = {.coeffs - = {-0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, 0x1.3a81dcp-4f, - 0x1.65bbaap-10f, -0x1.057f1p-4f, 0x1.6c1d46p-5f, -0x1.4cafe8p-7f}}; diff --git a/pl/math/atan_data.c b/pl/math/atan_data.c deleted file mode 100644 index 91d0f61d2eaf43..00000000000000 --- a/pl/math/atan_data.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x). - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -const struct atan_poly_data __atan_poly_data = { - .poly = {/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on - [2**-1022, 1.0]. See atan.sollya for details of how these were - generated. */ - -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, - 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, - -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, - 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, - -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, - 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, - -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16}}; diff --git a/pl/math/atanf_data.c b/pl/math/atanf_data.c deleted file mode 100644 index c4cba2378ceaee..00000000000000 --- a/pl/math/atanf_data.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x). - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. - */ -const struct atanf_poly_data __atanf_poly_data = { - .poly = {/* See atanf.sollya for details of how these were generated. */ - -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, - -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f}}; diff --git a/pl/math/exp_data.c b/pl/math/exp_data.c deleted file mode 100644 index 2354be76cfab6d..00000000000000 --- a/pl/math/exp_data.c +++ /dev/null @@ -1,1120 +0,0 @@ -/* - * Shared data between exp, exp2 and pow. - * - * Copyright (c) 2018-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#define N (1 << EXP_TABLE_BITS) - -const struct exp_data __exp_data = { -// N/ln2 -.invln2N = 0x1.71547652b82fep0 * N, -// -ln2/N -#if N == 64 -.negln2hiN = -0x1.62e42fefa0000p-7, -.negln2loN = -0x1.cf79abc9e3b3ap-46, -#elif N == 128 -.negln2hiN = -0x1.62e42fefa0000p-8, -.negln2loN = -0x1.cf79abc9e3b3ap-47, -#elif N == 256 -.negln2hiN = -0x1.62e42fefc0000p-9, -.negln2loN = 0x1.c610ca86c3899p-45, -#elif N == 512 -.negln2hiN = -0x1.62e42fef80000p-10, -.negln2loN = -0x1.1cf79abc9e3b4p-45, -#endif -// Used for rounding when !TOINT_INTRINSICS -#if EXP_USE_TOINT_NARROW -.shift = 0x1800000000.8p0, -#else -.shift = 0x1.8p52, -#endif -// exp polynomial coefficients. -.poly = { -#if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE -// abs error: 1.5543*2^-60 -// ulp error: 0.529 (0.533 without fma) -// if |x| < ln2/128+eps -// abs error if |x| < ln2/64: 1.7157*2^-50 -0x1.fffffffffdbcdp-2, -0x1.555555555444cp-3, -0x1.555573c6a9f7dp-5, -0x1.1111266d28935p-7, -#elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE -// abs error: 1.6735*2^-64 -// ulp error: 0.518 (0.522 without fma) -// if |x| < ln2/64 -0x1.5555555548f9ap-3, -0x1.555555554bf5dp-5, -0x1.11115b75f0f4dp-7, -0x1.6c171a6b6303ep-10, -#elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE -// abs error: 1.555*2^-66 -// ulp error: 0.509 (0.511 without fma) -// if |x| < ln2/256+eps -// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65 -// abs error if |x| < ln2/128: 1.7145*2^-56 -0x1.ffffffffffdbdp-2, -0x1.555555555543cp-3, -0x1.55555cf172b91p-5, -0x1.1111167a4d017p-7, -#elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE -// abs error: 1.5542*2^-60 -// ulp error: 0.521 (0.523 without fma) -// if |x| < ln2/128 -0x1.fffffffffdbcep-2, -0x1.55555555543c2p-3, -0x1.555573c64f2e3p-5, -0x1.111126b4eff73p-7, -#elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE -// abs error: 1.6861*2^-71 -// ulp error: 0.509 (0.511 without fma) -// if |x| < ln2/128 -0x1.55555555548fdp-3, -0x1.555555555658fp-5, -0x1.111123a859bb6p-7, -0x1.6c16ba6920cabp-10, -#elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE -// abs error: 1.43*2^-58 -// ulp error: 0.549 (0.550 without fma) -// if |x| < ln2/512 -0x1p0, // unused -0x1.fffffffffffd4p-2, -0x1.5555571d6ef9p-3, -0x1.5555576a5adcep-5, -#elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE -// abs error: 1.5547*2^-66 -// ulp error: 0.505 (0.506 without fma) -// if |x| < ln2/256 -0x1.ffffffffffdbdp-2, -0x1.555555555543cp-3, -0x1.55555cf16e1edp-5, -0x1.1111167a4b553p-7, -#elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE -// abs error: 1.4300*2^-63 -// ulp error: 0.504 -// if |x| < ln2/1024 -// abs error if |x| < ln2/512: 1.0689*2^-55 -0x1p0, // unused -0x1.ffffffffffffdp-2, -0x1.555555c75bb6p-3, -0x1.555555dec04a8p-5, -#endif -}, -.exp2_shift = 0x1.8p52 / N, -// exp2 polynomial coefficients. -.exp2_poly = { -#if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE -// abs error: 1.3054*2^-63 -// ulp error: 0.515 -// if |x| < 1/64 -0x1.62e42fefa39efp-1, -0x1.ebfbdff82c58fp-3, -0x1.c6b08d7045cf1p-5, -0x1.3b2ab6fb8fd0ep-7, -0x1.5d884afec48d7p-10, -0x1.43097dc684ae1p-13, -#elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE -// abs error: 1.2195*2^-65 -// ulp error: 0.507 (0.511 without fma) -// if |x| < 1/256 -// abs error if |x| < 1/128: 1.9941*2^-56 -0x1.62e42fefa39efp-1, -0x1.ebfbdff82c424p-3, -0x1.c6b08d70cf4b5p-5, -0x1.3b2abd24650ccp-7, -0x1.5d7e09b4e3a84p-10, -#elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE -// abs error: 1.2195*2^-65 -// ulp error: 0.504 (0.508 without fma) -// if |x| < 1/256 -0x1.62e42fefa39efp-1, -0x1.ebfbdff82c424p-3, -0x1.c6b08d70cf4b5p-5, -0x1.3b2abd24650ccp-7, -0x1.5d7e09b4e3a84p-10, -#elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE -// abs error: 1.4411*2^-64 -// ulp error: 0.5024 (0.5063 without fma) -// if |x| < 1/1024 -// abs error if |x| < 1/512: 1.9430*2^-56 -0x1.62e42fefa39ecp-1, -0x1.ebfbdff82c58bp-3, -0x1.c6b08e46de41fp-5, -0x1.3b2ab786ee1dap-7, -#endif -}, -// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) -// tab[2*k] = asuint64(T[k]) -// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N -.tab = { -#if N == 64 -0x0, 0x3ff0000000000000, -0xbc7160139cd8dc5d, 0x3fefec9a3e778061, -0x3c8cd2523567f613, 0x3fefd9b0d3158574, -0x3c60f74e61e6c861, 0x3fefc74518759bc8, -0x3c979aa65d837b6d, 0x3fefb5586cf9890f, -0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, -0xbc9556522a2fbd0e, 0x3fef9301d0125b51, -0xbc91c923b9d5f416, 0x3fef829aaea92de0, -0xbc801b15eaa59348, 0x3fef72b83c7d517b, -0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, -0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, -0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, -0x3c968efde3a8a894, 0x3fef387a6e756238, -0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, -0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, -0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, -0x3c834d754db0abb6, 0x3fef06fe0a31b715, -0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, -0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, -0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, -0x3c859f48a72a4c6d, 0x3feedea64c123422, -0xbc58a78f4817895b, 0x3feed60a21f72e2a, -0x3c4363ed60c2ac11, 0x3feece086061892d, -0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, -0x3c7690cebb7aafb0, 0x3feebfdad5362a27, -0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, -0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, -0x3c93350518fdd78e, 0x3feeaf4736b527da, -0x3c9063e1e21c5409, 0x3feeab07dd485429, -0x3c9432e62b64c035, 0x3feea76f15ad2148, -0xbc8c33c53bef4da8, 0x3feea47eb03a5585, -0xbc93cedd78565858, 0x3feea23882552225, -0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, -0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, -0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, -0xbc8619321e55e68a, 0x3fee9feb564267c9, -0xbc7b32dcb94da51d, 0x3feea11473eb0187, -0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, -0xbc9369b6f13b3734, 0x3feea589994cce13, -0xbc94d450d872576e, 0x3feea8d99b4492ed, -0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, -0x3c7bf68359f35f44, 0x3feeb1ae99157736, -0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, -0xbc92434322f4f9aa, 0x3feebd829fde4e50, -0x3c71affc2b91ce27, 0x3feec49182a3f090, -0xbc87c50422622263, 0x3feecc667b5de565, -0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, -0x3c8469846e735ab3, 0x3feede6b5579fdbf, -0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, -0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, -0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, -0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, -0x3c736eae30af0cb3, 0x3fef199bdd85529c, -0x3c84e08fd10959ac, 0x3fef27f12e57d14b, -0x3c676b2c6c921968, 0x3fef3720dcef9069, -0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, -0x3c74a385a63d07a7, 0x3fef5818dcfba487, -0x3c8e5a50d5c192ac, 0x3fef69e603db3285, -0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, -0x3c74b604603a88d3, 0x3fef902ee78b3ff6, -0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, -0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, -0x3c8a64a931d185ee, 0x3fefd0765b6e4540, -0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, -#elif N == 128 -0x0, 0x3ff0000000000000, -0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, -0xbc7160139cd8dc5d, 0x3fefec9a3e778061, -0xbc905e7a108766d1, 0x3fefe315e86e7f85, -0x3c8cd2523567f613, 0x3fefd9b0d3158574, -0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, -0x3c60f74e61e6c861, 0x3fefc74518759bc8, -0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, -0x3c979aa65d837b6d, 0x3fefb5586cf9890f, -0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, -0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, -0xbc6a033489906e0b, 0x3fef9b66affed31b, -0xbc9556522a2fbd0e, 0x3fef9301d0125b51, -0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, -0xbc91c923b9d5f416, 0x3fef829aaea92de0, -0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, -0xbc801b15eaa59348, 0x3fef72b83c7d517b, -0xbc8f1ff055de323d, 0x3fef6af9388c8dea, -0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, -0xbc96d99c7611eb26, 0x3fef5be084045cd4, -0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, -0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, -0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, -0x3c807a05b0e4047d, 0x3fef3f49917ddc96, -0x3c968efde3a8a894, 0x3fef387a6e756238, -0x3c875e18f274487d, 0x3fef31ce4fb2a63f, -0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, -0xbc96b87b3f71085e, 0x3fef24dfe1f56381, -0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, -0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, -0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, -0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, -0x3c834d754db0abb6, 0x3fef06fe0a31b715, -0x3c864201e2ac744c, 0x3fef0170fc4cd831, -0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, -0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, -0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, -0xbc9907f81b512d8e, 0x3feeecae6d05d866, -0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, -0xbc991919b3ce1b15, 0x3feee32dc313a8e5, -0x3c859f48a72a4c6d, 0x3feedea64c123422, -0xbc9312607a28698a, 0x3feeda4504ac801c, -0xbc58a78f4817895b, 0x3feed60a21f72e2a, -0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, -0x3c4363ed60c2ac11, 0x3feece086061892d, -0x3c9666093b0664ef, 0x3feeca41ed1d0057, -0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, -0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, -0x3c7690cebb7aafb0, 0x3feebfdad5362a27, -0x3c931dbdeb54e077, 0x3feebcb299fddd0d, -0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, -0xbc87deccdc93a349, 0x3feeb6daa2cf6642, -0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, -0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, -0x3c93350518fdd78e, 0x3feeaf4736b527da, -0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, -0x3c9063e1e21c5409, 0x3feeab07dd485429, -0x3c34c7855019c6ea, 0x3feea9268a5946b7, -0x3c9432e62b64c035, 0x3feea76f15ad2148, -0xbc8ce44a6199769f, 0x3feea5e1b976dc09, -0xbc8c33c53bef4da8, 0x3feea47eb03a5585, -0xbc845378892be9ae, 0x3feea34634ccc320, -0xbc93cedd78565858, 0x3feea23882552225, -0x3c5710aa807e1964, 0x3feea155d44ca973, -0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, -0xbc6a12ad8734b982, 0x3feea012750bdabf, -0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, -0xbc80dc3d54e08851, 0x3fee9f7df9519484, -0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, -0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, -0xbc8619321e55e68a, 0x3fee9feb564267c9, -0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, -0xbc7b32dcb94da51d, 0x3feea11473eb0187, -0x3c94ecfd5467c06b, 0x3feea1ed0130c132, -0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, -0xbc88a1c52fb3cf42, 0x3feea427543e1a12, -0xbc9369b6f13b3734, 0x3feea589994cce13, -0xbc805e843a19ff1e, 0x3feea71a4623c7ad, -0xbc94d450d872576e, 0x3feea8d99b4492ed, -0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, -0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, -0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, -0x3c7bf68359f35f44, 0x3feeb1ae99157736, -0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, -0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, -0xbc6c23f97c90b959, 0x3feeba44cbc8520f, -0xbc92434322f4f9aa, 0x3feebd829fde4e50, -0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, -0x3c71affc2b91ce27, 0x3feec49182a3f090, -0x3c6dd235e10a73bb, 0x3feec86319e32323, -0xbc87c50422622263, 0x3feecc667b5de565, -0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, -0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, -0x3c90cc319cee31d2, 0x3feed99e1330b358, -0x3c8469846e735ab3, 0x3feede6b5579fdbf, -0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, -0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, -0xbc907b8f4ad1d9fa, 0x3feeee07298db666, -0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, -0xbc90a40e3da6f640, 0x3feef9728de5593a, -0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, -0xbc91eee26b588a35, 0x3fef05b030a1064a, -0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, -0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, -0x3c736eae30af0cb3, 0x3fef199bdd85529c, -0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, -0x3c84e08fd10959ac, 0x3fef27f12e57d14b, -0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, -0x3c676b2c6c921968, 0x3fef3720dcef9069, -0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, -0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, -0xbc900dae3875a949, 0x3fef4f87080d89f2, -0x3c74a385a63d07a7, 0x3fef5818dcfba487, -0xbc82919e2040220f, 0x3fef60e316c98398, -0x3c8e5a50d5c192ac, 0x3fef69e603db3285, -0x3c843a59ac016b4b, 0x3fef7321f301b460, -0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, -0xbc892ab93b470dc9, 0x3fef864614f5a129, -0x3c74b604603a88d3, 0x3fef902ee78b3ff6, -0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, -0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, -0xbc8dae98e223747d, 0x3fefaf482d8e67f1, -0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, -0x3c842b94c3a9eb32, 0x3fefc52b376bba97, -0x3c8a64a931d185ee, 0x3fefd0765b6e4540, -0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, -0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, -0x3c5305c14160cc89, 0x3feff3c22b8f71f1, -#elif N == 256 -0x0, 0x3ff0000000000000, -0xbc84e82fc61851ac, 0x3feffb1afa5abcbf, -0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, -0xbc82985dd8521d32, 0x3feff168143b0281, -0xbc7160139cd8dc5d, 0x3fefec9a3e778061, -0x3c651e617061bfbd, 0x3fefe7d42e11bbcc, -0xbc905e7a108766d1, 0x3fefe315e86e7f85, -0x3c845fad437fa426, 0x3fefde5f72f654b1, -0x3c8cd2523567f613, 0x3fefd9b0d3158574, -0xbc954529642b232f, 0x3fefd50a0e3c1f89, -0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, -0x3c8293708ef5c32e, 0x3fefcbd42b72a836, -0x3c60f74e61e6c861, 0x3fefc74518759bc8, -0xbc95b9280905b2a4, 0x3fefc2bdf66607e0, -0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, -0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919, -0x3c979aa65d837b6d, 0x3fefb5586cf9890f, -0x3c9407fb30d06420, 0x3fefb0f145e46c85, -0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, -0xbc9a5d04b3b9911b, 0x3fefa83b23395dec, -0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, -0xbc937a01f0739546, 0x3fef9fa55fdfa9c5, -0xbc6a033489906e0b, 0x3fef9b66affed31b, -0x3c8b8268b04ef0a5, 0x3fef973028d7233e, -0xbc9556522a2fbd0e, 0x3fef9301d0125b51, -0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6, -0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, -0xbc65704e90c9f860, 0x3fef86a814f204ab, -0xbc91c923b9d5f416, 0x3fef829aaea92de0, -0xbc897cea57e46280, 0x3fef7e95934f312e, -0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, -0x3c56f01429e2b9d2, 0x3fef76a45471c3c2, -0xbc801b15eaa59348, 0x3fef72b83c7d517b, -0x3c6e653b2459034b, 0x3fef6ed48695bbc0, -0xbc8f1ff055de323d, 0x3fef6af9388c8dea, -0x3c92cc7ea345b7dc, 0x3fef672658375d2f, -0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, -0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c, -0xbc96d99c7611eb26, 0x3fef5be084045cd4, -0x3c8cdc1873af2155, 0x3fef582f95281c6b, -0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, -0xbc9493684653a131, 0x3fef50e75eb44027, -0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, -0xbc98e2899077520a, 0x3fef49c18438ce4d, -0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, -0x3c9120fcd4f59273, 0x3fef42be3578a819, -0x3c807a05b0e4047d, 0x3fef3f49917ddc96, -0x3c89b788c188c9b8, 0x3fef3bdda27912d1, -0x3c968efde3a8a894, 0x3fef387a6e756238, -0x3c877afbca90ef84, 0x3fef351ffb82140a, -0x3c875e18f274487d, 0x3fef31ce4fb2a63f, -0x3c91512f082876ee, 0x3fef2e85711ece75, -0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, -0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29, -0xbc96b87b3f71085e, 0x3fef24dfe1f56381, -0xbc803297e78260bf, 0x3fef21ba7591bb70, -0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, -0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13, -0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, -0xbc91e75c40b4251e, 0x3fef157e39771b2f, -0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, -0x3c98a911f1f7785a, 0x3fef0f961f641589, -0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, -0xbc61e7c998db7dbb, 0x3fef09d24abd886b, -0x3c834d754db0abb6, 0x3fef06fe0a31b715, -0x3c85425c11faadf4, 0x3fef0432edeeb2fd, -0x3c864201e2ac744c, 0x3fef0170fc4cd831, -0xbc979517a03e2847, 0x3feefeb83ba8ea32, -0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, -0xbc800e2a46da4bee, 0x3feef96266e3fa2d, -0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, -0xbc87430803972b34, 0x3feef431a2de883b, -0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, -0xbc954de30ae02d94, 0x3feeef26231e754a, -0xbc9907f81b512d8e, 0x3feeecae6d05d866, -0xbc94f2487e1c03ec, 0x3feeea401b7140ef, -0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, -0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4, -0xbc991919b3ce1b15, 0x3feee32dc313a8e5, -0x3c79c3bba5562a2f, 0x3feee0e544ede173, -0x3c859f48a72a4c6d, 0x3feedea64c123422, -0xbc85a71612e21658, 0x3feedc70df1c5175, -0xbc9312607a28698a, 0x3feeda4504ac801c, -0x3c86421f6f1d24d6, 0x3feed822c367a024, -0xbc58a78f4817895b, 0x3feed60a21f72e2a, -0xbc9348a6815fce65, 0x3feed3fb2709468a, -0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, -0x3c835c43984d9871, 0x3feecffa3f84b9d4, -0x3c4363ed60c2ac11, 0x3feece086061892d, -0xbc632afc8d9473a0, 0x3feecc2042a7d232, -0x3c9666093b0664ef, 0x3feeca41ed1d0057, -0xbc95fc5e44de020e, 0x3feec86d668b3237, -0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, -0xbc7ea0148327c42f, 0x3feec4e1e192aed2, -0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, -0xbc7a843ad1a88022, 0x3feec17dea6db7d7, -0x3c7690cebb7aafb0, 0x3feebfdad5362a27, -0x3c892ca3bf144e63, 0x3feebe41b817c114, -0x3c931dbdeb54e077, 0x3feebcb299fddd0d, -0xbc902c99b04aa8b0, 0x3feebb2d81d8abff, -0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, -0x3c73e34f67e67118, 0x3feeb8417f4531ee, -0xbc87deccdc93a349, 0x3feeb6daa2cf6642, -0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef, -0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, -0x3c81bd2888075068, 0x3feeb2e2f4f6ad27, -0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, -0xbc896be8ae89ef8f, 0x3feeb070dde910d2, -0x3c93350518fdd78e, 0x3feeaf4736b527da, -0xbc88e6ac90348602, 0x3feeae27dbe2c4cf, -0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, -0xbc91af7f1365c3ac, 0x3feeac0827ff07cc, -0x3c9063e1e21c5409, 0x3feeab07dd485429, -0xbc943a3540d1898a, 0x3feeaa11fba87a03, -0x3c34c7855019c6ea, 0x3feea9268a5946b7, -0xbc951f58ddaa8090, 0x3feea84590998b93, -0x3c9432e62b64c035, 0x3feea76f15ad2148, -0xbc82e1648e50a17c, 0x3feea6a320dceb71, -0xbc8ce44a6199769f, 0x3feea5e1b976dc09, -0x3c95f30eda98a575, 0x3feea52ae6cdf6f4, -0xbc8c33c53bef4da8, 0x3feea47eb03a5585, -0x3c917ecda8a72159, 0x3feea3dd1d1929fd, -0xbc845378892be9ae, 0x3feea34634ccc320, -0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7, -0xbc93cedd78565858, 0x3feea23882552225, -0xbc85c33fdf910406, 0x3feea1c1c70833f6, -0x3c5710aa807e1964, 0x3feea155d44ca973, -0x3c81079ab5789604, 0x3feea0f4b19e9538, -0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, -0x3c727df161cd7778, 0x3feea052fa75173e, -0xbc6a12ad8734b982, 0x3feea012750bdabf, -0x3c93f9924a05b767, 0x3fee9fdcddd47645, -0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, -0xbc87557939a8b5ef, 0x3fee9f9298593ae5, -0xbc80dc3d54e08851, 0x3fee9f7df9519484, -0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87, -0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, -0xbc88e67a9006c909, 0x3fee9f8286ead08a, -0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, -0x3c86597566977ac8, 0x3fee9fbd35d7cbfd, -0xbc8619321e55e68a, 0x3fee9feb564267c9, -0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09, -0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, -0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6, -0xbc7b32dcb94da51d, 0x3feea11473eb0187, -0xbc92dad3519d7b5b, 0x3feea17b0976cfdb, -0x3c94ecfd5467c06b, 0x3feea1ed0130c132, -0x3c87d51410fd15c2, 0x3feea26a62ff86f0, -0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, -0xbc760a3629969871, 0x3feea3878491c491, -0xbc88a1c52fb3cf42, 0x3feea427543e1a12, -0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9, -0xbc9369b6f13b3734, 0x3feea589994cce13, -0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7, -0xbc805e843a19ff1e, 0x3feea71a4623c7ad, -0xbc522cea4f3afa1e, 0x3feea7f4179f5b21, -0xbc94d450d872576e, 0x3feea8d99b4492ed, -0x3c7c88549b958471, 0x3feea9cad931a436, -0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, -0x3c931143962f7877, 0x3feeabd0a478580f, -0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, -0x3c93e9e96f112479, 0x3feeae05bad61778, -0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, -0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9, -0x3c7bf68359f35f44, 0x3feeb1ae99157736, -0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a, -0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, -0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2, -0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, -0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5, -0xbc6c23f97c90b959, 0x3feeba44cbc8520f, -0xbc51669428996971, 0x3feebbdd9a7670b3, -0xbc92434322f4f9aa, 0x3feebd829fde4e50, -0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2, -0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, -0xbc9294f304f166b6, 0x3feec2bb4d53fe0d, -0x3c71affc2b91ce27, 0x3feec49182a3f090, -0xbc8a1e58414c07d3, 0x3feec674194bb8d5, -0x3c6dd235e10a73bb, 0x3feec86319e32323, -0xbc79740b58a20091, 0x3feeca5e8d07f29e, -0xbc87c50422622263, 0x3feecc667b5de565, -0x3c9165830a2b96c2, 0x3feece7aed8eb8bb, -0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, -0xbc903d5cbe27874b, 0x3feed2c980460ad8, -0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, -0x3c5986178980fce0, 0x3feed74a8af46052, -0x3c90cc319cee31d2, 0x3feed99e1330b358, -0xbc89472975b1f2a5, 0x3feedbfe53c12e59, -0x3c8469846e735ab3, 0x3feede6b5579fdbf, -0x3c7d8157a34b7e7f, 0x3feee0e521356eba, -0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, -0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774, -0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, -0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff, -0xbc907b8f4ad1d9fa, 0x3feeee07298db666, -0x3c889c2ea41433c7, 0x3feef0ce6c9a8952, -0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, -0xbc7274aedac8ff80, 0x3feef68415b749b1, -0xbc90a40e3da6f640, 0x3feef9728de5593a, -0x3c85c620ce76df06, 0x3feefc6e29f1c52a, -0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, -0xbc8fda52e1b51e41, 0x3fef028cf22749e4, -0xbc91eee26b588a35, 0x3fef05b030a1064a, -0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f, -0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, -0xbc302899507554e5, 0x3fef0f69c3f3a207, -0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, -0xbc80dda2d4c0010c, 0x3fef16286141b33d, -0x3c736eae30af0cb3, 0x3fef199bdd85529c, -0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c, -0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, -0x3c836909391181d3, 0x3fef244778fafb22, -0x3c84e08fd10959ac, 0x3fef27f12e57d14b, -0xbc811cd7dbdf9547, 0x3fef2ba88988c933, -0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, -0xbc7ac28b7bef6621, 0x3fef33405751c4db, -0x3c676b2c6c921968, 0x3fef3720dcef9069, -0xbc7030587207b9e1, 0x3fef3b0f2e6d1675, -0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, -0xbc8cc734592af7fc, 0x3fef43155b5bab74, -0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, -0x3c87752a44f587e8, 0x3fef4b532b08c968, -0xbc900dae3875a949, 0x3fef4f87080d89f2, -0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6, -0x3c74a385a63d07a7, 0x3fef5818dcfba487, -0x3c5159d9d908a96e, 0x3fef5c76e862e6d3, -0xbc82919e2040220f, 0x3fef60e316c98398, -0x3c8c254d16117a68, 0x3fef655d71ff6075, -0x3c8e5a50d5c192ac, 0x3fef69e603db3285, -0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315, -0x3c843a59ac016b4b, 0x3fef7321f301b460, -0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658, -0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, -0xbc63e8e3eab2cbb4, 0x3fef81676b197d17, -0xbc892ab93b470dc9, 0x3fef864614f5a129, -0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12, -0x3c74b604603a88d3, 0x3fef902ee78b3ff6, -0xbc776caa4c2ff1cf, 0x3fef953924676d76, -0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, -0xbc81d5fc525d9940, 0x3fef9f7977cdb740, -0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, -0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e, -0xbc8dae98e223747d, 0x3fefaf482d8e67f1, -0x3c8269947c2bed4a, 0x3fefb4aaa2188510, -0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, -0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a, -0x3c842b94c3a9eb32, 0x3fefc52b376bba97, -0xbc69fa74878ba7c7, 0x3fefcac948dd7274, -0x3c8a64a931d185ee, 0x3fefd0765b6e4540, -0x3c901f3a75ee0efe, 0x3fefd632798844f8, -0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, -0xbc516a9ce6ed84fa, 0x3fefe1d802243c89, -0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, -0xbc699c7db2effc76, 0x3fefedba3692d514, -0x3c5305c14160cc89, 0x3feff3c22b8f71f1, -0x3c64b458677f9840, 0x3feff9d96b2a23d9, -#elif N == 512 -0x0, 0x3ff0000000000000, -0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a, -0xbc84e82fc61851ac, 0x3feffb1afa5abcbf, -0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11, -0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, -0x3c75c18e5ae0563a, 0x3feff3d1e77170b4, -0xbc82985dd8521d32, 0x3feff168143b0281, -0xbc705b1125cf49a5, 0x3fefef003103b10e, -0xbc7160139cd8dc5d, 0x3fefec9a3e778061, -0x3c9f879abbff3f87, 0x3fefea363d42b027, -0x3c651e617061bfbd, 0x3fefe7d42e11bbcc, -0x3c9b14003824712a, 0x3fefe57411915a8a, -0xbc905e7a108766d1, 0x3fefe315e86e7f85, -0x3c61cbf0f38af658, 0x3fefe0b9b35659d8, -0x3c845fad437fa426, 0x3fefde5f72f654b1, -0xbc9a3316383dcbc5, 0x3fefdc0727fc1762, -0x3c8cd2523567f613, 0x3fefd9b0d3158574, -0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2, -0xbc954529642b232f, 0x3fefd50a0e3c1f89, -0xbc89b3236d111646, 0x3fefd2b99fa6407c, -0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, -0xbc8cb191be99b1b0, 0x3fefce1ead925493, -0x3c8293708ef5c32e, 0x3fefcbd42b72a836, -0xbc9acb71e83765b7, 0x3fefc98ba42e7d30, -0x3c60f74e61e6c861, 0x3fefc74518759bc8, -0x3c5cd3e58b03697e, 0x3fefc50088f8093f, -0xbc95b9280905b2a4, 0x3fefc2bdf66607e0, -0xbc8bfb07d4755452, 0x3fefc07d61701716, -0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, -0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715, -0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919, -0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52, -0x3c979aa65d837b6d, 0x3fefb5586cf9890f, -0xbc85b9eb0402507b, 0x3fefb323d833d93f, -0x3c9407fb30d06420, 0x3fefb0f145e46c85, -0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53, -0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, -0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba, -0xbc9a5d04b3b9911b, 0x3fefa83b23395dec, -0x3c9c8be44bf4cde8, 0x3fefa612a7b26300, -0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, -0x3c820c5444c93c44, 0x3fefa1c7c55189c6, -0xbc937a01f0739546, 0x3fef9fa55fdfa9c5, -0xbc84c6baeb580d7a, 0x3fef9d8503328e6d, -0xbc6a033489906e0b, 0x3fef9b66affed31b, -0x3c8657aa1b0d9f83, 0x3fef994a66f951ce, -0x3c8b8268b04ef0a5, 0x3fef973028d7233e, -0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1, -0xbc9556522a2fbd0e, 0x3fef9301d0125b51, -0xbc6b0b2789925e90, 0x3fef90edb6db2dc1, -0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6, -0xbc93aad17d197fae, 0x3fef8ccbae51a5c8, -0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, -0xbc989c464a07ad70, 0x3fef88b1e264a0e9, -0xbc65704e90c9f860, 0x3fef86a814f204ab, -0xbc72c338fce197f4, 0x3fef84a058cbae1e, -0xbc91c923b9d5f416, 0x3fef829aaea92de0, -0xbc6dca724cea0eb6, 0x3fef809717425438, -0xbc897cea57e46280, 0x3fef7e95934f312e, -0x3c464770b955d34d, 0x3fef7c962388149e, -0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, -0xbc962811c114424f, 0x3fef789d83606e12, -0x3c56f01429e2b9d2, 0x3fef76a45471c3c2, -0x3c8ec58e74904dd4, 0x3fef74ad3c92df73, -0xbc801b15eaa59348, 0x3fef72b83c7d517b, -0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89, -0x3c6e653b2459034b, 0x3fef6ed48695bbc0, -0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9, -0xbc8f1ff055de323d, 0x3fef6af9388c8dea, -0x3c8bda920de0f6e2, 0x3fef690eba4df41f, -0x3c92cc7ea345b7dc, 0x3fef672658375d2f, -0xbc9a597f9a5ff71c, 0x3fef654013041dc2, -0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, -0x3c50835b125aa573, 0x3fef6179e2363cf8, -0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c, -0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0, -0xbc96d99c7611eb26, 0x3fef5be084045cd4, -0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f, -0x3c8cdc1873af2155, 0x3fef582f95281c6b, -0xbc6817fd6a313e3e, 0x3fef565a51860746, -0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, -0xbc96236af85fd26a, 0x3fef52b6358e15e8, -0xbc9493684653a131, 0x3fef50e75eb44027, -0x3c7795eb4523abe7, 0x3fef4f1aad999e82, -0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, -0x3c8fe58b91b40095, 0x3fef4b87bf9cda38, -0xbc98e2899077520a, 0x3fef49c18438ce4d, -0x3c91ecaa860c614a, 0x3fef47fd7190241e, -0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, -0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18, -0x3c9120fcd4f59273, 0x3fef42be3578a819, -0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9, -0x3c807a05b0e4047d, 0x3fef3f49917ddc96, -0x3c87f1c7350e256d, 0x3fef3d9282fc1f27, -0x3c89b788c188c9b8, 0x3fef3bdda27912d1, -0x3c420dac6c124f4f, 0x3fef3a2af0b63bff, -0x3c968efde3a8a894, 0x3fef387a6e756238, -0xbc99501d09bc09fd, 0x3fef36cc1c78903a, -0x3c877afbca90ef84, 0x3fef351ffb82140a, -0x3c73baf864dc8675, 0x3fef33760c547f15, -0x3c875e18f274487d, 0x3fef31ce4fb2a63f, -0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff, -0x3c91512f082876ee, 0x3fef2e85711ece75, -0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82, -0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, -0xbc7548165d85ed32, 0x3fef29a8b16f0a30, -0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29, -0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98, -0xbc96b87b3f71085e, 0x3fef24dfe1f56381, -0xbc93a255f697ecfe, 0x3fef234c0ea83f36, -0xbc803297e78260bf, 0x3fef21ba7591bb70, -0x3c8d2d19edc1e550, 0x3fef202b17779965, -0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, -0xbc76b2173113dd8c, 0x3fef1d130f50d65c, -0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13, -0x3c811aa5f853590b, 0x3fef1a03fc675d1f, -0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, -0x3c61d61a34c8aa02, 0x3fef16fde4f2e280, -0xbc91e75c40b4251e, 0x3fef157e39771b2f, -0xbc91f892bf6b286d, 0x3fef1400cf2f6c18, -0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, -0x3c7590c65c20e680, 0x3fef110cc15d5346, -0x3c98a911f1f7785a, 0x3fef0f961f641589, -0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833, -0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, -0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2, -0xbc61e7c998db7dbb, 0x3fef09d24abd886b, -0x3c7b3bf786a54a87, 0x3fef08670653dfe4, -0x3c834d754db0abb6, 0x3fef06fe0a31b715, -0x3c74bb6c41732885, 0x3fef05975721b004, -0x3c85425c11faadf4, 0x3fef0432edeeb2fd, -0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac, -0x3c864201e2ac744c, 0x3fef0170fc4cd831, -0xbc5451d60c6ac9eb, 0x3fef001375752b40, -0xbc979517a03e2847, 0x3feefeb83ba8ea32, -0x3c8787a210ceafd9, 0x3feefd5f4fb45e20, -0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, -0xbc888d1e4629943d, 0x3feefab46484ebb4, -0xbc800e2a46da4bee, 0x3feef96266e3fa2d, -0xbc93369c544088b6, 0x3feef812ba4ea77d, -0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, -0x3c85373ce4eb6dfb, 0x3feef57a577dd72b, -0xbc87430803972b34, 0x3feef431a2de883b, -0x3c83adec8265a67f, 0x3feef2eb428335b4, -0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, -0xbc835388bcac6bc5, 0x3feef06581d3f669, -0xbc954de30ae02d94, 0x3feeef26231e754a, -0x3c727cdb4e4b6640, 0x3feeede91be9c811, -0xbc9907f81b512d8e, 0x3feeecae6d05d866, -0x3c86c2696a26af35, 0x3feeeb761742d808, -0xbc94f2487e1c03ec, 0x3feeea401b7140ef, -0x3c888f6ff06b979a, 0x3feee90c7a61d55b, -0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, -0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea, -0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4, -0xbc76b8867f91c9d6, 0x3feee4559212ef89, -0xbc991919b3ce1b15, 0x3feee32dc313a8e5, -0x3c94c9c0b5157fe6, 0x3feee20853c10f28, -0x3c79c3bba5562a2f, 0x3feee0e544ede173, -0xbc62455345b51c8e, 0x3feedfc4976d27fa, -0x3c859f48a72a4c6d, 0x3feedea64c123422, -0xbc93331de45477d0, 0x3feedd8a63b0a09b, -0xbc85a71612e21658, 0x3feedc70df1c5175, -0xbc95f84d39b39b16, 0x3feedb59bf29743f, -0xbc9312607a28698a, 0x3feeda4504ac801c, -0xbc72ba4dc7c4d562, 0x3feed932b07a35df, -0x3c86421f6f1d24d6, 0x3feed822c367a024, -0xbc844f25dc02691f, 0x3feed7153e4a136a, -0xbc58a78f4817895b, 0x3feed60a21f72e2a, -0xbc888d328eb9b501, 0x3feed5016f44d8f5, -0xbc9348a6815fce65, 0x3feed3fb2709468a, -0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1, -0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, -0xbc615f0a2b9cd452, 0x3feed0f6d5817663, -0x3c835c43984d9871, 0x3feecffa3f84b9d4, -0xbc8c2e465a919e1d, 0x3feecf0018321a1a, -0x3c4363ed60c2ac11, 0x3feece086061892d, -0xbc865dfd02bd08f1, 0x3feecd1318eb43ec, -0xbc632afc8d9473a0, 0x3feecc2042a7d232, -0xbc8e68cec89b1762, 0x3feecb2fde7006f4, -0x3c9666093b0664ef, 0x3feeca41ed1d0057, -0xbc48ae858eb682ca, 0x3feec9566f8827d0, -0xbc95fc5e44de020e, 0x3feec86d668b3237, -0x3c5dd71277c0915f, 0x3feec786d3001fe5, -0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, -0x3c92001325ecd7fb, 0x3feec5c10fa920a1, -0xbc7ea0148327c42f, 0x3feec4e1e192aed2, -0x3c65ace6e2870332, 0x3feec4052c5916c4, -0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, -0xbc9595c55690ffaf, 0x3feec2532feaada6, -0xbc7a843ad1a88022, 0x3feec17dea6db7d7, -0xbc8b401ba9fb5199, 0x3feec0ab213d5283, -0x3c7690cebb7aafb0, 0x3feebfdad5362a27, -0x3c6df82bf324cc57, 0x3feebf0d073537ca, -0x3c892ca3bf144e63, 0x3feebe41b817c114, -0x3c97cae38641c7bb, 0x3feebd78e8bb586b, -0x3c931dbdeb54e077, 0x3feebcb299fddd0d, -0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a, -0xbc902c99b04aa8b0, 0x3feebb2d81d8abff, -0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0, -0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, -0xbc80b582d74a55d9, 0x3feeb8f8b804f127, -0x3c73e34f67e67118, 0x3feeb8417f4531ee, -0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d, -0xbc87deccdc93a349, 0x3feeb6daa2cf6642, -0xbc592dca38593e20, 0x3feeb62b00da3b14, -0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef, -0xbc85daca9994833e, 0x3feeb4d359dfd53d, -0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, -0xbc980b4321bc6dae, 0x3feeb385df598d78, -0x3c81bd2888075068, 0x3feeb2e2f4f6ad27, -0xbc8390afec5241c5, 0x3feeb24298571b06, -0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, -0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf, -0xbc896be8ae89ef8f, 0x3feeb070dde910d2, -0xbc910aa91ae9b67f, 0x3feeafdac1351819, -0x3c93350518fdd78e, 0x3feeaf4736b527da, -0x3c957e1b67462375, 0x3feeaeb63f4d854c, -0xbc88e6ac90348602, 0x3feeae27dbe2c4cf, -0x3c8124d5051552a7, 0x3feead9c0d59ca07, -0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, -0xbc3ca103952ecf1f, 0x3feeac8c32824135, -0xbc91af7f1365c3ac, 0x3feeac0827ff07cc, -0x3c773345c02a4fd6, 0x3feeab86b5f43d92, -0x3c9063e1e21c5409, 0x3feeab07dd485429, -0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e, -0xbc943a3540d1898a, 0x3feeaa11fba87a03, -0xbc924f2cb4f81746, 0x3feea99af482fc8f, -0x3c34c7855019c6ea, 0x3feea9268a5946b7, -0xbc943592a0a9846b, 0x3feea8b4be135acc, -0xbc951f58ddaa8090, 0x3feea84590998b93, -0xbc956bc85d444f4f, 0x3feea7d902d47c65, -0x3c9432e62b64c035, 0x3feea76f15ad2148, -0x3c914d1e4218319f, 0x3feea707ca0cbf0f, -0xbc82e1648e50a17c, 0x3feea6a320dceb71, -0x3c971c93709313f4, 0x3feea6411b078d26, -0xbc8ce44a6199769f, 0x3feea5e1b976dc09, -0x3c7f88303b60d222, 0x3feea584fd15612a, -0x3c95f30eda98a575, 0x3feea52ae6cdf6f4, -0x3c70125ca18d4b5b, 0x3feea4d3778bc944, -0xbc8c33c53bef4da8, 0x3feea47eb03a5585, -0x3c9592ea73798b11, 0x3feea42c91c56acd, -0x3c917ecda8a72159, 0x3feea3dd1d1929fd, -0xbc9371d6d7d75739, 0x3feea390532205d8, -0xbc845378892be9ae, 0x3feea34634ccc320, -0xbc8ac05fd996f807, 0x3feea2fec30678b7, -0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7, -0xbc91f5067d03653a, 0x3feea277e8dcc390, -0xbc93cedd78565858, 0x3feea23882552225, -0x3c917339c86ce3ad, 0x3feea1fbcc140be7, -0xbc85c33fdf910406, 0x3feea1c1c70833f6, -0xbc77e66065ba2500, 0x3feea18a7420a036, -0x3c5710aa807e1964, 0x3feea155d44ca973, -0x3c964c827ee6b49a, 0x3feea123e87bfb7a, -0x3c81079ab5789604, 0x3feea0f4b19e9538, -0xbc928311a3c73480, 0x3feea0c830a4c8d4, -0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, -0x3c882c79e185e981, 0x3feea077541ee718, -0x3c727df161cd7778, 0x3feea052fa75173e, -0xbc8b48cea80b043b, 0x3feea0315a736c75, -0xbc6a12ad8734b982, 0x3feea012750bdabf, -0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09, -0x3c93f9924a05b767, 0x3fee9fdcddd47645, -0x3c954835dd4b7548, 0x3fee9fc62dea2f8a, -0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, -0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8, -0xbc87557939a8b5ef, 0x3fee9f9298593ae5, -0xbc8f652fde52775c, 0x3fee9f86e7ba9fef, -0xbc80dc3d54e08851, 0x3fee9f7df9519484, -0xbc7b0300defbcf98, 0x3fee9f77ce1303f6, -0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87, -0xbc89dab646035dc0, 0x3fee9f73c4eaa988, -0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, -0xbc91f0c230588dde, 0x3fee9f7ad3ef9011, -0xbc88e67a9006c909, 0x3fee9f8286ead08a, -0x3c9106450507a28c, 0x3fee9f8d02d50b8f, -0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, -0xbc9129729a10f3a0, 0x3fee9faa5953c849, -0x3c86597566977ac8, 0x3fee9fbd35d7cbfd, -0x3c781a70a5124f67, 0x3fee9fd2df29ce7c, -0xbc8619321e55e68a, 0x3fee9feb564267c9, -0x3c941626ea62646d, 0x3feea0069c1a861d, -0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09, -0xbc940b9f54365b7c, 0x3feea04597eeba8f, -0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, -0x3c873455e0e826c1, 0x3feea08fda749e5d, -0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6, -0x3c94f006ad874e3e, 0x3feea0e56b7fcf03, -0xbc7b32dcb94da51d, 0x3feea11473eb0187, -0xbc8f6d693d0973bb, 0x3feea14652e958aa, -0xbc92dad3519d7b5b, 0x3feea17b0976cfdb, -0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec, -0x3c94ecfd5467c06b, 0x3feea1ed0130c132, -0xbc88b25e045d207b, 0x3feea22a4456e7a3, -0x3c87d51410fd15c2, 0x3feea26a62ff86f0, -0xbc69cb3314060ca7, 0x3feea2ad5e2850ac, -0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, -0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9, -0xbc760a3629969871, 0x3feea3878491c491, -0x3c94aa7212bfa73c, 0x3feea3d5fbab091f, -0xbc88a1c52fb3cf42, 0x3feea427543e1a12, -0xbc81e688272a8a12, 0x3feea47b8f4abaa9, -0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9, -0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a, -0xbc9369b6f13b3734, 0x3feea589994cce13, -0x3c8a1e274eed4476, 0x3feea5e968443d9a, -0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7, -0x3c94a533a59324da, 0x3feea6b1bdadb46d, -0xbc805e843a19ff1e, 0x3feea71a4623c7ad, -0x3c7a56d2760d087d, 0x3feea785b91e07f1, -0xbc522cea4f3afa1e, 0x3feea7f4179f5b21, -0x3c91682c1c6e8b05, 0x3feea86562ab00ec, -0xbc94d450d872576e, 0x3feea8d99b4492ed, -0x3c89ea99cf7a9591, 0x3feea950c27004c2, -0x3c7c88549b958471, 0x3feea9cad931a436, -0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957, -0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, -0x3c909b176e05a9cd, 0x3feeab4ac52be8f7, -0x3c931143962f7877, 0x3feeabd0a478580f, -0x3c711607f1952c95, 0x3feeac597875c644, -0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, -0x3c869608f0f86431, 0x3feead74029db01e, -0x3c93e9e96f112479, 0x3feeae05bad61778, -0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598, -0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, -0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6, -0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9, -0x3c81c1701c359530, 0x3feeb10afc931857, -0x3c7bf68359f35f44, 0x3feeb1ae99157736, -0xbc8edb1bf6809287, 0x3feeb2553499284b, -0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a, -0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c, -0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, -0xbc93fc025e1db9ce, 0x3feeb50dad829e70, -0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2, -0xbc8d737c7d71382e, 0x3feeb67bff148396, -0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, -0x3c6ae88c43905293, 0x3feeb7f669e2802b, -0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5, -0xbc93d1f7661fe51b, 0x3feeb97cf65253d1, -0xbc6c23f97c90b959, 0x3feeba44cbc8520f, -0x3c651b68797ffc1c, 0x3feebb0faccf9243, -0xbc51669428996971, 0x3feebbdd9a7670b3, -0x3c54579c5ceed70b, 0x3feebcae95cba768, -0xbc92434322f4f9aa, 0x3feebd829fde4e50, -0x3c87298413381667, 0x3feebe59b9bddb5b, -0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2, -0xbc905000be64e965, 0x3feec01121235681, -0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, -0xbc89fb12e3454b73, 0x3feec1d4d47f2598, -0xbc9294f304f166b6, 0x3feec2bb4d53fe0d, -0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3, -0x3c71affc2b91ce27, 0x3feec49182a3f090, -0x3c90622b15810eea, 0x3feec581414380f2, -0xbc8a1e58414c07d3, 0x3feec674194bb8d5, -0x3be9a5ecc875d327, 0x3feec76a0bcfc15e, -0x3c6dd235e10a73bb, 0x3feec86319e32323, -0x3c88ea486a3350ef, 0x3feec95f4499c647, -0xbc79740b58a20091, 0x3feeca5e8d07f29e, -0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb, -0xbc87c50422622263, 0x3feecc667b5de565, -0x3c89c31f7e38028b, 0x3feecd6f23701b15, -0x3c9165830a2b96c2, 0x3feece7aed8eb8bb, -0xbc5fac13f4e005a3, 0x3feecf89dacfe68c, -0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, -0x3c7d8aced7162e89, 0x3feed1b1231475f7, -0xbc903d5cbe27874b, 0x3feed2c980460ad8, -0xbc848f50cea7269f, 0x3feed3e504f696b1, -0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, -0x3c821eb9a08a0542, 0x3feed625893523d4, -0x3c5986178980fce0, 0x3feed74a8af46052, -0xbc6133a953131cfd, 0x3feed872b8950a73, -0x3c90cc319cee31d2, 0x3feed99e1330b358, -0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca, -0xbc89472975b1f2a5, 0x3feedbfe53c12e59, -0xbc90260cf07cb311, 0x3feedd333beb0b7e, -0x3c8469846e735ab3, 0x3feede6b5579fdbf, -0x3c1bca400a7b939d, 0x3feedfa6a1897fd2, -0x3c7d8157a34b7e7f, 0x3feee0e521356eba, -0x3c9140bc34dfc19f, 0x3feee226d59a09ee, -0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, -0xbc8c9b1da461ab87, 0x3feee4b3e100301e, -0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774, -0x3c8c115f23ebea8e, 0x3feee74dcca5a413, -0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, -0xbc6dcab99f23f84e, 0x3feee9f4a17a4735, -0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff, -0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4, -0xbc907b8f4ad1d9fa, 0x3feeee07298db666, -0x3c915b1397075f04, 0x3feeef692a8fa8cd, -0x3c889c2ea41433c7, 0x3feef0ce6c9a8952, -0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a, -0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, -0xbc86a510f31e13e6, 0x3feef511c43bbd62, -0xbc7274aedac8ff80, 0x3feef68415b749b1, -0xbc92887ea88e7340, 0x3feef7f9ade433c6, -0xbc90a40e3da6f640, 0x3feef9728de5593a, -0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87, -0x3c85c620ce76df06, 0x3feefc6e29f1c52a, -0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6, -0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, -0xbc8d1bf10460dba0, 0x3fef01004b3a7804, -0xbc8fda52e1b51e41, 0x3fef028cf22749e4, -0x3c8e5d80813dddfc, 0x3fef041ce8e77680, -0xbc91eee26b588a35, 0x3fef05b030a1064a, -0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7, -0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f, -0x3c7a77557fd62db3, 0x3fef0a7df9285775, -0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, -0xbc651ba6128db749, 0x3fef0dc27e2cb5e5, -0xbc302899507554e5, 0x3fef0f69c3f3a207, -0xbc7c0ffefdc5e251, 0x3fef111462c95b60, -0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, -0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30, -0xbc80dda2d4c0010c, 0x3fef16286141b33d, -0x3c923759b8aca76d, 0x3fef17e06ff301f4, -0x3c736eae30af0cb3, 0x3fef199bdd85529c, -0xbc895498a73dac7d, 0x3fef1b5aab23e61e, -0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c, -0x3c851de924583108, 0x3fef1ee26b34e065, -0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, -0xbc8c5fe4051ba06c, 0x3fef2277b9881650, -0x3c836909391181d3, 0x3fef244778fafb22, -0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad, -0x3c84e08fd10959ac, 0x3fef27f12e57d14b, -0xbc7af5c67c4e8235, 0x3fef29cb269e601f, -0xbc811cd7dbdf9547, 0x3fef2ba88988c933, -0xbc8304ef0045d575, 0x3fef2d89584661a1, -0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, -0x3c8725f94f910375, 0x3fef31553dfa8313, -0xbc7ac28b7bef6621, 0x3fef33405751c4db, -0x3c7b53e99f9191e8, 0x3fef352ee13da7cb, -0x3c676b2c6c921968, 0x3fef3720dcef9069, -0xbc810a79e6d7e2b8, 0x3fef39164b994d23, -0xbc7030587207b9e1, 0x3fef3b0f2e6d1675, -0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f, -0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, -0x3c549eeef9ec910c, 0x3fef410e9be12cb9, -0xbc8cc734592af7fc, 0x3fef43155b5bab74, -0xbc8335827ffb9dce, 0x3fef451f95018d17, -0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, -0x3c645563980ef762, 0x3fef493e7ba2c38c, -0x3c87752a44f587e8, 0x3fef4b532b08c968, -0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c, -0xbc900dae3875a949, 0x3fef4f87080d89f2, -0xbc8aab80ceab2b4a, 0x3fef51a638197a3c, -0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6, -0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f, -0x3c74a385a63d07a7, 0x3fef5818dcfba487, -0x3c83c119f18464c5, 0x3fef5a461eec14be, -0x3c5159d9d908a96e, 0x3fef5c76e862e6d3, -0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b, -0xbc82919e2040220f, 0x3fef60e316c98398, -0xbc72550d76be719a, 0x3fef631e7e2d479d, -0x3c8c254d16117a68, 0x3fef655d71ff6075, -0xbc82090274667d12, 0x3fef679ff37adb4a, -0x3c8e5a50d5c192ac, 0x3fef69e603db3285, -0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd, -0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315, -0x3c890de9296f4cd1, 0x3fef70cd9ab294e4, -0x3c843a59ac016b4b, 0x3fef7321f301b460, -0x3c832ff9978b34bc, 0x3fef7579e065807d, -0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658, -0xbc7303b63dda1980, 0x3fef7a347f63c159, -0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, -0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1, -0xbc63e8e3eab2cbb4, 0x3fef81676b197d17, -0x3c768d9144ae12fc, 0x3fef83d4f11f8220, -0xbc892ab93b470dc9, 0x3fef864614f5a129, -0x3c853687f542403b, 0x3fef88bad7dcee90, -0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12, -0xbc736ed2de40b407, 0x3fef8daf3fe592e8, -0x3c74b604603a88d3, 0x3fef902ee78b3ff6, -0xbc614ef56c770f3b, 0x3fef92b2334ac7ee, -0xbc776caa4c2ff1cf, 0x3fef953924676d76, -0x3c8df7d1353d8e88, 0x3fef97c3bc24e350, -0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, -0xbc850bed64091b8a, 0x3fef9ce3e4933c7e, -0xbc81d5fc525d9940, 0x3fef9f7977cdb740, -0x3c89d852381c317f, 0x3fefa212b6bc3181, -0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, -0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5, -0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e, -0xbc5a1f25ce94cae7, 0x3fefac9c80faa594, -0xbc8dae98e223747d, 0x3fefaf482d8e67f1, -0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2, -0x3c8269947c2bed4a, 0x3fefb4aaa2188510, -0x3c737e8ae802b851, 0x3fefb7616ca06dd6, -0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, -0x3c875119560e34af, 0x3fefbcda28a52e59, -0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a, -0xbc7431c3840929c6, 0x3fefc261cbdf5be7, -0x3c842b94c3a9eb32, 0x3fefc52b376bba97, -0xbc8cb472d2e86b99, 0x3fefc7f860a70c22, -0xbc69fa74878ba7c7, 0x3fefcac948dd7274, -0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac, -0x3c8a64a931d185ee, 0x3fefd0765b6e4540, -0x3c8eef18336b62e3, 0x3fefd35288633625, -0x3c901f3a75ee0efe, 0x3fefd632798844f8, -0x3c80d23f87b50a2a, 0x3fefd916302bd526, -0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, -0x3c8302dee657c8e6, 0x3fefdee8f32a4b45, -0xbc516a9ce6ed84fa, 0x3fefe1d802243c89, -0xbc7b0caa080df170, 0x3fefe4cadbdac61d, -0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, -0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54, -0xbc699c7db2effc76, 0x3fefedba3692d514, -0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad, -0x3c5305c14160cc89, 0x3feff3c22b8f71f1, -0x3c8e70b094fa075a, 0x3feff6cbe15f6314, -0x3c64b458677f9840, 0x3feff9d96b2a23d9, -0xbc72ec9a3e5d680a, 0x3feffceaca4391b6, -#endif -}, -}; diff --git a/pl/math/expf.c b/pl/math/expf.c deleted file mode 100644 index cd3cfa925c644d..00000000000000 --- a/pl/math/expf.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Single-precision e^x function. - * - * Copyright (c) 2017-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include -#include -#include "math_config.h" - -/* -EXPF_TABLE_BITS = 5 -EXPF_POLY_ORDER = 3 - -ULP error: 0.502 (nearest rounding.) -Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.) -Wrong count: 170635 (all nearest rounding wrong results with fma.) -Non-nearest ULP error: 1 (rounded ULP error) -*/ - -#define N (1 << EXPF_TABLE_BITS) -#define InvLn2N __expf_data.invln2_scaled -#define T __expf_data.tab -#define C __expf_data.poly_scaled - -static inline uint32_t -top12 (float x) -{ - return asuint (x) >> 20; -} - -float -optr_aor_exp_f32 (float x) -{ - uint32_t abstop; - uint64_t ki, t; - /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ - double_t kd, xd, z, r, r2, y, s; - - xd = (double_t) x; - abstop = top12 (x) & 0x7ff; - if (unlikely (abstop >= top12 (88.0f))) - { - /* |x| >= 88 or x is nan. */ - if (asuint (x) == asuint (-INFINITY)) - return 0.0f; - if (abstop >= top12 (INFINITY)) - return x + x; - if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */ - return __math_oflowf (0); - if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */ - return __math_uflowf (0); - } - - /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k. */ - z = InvLn2N * xd; - - /* Round and convert z to int, the result is in [-150*N, 128*N] and - ideally nearest int is used, otherwise the magnitude of r can be - bigger which gives larger approximation error. */ - kd = round (z); - ki = lround (z); - r = z - kd; - - /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ - t = T[ki % N]; - t += ki << (52 - EXPF_TABLE_BITS); - s = asdouble (t); - z = C[0] * r + C[1]; - r2 = r * r; - y = C[2] * r + 1; - y = z * r2 + y; - y = y * s; - return eval_as_float (y); -} diff --git a/pl/math/expm1_data.c b/pl/math/expm1_data.c deleted file mode 100644 index ff7426b9013579..00000000000000 --- a/pl/math/expm1_data.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Coefficients for double-precision e^x - 1 function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Generated using fpminimax, see tools/expm1.sollya for details. */ -const double __expm1_poly[] = {0x1p-1, - 0x1.5555555555559p-3, - 0x1.555555555554bp-5, - 0x1.111111110f663p-7, - 0x1.6c16c16c1b5f3p-10, - 0x1.a01a01affa35dp-13, - 0x1.a01a018b4ecbbp-16, - 0x1.71ddf82db5bb4p-19, - 0x1.27e517fc0d54bp-22, - 0x1.af5eedae67435p-26, - 0x1.1f143d060a28ap-29}; diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h deleted file mode 100644 index f886e7f8c07a02..00000000000000 --- a/pl/math/include/mathlib.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Public API. - * - * Copyright (c) 2015-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _MATHLIB_H -#define _MATHLIB_H - -float acosf (float); -float acoshf (float); -float asinf (float); -float asinhf (float); -float atan2f (float, float); -float atanf (float); -float atanhf (float); -float cbrtf (float); -float coshf (float); -float cospif (float); -float erfcf (float); -float erff (float); -float erfinvf (float); -float exp10f (float); -float expm1f (float); -float log10f (float); -float log1pf (float); -float sinhf (float); -float sinpif (float); -float tanf (float); -float tanhf (float); - -double acos (double); -double acosh (double); -double asin (double); -double asinh (double); -double atan (double); -double atan2 (double, double); -double atanh (double); -double cbrt (double); -double cosh (double); -double cospi (double); -double erfc (double); -double erfinv (double); -double exp10 (double); -double expm1 (double); -double log10 (double); -double log1p (double); -double sinh (double); -double sinpi (double); -double tanh (double); - -long double cospil (long double); -long double erfinvl (long double); -long double exp10l (long double); -long double sinpil (long double); - -#if __aarch64__ -# if __GNUC__ >= 5 -typedef __Float32x4_t __f32x4_t; -typedef __Float64x2_t __f64x2_t; -# elif __clang_major__ * 100 + __clang_minor__ >= 305 -typedef __attribute__ ((__neon_vector_type__ (4))) float __f32x4_t; -typedef __attribute__ ((__neon_vector_type__ (2))) double __f64x2_t; -# else -# error Unsupported compiler -# endif - -# if __GNUC__ >= 9 || __clang_major__ >= 8 -# define __vpcs __attribute__ ((__aarch64_vector_pcs__)) - -typedef struct __f32x4x2_t -{ - __f32x4_t val[2]; -} __f32x4x2_t; - -typedef struct __f64x2x2_t -{ - __f64x2_t val[2]; -} __f64x2x2_t; - -/* Vector functions following the vector PCS using ABI names. */ -__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t); -__vpcs __f32x4x2_t _ZGVnN4v_cexpif (__f32x4_t); -__vpcs __f64x2x2_t _ZGVnN2v_cexpi (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_cospif (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_cospi (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_erfinvf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_erfinv (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t); -__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); -__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_sinpif (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_sinpi (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t); -__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t); -__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t); -__vpcs void _ZGVnN4vl4l4_sincosf (__f32x4_t, __f32x4_t *, __f32x4_t *); -__vpcs void _ZGVnN2vl8l8_sincos (__f64x2_t, __f64x2_t *, __f64x2_t *); - -# endif - -# if WANT_SVE_MATH -# include -svfloat32_t _ZGVsMxv_acoshf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_acosh (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_acosf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_acos (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_asinhf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_asinh (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_asinf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_asin (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_atanhf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_atanh (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t); -svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t); -svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_cbrtf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_cbrt (svfloat64_t, svbool_t); -svfloat32x2_t _ZGVsMxv_cexpif (svfloat32_t, svbool_t); -svfloat64x2_t _ZGVsMxv_cexpi (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_coshf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_cosh (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t); -svfloat32_t _ZGVsMxv_cospif (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t); -svfloat64_t _ZGVsMxv_cospi (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t); -svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_erfcf (svfloat32_t, svbool_t); -svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_exp (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_exp10f (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_exp10 (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_exp2f (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_exp2 (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_expm1f (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_expm1 (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxvv_hypotf (svfloat32_t, svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxvv_hypot (svfloat64_t, svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_log1pf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_log1p (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxvv_powi (svfloat32_t, svint32_t, svbool_t); -svfloat64_t _ZGVsMxvv_powk (svfloat64_t, svint64_t, svbool_t); -svfloat32_t _ZGVsMxvv_powf (svfloat32_t, svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxvv_pow (svfloat64_t, svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_sinhf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_sinh (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t); -svfloat32_t _ZGVsMxv_sinpif (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t); -svfloat64_t _ZGVsMxv_sinpi (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_tanhf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_tanh (svfloat64_t, svbool_t); -svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t); -svfloat64_t _ZGVsMxv_tan (svfloat64_t, svbool_t); -void _ZGVsMxvl4l4_sincosf (svfloat32_t, float *, float *, svbool_t); -void _ZGVsMxvl8l8_sincos (svfloat64_t, double *, double *, svbool_t); -# endif - -#endif - -#endif diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h deleted file mode 100644 index 3a3407e337b872..00000000000000 --- a/pl/math/include/pl_test.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * PL macros to aid testing. This version of this file is used for building the - * routine, not the tests. Separate definitions are found in test/pl_test.h - * which emit test parameters. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. - */ - -/* Emit max ULP threshold - silenced for building the routine. */ -#define PL_TEST_ULP(f, l) - -/* Emit routine name if e == 1 and f is expected to correctly trigger fenv - exceptions. e allows declaration to be emitted conditionally upon certain - build flags - defer expansion by one pass to allow those flags to be expanded - properly. */ -#define PL_TEST_EXPECT_FENV(f, e) -#define PL_TEST_EXPECT_FENV_ALWAYS(f) - -#define PL_TEST_INTERVAL(f, lo, hi, n) -#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) -#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) -#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) -#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) diff --git a/pl/math/log.c b/pl/math/log.c deleted file mode 100644 index 40b0441d981de3..00000000000000 --- a/pl/math/log.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Double-precision log(x) function. - * - * Copyright (c) 2018-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include -#include -#include -#include "math_config.h" - -#define T __log_data.tab -#define T2 __log_data.tab2 -#define B __log_data.poly1 -#define A __log_data.poly -#define Ln2hi __log_data.ln2hi -#define Ln2lo __log_data.ln2lo -#define N (1 << LOG_TABLE_BITS) -#define OFF 0x3fe6000000000000 - -/* Top 16 bits of a double. */ -static inline uint32_t -top16 (double x) -{ - return asuint64 (x) >> 48; -} - -double -optr_aor_log_f64 (double x) -{ - /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ - double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo; - uint64_t ix, iz, tmp; - uint32_t top; - int k, i; - - ix = asuint64 (x); - top = top16 (x); - -#if LOG_POLY1_ORDER == 10 || LOG_POLY1_ORDER == 11 -#define LO asuint64 (1.0 - 0x1p-5) -#define HI asuint64 (1.0 + 0x1.1p-5) -#elif LOG_POLY1_ORDER == 12 -#define LO asuint64 (1.0 - 0x1p-4) -#define HI asuint64 (1.0 + 0x1.09p-4) -#endif - if (unlikely (ix - LO < HI - LO)) - { - /* Handle close to 1.0 inputs separately. */ - /* Fix sign of zero with downward rounding when x==1. */ - if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0))) - return 0; - r = x - 1.0; - r2 = r * r; - r3 = r * r2; -#if LOG_POLY1_ORDER == 10 - /* Worst-case error is around 0.516 ULP. */ - y = r3 - * (B[1] + r * B[2] + r2 * B[3] - + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8]))); - w = B[0] * r2; /* B[0] == -0.5. */ - hi = r + w; - y += r - hi + w; - y += hi; -#elif LOG_POLY1_ORDER == 11 - /* Worst-case error is around 0.516 ULP. */ - y = r3 - * (B[1] + r * B[2] - + r2 - * (B[3] + r * B[4] + r2 * B[5] - + r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9]))); - w = B[0] * r2; /* B[0] == -0.5. */ - hi = r + w; - y += r - hi + w; - y += hi; -#elif LOG_POLY1_ORDER == 12 - y = r3 - * (B[1] + r * B[2] + r2 * B[3] - + r3 - * (B[4] + r * B[5] + r2 * B[6] - + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); -#if N <= 64 - /* Worst-case error is around 0.532 ULP. */ - w = B[0] * r2; /* B[0] == -0.5. */ - hi = r + w; - y += r - hi + w; - y += hi; -#else - /* Worst-case error is around 0.507 ULP. */ - w = r * 0x1p27; - double_t rhi = r + w - w; - double_t rlo = r - rhi; - w = rhi * rhi * B[0]; /* B[0] == -0.5. */ - hi = r + w; - lo = r - hi + w; - lo += B[0] * rlo * (rhi + r); - y += lo; - y += hi; -#endif -#endif - return eval_as_double (y); - } - if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010)) - { - /* x < 0x1p-1022 or inf or nan. */ - if (ix * 2 == 0) - return __math_divzero (1); - if (ix == asuint64 (INFINITY)) /* log(inf) == inf. */ - return x; - if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) - return __math_invalid (x); - /* x is subnormal, normalize it. */ - ix = asuint64 (x * 0x1p52); - ix -= 52ULL << 52; - } - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (52 - LOG_TABLE_BITS)) % N; - k = (int64_t) tmp >> 52; /* arithmetic shift */ - iz = ix - (tmp & 0xfffULL << 52); - invc = T[i].invc; - logc = T[i].logc; - z = asdouble (iz); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - /* r ~= z/c - 1, |r| < 1/(2*N). */ -#if HAVE_FAST_FMA - /* rounding error: 0x1p-55/N. */ - r = fma (z, invc, -1.0); -#else - /* rounding error: 0x1p-55/N + 0x1p-66. */ - r = (z - T2[i].chi - T2[i].clo) * invc; -#endif - kd = (double_t) k; - - /* hi + lo = r + log(c) + k*Ln2. */ - w = kd * Ln2hi + logc; - hi = w + r; - lo = w - hi + r + kd * Ln2lo; - - /* log(x) = lo + (log1p(r) - r) + hi. */ - r2 = r * r; /* rounding error: 0x1p-54/N^2. */ - /* Worst case error if |y| > 0x1p-5: - 0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma) - Worst case error if |y| > 0x1p-4: - 0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma). */ -#if LOG_POLY_ORDER == 6 - y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi; -#elif LOG_POLY_ORDER == 7 - y = lo - + r2 - * (A[0] + r * A[1] + r2 * (A[2] + r * A[3]) - + r2 * r2 * (A[4] + r * A[5])) - + hi; -#endif - return eval_as_double (y); -} diff --git a/pl/math/log1p_data.c b/pl/math/log1p_data.c deleted file mode 100644 index 6168a0c9a21472..00000000000000 --- a/pl/math/log1p_data.c +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Data used in double-precision log(1+x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Polynomial coefficients generated using Remez algorithm, see - log1p.sollya for details. */ -const struct log1p_data __log1p_data = { - .coeffs = {-0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, - 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, - -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, - 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, - -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, - 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, - -0x1.cfa7385bdb37ep-6}}; diff --git a/pl/math/log_data.c b/pl/math/log_data.c deleted file mode 100644 index 34715e5036a39d..00000000000000 --- a/pl/math/log_data.c +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Data for log. - * - * Copyright (c) 2018-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#define N (1 << LOG_TABLE_BITS) - -const struct log_data __log_data = { -.ln2hi = 0x1.62e42fefa3800p-1, -.ln2lo = 0x1.ef35793c76730p-45, -.poly1 = { -#if LOG_POLY1_ORDER == 10 -// relative error: 0x1.32eccc6p-62 -// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval) --0x1p-1, -0x1.55555555554e5p-2, --0x1.0000000000af2p-2, -0x1.9999999bbe436p-3, --0x1.55555537f9cdep-3, -0x1.24922fc8127cfp-3, --0x1.0000b7d6bb612p-3, -0x1.c806ee1ddbcafp-4, --0x1.972335a9c2d6ep-4, -#elif LOG_POLY1_ORDER == 11 -// relative error: 0x1.52c8b708p-68 -// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval) --0x1p-1, -0x1.5555555555555p-2, --0x1.ffffffffffea9p-3, -0x1.999999999c4d4p-3, --0x1.55555557f5541p-3, -0x1.249248fbe33e4p-3, --0x1.ffffc9a3c825bp-4, -0x1.c71e1f204435dp-4, --0x1.9a7f26377d06ep-4, -0x1.71c30cf8f7364p-4, -#elif LOG_POLY1_ORDER == 12 -// relative error: 0x1.c04d76cp-63 -// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval) --0x1p-1, -0x1.5555555555577p-2, --0x1.ffffffffffdcbp-3, -0x1.999999995dd0cp-3, --0x1.55555556745a7p-3, -0x1.24924a344de3p-3, --0x1.fffffa4423d65p-4, -0x1.c7184282ad6cap-4, --0x1.999eb43b068ffp-4, -0x1.78182f7afd085p-4, --0x1.5521375d145cdp-4, -#endif -}, -.poly = { -#if N == 64 && LOG_POLY_ORDER == 7 -// relative error: 0x1.906eb8ap-58 -// abs error: 0x1.d2cad5a8p-67 -// in -0x1.fp-8 0x1.fp-8 --0x1.0000000000027p-1, -0x1.555555555556ap-2, --0x1.fffffff0440bap-3, -0x1.99999991906c3p-3, --0x1.555c8d7e8201ep-3, -0x1.24978c59151fap-3, -#elif N == 128 && LOG_POLY_ORDER == 6 -// relative error: 0x1.926199e8p-56 -// abs error: 0x1.882ff33p-65 -// in -0x1.fp-9 0x1.fp-9 --0x1.0000000000001p-1, -0x1.555555551305bp-2, --0x1.fffffffeb459p-3, -0x1.999b324f10111p-3, --0x1.55575e506c89fp-3, -#elif N == 128 && LOG_POLY_ORDER == 7 -// relative error: 0x1.649fc4bp-64 -// abs error: 0x1.c3b5769p-74 -// in -0x1.fp-9 0x1.fp-9 --0x1.0000000000001p-1, -0x1.5555555555556p-2, --0x1.fffffffea1a8p-3, -0x1.99999998e9139p-3, --0x1.555776801b968p-3, -0x1.2493c29331a5cp-3, -#endif -}, -/* Algorithm: - - x = 2^k z - log(x) = k ln2 + log(c) + log(z/c) - log(z/c) = poly(z/c - 1) - -where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls -into the ith one, then table entries are computed as - - tab[i].invc = 1/c - tab[i].logc = (double)log(c) - tab2[i].chi = (double)c - tab2[i].clo = (double)(c - (double)c) - -where c is near the center of the subinterval and is chosen by trying +-2^29 -floating point invc candidates around 1/center and selecting one for which - - 1) the rounding error in 0x1.8p9 + logc is 0, - 2) the rounding error in z - chi - clo is < 0x1p-66 and - 3) the rounding error in (double)log(c) is minimized (< 0x1p-66). - -Note: 1) ensures that k*ln2hi + logc can be computed without rounding error, -2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to -a single rounding error when there is no fast fma for z*invc - 1, 3) ensures -that logc + poly(z/c - 1) has small error, however near x == 1 when -|log(x)| < 0x1p-4, this is not enough so that is special cased. */ -.tab = { -#if N == 64 -{0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2}, -{0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2}, -{0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2}, -{0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2}, -{0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2}, -{0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2}, -{0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2}, -{0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2}, -{0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2}, -{0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2}, -{0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2}, -{0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2}, -{0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3}, -{0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3}, -{0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3}, -{0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3}, -{0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3}, -{0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3}, -{0x1.33ae463091760p+0, -0x1.7898db878d000p-3}, -{0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3}, -{0x1.2e025c9203c89p+0, -0x1.527e620845000p-3}, -{0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3}, -{0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3}, -{0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3}, -{0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3}, -{0x1.20b4703174157p+0, -0x1.ec738fee40000p-4}, -{0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4}, -{0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4}, -{0x1.194538e960658p+0, -0x1.8197efba9a000p-4}, -{0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4}, -{0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4}, -{0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4}, -{0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5}, -{0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5}, -{0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5}, -{0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5}, -{0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6}, -{0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6}, -{0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7}, -{0x1.01010152cf066p+0, -0x1.0080a711c0000p-8}, -{0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8}, -{0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6}, -{0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5}, -{0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5}, -{0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4}, -{0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4}, -{0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4}, -{0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4}, -{0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4}, -{0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3}, -{0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3}, -{0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3}, -{0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3}, -{0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3}, -{0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3}, -{0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3}, -{0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3}, -{0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3}, -{0x1.8d3018b58699ap-1, 0x1.040259974e000p-2}, -{0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2}, -{0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2}, -{0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2}, -{0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2}, -{0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2}, -#elif N == 128 -{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2}, -{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2}, -{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2}, -{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2}, -{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2}, -{0x1.69147332f0cbap+0, -0x1.602d076180000p-2}, -{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2}, -{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2}, -{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2}, -{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2}, -{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2}, -{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2}, -{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2}, -{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2}, -{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2}, -{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2}, -{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2}, -{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2}, -{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2}, -{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2}, -{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2}, -{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2}, -{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2}, -{0x1.4880524d48434p+0, -0x1.feb224586f000p-3}, -{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3}, -{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3}, -{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3}, -{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3}, -{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3}, -{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3}, -{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3}, -{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3}, -{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3}, -{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3}, -{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3}, -{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3}, -{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3}, -{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3}, -{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3}, -{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3}, -{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3}, -{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3}, -{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3}, -{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3}, -{0x1.293726014b530p+0, -0x1.31b996b490000p-3}, -{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3}, -{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3}, -{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3}, -{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3}, -{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3}, -{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4}, -{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4}, -{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4}, -{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4}, -{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4}, -{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4}, -{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4}, -{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4}, -{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4}, -{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4}, -{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4}, -{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4}, -{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4}, -{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4}, -{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5}, -{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5}, -{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5}, -{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5}, -{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5}, -{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5}, -{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5}, -{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5}, -{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6}, -{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6}, -{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6}, -{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6}, -{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7}, -{0x1.02865137932a9p+0, -0x1.419355daa0000p-7}, -{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8}, -{0x1.008040614b195p+0, -0x1.0040979240000p-9}, -{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9}, -{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7}, -{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6}, -{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6}, -{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5}, -{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5}, -{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5}, -{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5}, -{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4}, -{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4}, -{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4}, -{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4}, -{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4}, -{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4}, -{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4}, -{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4}, -{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4}, -{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3}, -{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3}, -{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3}, -{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3}, -{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3}, -{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3}, -{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3}, -{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3}, -{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3}, -{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3}, -{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3}, -{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3}, -{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3}, -{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3}, -{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3}, -{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3}, -{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3}, -{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3}, -{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3}, -{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2}, -{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2}, -{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2}, -{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2}, -{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2}, -{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2}, -{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2}, -{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2}, -{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2}, -{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2}, -{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2}, -{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2}, -#endif -}, -#if !HAVE_FAST_FMA -.tab2 = { -#if N == 64 -{0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56}, -{0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55}, -{0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55}, -{0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56}, -{0x1.720000b37216ep-1, 0x1.802bc8d437043p-55}, -{0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57}, -{0x1.7a0000628daep-1, -0x1.e00434b49313dp-56}, -{0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56}, -{0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57}, -{0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56}, -{0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57}, -{0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55}, -{0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55}, -{0x1.960000135d8eap-1, -0x1.f832268dc3095p-55}, -{0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56}, -{0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58}, -{0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55}, -{0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55}, -{0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57}, -{0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56}, -{0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57}, -{0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57}, -{0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56}, -{0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55}, -{0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58}, -{0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55}, -{0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58}, -{0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59}, -{0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59}, -{0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58}, -{0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55}, -{0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55}, -{0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55}, -{0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60}, -{0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55}, -{0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55}, -{0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56}, -{0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57}, -{0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58}, -{0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58}, -{0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54}, -{0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55}, -{0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54}, -{0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54}, -{0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55}, -{0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57}, -{0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54}, -{0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55}, -{0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54}, -{0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55}, -{0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56}, -{0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54}, -{0x1.320000324c55bp+0, 0x1.f81983997354fp-54}, -{0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54}, -{0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54}, -{0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56}, -{0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54}, -{0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55}, -{0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55}, -{0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56}, -{0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54}, -{0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55}, -{0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55}, -{0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54}, -#elif N == 128 -{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56}, -{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55}, -{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55}, -{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57}, -{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56}, -{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55}, -{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55}, -{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56}, -{0x1.710000e86978p-1, 0x1.bff6671097952p-56}, -{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55}, -{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57}, -{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57}, -{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55}, -{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56}, -{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55}, -{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55}, -{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55}, -{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55}, -{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55}, -{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55}, -{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55}, -{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56}, -{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55}, -{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55}, -{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55}, -{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56}, -{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55}, -{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56}, -{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55}, -{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55}, -{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60}, -{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55}, -{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56}, -{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55}, -{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55}, -{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55}, -{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55}, -{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57}, -{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55}, -{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57}, -{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58}, -{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56}, -{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56}, -{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55}, -{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56}, -{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57}, -{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57}, -{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55}, -{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55}, -{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57}, -{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55}, -{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55}, -{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56}, -{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57}, -{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55}, -{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55}, -{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56}, -{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55}, -{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58}, -{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56}, -{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56}, -{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55}, -{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55}, -{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57}, -{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56}, -{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56}, -{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56}, -{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58}, -{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55}, -{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56}, -{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58}, -{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55}, -{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59}, -{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55}, -{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55}, -{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57}, -{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56}, -{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57}, -{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56}, -{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57}, -{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55}, -{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54}, -{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54}, -{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55}, -{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57}, -{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54}, -{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55}, -{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56}, -{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55}, -{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54}, -{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54}, -{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55}, -{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54}, -{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54}, -{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57}, -{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54}, -{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54}, -{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54}, -{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56}, -{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56}, -{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56}, -{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54}, -{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55}, -{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55}, -{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55}, -{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54}, -{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54}, -{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55}, -{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54}, -{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55}, -{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56}, -{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54}, -{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57}, -{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55}, -{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55}, -{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54}, -{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54}, -{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54}, -{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54}, -{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54}, -{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57}, -{0x1.530001605277ap+0, -0x1.6bfcece233209p-54}, -{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55}, -{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54}, -{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55}, -{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54}, -{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54}, -{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54}, -#endif -}, -#endif /* !HAVE_FAST_FMA */ -}; diff --git a/pl/math/logf.c b/pl/math/logf.c deleted file mode 100644 index 17a74ed6d28f10..00000000000000 --- a/pl/math/logf.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Single-precision log function. - * - * Copyright (c) 2017-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include -#include -#include "math_config.h" - -/* -LOGF_TABLE_BITS = 4 -LOGF_POLY_ORDER = 4 - -ULP error: 0.818 (nearest rounding.) -Relative error: 1.957 * 2^-26 (before rounding.) -*/ - -#define T __logf_data.tab -#define A __logf_data.poly -#define Ln2 __logf_data.ln2 -#define N (1 << LOGF_TABLE_BITS) -#define OFF 0x3f330000 - -float -optr_aor_log_f32 (float x) -{ - /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ - double_t z, r, r2, y, y0, invc, logc; - uint32_t ix, iz, tmp; - int k, i; - - ix = asuint (x); -#if WANT_ROUNDING - /* Fix sign of zero with downward rounding when x==1. */ - if (unlikely (ix == 0x3f800000)) - return 0; -#endif - if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000)) - { - /* x < 0x1p-126 or inf or nan. */ - if (ix * 2 == 0) - return __math_divzerof (1); - if (ix == 0x7f800000) /* log(inf) == inf. */ - return x; - if ((ix & 0x80000000) || ix * 2 >= 0xff000000) - return __math_invalidf (x); - /* x is subnormal, normalize it. */ - ix = asuint (x * 0x1p23f); - ix -= 23 << 23; - } - - /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; - k = (int32_t) tmp >> 23; /* arithmetic shift */ - iz = ix - (tmp & 0x1ff << 23); - invc = T[i].invc; - logc = T[i].logc; - z = (double_t) asfloat (iz); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */ - r = z * invc - 1; - y0 = logc + (double_t) k * Ln2; - - /* Pipelined polynomial evaluation to approximate log1p(r). */ - r2 = r * r; - y = A[1] * r + A[2]; - y = A[0] * r2 + y; - y = y * r2 + (y0 + r); - return eval_as_float (y); -} diff --git a/pl/math/logf_data.c b/pl/math/logf_data.c deleted file mode 100644 index 97d9eb8d009779..00000000000000 --- a/pl/math/logf_data.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Data definition for logf and log10f. - * - * Copyright (c) 2017-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -const struct logf_data __logf_data = { - .tab = - { - {0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2}, - {0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2}, - {0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2}, - {0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3}, - {0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3}, - {0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3}, - {0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4}, - {0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4}, - {0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5}, - {0x1p+0, 0x0p+0}, - {0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5}, - {0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4}, - {0x1.b2036576afce6p-1, 0x1.526e57720db08p-3}, - {0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3}, - {0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2}, - {0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2}, - }, - .ln2 = 0x1.62e42fefa39efp-1, - .invln10 = 0x1.bcb7b1526e50ep-2, - .poly = { - -0x1.00ea348b88334p-2, - 0x1.5575b0be00b6ap-2, - -0x1.ffffef20a4123p-2, - }}; diff --git a/pl/math/math_config.h b/pl/math/math_config.h deleted file mode 100644 index c3dd8f2db8c7b0..00000000000000 --- a/pl/math/math_config.h +++ /dev/null @@ -1,624 +0,0 @@ -/* - * Configuration for math routines. - * - * Copyright (c) 2017-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _MATH_CONFIG_H -#define _MATH_CONFIG_H - -#include -#include - -#ifndef WANT_ROUNDING -/* If defined to 1, return correct results for special cases in non-nearest - rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than - -0.0f). This may be set to 0 if there is no fenv support or if math - functions only get called in round to nearest mode. */ -# define WANT_ROUNDING 1 -#endif -#ifndef WANT_ERRNO -/* If defined to 1, set errno in math functions according to ISO C. Many math - libraries do not set errno, so this is 0 by default. It may need to be - set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. */ -# define WANT_ERRNO 0 -#endif -#ifndef WANT_SIMD_EXCEPT -/* If defined to 1, trigger fp exceptions in vector routines, consistently with - behaviour expected from the corresponding scalar routine. */ -# define WANT_SIMD_EXCEPT 0 -#endif - -/* Compiler can inline round as a single instruction. */ -#ifndef HAVE_FAST_ROUND -# if __aarch64__ -# define HAVE_FAST_ROUND 1 -# else -# define HAVE_FAST_ROUND 0 -# endif -#endif - -/* Compiler can inline lround, but not (long)round(x). */ -#ifndef HAVE_FAST_LROUND -# if __aarch64__ && (100 * __GNUC__ + __GNUC_MINOR__) >= 408 \ - && __NO_MATH_ERRNO__ -# define HAVE_FAST_LROUND 1 -# else -# define HAVE_FAST_LROUND 0 -# endif -#endif - -/* Compiler can inline fma as a single instruction. */ -#ifndef HAVE_FAST_FMA -# if defined FP_FAST_FMA || __aarch64__ -# define HAVE_FAST_FMA 1 -# else -# define HAVE_FAST_FMA 0 -# endif -#endif - -/* Provide *_finite symbols and some of the glibc hidden symbols - so libmathlib can be used with binaries compiled against glibc - to interpose math functions with both static and dynamic linking. */ -#ifndef USE_GLIBC_ABI -# if __GNUC__ -# define USE_GLIBC_ABI 1 -# else -# define USE_GLIBC_ABI 0 -# endif -#endif - -/* Optionally used extensions. */ -#ifdef __GNUC__ -# define HIDDEN __attribute__ ((__visibility__ ("hidden"))) -# define NOINLINE __attribute__ ((noinline)) -# define UNUSED __attribute__ ((unused)) -# define likely(x) __builtin_expect (!!(x), 1) -# define unlikely(x) __builtin_expect (x, 0) -# if __GNUC__ >= 9 -# define attribute_copy(f) __attribute__ ((copy (f))) -# else -# define attribute_copy(f) -# endif -# define strong_alias(f, a) \ - extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f); -# define hidden_alias(f, a) \ - extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \ - attribute_copy (f); -#else -# define HIDDEN -# define NOINLINE -# define UNUSED -# define likely(x) (x) -# define unlikely(x) (x) -#endif - -/* Return ptr but hide its value from the compiler so accesses through it - cannot be optimized based on the contents. */ -#define ptr_barrier(ptr) \ - ({ \ - __typeof (ptr) __ptr = (ptr); \ - __asm("" : "+r"(__ptr)); \ - __ptr; \ - }) - -/* Symbol renames to avoid libc conflicts. */ -#define __math_oflowf arm_math_oflowf -#define __math_uflowf arm_math_uflowf -#define __math_may_uflowf arm_math_may_uflowf -#define __math_divzerof arm_math_divzerof -#define __math_oflow arm_math_oflow -#define __math_uflow arm_math_uflow -#define __math_may_uflow arm_math_may_uflow -#define __math_divzero arm_math_divzero -#define __math_invalidf arm_math_invalidf -#define __math_invalid arm_math_invalid -#define __math_check_oflow arm_math_check_oflow -#define __math_check_uflow arm_math_check_uflow -#define __math_check_oflowf arm_math_check_oflowf -#define __math_check_uflowf arm_math_check_uflowf - -#if HAVE_FAST_ROUND -/* When set, the roundtoint and converttoint functions are provided with - the semantics documented below. */ -# define TOINT_INTRINSICS 1 - -/* Round x to nearest int in all rounding modes, ties have to be rounded - consistently with converttoint so the results match. If the result - would be outside of [-2^31, 2^31-1] then the semantics is unspecified. */ -static inline double_t -roundtoint (double_t x) -{ - return round (x); -} - -/* Convert x to nearest int in all rounding modes, ties have to be rounded - consistently with roundtoint. If the result is not representible in an - int32_t then the semantics is unspecified. */ -static inline int32_t -converttoint (double_t x) -{ -# if HAVE_FAST_LROUND - return lround (x); -# else - return (long) round (x); -# endif -} -#endif - -static inline uint32_t -asuint (float f) -{ - union - { - float f; - uint32_t i; - } u = { f }; - return u.i; -} - -static inline float -asfloat (uint32_t i) -{ - union - { - uint32_t i; - float f; - } u = { i }; - return u.f; -} - -static inline uint64_t -asuint64 (double f) -{ - union - { - double f; - uint64_t i; - } u = { f }; - return u.i; -} - -static inline double -asdouble (uint64_t i) -{ - union - { - uint64_t i; - double f; - } u = { i }; - return u.f; -} - -#ifndef IEEE_754_2008_SNAN -# define IEEE_754_2008_SNAN 1 -#endif -static inline int -issignalingf_inline (float x) -{ - uint32_t ix = asuint (x); - if (!IEEE_754_2008_SNAN) - return (ix & 0x7fc00000) == 0x7fc00000; - return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000; -} - -static inline int -issignaling_inline (double x) -{ - uint64_t ix = asuint64 (x); - if (!IEEE_754_2008_SNAN) - return (ix & 0x7ff8000000000000) == 0x7ff8000000000000; - return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL; -} - -#if __aarch64__ && __GNUC__ -/* Prevent the optimization of a floating-point expression. */ -static inline float -opt_barrier_float (float x) -{ - __asm__ __volatile__ ("" : "+w" (x)); - return x; -} -static inline double -opt_barrier_double (double x) -{ - __asm__ __volatile__ ("" : "+w" (x)); - return x; -} -/* Force the evaluation of a floating-point expression for its side-effect. */ -static inline void -force_eval_float (float x) -{ - __asm__ __volatile__ ("" : "+w" (x)); -} -static inline void -force_eval_double (double x) -{ - __asm__ __volatile__ ("" : "+w" (x)); -} -#else -static inline float -opt_barrier_float (float x) -{ - volatile float y = x; - return y; -} -static inline double -opt_barrier_double (double x) -{ - volatile double y = x; - return y; -} -static inline void -force_eval_float (float x) -{ - volatile float y UNUSED = x; -} -static inline void -force_eval_double (double x) -{ - volatile double y UNUSED = x; -} -#endif - -/* Evaluate an expression as the specified type, normally a type - cast should be enough, but compilers implement non-standard - excess-precision handling, so when FLT_EVAL_METHOD != 0 then - these functions may need to be customized. */ -static inline float -eval_as_float (float x) -{ - return x; -} -static inline double -eval_as_double (double x) -{ - return x; -} - -/* Error handling tail calls for special cases, with a sign argument. - The sign of the return value is set if the argument is non-zero. */ - -/* The result overflows. */ -HIDDEN float __math_oflowf (uint32_t); -/* The result underflows to 0 in nearest rounding mode. */ -HIDDEN float __math_uflowf (uint32_t); -/* The result underflows to 0 in some directed rounding mode only. */ -HIDDEN float __math_may_uflowf (uint32_t); -/* Division by zero. */ -HIDDEN float __math_divzerof (uint32_t); -/* The result overflows. */ -HIDDEN double __math_oflow (uint32_t); -/* The result underflows to 0 in nearest rounding mode. */ -HIDDEN double __math_uflow (uint32_t); -/* The result underflows to 0 in some directed rounding mode only. */ -HIDDEN double __math_may_uflow (uint32_t); -/* Division by zero. */ -HIDDEN double __math_divzero (uint32_t); - -/* Error handling using input checking. */ - -/* Invalid input unless it is a quiet NaN. */ -HIDDEN float __math_invalidf (float); -/* Invalid input unless it is a quiet NaN. */ -HIDDEN double __math_invalid (double); - -/* Error handling using output checking, only for errno setting. */ - -/* Check if the result overflowed to infinity. */ -HIDDEN double __math_check_oflow (double); -/* Check if the result underflowed to 0. */ -HIDDEN double __math_check_uflow (double); - -/* Check if the result overflowed to infinity. */ -static inline double -check_oflow (double x) -{ - return WANT_ERRNO ? __math_check_oflow (x) : x; -} - -/* Check if the result underflowed to 0. */ -static inline double -check_uflow (double x) -{ - return WANT_ERRNO ? __math_check_uflow (x) : x; -} - -/* Check if the result overflowed to infinity. */ -HIDDEN float __math_check_oflowf (float); -/* Check if the result underflowed to 0. */ -HIDDEN float __math_check_uflowf (float); - -/* Check if the result overflowed to infinity. */ -static inline float -check_oflowf (float x) -{ - return WANT_ERRNO ? __math_check_oflowf (x) : x; -} - -/* Check if the result underflowed to 0. */ -static inline float -check_uflowf (float x) -{ - return WANT_ERRNO ? __math_check_uflowf (x) : x; -} - -extern const struct erff_data -{ - struct - { - float erf, scale; - } tab[513]; -} __erff_data HIDDEN; - -extern const struct sv_erff_data -{ - float erf[513]; - float scale[513]; -} __sv_erff_data HIDDEN; - -extern const struct erfcf_data -{ - struct - { - float erfc, scale; - } tab[645]; -} __erfcf_data HIDDEN; - -/* Data for logf and log10f. */ -#define LOGF_TABLE_BITS 4 -#define LOGF_POLY_ORDER 4 -extern const struct logf_data -{ - struct - { - double invc, logc; - } tab[1 << LOGF_TABLE_BITS]; - double ln2; - double invln10; - double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */ -} __logf_data HIDDEN; - -/* Data for low accuracy log10 (with 1/ln(10) included in coefficients). */ -#define LOG10_TABLE_BITS 7 -#define LOG10_POLY_ORDER 6 -#define LOG10_POLY1_ORDER 12 -extern const struct log10_data -{ - double ln2hi; - double ln2lo; - double invln10; - double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */ - double poly1[LOG10_POLY1_ORDER - 1]; - struct - { - double invc, logc; - } tab[1 << LOG10_TABLE_BITS]; -#if !HAVE_FAST_FMA - struct - { - double chi, clo; - } tab2[1 << LOG10_TABLE_BITS]; -#endif -} __log10_data HIDDEN; - -#define EXP_TABLE_BITS 7 -#define EXP_POLY_ORDER 5 -/* Use polynomial that is optimized for a wider input range. This may be - needed for good precision in non-nearest rounding and !TOINT_INTRINSICS. */ -#define EXP_POLY_WIDE 0 -/* Use close to nearest rounding toint when !TOINT_INTRINSICS. This may be - needed for good precision in non-nearest rouning and !EXP_POLY_WIDE. */ -#define EXP_USE_TOINT_NARROW 0 -#define EXP2_POLY_ORDER 5 -#define EXP2_POLY_WIDE 0 -extern const struct exp_data -{ - double invln2N; - double shift; - double negln2hiN; - double negln2loN; - double poly[4]; /* Last four coefficients. */ - double exp2_shift; - double exp2_poly[EXP2_POLY_ORDER]; - uint64_t tab[2 * (1 << EXP_TABLE_BITS)]; -} __exp_data HIDDEN; - -/* Copied from math/v_exp.h for use in vector exp_tail. */ -#define V_EXP_TAIL_TABLE_BITS 8 -extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN; - -/* Copied from math/v_exp.h for use in vector exp2. */ -#define V_EXP_TABLE_BITS 7 -extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; - -extern const struct erf_data -{ - struct - { - double erf, scale; - } tab[769]; -} __erf_data HIDDEN; - -extern const struct sv_erf_data -{ - double erf[769]; - double scale[769]; -} __sv_erf_data HIDDEN; - -extern const struct erfc_data -{ - struct - { - double erfc, scale; - } tab[3488]; -} __erfc_data HIDDEN; - -#define ATAN_POLY_NCOEFFS 20 -extern const struct atan_poly_data -{ - double poly[ATAN_POLY_NCOEFFS]; -} __atan_poly_data HIDDEN; - -#define ATANF_POLY_NCOEFFS 8 -extern const struct atanf_poly_data -{ - float poly[ATANF_POLY_NCOEFFS]; -} __atanf_poly_data HIDDEN; - -#define ASINHF_NCOEFFS 8 -extern const struct asinhf_data -{ - float coeffs[ASINHF_NCOEFFS]; -} __asinhf_data HIDDEN; - -#define LOG_TABLE_BITS 7 -#define LOG_POLY_ORDER 6 -#define LOG_POLY1_ORDER 12 -extern const struct log_data -{ - double ln2hi; - double ln2lo; - double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1. */ - double poly1[LOG_POLY1_ORDER - 1]; - struct - { - double invc, logc; - } tab[1 << LOG_TABLE_BITS]; -#if !HAVE_FAST_FMA - struct - { - double chi, clo; - } tab2[1 << LOG_TABLE_BITS]; -#endif -} __log_data HIDDEN; - -#define ASINH_NCOEFFS 18 -extern const struct asinh_data -{ - double poly[ASINH_NCOEFFS]; -} __asinh_data HIDDEN; - -#define LOG1P_NCOEFFS 19 -extern const struct log1p_data -{ - double coeffs[LOG1P_NCOEFFS]; -} __log1p_data HIDDEN; - -#define LOG1PF_2U5 -#define LOG1PF_NCOEFFS 9 -extern const struct log1pf_data -{ - float coeffs[LOG1PF_NCOEFFS]; -} __log1pf_data HIDDEN; - -#define TANF_P_POLY_NCOEFFS 6 -/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps. */ -#define TANF_Q_POLY_NCOEFFS 4 -extern const struct tanf_poly_data -{ - float poly_tan[TANF_P_POLY_NCOEFFS]; - float poly_cotan[TANF_Q_POLY_NCOEFFS]; -} __tanf_poly_data HIDDEN; - -#define V_LOG2_TABLE_BITS 7 -extern const struct v_log2_data -{ - double poly[5]; - double invln2; - struct - { - double invc, log2c; - } table[1 << V_LOG2_TABLE_BITS]; -} __v_log2_data HIDDEN; - -#define V_LOG10_TABLE_BITS 7 -extern const struct v_log10_data -{ - double poly[5]; - double invln10, log10_2; - struct - { - double invc, log10c; - } table[1 << V_LOG10_TABLE_BITS]; -} __v_log10_data HIDDEN; - -/* Some data for SVE powf's internal exp and log. */ -#define V_POWF_EXP2_TABLE_BITS 5 -#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS) -#define V_POWF_LOG2_TABLE_BITS 5 -#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS) -extern const struct v_powf_data -{ - double invc[V_POWF_LOG2_N]; - double logc[V_POWF_LOG2_N]; - uint64_t scale[V_POWF_EXP2_N]; -} __v_powf_data HIDDEN; - -#define V_LOG_POLY_ORDER 6 -#define V_LOG_TABLE_BITS 7 -extern const struct v_log_data -{ - /* Shared data for vector log and log-derived routines (e.g. asinh). */ - double poly[V_LOG_POLY_ORDER - 1]; - double ln2; - struct - { - double invc, logc; - } table[1 << V_LOG_TABLE_BITS]; -} __v_log_data HIDDEN; - -#define EXPM1F_POLY_ORDER 5 -extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN; - -#define EXPF_TABLE_BITS 5 -#define EXPF_POLY_ORDER 3 -extern const struct expf_data -{ - uint64_t tab[1 << EXPF_TABLE_BITS]; - double invln2_scaled; - double poly_scaled[EXPF_POLY_ORDER]; -} __expf_data HIDDEN; - -#define EXPM1_POLY_ORDER 11 -extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN; - -extern const struct cbrtf_data -{ - float poly[4]; - float table[5]; -} __cbrtf_data HIDDEN; - -extern const struct cbrt_data -{ - double poly[4]; - double table[5]; -} __cbrt_data HIDDEN; - -#define ASINF_POLY_ORDER 4 -extern const float __asinf_poly[ASINF_POLY_ORDER + 1] HIDDEN; - -#define ASIN_POLY_ORDER 11 -extern const double __asin_poly[ASIN_POLY_ORDER + 1] HIDDEN; - -/* Some data for AdvSIMD and SVE pow's internal exp and log. */ -#define V_POW_EXP_TABLE_BITS 8 -extern const struct v_pow_exp_data -{ - double poly[3]; - double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift; - uint64_t sbits[1 << V_POW_EXP_TABLE_BITS]; -} __v_pow_exp_data HIDDEN; - -#define V_POW_LOG_TABLE_BITS 7 -extern const struct v_pow_log_data -{ - double poly[7]; /* First coefficient is 1. */ - double ln2_hi, ln2_lo; - double invc[1 << V_POW_LOG_TABLE_BITS]; - double logc[1 << V_POW_LOG_TABLE_BITS]; - double logctail[1 << V_POW_LOG_TABLE_BITS]; -} __v_pow_log_data HIDDEN; - -#endif diff --git a/pl/math/math_err.c b/pl/math/math_err.c deleted file mode 100644 index 74db54a5b2cd16..00000000000000 --- a/pl/math/math_err.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Double-precision math error handling. - * - * Copyright (c) 2018-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#if WANT_ERRNO -# include -/* NOINLINE reduces code size and avoids making math functions non-leaf - when the error handling is inlined. */ -NOINLINE static double -with_errno (double y, int e) -{ - errno = e; - return y; -} -#else -# define with_errno(x, e) (x) -#endif - -/* NOINLINE reduces code size. */ -NOINLINE static double -xflow (uint32_t sign, double y) -{ - y = eval_as_double (opt_barrier_double (sign ? -y : y) * y); - return with_errno (y, ERANGE); -} - -HIDDEN double -__math_uflow (uint32_t sign) -{ - return xflow (sign, 0x1p-767); -} - -/* Underflows to zero in some non-nearest rounding mode, setting errno - is valid even if the result is non-zero, but in the subnormal range. */ -HIDDEN double -__math_may_uflow (uint32_t sign) -{ - return xflow (sign, 0x1.8p-538); -} - -HIDDEN double -__math_oflow (uint32_t sign) -{ - return xflow (sign, 0x1p769); -} - -HIDDEN double -__math_divzero (uint32_t sign) -{ - double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0; - return with_errno (y, ERANGE); -} - -HIDDEN double -__math_invalid (double x) -{ - double y = (x - x) / (x - x); - return isnan (x) ? y : with_errno (y, EDOM); -} - -/* Check result and set errno if necessary. */ - -HIDDEN double -__math_check_uflow (double y) -{ - return y == 0.0 ? with_errno (y, ERANGE) : y; -} - -HIDDEN double -__math_check_oflow (double y) -{ - return isinf (y) ? with_errno (y, ERANGE) : y; -} diff --git a/pl/math/math_errf.c b/pl/math/math_errf.c deleted file mode 100644 index 2b8c6bd25753b6..00000000000000 --- a/pl/math/math_errf.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Single-precision math error handling. - * - * Copyright (c) 2017-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#if WANT_ERRNO -# include -/* NOINLINE reduces code size and avoids making math functions non-leaf - when the error handling is inlined. */ -NOINLINE static float -with_errnof (float y, int e) -{ - errno = e; - return y; -} -#else -# define with_errnof(x, e) (x) -#endif - -/* NOINLINE reduces code size. */ -NOINLINE static float -xflowf (uint32_t sign, float y) -{ - y = eval_as_float (opt_barrier_float (sign ? -y : y) * y); - return with_errnof (y, ERANGE); -} - -HIDDEN float -__math_uflowf (uint32_t sign) -{ - return xflowf (sign, 0x1p-95f); -} - -/* Underflows to zero in some non-nearest rounding mode, setting errno - is valid even if the result is non-zero, but in the subnormal range. */ -HIDDEN float -__math_may_uflowf (uint32_t sign) -{ - return xflowf (sign, 0x1.4p-75f); -} - -HIDDEN float -__math_oflowf (uint32_t sign) -{ - return xflowf (sign, 0x1p97f); -} - -HIDDEN float -__math_divzerof (uint32_t sign) -{ - float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f; - return with_errnof (y, ERANGE); -} - -HIDDEN float -__math_invalidf (float x) -{ - float y = (x - x) / (x - x); - return isnan (x) ? y : with_errnof (y, EDOM); -} - -/* Check result and set errno if necessary. */ - -HIDDEN float -__math_check_uflowf (float y) -{ - return y == 0.0f ? with_errnof (y, ERANGE) : y; -} - -HIDDEN float -__math_check_oflowf (float y) -{ - return isinf (y) ? with_errnof (y, ERANGE) : y; -} diff --git a/pl/math/pl_sig.h b/pl/math/pl_sig.h deleted file mode 100644 index 52d988f0e1ce6e..00000000000000 --- a/pl/math/pl_sig.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * PL macros for emitting various ulp/bench entries based on function signature - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. - */ - -#define V_NAME_F1(fun) _ZGVnN4v_##fun##f -#define V_NAME_D1(fun) _ZGVnN2v_##fun -#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f -#define V_NAME_D2(fun) _ZGVnN2vv_##fun - -#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f -#define SV_NAME_D1(fun) _ZGVsMxv_##fun -#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f -#define SV_NAME_D2(fun) _ZGVsMxvv_##fun - -#define PL_DECL_SF1(fun) float fun##f (float); -#define PL_DECL_SF2(fun) float fun##f (float, float); -#define PL_DECL_SD1(fun) double fun (double); -#define PL_DECL_SD2(fun) double fun (double, double); - -#if WANT_VMATH -# define PL_DECL_VF1(fun) \ - VPCS_ATTR float32x4_t V_NAME_F1 (fun##f) (float32x4_t); -# define PL_DECL_VF2(fun) \ - VPCS_ATTR float32x4_t V_NAME_F2 (fun##f) (float32x4_t, float32x4_t); -# define PL_DECL_VD1(fun) VPCS_ATTR float64x2_t V_NAME_D1 (fun) (float64x2_t); -# define PL_DECL_VD2(fun) \ - VPCS_ATTR float64x2_t V_NAME_D2 (fun) (float64x2_t, float64x2_t); -#else -# define PL_DECL_VF1(fun) -# define PL_DECL_VF2(fun) -# define PL_DECL_VD1(fun) -# define PL_DECL_VD2(fun) -#endif - -#if WANT_SVE_MATH -# define PL_DECL_SVF1(fun) \ - svfloat32_t SV_NAME_F1 (fun) (svfloat32_t, svbool_t); -# define PL_DECL_SVF2(fun) \ - svfloat32_t SV_NAME_F2 (fun) (svfloat32_t, svfloat32_t, svbool_t); -# define PL_DECL_SVD1(fun) \ - svfloat64_t SV_NAME_D1 (fun) (svfloat64_t, svbool_t); -# define PL_DECL_SVD2(fun) \ - svfloat64_t SV_NAME_D2 (fun) (svfloat64_t, svfloat64_t, svbool_t); -#else -# define PL_DECL_SVF1(fun) -# define PL_DECL_SVF2(fun) -# define PL_DECL_SVD1(fun) -# define PL_DECL_SVD2(fun) -#endif - -/* For building the routines, emit function prototype from PL_SIG. This - ensures that the correct signature has been chosen (wrong one will be a - compile error). PL_SIG is defined differently by various components of the - build system to emit entries in the wrappers and entries for mathbench and - ulp. */ -#define PL_SIG(v, t, a, f, ...) PL_DECL_##v##t##a (f) diff --git a/pl/math/sv_acosh_3u5.c b/pl/math/sv_acosh_3u5.c deleted file mode 100644 index faf3513314641a..00000000000000 --- a/pl/math/sv_acosh_3u5.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Double-precision SVE acosh(x) function. - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define WANT_SV_LOG1P_K0_SHORTCUT 1 -#include "sv_log1p_inline.h" - -#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ -#define OneTop 0x3ff - -static NOINLINE svfloat64_t -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) -{ - return sv_call_f64 (acosh, x, y, special); -} - -/* SVE approximation for double-precision acosh, based on log1p. - The largest observed error is 3.19 ULP in the region where the - argument to log1p falls in the k=0 interval, i.e. x close to 1: - SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2 - want 0x1.ed23399f51373p-2. */ -svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) -{ - svuint64_t itop = svlsr_x (pg, svreinterpret_u64 (x), 52); - /* (itop - OneTop) >= (BigBoundTop - OneTop). */ - svbool_t special = svcmpge (pg, svsub_x (pg, itop, OneTop), sv_u64 (0x1ff)); - - svfloat64_t xm1 = svsub_x (pg, x, 1); - svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1)); - svfloat64_t y = sv_log1p_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); - - /* Fall back to scalar routine for special lanes. */ - if (unlikely (svptest_any (pg, special))) - return special_case (x, y, special); - - return y; -} - -PL_SIG (SV, D, 1, acosh, 1.0, 10.0) -PL_TEST_ULP (SV_NAME_D1 (acosh), 2.69) -PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000) -PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000) -PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000) -PL_TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/pl/math/sv_acoshf_2u8.c b/pl/math/sv_acoshf_2u8.c deleted file mode 100644 index f527083af40a22..00000000000000 --- a/pl/math/sv_acoshf_2u8.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Single-precision SVE acosh(x) function. - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define One 0x3f800000 -#define Thres 0x20000000 /* asuint(0x1p64) - One. */ - -#include "sv_log1pf_inline.h" - -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) -{ - return sv_call_f32 (acoshf, x, y, special); -} - -/* Single-precision SVE acosh(x) routine. Implements the same algorithm as - vector acoshf and log1p. - - Maximum error is 2.78 ULPs: - SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4 - want 0x1.f45b3cp-4. */ -svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) -{ - svuint32_t ix = svreinterpret_u32 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); - - svfloat32_t xm1 = svsub_x (pg, x, 1.0f); - svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f)); - svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, y, special); - return y; -} - -PL_SIG (SV, F, 1, acosh, 1.0, 10.0) -PL_TEST_ULP (SV_NAME_F1 (acosh), 2.29) -PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500) -PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000) -PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000) -PL_TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/pl/math/sv_asinh_3u0.c b/pl/math/sv_asinh_3u0.c deleted file mode 100644 index 711f0dfdbedc66..00000000000000 --- a/pl/math/sv_asinh_3u0.c +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Double-precision SVE asinh(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define OneTop sv_u64 (0x3ff) /* top12(asuint64(1.0f)). */ -#define HugeBound sv_u64 (0x5fe) /* top12(asuint64(0x1p511)). */ -#define TinyBound (0x3e5) /* top12(asuint64(0x1p-26)). */ -#define SignMask (0x8000000000000000) - -/* Constants & data for log. */ -#define A(i) __v_log_data.poly[i] -#define Ln2 (0x1.62e42fefa39efp-1) -#define N (1 << V_LOG_TABLE_BITS) -#define OFF (0x3fe6900900000000) - -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) -{ - return sv_call_f64 (asinh, x, y, special); -} - -static inline svfloat64_t -__sv_log_inline (svfloat64_t x, const svbool_t pg) -{ - /* Double-precision SVE log, copied from pl/math/sv_log_2u5.c with some - cosmetic modification and special-cases removed. See that file for details - of the algorithm used. */ - svuint64_t ix = svreinterpret_u64 (x); - svuint64_t tmp = svsub_x (pg, ix, OFF); - svuint64_t i - = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); - svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); - svfloat64_t z = svreinterpret_f64 (iz); - svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); - svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); - svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); - svfloat64_t kd = svcvt_f64_x (pg, k); - svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, Ln2); - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = svmla_x (pg, sv_f64 (A (2)), r, A (3)); - svfloat64_t p = svmla_x (pg, sv_f64 (A (0)), r, A (1)); - y = svmla_x (pg, y, r2, A (4)); - y = svmla_x (pg, p, r2, y); - y = svmla_x (pg, hi, r2, y); - return y; -} - -/* Double-precision implementation of SVE asinh(x). - asinh is very sensitive around 1, so it is impractical to devise a single - low-cost algorithm which is sufficiently accurate on a wide range of input. - Instead we use two different algorithms: - asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 - = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise - where log(x) is an optimized log approximation, and P(x) is a polynomial - shared with the scalar routine. The greatest observed error 2.51 ULP, in - |x| >= 1: - _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1 - want 0x1.e3181c43b0f39p-1. */ -svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) -{ - svuint64_t ix = svreinterpret_u64 (x); - svuint64_t iax = svbic_x (pg, ix, SignMask); - svuint64_t sign = svand_x (pg, ix, SignMask); - svfloat64_t ax = svreinterpret_f64 (iax); - svuint64_t top12 = svlsr_x (pg, iax, 52); - - svbool_t ge1 = svcmpge (pg, top12, OneTop); - svbool_t special = svcmpge (pg, top12, HugeBound); - - /* Option 1: |x| >= 1. - Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */ - svfloat64_t option_1 = sv_f64 (0); - if (likely (svptest_any (pg, ge1))) - { - svfloat64_t axax = svmul_x (pg, ax, ax); - option_1 = __sv_log_inline ( - svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, axax, 1))), pg); - } - - /* Option 2: |x| < 1. - Compute asinh(x) using a polynomial. - The largest observed error in this region is 1.51 ULPs: - _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 - want 0x1.c1e649ee2681dp-1. */ - svfloat64_t option_2 = sv_f64 (0); - if (likely (svptest_any (pg, svnot_z (pg, ge1)))) - { - svfloat64_t x2 = svmul_x (pg, ax, ax); - svfloat64_t z2 = svmul_x (pg, x2, x2); - svfloat64_t z4 = svmul_x (pg, z2, z2); - svfloat64_t z8 = svmul_x (pg, z4, z4); - svfloat64_t z16 = svmul_x (pg, z8, z8); - svfloat64_t p - = sv_estrin_17_f64_x (pg, x2, z2, z4, z8, z16, __asinh_data.poly); - option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax)); - } - - /* Choose the right option for each lane. */ - svfloat64_t y = svsel (ge1, option_1, option_2); - - /* Apply sign of x to y. */ - y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, y, special); - return y; -} - -PL_SIG (SV, D, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_D1 (asinh), 2.52) -/* Test vector asinh 3 times, with control lane < 1, > 1 and special. - Ensures the svsel is choosing the right option in all cases. */ -#define SV_ASINH_INTERVAL(lo, hi, n) \ - PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0.5) \ - PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 2) \ - PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0x1p600) -SV_ASINH_INTERVAL (0, 0x1p-26, 50000) -SV_ASINH_INTERVAL (0x1p-26, 1, 50000) -SV_ASINH_INTERVAL (1, 0x1p511, 50000) -SV_ASINH_INTERVAL (0x1p511, inf, 40000) diff --git a/pl/math/sv_coshf_2u.c b/pl/math/sv_coshf_2u.c deleted file mode 100644 index 81680fef318e84..00000000000000 --- a/pl/math/sv_coshf_2u.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Single-precision SVE cosh(x) function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#include "sv_expf_inline.h" - -static const struct data -{ - struct sv_expf_data expf_consts; - uint32_t special_bound; -} data = { - .expf_consts = SV_EXPF_DATA, - /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ - .special_bound = 0x42ad496c, -}; - -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) -{ - return sv_call_f32 (coshf, x, y, pg); -} - -/* Single-precision vector cosh, using vector expf. - Maximum error is 1.89 ULP: - _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127 - want 0x1.f00adcp+127. */ -svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - - svfloat32_t ax = svabs_x (pg, x); - svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound); - - /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ - svfloat32_t t = expf_inline (ax, pg, &d->expf_consts); - svfloat32_t half_t = svmul_x (pg, t, 0.5); - svfloat32_t half_over_t = svdivr_x (pg, t, 0.5); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, svadd_x (pg, half_t, half_over_t), special); - - return svadd_x (pg, half_t, half_over_t); -} - -PL_SIG (SV, F, 1, cosh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_F1 (cosh), 1.39) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/pl/math/sv_erf_data.c b/pl/math/sv_erf_data.c deleted file mode 100644 index 7244aceda5a5be..00000000000000 --- a/pl/math/sv_erf_data.c +++ /dev/null @@ -1,1558 +0,0 @@ -/* - * Data for approximation of erf. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Lookup table used in vector erf. - For each possible rounded input r (multiples of 1/128), between - r = 0.0 and r = 6.0 (769 values): - - the first entry __erf_data.tab.erf contains the values of erf(r), - - the second entry __erf_data.tab.scale contains the values of - 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the - algorithm, since lookup is performed only for x >= 1/64-1/512. */ -const struct sv_erf_data __sv_erf_data = { - .erf = { 0x0.0000000000000p+0, - 0x1.20dbf3deb1340p-7, - 0x1.20d77083f17a0p-6, - 0x1.b137e0cf584dcp-6, - 0x1.20c5645dd2538p-5, - 0x1.68e5d3bbc9526p-5, - 0x1.b0fafef135745p-5, - 0x1.f902a77bd3821p-5, - 0x1.207d480e90658p-4, - 0x1.44703e87e8593p-4, - 0x1.68591a1e83b5dp-4, - 0x1.8c36beb8a8d23p-4, - 0x1.b0081148a873ap-4, - 0x1.d3cbf7e70a4b3p-4, - 0x1.f78159ec8bb50p-4, - 0x1.0d939005f65e5p-3, - 0x1.1f5e1a35c3b89p-3, - 0x1.311fc15f56d14p-3, - 0x1.42d7fc2f64959p-3, - 0x1.548642321d7c6p-3, - 0x1.662a0bdf7a89fp-3, - 0x1.77c2d2a765f9ep-3, - 0x1.895010fdbdbfdp-3, - 0x1.9ad142662e14dp-3, - 0x1.ac45e37fe2526p-3, - 0x1.bdad72110a648p-3, - 0x1.cf076d1233237p-3, - 0x1.e05354b96ff36p-3, - 0x1.f190aa85540e2p-3, - 0x1.015f78a3dcf3dp-2, - 0x1.09eed6982b948p-2, - 0x1.127631eb8de32p-2, - 0x1.1af54e232d609p-2, - 0x1.236bef825d9a2p-2, - 0x1.2bd9db0f7827fp-2, - 0x1.343ed6989b7d9p-2, - 0x1.3c9aa8b84bedap-2, - 0x1.44ed18d9f6462p-2, - 0x1.4d35ef3e5372ep-2, - 0x1.5574f4ffac98ep-2, - 0x1.5da9f415ff23fp-2, - 0x1.65d4b75b00471p-2, - 0x1.6df50a8dff772p-2, - 0x1.760aba57a76bfp-2, - 0x1.7e15944d9d3e4p-2, - 0x1.861566f5fd3c0p-2, - 0x1.8e0a01cab516bp-2, - 0x1.95f3353cbb146p-2, - 0x1.9dd0d2b721f39p-2, - 0x1.a5a2aca209394p-2, - 0x1.ad68966569a87p-2, - 0x1.b522646bbda68p-2, - 0x1.bccfec24855b8p-2, - 0x1.c4710406a65fcp-2, - 0x1.cc058392a6d2dp-2, - 0x1.d38d4354c3bd0p-2, - 0x1.db081ce6e2a48p-2, - 0x1.e275eaf25e458p-2, - 0x1.e9d68931ae650p-2, - 0x1.f129d471eabb1p-2, - 0x1.f86faa9428f9dp-2, - 0x1.ffa7ea8eb5fd0p-2, - 0x1.03693a371519cp-1, - 0x1.06f794ab2cae7p-1, - 0x1.0a7ef5c18edd2p-1, - 0x1.0dff4f247f6c6p-1, - 0x1.1178930ada115p-1, - 0x1.14eab43841b55p-1, - 0x1.1855a5fd3dd50p-1, - 0x1.1bb95c3746199p-1, - 0x1.1f15cb50bc4dep-1, - 0x1.226ae840d4d70p-1, - 0x1.25b8a88b6dd7fp-1, - 0x1.28ff0240d52cdp-1, - 0x1.2c3debfd7d6c1p-1, - 0x1.2f755ce9a21f4p-1, - 0x1.32a54cb8db67bp-1, - 0x1.35cdb3a9a144dp-1, - 0x1.38ee8a84beb71p-1, - 0x1.3c07ca9cb4f9ep-1, - 0x1.3f196dcd0f135p-1, - 0x1.42236e79a5fa6p-1, - 0x1.4525c78dd5966p-1, - 0x1.4820747ba2dc2p-1, - 0x1.4b13713ad3513p-1, - 0x1.4dfeba47f63ccp-1, - 0x1.50e24ca35fd2cp-1, - 0x1.53be25d016a4fp-1, - 0x1.569243d2b3a9bp-1, - 0x1.595ea53035283p-1, - 0x1.5c2348ecc4dc3p-1, - 0x1.5ee02e8a71a53p-1, - 0x1.61955607dd15dp-1, - 0x1.6442bfdedd397p-1, - 0x1.66e86d0312e82p-1, - 0x1.69865ee075011p-1, - 0x1.6c1c9759d0e5fp-1, - 0x1.6eab18c74091bp-1, - 0x1.7131e5f496a5ap-1, - 0x1.73b1021fc0cb8p-1, - 0x1.762870f720c6fp-1, - 0x1.78983697dc96fp-1, - 0x1.7b00578c26037p-1, - 0x1.7d60d8c979f7bp-1, - 0x1.7fb9bfaed8078p-1, - 0x1.820b1202f27fbp-1, - 0x1.8454d5f25760dp-1, - 0x1.8697120d92a4ap-1, - 0x1.88d1cd474a2e0p-1, - 0x1.8b050ef253c37p-1, - 0x1.8d30debfc572ep-1, - 0x1.8f5544bd00c04p-1, - 0x1.91724951b8fc6p-1, - 0x1.9387f53df5238p-1, - 0x1.959651980da31p-1, - 0x1.979d67caa6631p-1, - 0x1.999d4192a5715p-1, - 0x1.9b95e8fd26abap-1, - 0x1.9d8768656cc42p-1, - 0x1.9f71ca72cffb6p-1, - 0x1.a1551a16aaeafp-1, - 0x1.a331628a45b92p-1, - 0x1.a506af4cc00f4p-1, - 0x1.a6d50c20fa293p-1, - 0x1.a89c850b7d54dp-1, - 0x1.aa5d265064366p-1, - 0x1.ac16fc7143263p-1, - 0x1.adca142b10f98p-1, - 0x1.af767a741088bp-1, - 0x1.b11c3c79bb424p-1, - 0x1.b2bb679ead19cp-1, - 0x1.b4540978921eep-1, - 0x1.b5e62fce16095p-1, - 0x1.b771e894d602ep-1, - 0x1.b8f741ef54f83p-1, - 0x1.ba764a2af2b78p-1, - 0x1.bbef0fbde6221p-1, - 0x1.bd61a1453ab44p-1, - 0x1.bece0d82d1a5cp-1, - 0x1.c034635b66e23p-1, - 0x1.c194b1d49a184p-1, - 0x1.c2ef0812fc1bdp-1, - 0x1.c443755820d64p-1, - 0x1.c5920900b5fd1p-1, - 0x1.c6dad2829ec62p-1, - 0x1.c81de16b14cefp-1, - 0x1.c95b455cce69dp-1, - 0x1.ca930e0e2a825p-1, - 0x1.cbc54b476248dp-1, - 0x1.ccf20ce0c0d27p-1, - 0x1.ce1962c0e0d8bp-1, - 0x1.cf3b5cdaf0c39p-1, - 0x1.d0580b2cfd249p-1, - 0x1.d16f7dbe41ca0p-1, - 0x1.d281c49d818d0p-1, - 0x1.d38eefdf64fddp-1, - 0x1.d4970f9ce00d9p-1, - 0x1.d59a33f19ed42p-1, - 0x1.d6986cfa798e7p-1, - 0x1.d791cad3eff01p-1, - 0x1.d8865d98abe01p-1, - 0x1.d97635600bb89p-1, - 0x1.da61623cb41e0p-1, - 0x1.db47f43b2980dp-1, - 0x1.dc29fb60715afp-1, - 0x1.dd0787a8bb39dp-1, - 0x1.dde0a90611a0dp-1, - 0x1.deb56f5f12d28p-1, - 0x1.df85ea8db188ep-1, - 0x1.e0522a5dfda73p-1, - 0x1.e11a3e8cf4eb8p-1, - 0x1.e1de36c75ba58p-1, - 0x1.e29e22a89d766p-1, - 0x1.e35a11b9b61cep-1, - 0x1.e4121370224ccp-1, - 0x1.e4c6372cd8927p-1, - 0x1.e5768c3b4a3fcp-1, - 0x1.e62321d06c5e0p-1, - 0x1.e6cc0709c8a0dp-1, - 0x1.e7714aec96534p-1, - 0x1.e812fc64db369p-1, - 0x1.e8b12a44944a8p-1, - 0x1.e94be342e6743p-1, - 0x1.e9e335fb56f87p-1, - 0x1.ea7730ed0bbb9p-1, - 0x1.eb07e27a133aap-1, - 0x1.eb9558e6b42cep-1, - 0x1.ec1fa258c4beap-1, - 0x1.eca6ccd709544p-1, - 0x1.ed2ae6489ac1ep-1, - 0x1.edabfc7453e63p-1, - 0x1.ee2a1d004692cp-1, - 0x1.eea5557137ae0p-1, - 0x1.ef1db32a2277cp-1, - 0x1.ef93436bc2daap-1, - 0x1.f006135426b26p-1, - 0x1.f0762fde45ee6p-1, - 0x1.f0e3a5e1a1788p-1, - 0x1.f14e8211e8c55p-1, - 0x1.f1b6d0fea5f4dp-1, - 0x1.f21c9f12f0677p-1, - 0x1.f27ff89525acfp-1, - 0x1.f2e0e9a6a8b09p-1, - 0x1.f33f7e43a706bp-1, - 0x1.f39bc242e43e6p-1, - 0x1.f3f5c1558b19ep-1, - 0x1.f44d870704911p-1, - 0x1.f4a31ebcd47dfp-1, - 0x1.f4f693b67bd77p-1, - 0x1.f547f10d60597p-1, - 0x1.f59741b4b97cfp-1, - 0x1.f5e4907982a07p-1, - 0x1.f62fe80272419p-1, - 0x1.f67952cff6282p-1, - 0x1.f6c0db3c34641p-1, - 0x1.f7068b7b10fd9p-1, - 0x1.f74a6d9a38383p-1, - 0x1.f78c8b812d498p-1, - 0x1.f7cceef15d631p-1, - 0x1.f80ba18636f07p-1, - 0x1.f848acb544e95p-1, - 0x1.f88419ce4e184p-1, - 0x1.f8bdf1fb78370p-1, - 0x1.f8f63e416ebffp-1, - 0x1.f92d077f8d56dp-1, - 0x1.f96256700da8ep-1, - 0x1.f99633a838a57p-1, - 0x1.f9c8a7989af0dp-1, - 0x1.f9f9ba8d3c733p-1, - 0x1.fa2974addae45p-1, - 0x1.fa57ddfe27376p-1, - 0x1.fa84fe5e05c8dp-1, - 0x1.fab0dd89d1309p-1, - 0x1.fadb831a9f9c3p-1, - 0x1.fb04f6868a944p-1, - 0x1.fb2d3f20f9101p-1, - 0x1.fb54641aebbc9p-1, - 0x1.fb7a6c834b5a2p-1, - 0x1.fb9f5f4739170p-1, - 0x1.fbc3433260ca5p-1, - 0x1.fbe61eef4cf6ap-1, - 0x1.fc07f907bc794p-1, - 0x1.fc28d7e4f9cd0p-1, - 0x1.fc48c1d033c7ap-1, - 0x1.fc67bcf2d7b8fp-1, - 0x1.fc85cf56ecd38p-1, - 0x1.fca2fee770c79p-1, - 0x1.fcbf5170b578bp-1, - 0x1.fcdacca0bfb73p-1, - 0x1.fcf57607a6e7cp-1, - 0x1.fd0f5317f582fp-1, - 0x1.fd2869270a56fp-1, - 0x1.fd40bd6d7a785p-1, - 0x1.fd58550773cb5p-1, - 0x1.fd6f34f52013ap-1, - 0x1.fd85621b0876dp-1, - 0x1.fd9ae142795e3p-1, - 0x1.fdafb719e6a69p-1, - 0x1.fdc3e835500b3p-1, - 0x1.fdd7790ea5bc0p-1, - 0x1.fdea6e062d0c9p-1, - 0x1.fdfccb62e52d3p-1, - 0x1.fe0e9552ebdd6p-1, - 0x1.fe1fcfebe2083p-1, - 0x1.fe307f2b503d0p-1, - 0x1.fe40a6f70af4bp-1, - 0x1.fe504b1d9696cp-1, - 0x1.fe5f6f568b301p-1, - 0x1.fe6e1742f7cf6p-1, - 0x1.fe7c466dc57a1p-1, - 0x1.fe8a004c19ae6p-1, - 0x1.fe97483db8670p-1, - 0x1.fea4218d6594ap-1, - 0x1.feb08f7146046p-1, - 0x1.febc950b3fa75p-1, - 0x1.fec835695932ep-1, - 0x1.fed37386190fbp-1, - 0x1.fede5248e38f4p-1, - 0x1.fee8d486585eep-1, - 0x1.fef2fd00af31ap-1, - 0x1.fefcce6813974p-1, - 0x1.ff064b5afffbep-1, - 0x1.ff0f766697c76p-1, - 0x1.ff18520700971p-1, - 0x1.ff20e0a7ba8c2p-1, - 0x1.ff2924a3f7a83p-1, - 0x1.ff312046f2339p-1, - 0x1.ff38d5cc4227fp-1, - 0x1.ff404760319b4p-1, - 0x1.ff47772010262p-1, - 0x1.ff4e671a85425p-1, - 0x1.ff55194fe19dfp-1, - 0x1.ff5b8fb26f5f6p-1, - 0x1.ff61cc26c1578p-1, - 0x1.ff67d08401202p-1, - 0x1.ff6d9e943c231p-1, - 0x1.ff733814af88cp-1, - 0x1.ff789eb6130c9p-1, - 0x1.ff7dd41ce2b4dp-1, - 0x1.ff82d9e1a76d8p-1, - 0x1.ff87b1913e853p-1, - 0x1.ff8c5cad200a5p-1, - 0x1.ff90dcaba4096p-1, - 0x1.ff9532f846ab0p-1, - 0x1.ff9960f3eb327p-1, - 0x1.ff9d67f51ddbap-1, - 0x1.ffa14948549a7p-1, - 0x1.ffa506302ebaep-1, - 0x1.ffa89fe5b3625p-1, - 0x1.ffac17988ef4bp-1, - 0x1.ffaf6e6f4f5c0p-1, - 0x1.ffb2a5879f35ep-1, - 0x1.ffb5bdf67fe6fp-1, - 0x1.ffb8b8c88295fp-1, - 0x1.ffbb970200110p-1, - 0x1.ffbe599f4f9d9p-1, - 0x1.ffc10194fcb64p-1, - 0x1.ffc38fcffbb7cp-1, - 0x1.ffc60535dd7f5p-1, - 0x1.ffc862a501fd7p-1, - 0x1.ffcaa8f4c9beap-1, - 0x1.ffccd8f5c66d1p-1, - 0x1.ffcef371ea4d7p-1, - 0x1.ffd0f92cb6ba7p-1, - 0x1.ffd2eae369a07p-1, - 0x1.ffd4c94d29fdbp-1, - 0x1.ffd6951b33686p-1, - 0x1.ffd84ef9009eep-1, - 0x1.ffd9f78c7524ap-1, - 0x1.ffdb8f7605ee7p-1, - 0x1.ffdd1750e1220p-1, - 0x1.ffde8fb314ebfp-1, - 0x1.ffdff92db56e5p-1, - 0x1.ffe1544d01ccbp-1, - 0x1.ffe2a1988857cp-1, - 0x1.ffe3e19349dc7p-1, - 0x1.ffe514bbdc197p-1, - 0x1.ffe63b8c8b5f7p-1, - 0x1.ffe7567b7b5e1p-1, - 0x1.ffe865fac722bp-1, - 0x1.ffe96a78a04a9p-1, - 0x1.ffea645f6d6dap-1, - 0x1.ffeb5415e7c44p-1, - 0x1.ffec39ff380b9p-1, - 0x1.ffed167b12ac2p-1, - 0x1.ffede9e5d3262p-1, - 0x1.ffeeb49896c6dp-1, - 0x1.ffef76e956a9fp-1, - 0x1.fff0312b010b5p-1, - 0x1.fff0e3ad91ec2p-1, - 0x1.fff18ebe2b0e1p-1, - 0x1.fff232a72b48ep-1, - 0x1.fff2cfb0453d9p-1, - 0x1.fff3661e9569dp-1, - 0x1.fff3f634b79f9p-1, - 0x1.fff48032dbe40p-1, - 0x1.fff50456dab8cp-1, - 0x1.fff582dc48d30p-1, - 0x1.fff5fbfc8a439p-1, - 0x1.fff66feee5129p-1, - 0x1.fff6dee89352ep-1, - 0x1.fff7491cd4af6p-1, - 0x1.fff7aebcff755p-1, - 0x1.fff80ff8911fdp-1, - 0x1.fff86cfd3e657p-1, - 0x1.fff8c5f702ccfp-1, - 0x1.fff91b102fca8p-1, - 0x1.fff96c717b695p-1, - 0x1.fff9ba420e834p-1, - 0x1.fffa04a7928b1p-1, - 0x1.fffa4bc63ee9ap-1, - 0x1.fffa8fc0e5f33p-1, - 0x1.fffad0b901755p-1, - 0x1.fffb0ecebee1bp-1, - 0x1.fffb4a210b172p-1, - 0x1.fffb82cd9dcbfp-1, - 0x1.fffbb8f1049c6p-1, - 0x1.fffbeca6adbe9p-1, - 0x1.fffc1e08f25f5p-1, - 0x1.fffc4d3120aa1p-1, - 0x1.fffc7a37857d2p-1, - 0x1.fffca53375ce3p-1, - 0x1.fffcce3b57bffp-1, - 0x1.fffcf564ab6b7p-1, - 0x1.fffd1ac4135f9p-1, - 0x1.fffd3e6d5cd87p-1, - 0x1.fffd607387b07p-1, - 0x1.fffd80e8ce0dap-1, - 0x1.fffd9fdeabccep-1, - 0x1.fffdbd65e5ad0p-1, - 0x1.fffdd98e903b2p-1, - 0x1.fffdf46816833p-1, - 0x1.fffe0e0140857p-1, - 0x1.fffe26683972ap-1, - 0x1.fffe3daa95b18p-1, - 0x1.fffe53d558ae9p-1, - 0x1.fffe68f4fa777p-1, - 0x1.fffe7d156d244p-1, - 0x1.fffe904222101p-1, - 0x1.fffea2860ee1ep-1, - 0x1.fffeb3ebb267bp-1, - 0x1.fffec47d19457p-1, - 0x1.fffed443e2787p-1, - 0x1.fffee34943b15p-1, - 0x1.fffef1960d85dp-1, - 0x1.fffeff32af7afp-1, - 0x1.ffff0c273bea2p-1, - 0x1.ffff187b6bc0ep-1, - 0x1.ffff2436a21dcp-1, - 0x1.ffff2f5fefcaap-1, - 0x1.ffff39fe16963p-1, - 0x1.ffff44178c8d2p-1, - 0x1.ffff4db27f146p-1, - 0x1.ffff56d4d5e5ep-1, - 0x1.ffff5f8435efcp-1, - 0x1.ffff67c604180p-1, - 0x1.ffff6f9f67e55p-1, - 0x1.ffff77154e0d6p-1, - 0x1.ffff7e2c6aea2p-1, - 0x1.ffff84e93cd75p-1, - 0x1.ffff8b500e77cp-1, - 0x1.ffff9164f8e46p-1, - 0x1.ffff972be5c59p-1, - 0x1.ffff9ca891572p-1, - 0x1.ffffa1de8c582p-1, - 0x1.ffffa6d13de73p-1, - 0x1.ffffab83e54b8p-1, - 0x1.ffffaff99bac4p-1, - 0x1.ffffb43555b5fp-1, - 0x1.ffffb839e52f3p-1, - 0x1.ffffbc09fa7cdp-1, - 0x1.ffffbfa82616bp-1, - 0x1.ffffc316d9ed0p-1, - 0x1.ffffc6586abf6p-1, - 0x1.ffffc96f1165ep-1, - 0x1.ffffcc5cec0c1p-1, - 0x1.ffffcf23ff5fcp-1, - 0x1.ffffd1c637b2bp-1, - 0x1.ffffd4456a10dp-1, - 0x1.ffffd6a3554a1p-1, - 0x1.ffffd8e1a2f22p-1, - 0x1.ffffdb01e8546p-1, - 0x1.ffffdd05a75eap-1, - 0x1.ffffdeee4f810p-1, - 0x1.ffffe0bd3e852p-1, - 0x1.ffffe273c15b7p-1, - 0x1.ffffe41314e06p-1, - 0x1.ffffe59c6698bp-1, - 0x1.ffffe710d565ep-1, - 0x1.ffffe8717232dp-1, - 0x1.ffffe9bf4098cp-1, - 0x1.ffffeafb377d5p-1, - 0x1.ffffec2641a9ep-1, - 0x1.ffffed413e5b7p-1, - 0x1.ffffee4d01cd6p-1, - 0x1.ffffef4a55bd4p-1, - 0x1.fffff039f9e8fp-1, - 0x1.fffff11ca4876p-1, - 0x1.fffff1f302bc1p-1, - 0x1.fffff2bdb904dp-1, - 0x1.fffff37d63a36p-1, - 0x1.fffff43297019p-1, - 0x1.fffff4dde0118p-1, - 0x1.fffff57fc4a95p-1, - 0x1.fffff618c3da6p-1, - 0x1.fffff6a956450p-1, - 0x1.fffff731ee681p-1, - 0x1.fffff7b2f8ed6p-1, - 0x1.fffff82cdcf1bp-1, - 0x1.fffff89ffc4aap-1, - 0x1.fffff90cb3c81p-1, - 0x1.fffff9735b73bp-1, - 0x1.fffff9d446cccp-1, - 0x1.fffffa2fc5015p-1, - 0x1.fffffa8621251p-1, - 0x1.fffffad7a2652p-1, - 0x1.fffffb248c39dp-1, - 0x1.fffffb6d1e95dp-1, - 0x1.fffffbb196132p-1, - 0x1.fffffbf22c1e2p-1, - 0x1.fffffc2f171e3p-1, - 0x1.fffffc688a9cfp-1, - 0x1.fffffc9eb76acp-1, - 0x1.fffffcd1cbc28p-1, - 0x1.fffffd01f36afp-1, - 0x1.fffffd2f57d68p-1, - 0x1.fffffd5a2041fp-1, - 0x1.fffffd8271d12p-1, - 0x1.fffffda86faa9p-1, - 0x1.fffffdcc3b117p-1, - 0x1.fffffdedf37edp-1, - 0x1.fffffe0db6b91p-1, - 0x1.fffffe2ba0ea5p-1, - 0x1.fffffe47ccb60p-1, - 0x1.fffffe62534d4p-1, - 0x1.fffffe7b4c81ep-1, - 0x1.fffffe92ced93p-1, - 0x1.fffffea8ef9cfp-1, - 0x1.fffffebdc2ec6p-1, - 0x1.fffffed15bcbap-1, - 0x1.fffffee3cc32cp-1, - 0x1.fffffef5251c2p-1, - 0x1.ffffff0576917p-1, - 0x1.ffffff14cfb92p-1, - 0x1.ffffff233ee1dp-1, - 0x1.ffffff30d18e8p-1, - 0x1.ffffff3d9480fp-1, - 0x1.ffffff4993c46p-1, - 0x1.ffffff54dab72p-1, - 0x1.ffffff5f74141p-1, - 0x1.ffffff6969fb8p-1, - 0x1.ffffff72c5fb6p-1, - 0x1.ffffff7b91176p-1, - 0x1.ffffff83d3d07p-1, - 0x1.ffffff8b962bep-1, - 0x1.ffffff92dfba2p-1, - 0x1.ffffff99b79d2p-1, - 0x1.ffffffa0248e8p-1, - 0x1.ffffffa62ce54p-1, - 0x1.ffffffabd69b4p-1, - 0x1.ffffffb127525p-1, - 0x1.ffffffb624592p-1, - 0x1.ffffffbad2affp-1, - 0x1.ffffffbf370cdp-1, - 0x1.ffffffc355dfdp-1, - 0x1.ffffffc733572p-1, - 0x1.ffffffcad3626p-1, - 0x1.ffffffce39b67p-1, - 0x1.ffffffd169d0cp-1, - 0x1.ffffffd466fa5p-1, - 0x1.ffffffd7344aap-1, - 0x1.ffffffd9d4aabp-1, - 0x1.ffffffdc4ad7ap-1, - 0x1.ffffffde9964ep-1, - 0x1.ffffffe0c2bf0p-1, - 0x1.ffffffe2c92dbp-1, - 0x1.ffffffe4aed5ep-1, - 0x1.ffffffe675bbdp-1, - 0x1.ffffffe81fc4ep-1, - 0x1.ffffffe9aeb97p-1, - 0x1.ffffffeb24467p-1, - 0x1.ffffffec81ff2p-1, - 0x1.ffffffedc95e7p-1, - 0x1.ffffffeefbc85p-1, - 0x1.fffffff01a8b6p-1, - 0x1.fffffff126e1ep-1, - 0x1.fffffff221f30p-1, - 0x1.fffffff30cd3fp-1, - 0x1.fffffff3e8892p-1, - 0x1.fffffff4b606fp-1, - 0x1.fffffff57632dp-1, - 0x1.fffffff629e44p-1, - 0x1.fffffff6d1e56p-1, - 0x1.fffffff76ef3fp-1, - 0x1.fffffff801c1fp-1, - 0x1.fffffff88af67p-1, - 0x1.fffffff90b2e3p-1, - 0x1.fffffff982fc1p-1, - 0x1.fffffff9f2e9fp-1, - 0x1.fffffffa5b790p-1, - 0x1.fffffffabd229p-1, - 0x1.fffffffb18582p-1, - 0x1.fffffffb6d844p-1, - 0x1.fffffffbbd0aap-1, - 0x1.fffffffc0748fp-1, - 0x1.fffffffc4c96cp-1, - 0x1.fffffffc8d462p-1, - 0x1.fffffffcc9a41p-1, - 0x1.fffffffd01f89p-1, - 0x1.fffffffd36871p-1, - 0x1.fffffffd678edp-1, - 0x1.fffffffd954aep-1, - 0x1.fffffffdbff2ap-1, - 0x1.fffffffde7ba0p-1, - 0x1.fffffffe0cd16p-1, - 0x1.fffffffe2f664p-1, - 0x1.fffffffe4fa30p-1, - 0x1.fffffffe6daf7p-1, - 0x1.fffffffe89b0cp-1, - 0x1.fffffffea3c9ap-1, - 0x1.fffffffebc1a9p-1, - 0x1.fffffffed2c21p-1, - 0x1.fffffffee7dc8p-1, - 0x1.fffffffefb847p-1, - 0x1.ffffffff0dd2bp-1, - 0x1.ffffffff1ede9p-1, - 0x1.ffffffff2ebdap-1, - 0x1.ffffffff3d843p-1, - 0x1.ffffffff4b453p-1, - 0x1.ffffffff58126p-1, - 0x1.ffffffff63fc3p-1, - 0x1.ffffffff6f121p-1, - 0x1.ffffffff79626p-1, - 0x1.ffffffff82fabp-1, - 0x1.ffffffff8be77p-1, - 0x1.ffffffff94346p-1, - 0x1.ffffffff9bec8p-1, - 0x1.ffffffffa319fp-1, - 0x1.ffffffffa9c63p-1, - 0x1.ffffffffaffa4p-1, - 0x1.ffffffffb5be5p-1, - 0x1.ffffffffbb1a2p-1, - 0x1.ffffffffc014ep-1, - 0x1.ffffffffc4b56p-1, - 0x1.ffffffffc901cp-1, - 0x1.ffffffffccfffp-1, - 0x1.ffffffffd0b56p-1, - 0x1.ffffffffd4271p-1, - 0x1.ffffffffd759dp-1, - 0x1.ffffffffda520p-1, - 0x1.ffffffffdd13cp-1, - 0x1.ffffffffdfa2dp-1, - 0x1.ffffffffe202dp-1, - 0x1.ffffffffe4371p-1, - 0x1.ffffffffe642ap-1, - 0x1.ffffffffe8286p-1, - 0x1.ffffffffe9eb0p-1, - 0x1.ffffffffeb8d0p-1, - 0x1.ffffffffed10ap-1, - 0x1.ffffffffee782p-1, - 0x1.ffffffffefc57p-1, - 0x1.fffffffff0fa7p-1, - 0x1.fffffffff218fp-1, - 0x1.fffffffff3227p-1, - 0x1.fffffffff4188p-1, - 0x1.fffffffff4fc9p-1, - 0x1.fffffffff5cfdp-1, - 0x1.fffffffff6939p-1, - 0x1.fffffffff748ep-1, - 0x1.fffffffff7f0dp-1, - 0x1.fffffffff88c5p-1, - 0x1.fffffffff91c6p-1, - 0x1.fffffffff9a1bp-1, - 0x1.fffffffffa1d2p-1, - 0x1.fffffffffa8f6p-1, - 0x1.fffffffffaf92p-1, - 0x1.fffffffffb5b0p-1, - 0x1.fffffffffbb58p-1, - 0x1.fffffffffc095p-1, - 0x1.fffffffffc56dp-1, - 0x1.fffffffffc9e8p-1, - 0x1.fffffffffce0dp-1, - 0x1.fffffffffd1e1p-1, - 0x1.fffffffffd56cp-1, - 0x1.fffffffffd8b3p-1, - 0x1.fffffffffdbbap-1, - 0x1.fffffffffde86p-1, - 0x1.fffffffffe11dp-1, - 0x1.fffffffffe380p-1, - 0x1.fffffffffe5b6p-1, - 0x1.fffffffffe7c0p-1, - 0x1.fffffffffe9a2p-1, - 0x1.fffffffffeb60p-1, - 0x1.fffffffffecfbp-1, - 0x1.fffffffffee77p-1, - 0x1.fffffffffefd6p-1, - 0x1.ffffffffff11ap-1, - 0x1.ffffffffff245p-1, - 0x1.ffffffffff359p-1, - 0x1.ffffffffff457p-1, - 0x1.ffffffffff542p-1, - 0x1.ffffffffff61bp-1, - 0x1.ffffffffff6e3p-1, - 0x1.ffffffffff79bp-1, - 0x1.ffffffffff845p-1, - 0x1.ffffffffff8e2p-1, - 0x1.ffffffffff973p-1, - 0x1.ffffffffff9f8p-1, - 0x1.ffffffffffa73p-1, - 0x1.ffffffffffae4p-1, - 0x1.ffffffffffb4cp-1, - 0x1.ffffffffffbadp-1, - 0x1.ffffffffffc05p-1, - 0x1.ffffffffffc57p-1, - 0x1.ffffffffffca2p-1, - 0x1.ffffffffffce7p-1, - 0x1.ffffffffffd27p-1, - 0x1.ffffffffffd62p-1, - 0x1.ffffffffffd98p-1, - 0x1.ffffffffffdcap-1, - 0x1.ffffffffffdf8p-1, - 0x1.ffffffffffe22p-1, - 0x1.ffffffffffe49p-1, - 0x1.ffffffffffe6cp-1, - 0x1.ffffffffffe8dp-1, - 0x1.ffffffffffeabp-1, - 0x1.ffffffffffec7p-1, - 0x1.ffffffffffee1p-1, - 0x1.ffffffffffef8p-1, - 0x1.fffffffffff0ep-1, - 0x1.fffffffffff22p-1, - 0x1.fffffffffff34p-1, - 0x1.fffffffffff45p-1, - 0x1.fffffffffff54p-1, - 0x1.fffffffffff62p-1, - 0x1.fffffffffff6fp-1, - 0x1.fffffffffff7bp-1, - 0x1.fffffffffff86p-1, - 0x1.fffffffffff90p-1, - 0x1.fffffffffff9ap-1, - 0x1.fffffffffffa2p-1, - 0x1.fffffffffffaap-1, - 0x1.fffffffffffb1p-1, - 0x1.fffffffffffb8p-1, - 0x1.fffffffffffbep-1, - 0x1.fffffffffffc3p-1, - 0x1.fffffffffffc8p-1, - 0x1.fffffffffffcdp-1, - 0x1.fffffffffffd1p-1, - 0x1.fffffffffffd5p-1, - 0x1.fffffffffffd9p-1, - 0x1.fffffffffffdcp-1, - 0x1.fffffffffffdfp-1, - 0x1.fffffffffffe2p-1, - 0x1.fffffffffffe4p-1, - 0x1.fffffffffffe7p-1, - 0x1.fffffffffffe9p-1, - 0x1.fffffffffffebp-1, - 0x1.fffffffffffedp-1, - 0x1.fffffffffffeep-1, - 0x1.ffffffffffff0p-1, - 0x1.ffffffffffff1p-1, - 0x1.ffffffffffff3p-1, - 0x1.ffffffffffff4p-1, - 0x1.ffffffffffff5p-1, - 0x1.ffffffffffff6p-1, - 0x1.ffffffffffff7p-1, - 0x1.ffffffffffff7p-1, - 0x1.ffffffffffff8p-1, - 0x1.ffffffffffff9p-1, - 0x1.ffffffffffff9p-1, - 0x1.ffffffffffffap-1, - 0x1.ffffffffffffbp-1, - 0x1.ffffffffffffbp-1, - 0x1.ffffffffffffbp-1, - 0x1.ffffffffffffcp-1, - 0x1.ffffffffffffcp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - }, - .scale = { 0x1.20dd750429b6dp+0, - 0x1.20d8f1975c85dp+0, - 0x1.20cb67bd452c7p+0, - 0x1.20b4d8bac36c1p+0, - 0x1.209546ad13ccfp+0, - 0x1.206cb4897b148p+0, - 0x1.203b261cd0052p+0, - 0x1.2000a00ae3804p+0, - 0x1.1fbd27cdc72d3p+0, - 0x1.1f70c3b4f2cc7p+0, - 0x1.1f1b7ae44867fp+0, - 0x1.1ebd5552f795bp+0, - 0x1.1e565bca400d4p+0, - 0x1.1de697e413d28p+0, - 0x1.1d6e14099944ap+0, - 0x1.1cecdb718d61cp+0, - 0x1.1c62fa1e869b6p+0, - 0x1.1bd07cdd189acp+0, - 0x1.1b357141d95d5p+0, - 0x1.1a91e5a748165p+0, - 0x1.19e5e92b964abp+0, - 0x1.19318bae53a04p+0, - 0x1.1874ddcdfce24p+0, - 0x1.17aff0e56ec10p+0, - 0x1.16e2d7093cd8cp+0, - 0x1.160da304ed92fp+0, - 0x1.153068581b781p+0, - 0x1.144b3b337c90cp+0, - 0x1.135e3075d076bp+0, - 0x1.12695da8b5bdep+0, - 0x1.116cd8fd67618p+0, - 0x1.1068b94962e5ep+0, - 0x1.0f5d1602f7e41p+0, - 0x1.0e4a073dc1b91p+0, - 0x1.0d2fa5a70c168p+0, - 0x1.0c0e0a8223359p+0, - 0x1.0ae54fa490722p+0, - 0x1.09b58f724416bp+0, - 0x1.087ee4d9ad247p+0, - 0x1.07416b4fbfe7cp+0, - 0x1.05fd3ecbec297p+0, - 0x1.04b27bc403d30p+0, - 0x1.03613f2812dafp+0, - 0x1.0209a65e29545p+0, - 0x1.00abcf3e187a9p+0, - 0x1.fe8fb01a47307p-1, - 0x1.fbbbbef34b4b2p-1, - 0x1.f8dc092d58ff8p-1, - 0x1.f5f0cdaf15313p-1, - 0x1.f2fa4c16c0019p-1, - 0x1.eff8c4b1375dbp-1, - 0x1.ecec7870ebca7p-1, - 0x1.e9d5a8e4c934ep-1, - 0x1.e6b4982f158b9p-1, - 0x1.e38988fc46e72p-1, - 0x1.e054be79d3042p-1, - 0x1.dd167c4cf9d2ap-1, - 0x1.d9cf06898cdafp-1, - 0x1.d67ea1a8b5368p-1, - 0x1.d325927fb9d89p-1, - 0x1.cfc41e36c7df9p-1, - 0x1.cc5a8a3fbea40p-1, - 0x1.c8e91c4d01368p-1, - 0x1.c5701a484ef9dp-1, - 0x1.c1efca49a5011p-1, - 0x1.be68728e29d5dp-1, - 0x1.bada596f25436p-1, - 0x1.b745c55905bf8p-1, - 0x1.b3aafcc27502ep-1, - 0x1.b00a46237d5bep-1, - 0x1.ac63e7ecc1411p-1, - 0x1.a8b8287ec6a09p-1, - 0x1.a5074e2157620p-1, - 0x1.a1519efaf889ep-1, - 0x1.9d97610879642p-1, - 0x1.99d8da149c13fp-1, - 0x1.96164fafd8de3p-1, - 0x1.925007283d7aap-1, - 0x1.8e86458169af8p-1, - 0x1.8ab94f6caa71dp-1, - 0x1.86e9694134b9ep-1, - 0x1.8316d6f48133dp-1, - 0x1.7f41dc12c9e89p-1, - 0x1.7b6abbb7aaf19p-1, - 0x1.7791b886e7403p-1, - 0x1.73b714a552763p-1, - 0x1.6fdb11b1e0c34p-1, - 0x1.6bfdf0beddaf5p-1, - 0x1.681ff24b4ab04p-1, - 0x1.6441563c665d4p-1, - 0x1.60625bd75d07bp-1, - 0x1.5c8341bb23767p-1, - 0x1.58a445da7c74cp-1, - 0x1.54c5a57629db0p-1, - 0x1.50e79d1749ac9p-1, - 0x1.4d0a6889dfd9fp-1, - 0x1.492e42d78d2c5p-1, - 0x1.4553664273d24p-1, - 0x1.417a0c4049fd0p-1, - 0x1.3da26d759aef5p-1, - 0x1.39ccc1b136d5ap-1, - 0x1.35f93fe7d1b3dp-1, - 0x1.32281e2fd1a92p-1, - 0x1.2e5991bd4cbfcp-1, - 0x1.2a8dcede3673bp-1, - 0x1.26c508f6bd0ffp-1, - 0x1.22ff727dd6f7bp-1, - 0x1.1f3d3cf9ffe5ap-1, - 0x1.1b7e98fe26217p-1, - 0x1.17c3b626c7a11p-1, - 0x1.140cc3173f007p-1, - 0x1.1059ed7740313p-1, - 0x1.0cab61f084b93p-1, - 0x1.09014c2ca74dap-1, - 0x1.055bd6d32e8d7p-1, - 0x1.01bb2b87c6968p-1, - 0x1.fc3ee5d1524b0p-2, - 0x1.f511a91a67d2ap-2, - 0x1.edeeee0959518p-2, - 0x1.e6d6ffaa65a25p-2, - 0x1.dfca26f5bbf88p-2, - 0x1.d8c8aace11e63p-2, - 0x1.d1d2cfff91594p-2, - 0x1.cae8d93f1d7b6p-2, - 0x1.c40b0729ed547p-2, - 0x1.bd3998457afdap-2, - 0x1.b674c8ffc6283p-2, - 0x1.afbcd3afe8ab6p-2, - 0x1.a911f096fbc26p-2, - 0x1.a27455e14c93cp-2, - 0x1.9be437a7de946p-2, - 0x1.9561c7f23a47bp-2, - 0x1.8eed36b886d93p-2, - 0x1.8886b1e5ecfd1p-2, - 0x1.822e655b417e6p-2, - 0x1.7be47af1f5d89p-2, - 0x1.75a91a7f4d2edp-2, - 0x1.6f7c69d7d3ef8p-2, - 0x1.695e8cd31867ep-2, - 0x1.634fa54fa285fp-2, - 0x1.5d4fd33729015p-2, - 0x1.575f3483021c3p-2, - 0x1.517de540ce2a3p-2, - 0x1.4babff975a04cp-2, - 0x1.45e99bcbb7915p-2, - 0x1.4036d0468a7a2p-2, - 0x1.3a93b1998736cp-2, - 0x1.35005285227f1p-2, - 0x1.2f7cc3fe6f423p-2, - 0x1.2a09153529381p-2, - 0x1.24a55399ea239p-2, - 0x1.1f518ae487dc8p-2, - 0x1.1a0dc51a9934dp-2, - 0x1.14da0a961fd14p-2, - 0x1.0fb6620c550afp-2, - 0x1.0aa2d09497f2bp-2, - 0x1.059f59af7a906p-2, - 0x1.00abff4dec7a3p-2, - 0x1.f79183b101c5bp-3, - 0x1.edeb406d9c824p-3, - 0x1.e4652fadcb6b2p-3, - 0x1.daff4969c0b04p-3, - 0x1.d1b982c501370p-3, - 0x1.c893ce1dcbef7p-3, - 0x1.bf8e1b1ca2279p-3, - 0x1.b6a856c3ed54fp-3, - 0x1.ade26b7fbed95p-3, - 0x1.a53c4135a6526p-3, - 0x1.9cb5bd549b111p-3, - 0x1.944ec2e4f5630p-3, - 0x1.8c07329874652p-3, - 0x1.83deeada4d25ap-3, - 0x1.7bd5c7df3fe9cp-3, - 0x1.73eba3b5b07b7p-3, - 0x1.6c205655be71fp-3, - 0x1.6473b5b15a7a1p-3, - 0x1.5ce595c455b0ap-3, - 0x1.5575c8a468361p-3, - 0x1.4e241e912c305p-3, - 0x1.46f066040a832p-3, - 0x1.3fda6bc016994p-3, - 0x1.38e1fae1d6a9dp-3, - 0x1.3206dceef5f87p-3, - 0x1.2b48d9e5dea1cp-3, - 0x1.24a7b84d38971p-3, - 0x1.1e233d434b813p-3, - 0x1.17bb2c8d41535p-3, - 0x1.116f48a6476ccp-3, - 0x1.0b3f52ce8c383p-3, - 0x1.052b0b1a174eap-3, - 0x1.fe6460fef4680p-4, - 0x1.f2a901ccafb37p-4, - 0x1.e723726b824a9p-4, - 0x1.dbd32ac4c99b0p-4, - 0x1.d0b7a0f921e7cp-4, - 0x1.c5d0497c09e74p-4, - 0x1.bb1c972f23e50p-4, - 0x1.b09bfb7d11a83p-4, - 0x1.a64de673e8837p-4, - 0x1.9c31c6df3b1b8p-4, - 0x1.92470a61b6965p-4, - 0x1.888d1d8e510a3p-4, - 0x1.7f036c0107294p-4, - 0x1.75a96077274bap-4, - 0x1.6c7e64e7281cbp-4, - 0x1.6381e2980956bp-4, - 0x1.5ab342383d177p-4, - 0x1.5211ebf41880bp-4, - 0x1.499d478bca735p-4, - 0x1.4154bc68d75c3p-4, - 0x1.3937b1b319259p-4, - 0x1.31458e6542847p-4, - 0x1.297db960e4f63p-4, - 0x1.21df9981f8e53p-4, - 0x1.1a6a95b1e786fp-4, - 0x1.131e14fa1625dp-4, - 0x1.0bf97e95f2a64p-4, - 0x1.04fc3a0481321p-4, - 0x1.fc4b5e32d6259p-5, - 0x1.eeea8c1b1db93p-5, - 0x1.e1d4cf1e2450ap-5, - 0x1.d508f9a1ea64ep-5, - 0x1.c885df3451a07p-5, - 0x1.bc4a54a84e834p-5, - 0x1.b055303221015p-5, - 0x1.a4a549829587ep-5, - 0x1.993979e14fffdp-5, - 0x1.8e109c4622913p-5, - 0x1.83298d717210ep-5, - 0x1.78832c03aa2b1p-5, - 0x1.6e1c5893c380bp-5, - 0x1.63f3f5c4de13bp-5, - 0x1.5a08e85af27e0p-5, - 0x1.505a174e9c929p-5, - 0x1.46e66be002240p-5, - 0x1.3dacd1a8d8ccdp-5, - 0x1.34ac36ad8dafep-5, - 0x1.2be38b6d92415p-5, - 0x1.2351c2f2d1449p-5, - 0x1.1af5d2e04f3f6p-5, - 0x1.12ceb37ff9bc3p-5, - 0x1.0adb5fcfa8c75p-5, - 0x1.031ad58d56279p-5, - 0x1.f7182a851bca2p-6, - 0x1.e85c449e377f2p-6, - 0x1.da0005e5f28dfp-6, - 0x1.cc0180af00a8bp-6, - 0x1.be5ecd2fcb5f9p-6, - 0x1.b1160991ff737p-6, - 0x1.a4255a00b9f03p-6, - 0x1.978ae8b55ce1bp-6, - 0x1.8b44e6031383ep-6, - 0x1.7f5188610ddc8p-6, - 0x1.73af0c737bb45p-6, - 0x1.685bb5134ef13p-6, - 0x1.5d55cb54cd53ap-6, - 0x1.529b9e8cf9a1ep-6, - 0x1.482b8455dc491p-6, - 0x1.3e03d891b37dep-6, - 0x1.3422fd6d12e2bp-6, - 0x1.2a875b5ffab56p-6, - 0x1.212f612dee7fbp-6, - 0x1.181983e5133ddp-6, - 0x1.0f443edc5ce49p-6, - 0x1.06ae13b0d3255p-6, - 0x1.fcab1483ea7fcp-7, - 0x1.ec72615a894c4p-7, - 0x1.dcaf3691fc448p-7, - 0x1.cd5ec93c12431p-7, - 0x1.be7e5ac24963bp-7, - 0x1.b00b38d6b3575p-7, - 0x1.a202bd6372dcep-7, - 0x1.94624e78e0fafp-7, - 0x1.87275e3a6869dp-7, - 0x1.7a4f6aca256cbp-7, - 0x1.6dd7fe3358230p-7, - 0x1.61beae53b72b7p-7, - 0x1.56011cc3b036dp-7, - 0x1.4a9cf6bda3f4cp-7, - 0x1.3f8ff5042a88ep-7, - 0x1.34d7dbc76d7e5p-7, - 0x1.2a727a89a3f14p-7, - 0x1.205dac02bd6b9p-7, - 0x1.1697560347b25p-7, - 0x1.0d1d69569b82dp-7, - 0x1.03ede1a45bfeep-7, - 0x1.f60d8aa2a88f2p-8, - 0x1.e4cc4abf7d065p-8, - 0x1.d4143a9dfe965p-8, - 0x1.c3e1a5f5c077cp-8, - 0x1.b430ecf4a83a8p-8, - 0x1.a4fe83fb9db25p-8, - 0x1.9646f35a76623p-8, - 0x1.8806d70b2fc36p-8, - 0x1.7a3ade6c8b3e4p-8, - 0x1.6cdfcbfc1e263p-8, - 0x1.5ff2750fe7820p-8, - 0x1.536fc18f7ce5cp-8, - 0x1.4754abacdf1dcp-8, - 0x1.3b9e3f9d06e3fp-8, - 0x1.30499b503957fp-8, - 0x1.2553ee2a336bfp-8, - 0x1.1aba78ba3af89p-8, - 0x1.107a8c7323a6ep-8, - 0x1.06918b6355624p-8, - 0x1.f9f9cfd9c3035p-9, - 0x1.e77448fb66bb9p-9, - 0x1.d58da68fd1170p-9, - 0x1.c4412bf4b8f0bp-9, - 0x1.b38a3af2e55b4p-9, - 0x1.a3645330550ffp-9, - 0x1.93cb11a30d765p-9, - 0x1.84ba3004a50d0p-9, - 0x1.762d84469c18fp-9, - 0x1.6821000795a03p-9, - 0x1.5a90b00981d93p-9, - 0x1.4d78bba8ca5fdp-9, - 0x1.40d564548fad7p-9, - 0x1.34a305080681fp-9, - 0x1.28de11c5031ebp-9, - 0x1.1d83170fbf6fbp-9, - 0x1.128eb96be8798p-9, - 0x1.07fdb4dafea5fp-9, - 0x1.fb99b8b8279e1p-10, - 0x1.e7f232d9e2630p-10, - 0x1.d4fed7195d7e8p-10, - 0x1.c2b9cf7f893bfp-10, - 0x1.b11d702b3deb1p-10, - 0x1.a024365f771bdp-10, - 0x1.8fc8c794b03b5p-10, - 0x1.8005f08d6f1efp-10, - 0x1.70d6a46e07ddap-10, - 0x1.6235fbd7a4345p-10, - 0x1.541f340697987p-10, - 0x1.468dadf4080abp-10, - 0x1.397ced7af2b15p-10, - 0x1.2ce898809244ep-10, - 0x1.20cc76202c5fap-10, - 0x1.15246dda49d47p-10, - 0x1.09ec86c75d497p-10, - 0x1.fe41cd9bb4eeep-11, - 0x1.e97ba3b77f306p-11, - 0x1.d57f524723822p-11, - 0x1.c245d4b998479p-11, - 0x1.afc85e0f82e12p-11, - 0x1.9e005769dbc1dp-11, - 0x1.8ce75e9f6f8a0p-11, - 0x1.7c7744d9378f7p-11, - 0x1.6caa0d3582fe9p-11, - 0x1.5d79eb71e893bp-11, - 0x1.4ee1429bf7cc0p-11, - 0x1.40daa3c89f5b6p-11, - 0x1.3360ccd23db3ap-11, - 0x1.266ea71d4f71ap-11, - 0x1.19ff4663ae9dfp-11, - 0x1.0e0de78654d1ep-11, - 0x1.0295ef6591848p-11, - 0x1.ef25d37f49fe1p-12, - 0x1.da01102b5f851p-12, - 0x1.c5b5412dcafadp-12, - 0x1.b23a5a23e4210p-12, - 0x1.9f8893d8fd1c1p-12, - 0x1.8d986a4187285p-12, - 0x1.7c629a822bc9ep-12, - 0x1.6be02102b3520p-12, - 0x1.5c0a378c90bcap-12, - 0x1.4cda5374ea275p-12, - 0x1.3e4a23d1f4702p-12, - 0x1.30538fbb77ecdp-12, - 0x1.22f0b496539bdp-12, - 0x1.161be46ad3b50p-12, - 0x1.09cfa445b00ffp-12, - 0x1.fc0d55470cf51p-13, - 0x1.e577bbcd49935p-13, - 0x1.cfd4a5adec5bfp-13, - 0x1.bb1a9657ce465p-13, - 0x1.a740684026555p-13, - 0x1.943d4a1d1ed39p-13, - 0x1.8208bc334a6a5p-13, - 0x1.709a8db59f25cp-13, - 0x1.5feada379d8b7p-13, - 0x1.4ff207314a102p-13, - 0x1.40a8c1949f75ep-13, - 0x1.3207fb7420eb9p-13, - 0x1.2408e9ba3327fp-13, - 0x1.16a501f0e42cap-13, - 0x1.09d5f819c9e29p-13, - 0x1.fb2b792b40a22p-14, - 0x1.e3bcf436a1a95p-14, - 0x1.cd55277c18d05p-14, - 0x1.b7e94604479dcp-14, - 0x1.a36eec00926ddp-14, - 0x1.8fdc1b2dcf7b9p-14, - 0x1.7d2737527c3f9p-14, - 0x1.6b4702d7d5849p-14, - 0x1.5a329b7d30748p-14, - 0x1.49e17724f4d41p-14, - 0x1.3a4b60ba9aa4dp-14, - 0x1.2b6875310f785p-14, - 0x1.1d312098e9dbap-14, - 0x1.0f9e1b4dd36dfp-14, - 0x1.02a8673a94691p-14, - 0x1.ec929a665b449p-15, - 0x1.d4f4b4c8e09edp-15, - 0x1.be6abbb10a5aap-15, - 0x1.a8e8cc1fadef6p-15, - 0x1.94637d5bacfdbp-15, - 0x1.80cfdc72220cfp-15, - 0x1.6e2367dc27f95p-15, - 0x1.5c540b4936fd2p-15, - 0x1.4b581b8d170fcp-15, - 0x1.3b2652b06c2b2p-15, - 0x1.2bb5cc22e5db6p-15, - 0x1.1cfe010e2052dp-15, - 0x1.0ef6c4c84a0fep-15, - 0x1.01984165a5f36p-15, - 0x1.e9b5e8d00ce76p-16, - 0x1.d16f5716c6c1ap-16, - 0x1.ba4f035d60e02p-16, - 0x1.a447b7b03f045p-16, - 0x1.8f4ccca7fc90dp-16, - 0x1.7b5223dac7336p-16, - 0x1.684c227fcacefp-16, - 0x1.562fac4329b48p-16, - 0x1.44f21e49054f2p-16, - 0x1.34894a5e24657p-16, - 0x1.24eb7254ccf83p-16, - 0x1.160f438c70913p-16, - 0x1.07ebd2a2d2844p-16, - 0x1.f4f12e9ab070ap-17, - 0x1.db5ad0b27805cp-17, - 0x1.c304efa2c6f4ep-17, - 0x1.abe09e9144b5ep-17, - 0x1.95df988e76644p-17, - 0x1.80f439b4ee04bp-17, - 0x1.6d11788a69c64p-17, - 0x1.5a2adfa0b4bc4p-17, - 0x1.4834877429b8fp-17, - 0x1.37231085c7d9ap-17, - 0x1.26eb9daed6f7ep-17, - 0x1.1783ceac28910p-17, - 0x1.08e1badf0fcedp-17, - 0x1.f5f7d88472604p-18, - 0x1.db92b5212fb8dp-18, - 0x1.c282cd3957edap-18, - 0x1.aab7abace48dcp-18, - 0x1.94219bfcb4928p-18, - 0x1.7eb1a2075864dp-18, - 0x1.6a597219a93d9p-18, - 0x1.570b69502f313p-18, - 0x1.44ba864670882p-18, - 0x1.335a62115bce2p-18, - 0x1.22df298214423p-18, - 0x1.133d96ae7e0ddp-18, - 0x1.046aeabcfcdecp-18, - 0x1.ecb9cfe1d8642p-19, - 0x1.d21397ead99cbp-19, - 0x1.b8d094c86d374p-19, - 0x1.a0df0f0c626dcp-19, - 0x1.8a2e269750a39p-19, - 0x1.74adc8f4064d3p-19, - 0x1.604ea819f007cp-19, - 0x1.4d0231928c6f9p-19, - 0x1.3aba85fe22e1fp-19, - 0x1.296a70f414053p-19, - 0x1.1905613b3abf2p-19, - 0x1.097f6156f32c5p-19, - 0x1.f59a20caf6695p-20, - 0x1.d9c73698fb1dcp-20, - 0x1.bf716c6168baep-20, - 0x1.a6852c6b58392p-20, - 0x1.8eefd70594a88p-20, - 0x1.789fb715aae95p-20, - 0x1.6383f726a8e04p-20, - 0x1.4f8c96f26a26ap-20, - 0x1.3caa61607f920p-20, - 0x1.2acee2f5ecdb8p-20, - 0x1.19ec60b1242edp-20, - 0x1.09f5cf4dd2877p-20, - 0x1.f5bd95d8730d8p-21, - 0x1.d9371e2ff7c35p-21, - 0x1.be41de54d155ap-21, - 0x1.a4c89e08ef4f3p-21, - 0x1.8cb738399b12cp-21, - 0x1.75fa8dbc84becp-21, - 0x1.608078a70dcbcp-21, - 0x1.4c37c0394d094p-21, - 0x1.39100d5687bfep-21, - 0x1.26f9df8519bd6p-21, - 0x1.15e6827001f18p-21, - 0x1.05c803e4831c1p-21, - 0x1.ed22548cffd35p-22, - 0x1.d06ad6ecdf971p-22, - 0x1.b551c847fbc96p-22, - 0x1.9bc09f112b494p-22, - 0x1.83a1ff0aa239dp-22, - 0x1.6ce1aa3fd7bddp-22, - 0x1.576c72b514859p-22, - 0x1.43302cc4a0da8p-22, - 0x1.301ba221dc9bbp-22, - 0x1.1e1e857adc568p-22, - 0x1.0d2966b1746f7p-22, - 0x1.fa5b4f49cc6b2p-23, - 0x1.dc3ae30b55c16p-23, - 0x1.bfd7555a3bd68p-23, - 0x1.a517d9e61628ap-23, - 0x1.8be4f8f6c951fp-23, - 0x1.74287ded49339p-23, - 0x1.5dcd669f2cd34p-23, - 0x1.48bfd38302870p-23, - 0x1.34ecf8a3c124ap-23, - 0x1.22430f521cbcfp-23, - 0x1.10b1488aeb235p-23, - 0x1.0027c00a263a6p-23, - 0x1.e12ee004efc37p-24, - 0x1.c3e44ae32b16bp-24, - 0x1.a854ea14102a8p-24, - 0x1.8e6761569f45dp-24, - 0x1.7603bac345f65p-24, - 0x1.5f1353cdad001p-24, - 0x1.4980cb3c80949p-24, - 0x1.3537f00b6ad4dp-24, - 0x1.2225b12bffc68p-24, - 0x1.10380e1adb7e9p-24, - 0x1.febc107d5efaap-25, - 0x1.df0f2a0ee6946p-25, - 0x1.c14b2188bcee4p-25, - 0x1.a553644f7f07dp-25, - 0x1.8b0cfce0579dfp-25, - 0x1.725e7c5dd20f7p-25, - 0x1.5b2fe547a1340p-25, - 0x1.456a974e92e93p-25, - 0x1.30f93c3699078p-25, - 0x1.1dc7b5b978cf8p-25, - 0x1.0bc30c5d52f15p-25, - 0x1.f5b2be65a0c7fp-26, - 0x1.d5f3a8dea7357p-26, - 0x1.b82915b03515bp-26, - 0x1.9c3517e789488p-26, - 0x1.81fb7df06136ep-26, - 0x1.6961b8d641d06p-26, - 0x1.524ec4d916caep-26, - 0x1.3cab1343d18d1p-26, - 0x1.2860757487a01p-26, - 0x1.155a09065d4f7p-26, - 0x1.0384250e4c9fcp-26, - 0x1.e59890b926c78p-27, - 0x1.c642116a8a9e3p-27, - 0x1.a8e405e651ab6p-27, - 0x1.8d5f98114f872p-27, - 0x1.7397c5a66e307p-27, - 0x1.5b71456c5a4c4p-27, - 0x1.44d26de513197p-27, - 0x1.2fa31d6371537p-27, - 0x1.1bcca373b7b43p-27, - 0x1.0939ab853339fp-27, - 0x1.efac5187b2863p-28, - 0x1.cf1e86235d0e6p-28, - 0x1.b0a68a2128babp-28, - 0x1.9423165bc4444p-28, - 0x1.7974e743dea3cp-28, - 0x1.607e9eacd1050p-28, - 0x1.4924a74dec728p-28, - 0x1.334d19e0c2160p-28, - 0x1.1edfa3c5f5ccap-28, - 0x1.0bc56f1b54701p-28, - 0x1.f3d2185e047d9p-29, - 0x1.d26cb87945e87p-29, - 0x1.b334fac4b9f99p-29, - 0x1.96076f7918d1cp-29, - 0x1.7ac2d72fc2c63p-29, - 0x1.614801550319ep-29, - 0x1.4979ac8b28926p-29, - 0x1.333c68e2d0548p-29, - 0x1.1e767bce37dd7p-29, - 0x1.0b0fc5b6d05a0p-29, - 0x1.f1e3523b41d7dp-30, - 0x1.d00de6608effep-30, - 0x1.b0778b7b3301ap-30, - 0x1.92fb04ec0f6cfp-30, - 0x1.77756ec9f78fap-30, - 0x1.5dc61922d5a06p-30, - 0x1.45ce65699ff6dp-30, - 0x1.2f71a5f159970p-30, - 0x1.1a94ff571654fp-30, - 0x1.071f4bbea09ecp-30, - 0x1.e9f1ff8ddd774p-31, - 0x1.c818223a202c7p-31, - 0x1.a887bd2b4404dp-31, - 0x1.8b1a336c5eb6bp-31, - 0x1.6fab63324088ap-31, - 0x1.56197e30205bap-31, - 0x1.3e44e45301b92p-31, - 0x1.281000bfe4c3fp-31, - 0x1.135f28f2d50b4p-31, - 0x1.00187dded5975p-31, - 0x1.dc479de0ef001p-32, - 0x1.bad4fdad3caa1p-32, - 0x1.9baed3ed27ab8p-32, - 0x1.7ead9ce4285bbp-32, - 0x1.63ac6b4edc88ep-32, - 0x1.4a88be2a6390cp-32, - 0x1.332259185f1a0p-32, - 0x1.1d5b1f3793044p-32, - 0x1.0916f04b6e18bp-32, - 0x1.ec77101de6926p-33, - 0x1.c960bf23153e0p-33, - 0x1.a8bd20fc65ef7p-33, - 0x1.8a61745ec7d1dp-33, - 0x1.6e25d0e756261p-33, - 0x1.53e4f7d1666cbp-33, - 0x1.3b7c27a7ddb0ep-33, - 0x1.24caf2c32af14p-33, - 0x1.0fb3186804d0fp-33, - 0x1.f830c0bb41fd7p-34, - 0x1.d3c0f1a91c846p-34, - 0x1.b1e5acf351d87p-34, - 0x1.92712d259ce66p-34, - 0x1.7538c60a04476p-34, - 0x1.5a14b04b47879p-34, - 0x1.40dfd87456f4cp-34, - 0x1.2977b1172b9d5p-34, - 0x1.13bc07e891491p-34, - 0x1.ff1dbb4300811p-35, - 0x1.d9a880f306bd8p-35, - 0x1.b6e45220b55e0p-35, - 0x1.96a0b33f2c4dap-35, - 0x1.78b07e9e924acp-35, - 0x1.5ce9ab1670dd2p-35, - 0x1.4325167006bb0p-35, - 0x1.2b3e53538ff3fp-35, - 0x1.15137a7f44864p-35, - 0x1.0084ff125639dp-35, - 0x1.daeb0b7311ec7p-36, - 0x1.b7937d1c40c52p-36, - 0x1.96d082f59ab06p-36, - 0x1.7872d9fa10aadp-36, - 0x1.5c4e8e37bc7d0p-36, - 0x1.423ac0df49a40p-36, - 0x1.2a117230ad284p-36, - 0x1.13af4f04f9998p-36, - 0x1.fde703724e560p-37, - 0x1.d77f0c82e7641p-37, - 0x1.b3ee02611d7ddp-37, - 0x1.92ff33023d5bdp-37, - 0x1.7481a9e69f53fp-37, - 0x1.5847eda620959p-37, - 0x1.3e27c1fcc74bdp-37, - 0x1.25f9ee0b923dcp-37, - 0x1.0f9a0686531ffp-37, - 0x1.f5cc7718082afp-38, - 0x1.cf7e53d6a2ca5p-38, - 0x1.ac0f5f3229372p-38, - 0x1.8b498644847eap-38, - 0x1.6cfa9bcca59dcp-38, - 0x1.50f411d4fd2cdp-38, - 0x1.370ab8327af5ep-38, - 0x1.1f167f88c6b6ep-38, - 0x1.08f24085d4597p-38, - 0x1.e8f70e181d619p-39, - 0x1.c324c20e337dcp-39, - 0x1.a03261574b54ep-39, - 0x1.7fe903cdf5855p-39, - 0x1.6215c58da3450p-39, - 0x1.46897d4b69fc6p-39, - 0x1.2d1877d731b7bp-39, - 0x1.159a386b11517p-39, - 0x1.ffd27ae9393cep-40, - 0x1.d7c593130dd0bp-40, - 0x1.b2cd607c79bcfp-40, - 0x1.90ae4d3405651p-40, - 0x1.71312dd1759e2p-40, - 0x1.5422ef5d8949dp-40, - 0x1.39544b0ecc957p-40, - 0x1.20997f73e73ddp-40, - 0x1.09ca0eaacd277p-40, - 0x1.e9810295890ecp-41, - 0x1.c2b45b5aa4a1dp-41, - 0x1.9eee068fa7596p-41, - 0x1.7df2b399c10a8p-41, - 0x1.5f8b87a31bd85p-41, - 0x1.4385c96e9a2d9p-41, - 0x1.29b2933ef4cbcp-41, - 0x1.11e68a6378f8ap-41, - 0x1.f7f338086a86bp-42, - 0x1.cf8d7d9ce040ap-42, - 0x1.aa577251ae484p-42, - 0x1.8811d739efb5ep-42, - 0x1.68823e52970bep-42, - 0x1.4b72ae68e8b4cp-42, - 0x1.30b14dbe876bcp-42, - 0x1.181012ef86610p-42, - 0x1.01647ba798744p-42, - 0x1.d90e917701675p-43, - 0x1.b2a87e86d0c8ap-43, - 0x1.8f53dcb377293p-43, - 0x1.6ed2f2515e933p-43, - 0x1.50ecc9ed47f19p-43, - 0x1.356cd5ce7799ep-43, - 0x1.1c229a587ab78p-43, - 0x1.04e15ecc7f3f6p-43, - 0x1.deffc7e6a6017p-44, - 0x1.b7b040832f310p-44, - 0x1.938e021f36d76p-44, - 0x1.7258610b3b233p-44, - 0x1.53d3bfc82a909p-44, - 0x1.37c92babdc2fdp-44, - 0x1.1e06010120f6ap-44, - 0x1.065b9616170d4p-44, - 0x1.e13dd96b3753ap-45, - 0x1.b950d32467392p-45, - 0x1.94a72263259a5p-45, - 0x1.72fd93e036cdcp-45, - 0x1.54164576929abp-45, - 0x1.37b83c521fe96p-45, - 0x1.1daf033182e96p-45, - 0x1.05ca50205d26ap-45, - 0x1.dfbb6235639fap-46, - 0x1.b7807e294781fp-46, - 0x1.9298add70a734p-46, - 0x1.70beaf9c7ffb6p-46, - 0x1.51b2cd6709222p-46, - 0x1.353a6cf7f7fffp-46, - 0x1.1b1fa8cbe84a7p-46, - 0x1.0330f0fd69921p-46, - 0x1.da81670f96f9bp-47, - 0x1.b24a16b4d09aap-47, - 0x1.8d6eeb6efdbd6p-47, - 0x1.6ba91ac734785p-47, - 0x1.4cb7966770ab5p-47, - 0x1.305e9721d0981p-47, - 0x1.1667311fff70ap-47, - 0x1.fd3de10d62855p-48, - 0x1.d1aefbcd48d0cp-48, - 0x1.a9cc93c25aca9p-48, - 0x1.85487ee3ea735p-48, - 0x1.63daf8b4b1e0cp-48, - 0x1.45421e69a6ca1p-48, - 0x1.294175802d99ap-48, - 0x1.0fa17bf41068fp-48, - 0x1.f05e82aae2bb9p-49, - 0x1.c578101b29058p-49, - 0x1.9e39dc5dd2f7cp-49, - 0x1.7a553a728bbf2p-49, - 0x1.5982008db1304p-49, - 0x1.3b7e00422e51bp-49, - 0x1.200c898d9ee3ep-49, - 0x1.06f5f7eb65a56p-49, - 0x1.e00e9148a1d25p-50, - 0x1.b623734024e92p-50, - 0x1.8fd4e01891bf8p-50, - 0x1.6cd44c7470d89p-50, - 0x1.4cd9c04158cd7p-50, - 0x1.2fa34bf5c8344p-50, - 0x1.14f4890ff2461p-50, - 0x1.f92c49dfa4df5p-51, - 0x1.ccaaea71ab0dfp-51, - 0x1.a40829f001197p-51, - 0x1.7eef13b59e96cp-51, - 0x1.5d11e1a252bf5p-51, - 0x1.3e296303b2297p-51, - 0x1.21f47009f43cep-51, - 0x1.083768c5e4541p-51, - 0x1.e1777d831265ep-52, - 0x1.b69f10b0191b5p-52, - 0x1.8f8a3a05b5b52p-52, - 0x1.6be573c40c8e7p-52, - 0x1.4b645ba991fdbp-52, - 0x1.2dc119095729fp-52, - }, -}; diff --git a/pl/math/sv_erff_data.c b/pl/math/sv_erff_data.c deleted file mode 100644 index 154d3c1888748d..00000000000000 --- a/pl/math/sv_erff_data.c +++ /dev/null @@ -1,1046 +0,0 @@ -/* - * Data for approximation of vector erff. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* Lookup table used in SVE erff. - For each possible rounded input r (multiples of 1/128), between - r = 0.0 and r = 4.0 (513 values): - - __erff_data.erf contains the values of erf(r), - - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2). - Note that indices 0 and 1 are never hit by the algorithm, since lookup is - performed only for x >= 1/64-1/512. */ -const struct sv_erff_data __sv_erff_data = { - .erf = { 0x0.000000p+0, - 0x1.20dbf4p-7, - 0x1.20d770p-6, - 0x1.b137e0p-6, - 0x1.20c564p-5, - 0x1.68e5d4p-5, - 0x1.b0fafep-5, - 0x1.f902a8p-5, - 0x1.207d48p-4, - 0x1.44703ep-4, - 0x1.68591ap-4, - 0x1.8c36bep-4, - 0x1.b00812p-4, - 0x1.d3cbf8p-4, - 0x1.f7815ap-4, - 0x1.0d9390p-3, - 0x1.1f5e1ap-3, - 0x1.311fc2p-3, - 0x1.42d7fcp-3, - 0x1.548642p-3, - 0x1.662a0cp-3, - 0x1.77c2d2p-3, - 0x1.895010p-3, - 0x1.9ad142p-3, - 0x1.ac45e4p-3, - 0x1.bdad72p-3, - 0x1.cf076ep-3, - 0x1.e05354p-3, - 0x1.f190aap-3, - 0x1.015f78p-2, - 0x1.09eed6p-2, - 0x1.127632p-2, - 0x1.1af54ep-2, - 0x1.236bf0p-2, - 0x1.2bd9dcp-2, - 0x1.343ed6p-2, - 0x1.3c9aa8p-2, - 0x1.44ed18p-2, - 0x1.4d35f0p-2, - 0x1.5574f4p-2, - 0x1.5da9f4p-2, - 0x1.65d4b8p-2, - 0x1.6df50ap-2, - 0x1.760abap-2, - 0x1.7e1594p-2, - 0x1.861566p-2, - 0x1.8e0a02p-2, - 0x1.95f336p-2, - 0x1.9dd0d2p-2, - 0x1.a5a2acp-2, - 0x1.ad6896p-2, - 0x1.b52264p-2, - 0x1.bccfecp-2, - 0x1.c47104p-2, - 0x1.cc0584p-2, - 0x1.d38d44p-2, - 0x1.db081cp-2, - 0x1.e275eap-2, - 0x1.e9d68ap-2, - 0x1.f129d4p-2, - 0x1.f86faap-2, - 0x1.ffa7eap-2, - 0x1.03693ap-1, - 0x1.06f794p-1, - 0x1.0a7ef6p-1, - 0x1.0dff50p-1, - 0x1.117894p-1, - 0x1.14eab4p-1, - 0x1.1855a6p-1, - 0x1.1bb95cp-1, - 0x1.1f15ccp-1, - 0x1.226ae8p-1, - 0x1.25b8a8p-1, - 0x1.28ff02p-1, - 0x1.2c3decp-1, - 0x1.2f755cp-1, - 0x1.32a54cp-1, - 0x1.35cdb4p-1, - 0x1.38ee8ap-1, - 0x1.3c07cap-1, - 0x1.3f196ep-1, - 0x1.42236ep-1, - 0x1.4525c8p-1, - 0x1.482074p-1, - 0x1.4b1372p-1, - 0x1.4dfebap-1, - 0x1.50e24cp-1, - 0x1.53be26p-1, - 0x1.569244p-1, - 0x1.595ea6p-1, - 0x1.5c2348p-1, - 0x1.5ee02ep-1, - 0x1.619556p-1, - 0x1.6442c0p-1, - 0x1.66e86ep-1, - 0x1.69865ep-1, - 0x1.6c1c98p-1, - 0x1.6eab18p-1, - 0x1.7131e6p-1, - 0x1.73b102p-1, - 0x1.762870p-1, - 0x1.789836p-1, - 0x1.7b0058p-1, - 0x1.7d60d8p-1, - 0x1.7fb9c0p-1, - 0x1.820b12p-1, - 0x1.8454d6p-1, - 0x1.869712p-1, - 0x1.88d1cep-1, - 0x1.8b050ep-1, - 0x1.8d30dep-1, - 0x1.8f5544p-1, - 0x1.91724ap-1, - 0x1.9387f6p-1, - 0x1.959652p-1, - 0x1.979d68p-1, - 0x1.999d42p-1, - 0x1.9b95e8p-1, - 0x1.9d8768p-1, - 0x1.9f71cap-1, - 0x1.a1551ap-1, - 0x1.a33162p-1, - 0x1.a506b0p-1, - 0x1.a6d50cp-1, - 0x1.a89c86p-1, - 0x1.aa5d26p-1, - 0x1.ac16fcp-1, - 0x1.adca14p-1, - 0x1.af767ap-1, - 0x1.b11c3cp-1, - 0x1.b2bb68p-1, - 0x1.b4540ap-1, - 0x1.b5e630p-1, - 0x1.b771e8p-1, - 0x1.b8f742p-1, - 0x1.ba764ap-1, - 0x1.bbef10p-1, - 0x1.bd61a2p-1, - 0x1.bece0ep-1, - 0x1.c03464p-1, - 0x1.c194b2p-1, - 0x1.c2ef08p-1, - 0x1.c44376p-1, - 0x1.c5920ap-1, - 0x1.c6dad2p-1, - 0x1.c81de2p-1, - 0x1.c95b46p-1, - 0x1.ca930ep-1, - 0x1.cbc54cp-1, - 0x1.ccf20cp-1, - 0x1.ce1962p-1, - 0x1.cf3b5cp-1, - 0x1.d0580cp-1, - 0x1.d16f7ep-1, - 0x1.d281c4p-1, - 0x1.d38ef0p-1, - 0x1.d49710p-1, - 0x1.d59a34p-1, - 0x1.d6986cp-1, - 0x1.d791cap-1, - 0x1.d8865ep-1, - 0x1.d97636p-1, - 0x1.da6162p-1, - 0x1.db47f4p-1, - 0x1.dc29fcp-1, - 0x1.dd0788p-1, - 0x1.dde0aap-1, - 0x1.deb570p-1, - 0x1.df85eap-1, - 0x1.e0522ap-1, - 0x1.e11a3ep-1, - 0x1.e1de36p-1, - 0x1.e29e22p-1, - 0x1.e35a12p-1, - 0x1.e41214p-1, - 0x1.e4c638p-1, - 0x1.e5768cp-1, - 0x1.e62322p-1, - 0x1.e6cc08p-1, - 0x1.e7714ap-1, - 0x1.e812fcp-1, - 0x1.e8b12ap-1, - 0x1.e94be4p-1, - 0x1.e9e336p-1, - 0x1.ea7730p-1, - 0x1.eb07e2p-1, - 0x1.eb9558p-1, - 0x1.ec1fa2p-1, - 0x1.eca6ccp-1, - 0x1.ed2ae6p-1, - 0x1.edabfcp-1, - 0x1.ee2a1ep-1, - 0x1.eea556p-1, - 0x1.ef1db4p-1, - 0x1.ef9344p-1, - 0x1.f00614p-1, - 0x1.f07630p-1, - 0x1.f0e3a6p-1, - 0x1.f14e82p-1, - 0x1.f1b6d0p-1, - 0x1.f21ca0p-1, - 0x1.f27ff8p-1, - 0x1.f2e0eap-1, - 0x1.f33f7ep-1, - 0x1.f39bc2p-1, - 0x1.f3f5c2p-1, - 0x1.f44d88p-1, - 0x1.f4a31ep-1, - 0x1.f4f694p-1, - 0x1.f547f2p-1, - 0x1.f59742p-1, - 0x1.f5e490p-1, - 0x1.f62fe8p-1, - 0x1.f67952p-1, - 0x1.f6c0dcp-1, - 0x1.f7068cp-1, - 0x1.f74a6ep-1, - 0x1.f78c8cp-1, - 0x1.f7cceep-1, - 0x1.f80ba2p-1, - 0x1.f848acp-1, - 0x1.f8841ap-1, - 0x1.f8bdf2p-1, - 0x1.f8f63ep-1, - 0x1.f92d08p-1, - 0x1.f96256p-1, - 0x1.f99634p-1, - 0x1.f9c8a8p-1, - 0x1.f9f9bap-1, - 0x1.fa2974p-1, - 0x1.fa57dep-1, - 0x1.fa84fep-1, - 0x1.fab0dep-1, - 0x1.fadb84p-1, - 0x1.fb04f6p-1, - 0x1.fb2d40p-1, - 0x1.fb5464p-1, - 0x1.fb7a6cp-1, - 0x1.fb9f60p-1, - 0x1.fbc344p-1, - 0x1.fbe61ep-1, - 0x1.fc07fap-1, - 0x1.fc28d8p-1, - 0x1.fc48c2p-1, - 0x1.fc67bcp-1, - 0x1.fc85d0p-1, - 0x1.fca2fep-1, - 0x1.fcbf52p-1, - 0x1.fcdaccp-1, - 0x1.fcf576p-1, - 0x1.fd0f54p-1, - 0x1.fd286ap-1, - 0x1.fd40bep-1, - 0x1.fd5856p-1, - 0x1.fd6f34p-1, - 0x1.fd8562p-1, - 0x1.fd9ae2p-1, - 0x1.fdafb8p-1, - 0x1.fdc3e8p-1, - 0x1.fdd77ap-1, - 0x1.fdea6ep-1, - 0x1.fdfcccp-1, - 0x1.fe0e96p-1, - 0x1.fe1fd0p-1, - 0x1.fe3080p-1, - 0x1.fe40a6p-1, - 0x1.fe504cp-1, - 0x1.fe5f70p-1, - 0x1.fe6e18p-1, - 0x1.fe7c46p-1, - 0x1.fe8a00p-1, - 0x1.fe9748p-1, - 0x1.fea422p-1, - 0x1.feb090p-1, - 0x1.febc96p-1, - 0x1.fec836p-1, - 0x1.fed374p-1, - 0x1.fede52p-1, - 0x1.fee8d4p-1, - 0x1.fef2fep-1, - 0x1.fefccep-1, - 0x1.ff064cp-1, - 0x1.ff0f76p-1, - 0x1.ff1852p-1, - 0x1.ff20e0p-1, - 0x1.ff2924p-1, - 0x1.ff3120p-1, - 0x1.ff38d6p-1, - 0x1.ff4048p-1, - 0x1.ff4778p-1, - 0x1.ff4e68p-1, - 0x1.ff551ap-1, - 0x1.ff5b90p-1, - 0x1.ff61ccp-1, - 0x1.ff67d0p-1, - 0x1.ff6d9ep-1, - 0x1.ff7338p-1, - 0x1.ff789ep-1, - 0x1.ff7dd4p-1, - 0x1.ff82dap-1, - 0x1.ff87b2p-1, - 0x1.ff8c5cp-1, - 0x1.ff90dcp-1, - 0x1.ff9532p-1, - 0x1.ff9960p-1, - 0x1.ff9d68p-1, - 0x1.ffa14ap-1, - 0x1.ffa506p-1, - 0x1.ffa8a0p-1, - 0x1.ffac18p-1, - 0x1.ffaf6ep-1, - 0x1.ffb2a6p-1, - 0x1.ffb5bep-1, - 0x1.ffb8b8p-1, - 0x1.ffbb98p-1, - 0x1.ffbe5ap-1, - 0x1.ffc102p-1, - 0x1.ffc390p-1, - 0x1.ffc606p-1, - 0x1.ffc862p-1, - 0x1.ffcaa8p-1, - 0x1.ffccd8p-1, - 0x1.ffcef4p-1, - 0x1.ffd0fap-1, - 0x1.ffd2eap-1, - 0x1.ffd4cap-1, - 0x1.ffd696p-1, - 0x1.ffd84ep-1, - 0x1.ffd9f8p-1, - 0x1.ffdb90p-1, - 0x1.ffdd18p-1, - 0x1.ffde90p-1, - 0x1.ffdffap-1, - 0x1.ffe154p-1, - 0x1.ffe2a2p-1, - 0x1.ffe3e2p-1, - 0x1.ffe514p-1, - 0x1.ffe63cp-1, - 0x1.ffe756p-1, - 0x1.ffe866p-1, - 0x1.ffe96ap-1, - 0x1.ffea64p-1, - 0x1.ffeb54p-1, - 0x1.ffec3ap-1, - 0x1.ffed16p-1, - 0x1.ffedeap-1, - 0x1.ffeeb4p-1, - 0x1.ffef76p-1, - 0x1.fff032p-1, - 0x1.fff0e4p-1, - 0x1.fff18ep-1, - 0x1.fff232p-1, - 0x1.fff2d0p-1, - 0x1.fff366p-1, - 0x1.fff3f6p-1, - 0x1.fff480p-1, - 0x1.fff504p-1, - 0x1.fff582p-1, - 0x1.fff5fcp-1, - 0x1.fff670p-1, - 0x1.fff6dep-1, - 0x1.fff74ap-1, - 0x1.fff7aep-1, - 0x1.fff810p-1, - 0x1.fff86cp-1, - 0x1.fff8c6p-1, - 0x1.fff91cp-1, - 0x1.fff96cp-1, - 0x1.fff9bap-1, - 0x1.fffa04p-1, - 0x1.fffa4cp-1, - 0x1.fffa90p-1, - 0x1.fffad0p-1, - 0x1.fffb0ep-1, - 0x1.fffb4ap-1, - 0x1.fffb82p-1, - 0x1.fffbb8p-1, - 0x1.fffbecp-1, - 0x1.fffc1ep-1, - 0x1.fffc4ep-1, - 0x1.fffc7ap-1, - 0x1.fffca6p-1, - 0x1.fffccep-1, - 0x1.fffcf6p-1, - 0x1.fffd1ap-1, - 0x1.fffd3ep-1, - 0x1.fffd60p-1, - 0x1.fffd80p-1, - 0x1.fffda0p-1, - 0x1.fffdbep-1, - 0x1.fffddap-1, - 0x1.fffdf4p-1, - 0x1.fffe0ep-1, - 0x1.fffe26p-1, - 0x1.fffe3ep-1, - 0x1.fffe54p-1, - 0x1.fffe68p-1, - 0x1.fffe7ep-1, - 0x1.fffe90p-1, - 0x1.fffea2p-1, - 0x1.fffeb4p-1, - 0x1.fffec4p-1, - 0x1.fffed4p-1, - 0x1.fffee4p-1, - 0x1.fffef2p-1, - 0x1.ffff00p-1, - 0x1.ffff0cp-1, - 0x1.ffff18p-1, - 0x1.ffff24p-1, - 0x1.ffff30p-1, - 0x1.ffff3ap-1, - 0x1.ffff44p-1, - 0x1.ffff4ep-1, - 0x1.ffff56p-1, - 0x1.ffff60p-1, - 0x1.ffff68p-1, - 0x1.ffff70p-1, - 0x1.ffff78p-1, - 0x1.ffff7ep-1, - 0x1.ffff84p-1, - 0x1.ffff8cp-1, - 0x1.ffff92p-1, - 0x1.ffff98p-1, - 0x1.ffff9cp-1, - 0x1.ffffa2p-1, - 0x1.ffffa6p-1, - 0x1.ffffacp-1, - 0x1.ffffb0p-1, - 0x1.ffffb4p-1, - 0x1.ffffb8p-1, - 0x1.ffffbcp-1, - 0x1.ffffc0p-1, - 0x1.ffffc4p-1, - 0x1.ffffc6p-1, - 0x1.ffffcap-1, - 0x1.ffffccp-1, - 0x1.ffffd0p-1, - 0x1.ffffd2p-1, - 0x1.ffffd4p-1, - 0x1.ffffd6p-1, - 0x1.ffffd8p-1, - 0x1.ffffdcp-1, - 0x1.ffffdep-1, - 0x1.ffffdep-1, - 0x1.ffffe0p-1, - 0x1.ffffe2p-1, - 0x1.ffffe4p-1, - 0x1.ffffe6p-1, - 0x1.ffffe8p-1, - 0x1.ffffe8p-1, - 0x1.ffffeap-1, - 0x1.ffffeap-1, - 0x1.ffffecp-1, - 0x1.ffffeep-1, - 0x1.ffffeep-1, - 0x1.fffff0p-1, - 0x1.fffff0p-1, - 0x1.fffff2p-1, - 0x1.fffff2p-1, - 0x1.fffff2p-1, - 0x1.fffff4p-1, - 0x1.fffff4p-1, - 0x1.fffff4p-1, - 0x1.fffff6p-1, - 0x1.fffff6p-1, - 0x1.fffff6p-1, - 0x1.fffff8p-1, - 0x1.fffff8p-1, - 0x1.fffff8p-1, - 0x1.fffff8p-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - }, - .scale = { 0x1.20dd76p+0, - 0x1.20d8f2p+0, - 0x1.20cb68p+0, - 0x1.20b4d8p+0, - 0x1.209546p+0, - 0x1.206cb4p+0, - 0x1.203b26p+0, - 0x1.2000a0p+0, - 0x1.1fbd28p+0, - 0x1.1f70c4p+0, - 0x1.1f1b7ap+0, - 0x1.1ebd56p+0, - 0x1.1e565cp+0, - 0x1.1de698p+0, - 0x1.1d6e14p+0, - 0x1.1cecdcp+0, - 0x1.1c62fap+0, - 0x1.1bd07cp+0, - 0x1.1b3572p+0, - 0x1.1a91e6p+0, - 0x1.19e5eap+0, - 0x1.19318cp+0, - 0x1.1874dep+0, - 0x1.17aff0p+0, - 0x1.16e2d8p+0, - 0x1.160da4p+0, - 0x1.153068p+0, - 0x1.144b3cp+0, - 0x1.135e30p+0, - 0x1.12695ep+0, - 0x1.116cd8p+0, - 0x1.1068bap+0, - 0x1.0f5d16p+0, - 0x1.0e4a08p+0, - 0x1.0d2fa6p+0, - 0x1.0c0e0ap+0, - 0x1.0ae550p+0, - 0x1.09b590p+0, - 0x1.087ee4p+0, - 0x1.07416cp+0, - 0x1.05fd3ep+0, - 0x1.04b27cp+0, - 0x1.036140p+0, - 0x1.0209a6p+0, - 0x1.00abd0p+0, - 0x1.fe8fb0p-1, - 0x1.fbbbbep-1, - 0x1.f8dc0ap-1, - 0x1.f5f0cep-1, - 0x1.f2fa4cp-1, - 0x1.eff8c4p-1, - 0x1.ecec78p-1, - 0x1.e9d5a8p-1, - 0x1.e6b498p-1, - 0x1.e38988p-1, - 0x1.e054bep-1, - 0x1.dd167cp-1, - 0x1.d9cf06p-1, - 0x1.d67ea2p-1, - 0x1.d32592p-1, - 0x1.cfc41ep-1, - 0x1.cc5a8ap-1, - 0x1.c8e91cp-1, - 0x1.c5701ap-1, - 0x1.c1efcap-1, - 0x1.be6872p-1, - 0x1.bada5ap-1, - 0x1.b745c6p-1, - 0x1.b3aafcp-1, - 0x1.b00a46p-1, - 0x1.ac63e8p-1, - 0x1.a8b828p-1, - 0x1.a5074ep-1, - 0x1.a1519ep-1, - 0x1.9d9762p-1, - 0x1.99d8dap-1, - 0x1.961650p-1, - 0x1.925008p-1, - 0x1.8e8646p-1, - 0x1.8ab950p-1, - 0x1.86e96ap-1, - 0x1.8316d6p-1, - 0x1.7f41dcp-1, - 0x1.7b6abcp-1, - 0x1.7791b8p-1, - 0x1.73b714p-1, - 0x1.6fdb12p-1, - 0x1.6bfdf0p-1, - 0x1.681ff2p-1, - 0x1.644156p-1, - 0x1.60625cp-1, - 0x1.5c8342p-1, - 0x1.58a446p-1, - 0x1.54c5a6p-1, - 0x1.50e79ep-1, - 0x1.4d0a68p-1, - 0x1.492e42p-1, - 0x1.455366p-1, - 0x1.417a0cp-1, - 0x1.3da26ep-1, - 0x1.39ccc2p-1, - 0x1.35f940p-1, - 0x1.32281ep-1, - 0x1.2e5992p-1, - 0x1.2a8dcep-1, - 0x1.26c508p-1, - 0x1.22ff72p-1, - 0x1.1f3d3cp-1, - 0x1.1b7e98p-1, - 0x1.17c3b6p-1, - 0x1.140cc4p-1, - 0x1.1059eep-1, - 0x1.0cab62p-1, - 0x1.09014cp-1, - 0x1.055bd6p-1, - 0x1.01bb2cp-1, - 0x1.fc3ee6p-2, - 0x1.f511aap-2, - 0x1.edeeeep-2, - 0x1.e6d700p-2, - 0x1.dfca26p-2, - 0x1.d8c8aap-2, - 0x1.d1d2d0p-2, - 0x1.cae8dap-2, - 0x1.c40b08p-2, - 0x1.bd3998p-2, - 0x1.b674c8p-2, - 0x1.afbcd4p-2, - 0x1.a911f0p-2, - 0x1.a27456p-2, - 0x1.9be438p-2, - 0x1.9561c8p-2, - 0x1.8eed36p-2, - 0x1.8886b2p-2, - 0x1.822e66p-2, - 0x1.7be47ap-2, - 0x1.75a91ap-2, - 0x1.6f7c6ap-2, - 0x1.695e8cp-2, - 0x1.634fa6p-2, - 0x1.5d4fd4p-2, - 0x1.575f34p-2, - 0x1.517de6p-2, - 0x1.4bac00p-2, - 0x1.45e99cp-2, - 0x1.4036d0p-2, - 0x1.3a93b2p-2, - 0x1.350052p-2, - 0x1.2f7cc4p-2, - 0x1.2a0916p-2, - 0x1.24a554p-2, - 0x1.1f518ap-2, - 0x1.1a0dc6p-2, - 0x1.14da0ap-2, - 0x1.0fb662p-2, - 0x1.0aa2d0p-2, - 0x1.059f5ap-2, - 0x1.00ac00p-2, - 0x1.f79184p-3, - 0x1.edeb40p-3, - 0x1.e46530p-3, - 0x1.daff4ap-3, - 0x1.d1b982p-3, - 0x1.c893cep-3, - 0x1.bf8e1cp-3, - 0x1.b6a856p-3, - 0x1.ade26cp-3, - 0x1.a53c42p-3, - 0x1.9cb5bep-3, - 0x1.944ec2p-3, - 0x1.8c0732p-3, - 0x1.83deeap-3, - 0x1.7bd5c8p-3, - 0x1.73eba4p-3, - 0x1.6c2056p-3, - 0x1.6473b6p-3, - 0x1.5ce596p-3, - 0x1.5575c8p-3, - 0x1.4e241ep-3, - 0x1.46f066p-3, - 0x1.3fda6cp-3, - 0x1.38e1fap-3, - 0x1.3206dcp-3, - 0x1.2b48dap-3, - 0x1.24a7b8p-3, - 0x1.1e233ep-3, - 0x1.17bb2cp-3, - 0x1.116f48p-3, - 0x1.0b3f52p-3, - 0x1.052b0cp-3, - 0x1.fe6460p-4, - 0x1.f2a902p-4, - 0x1.e72372p-4, - 0x1.dbd32ap-4, - 0x1.d0b7a0p-4, - 0x1.c5d04ap-4, - 0x1.bb1c98p-4, - 0x1.b09bfcp-4, - 0x1.a64de6p-4, - 0x1.9c31c6p-4, - 0x1.92470ap-4, - 0x1.888d1ep-4, - 0x1.7f036cp-4, - 0x1.75a960p-4, - 0x1.6c7e64p-4, - 0x1.6381e2p-4, - 0x1.5ab342p-4, - 0x1.5211ecp-4, - 0x1.499d48p-4, - 0x1.4154bcp-4, - 0x1.3937b2p-4, - 0x1.31458ep-4, - 0x1.297dbap-4, - 0x1.21df9ap-4, - 0x1.1a6a96p-4, - 0x1.131e14p-4, - 0x1.0bf97ep-4, - 0x1.04fc3ap-4, - 0x1.fc4b5ep-5, - 0x1.eeea8cp-5, - 0x1.e1d4d0p-5, - 0x1.d508fap-5, - 0x1.c885e0p-5, - 0x1.bc4a54p-5, - 0x1.b05530p-5, - 0x1.a4a54ap-5, - 0x1.99397ap-5, - 0x1.8e109cp-5, - 0x1.83298ep-5, - 0x1.78832cp-5, - 0x1.6e1c58p-5, - 0x1.63f3f6p-5, - 0x1.5a08e8p-5, - 0x1.505a18p-5, - 0x1.46e66cp-5, - 0x1.3dacd2p-5, - 0x1.34ac36p-5, - 0x1.2be38cp-5, - 0x1.2351c2p-5, - 0x1.1af5d2p-5, - 0x1.12ceb4p-5, - 0x1.0adb60p-5, - 0x1.031ad6p-5, - 0x1.f7182ap-6, - 0x1.e85c44p-6, - 0x1.da0006p-6, - 0x1.cc0180p-6, - 0x1.be5ecep-6, - 0x1.b1160ap-6, - 0x1.a4255ap-6, - 0x1.978ae8p-6, - 0x1.8b44e6p-6, - 0x1.7f5188p-6, - 0x1.73af0cp-6, - 0x1.685bb6p-6, - 0x1.5d55ccp-6, - 0x1.529b9ep-6, - 0x1.482b84p-6, - 0x1.3e03d8p-6, - 0x1.3422fep-6, - 0x1.2a875cp-6, - 0x1.212f62p-6, - 0x1.181984p-6, - 0x1.0f443ep-6, - 0x1.06ae14p-6, - 0x1.fcab14p-7, - 0x1.ec7262p-7, - 0x1.dcaf36p-7, - 0x1.cd5ecap-7, - 0x1.be7e5ap-7, - 0x1.b00b38p-7, - 0x1.a202bep-7, - 0x1.94624ep-7, - 0x1.87275ep-7, - 0x1.7a4f6ap-7, - 0x1.6dd7fep-7, - 0x1.61beaep-7, - 0x1.56011cp-7, - 0x1.4a9cf6p-7, - 0x1.3f8ff6p-7, - 0x1.34d7dcp-7, - 0x1.2a727ap-7, - 0x1.205dacp-7, - 0x1.169756p-7, - 0x1.0d1d6ap-7, - 0x1.03ede2p-7, - 0x1.f60d8ap-8, - 0x1.e4cc4ap-8, - 0x1.d4143ap-8, - 0x1.c3e1a6p-8, - 0x1.b430ecp-8, - 0x1.a4fe84p-8, - 0x1.9646f4p-8, - 0x1.8806d8p-8, - 0x1.7a3adep-8, - 0x1.6cdfccp-8, - 0x1.5ff276p-8, - 0x1.536fc2p-8, - 0x1.4754acp-8, - 0x1.3b9e40p-8, - 0x1.30499cp-8, - 0x1.2553eep-8, - 0x1.1aba78p-8, - 0x1.107a8cp-8, - 0x1.06918cp-8, - 0x1.f9f9d0p-9, - 0x1.e77448p-9, - 0x1.d58da6p-9, - 0x1.c4412cp-9, - 0x1.b38a3ap-9, - 0x1.a36454p-9, - 0x1.93cb12p-9, - 0x1.84ba30p-9, - 0x1.762d84p-9, - 0x1.682100p-9, - 0x1.5a90b0p-9, - 0x1.4d78bcp-9, - 0x1.40d564p-9, - 0x1.34a306p-9, - 0x1.28de12p-9, - 0x1.1d8318p-9, - 0x1.128ebap-9, - 0x1.07fdb4p-9, - 0x1.fb99b8p-10, - 0x1.e7f232p-10, - 0x1.d4fed8p-10, - 0x1.c2b9d0p-10, - 0x1.b11d70p-10, - 0x1.a02436p-10, - 0x1.8fc8c8p-10, - 0x1.8005f0p-10, - 0x1.70d6a4p-10, - 0x1.6235fcp-10, - 0x1.541f34p-10, - 0x1.468daep-10, - 0x1.397ceep-10, - 0x1.2ce898p-10, - 0x1.20cc76p-10, - 0x1.15246ep-10, - 0x1.09ec86p-10, - 0x1.fe41cep-11, - 0x1.e97ba4p-11, - 0x1.d57f52p-11, - 0x1.c245d4p-11, - 0x1.afc85ep-11, - 0x1.9e0058p-11, - 0x1.8ce75ep-11, - 0x1.7c7744p-11, - 0x1.6caa0ep-11, - 0x1.5d79ecp-11, - 0x1.4ee142p-11, - 0x1.40daa4p-11, - 0x1.3360ccp-11, - 0x1.266ea8p-11, - 0x1.19ff46p-11, - 0x1.0e0de8p-11, - 0x1.0295f0p-11, - 0x1.ef25d4p-12, - 0x1.da0110p-12, - 0x1.c5b542p-12, - 0x1.b23a5ap-12, - 0x1.9f8894p-12, - 0x1.8d986ap-12, - 0x1.7c629ap-12, - 0x1.6be022p-12, - 0x1.5c0a38p-12, - 0x1.4cda54p-12, - 0x1.3e4a24p-12, - 0x1.305390p-12, - 0x1.22f0b4p-12, - 0x1.161be4p-12, - 0x1.09cfa4p-12, - 0x1.fc0d56p-13, - 0x1.e577bcp-13, - 0x1.cfd4a6p-13, - 0x1.bb1a96p-13, - 0x1.a74068p-13, - 0x1.943d4ap-13, - 0x1.8208bcp-13, - 0x1.709a8ep-13, - 0x1.5feadap-13, - 0x1.4ff208p-13, - 0x1.40a8c2p-13, - 0x1.3207fcp-13, - 0x1.2408eap-13, - 0x1.16a502p-13, - 0x1.09d5f8p-13, - 0x1.fb2b7ap-14, - 0x1.e3bcf4p-14, - 0x1.cd5528p-14, - 0x1.b7e946p-14, - 0x1.a36eecp-14, - 0x1.8fdc1cp-14, - 0x1.7d2738p-14, - 0x1.6b4702p-14, - 0x1.5a329cp-14, - 0x1.49e178p-14, - 0x1.3a4b60p-14, - 0x1.2b6876p-14, - 0x1.1d3120p-14, - 0x1.0f9e1cp-14, - 0x1.02a868p-14, - 0x1.ec929ap-15, - 0x1.d4f4b4p-15, - 0x1.be6abcp-15, - 0x1.a8e8ccp-15, - 0x1.94637ep-15, - 0x1.80cfdcp-15, - 0x1.6e2368p-15, - 0x1.5c540cp-15, - 0x1.4b581cp-15, - 0x1.3b2652p-15, - 0x1.2bb5ccp-15, - 0x1.1cfe02p-15, - 0x1.0ef6c4p-15, - 0x1.019842p-15, - 0x1.e9b5e8p-16, - 0x1.d16f58p-16, - 0x1.ba4f04p-16, - 0x1.a447b8p-16, - 0x1.8f4cccp-16, - 0x1.7b5224p-16, - 0x1.684c22p-16, - 0x1.562facp-16, - 0x1.44f21ep-16, - 0x1.34894ap-16, - 0x1.24eb72p-16, - 0x1.160f44p-16, - 0x1.07ebd2p-16, - 0x1.f4f12ep-17, - 0x1.db5ad0p-17, - 0x1.c304f0p-17, - 0x1.abe09ep-17, - 0x1.95df98p-17, - 0x1.80f43ap-17, - 0x1.6d1178p-17, - 0x1.5a2ae0p-17, - 0x1.483488p-17, - 0x1.372310p-17, - 0x1.26eb9ep-17, - 0x1.1783cep-17, - 0x1.08e1bap-17, - 0x1.f5f7d8p-18, - 0x1.db92b6p-18, - 0x1.c282cep-18, - 0x1.aab7acp-18, - 0x1.94219cp-18, - 0x1.7eb1a2p-18, - 0x1.6a5972p-18, - 0x1.570b6ap-18, - 0x1.44ba86p-18, - 0x1.335a62p-18, - 0x1.22df2ap-18, - 0x1.133d96p-18, - 0x1.046aeap-18, - 0x1.ecb9d0p-19, - 0x1.d21398p-19, - 0x1.b8d094p-19, - 0x1.a0df10p-19, - 0x1.8a2e26p-19, - 0x1.74adc8p-19, - 0x1.604ea8p-19, - 0x1.4d0232p-19, - 0x1.3aba86p-19, - 0x1.296a70p-19, - 0x1.190562p-19, - 0x1.097f62p-19, - 0x1.f59a20p-20, - 0x1.d9c736p-20, - 0x1.bf716cp-20, - 0x1.a6852cp-20, - 0x1.8eefd8p-20, - 0x1.789fb8p-20, - 0x1.6383f8p-20, - 0x1.4f8c96p-20, - 0x1.3caa62p-20, - 0x1.2acee2p-20, - 0x1.19ec60p-20, - 0x1.09f5d0p-20, - 0x1.f5bd96p-21, - 0x1.d9371ep-21, - 0x1.be41dep-21, - 0x1.a4c89ep-21, - 0x1.8cb738p-21, - 0x1.75fa8ep-21, - 0x1.608078p-21, - 0x1.4c37c0p-21, - 0x1.39100ep-21, - 0x1.26f9e0p-21, - 0x1.15e682p-21, - 0x1.05c804p-21, - 0x1.ed2254p-22, - 0x1.d06ad6p-22, - 0x1.b551c8p-22, - 0x1.9bc0a0p-22, - 0x1.83a200p-22, - 0x1.6ce1aap-22, - 0x1.576c72p-22, - 0x1.43302cp-22, - 0x1.301ba2p-22, - 0x1.1e1e86p-22, - 0x1.0d2966p-22, - 0x1.fa5b50p-23, - 0x1.dc3ae4p-23, - 0x1.bfd756p-23, - 0x1.a517dap-23, - 0x1.8be4f8p-23, - 0x1.74287ep-23, - 0x1.5dcd66p-23, - 0x1.48bfd4p-23, - 0x1.34ecf8p-23, - 0x1.224310p-23, - 0x1.10b148p-23, - }, -}; diff --git a/pl/math/sv_exp10f_1u5.c b/pl/math/sv_exp10f_1u5.c deleted file mode 100644 index 9ecde8f1aa528b..00000000000000 --- a/pl/math/sv_exp10f_1u5.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Single-precision SVE 2^x function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "include/mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f32.h" - -/* For x < -SpecialBound, the result is subnormal and not handled correctly by - FEXPA. */ -#define SpecialBound 37.9 - -static const struct data -{ - float poly[5]; - float shift, log10_2, log2_10_hi, log2_10_lo, special_bound; -} data = { - /* Coefficients generated using Remez algorithm with minimisation of relative - error. - rel error: 0x1.89dafa3p-24 - abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] - maxerr: 0.52 +0.5 ulp. */ - .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f, - 0x1.12b41ap-1f }, - /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ - .shift = 0x1.903f8p17f, - .log10_2 = 0x1.a934fp+1, - .log2_10_hi = 0x1.344136p-2, - .log2_10_lo = -0x1.ec10cp-27, - .special_bound = SpecialBound, -}; - -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) -{ - return sv_call_f32 (exp10f, x, y, special); -} - -/* Single-precision SVE exp10f routine. Implements the same algorithm - as AdvSIMD exp10f. - Worst case error is 1.02 ULPs. - _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 - want 0x1.ba5f9cp-1. */ -svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), - with poly(r) in [1/sqrt(2), sqrt(2)] and - x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ - - /* Load some constants in quad-word chunks to minimise memory access (last - lane is wasted). */ - svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2); - - /* n = round(x/(log10(2)/N)). */ - svfloat32_t shift = sv_f32 (d->shift); - svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0); - svfloat32_t n = svsub_x (pg, z, shift); - - /* r = x - n*log10(2)/N. */ - svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1); - r = svmls_lane (r, n, log10_2_and_inv, 2); - - svbool_t special = svacgt (pg, x, d->special_bound); - svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - - /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t poly - = svmla_x (pg, svmul_x (pg, r, d->poly[0]), - sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (pg, scale, scale, poly), special); - - return svmla_x (pg, scale, scale, poly); -} - -PL_SIG (SV, F, 1, exp10, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_F1 (exp10), 0.52) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, SpecialBound, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), SpecialBound, inf, 50000) diff --git a/pl/math/sv_exp2f_1u6.c b/pl/math/sv_exp2f_1u6.c deleted file mode 100644 index 9698ff6f068294..00000000000000 --- a/pl/math/sv_exp2f_1u6.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Single-precision SVE 2^x function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "poly_sve_f32.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - float poly[5]; - float shift, thres; -} data = { - /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for - compatibility with polynomial helpers. */ - .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f, - 0x1.59977ap-10f }, - /* 1.5*2^17 + 127. */ - .shift = 0x1.903f8p17f, - /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled - correctly by FEXPA. */ - .thres = 0x1.5d5e2ap+6f, -}; - -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) -{ - return sv_call_f32 (exp2f, x, y, special); -} - -/* Single-precision SVE exp2f routine. Implements the same algorithm - as AdvSIMD exp2f. - Worst case error is 1.04 ULPs. - SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0 - want 0x1.ba7ebp+0. */ -svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ - svfloat32_t shift = sv_f32 (d->shift); - svfloat32_t z = svadd_x (pg, x, shift); - svfloat32_t n = svsub_x (pg, z, shift); - svfloat32_t r = svsub_x (pg, x, n); - - svbool_t special = svacgt (pg, x, d->thres); - svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - - /* Polynomial evaluation: poly(r) ~ exp2(r)-1. - Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for - coefficients 1 to 4, and apply most significant coefficient directly. */ - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1); - svfloat32_t p0 = svmul_x (pg, r, d->poly[0]); - svfloat32_t poly = svmla_x (pg, p0, r2, p14); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (pg, scale, scale, poly), special); - - return svmla_x (pg, scale, scale, poly); -} - -PL_SIG (SV, F, 1, exp2, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_F1 (exp2), 0.55) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, 1, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 1, Thres, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, -0x1p-23, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p-23, -1, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, -0x1p23, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p23, -inf, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, ScaleThres, 40000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -1, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, ScaleThres, 50000) -PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -inf, 50000) diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c deleted file mode 100644 index 93d705ce420a0b..00000000000000 --- a/pl/math/sv_expf_2u.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - float poly[5]; - float inv_ln2, ln2_hi, ln2_lo, shift, thres; -} data = { - /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for - compatibility with polynomial helpers. */ - .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, - 0x1.0e4020p-7f }, - .inv_ln2 = 0x1.715476p+0f, - .ln2_hi = 0x1.62e4p-1f, - .ln2_lo = 0x1.7f7d1cp-20f, - /* 1.5*2^17 + 127. */ - .shift = 0x1.903f8p17f, - /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled - correctly by FEXPA. */ - .thres = 0x1.5d5e2ap+6f, -}; - -#define C(i) sv_f32 (d->poly[i]) -#define ExponentBias 0x3f800000 - -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) -{ - return sv_call_f32 (expf, x, y, special); -} - -/* Optimised single-precision SVE exp function. - Worst-case error is 1.04 ulp: - SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4 - want 0x1.ba74bap+4. */ -svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - - /* Load some constants in quad-word chunks to minimise memory access (last - lane is wasted). */ - svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2); - - /* n = round(x/(ln2/N)). */ - svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0); - svfloat32_t n = svsub_x (pg, z, d->shift); - - /* r = x - n*ln2/N. */ - svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1); - r = svmls_lane (r, n, invln2_and_ln2, 2); - - /* scale = 2^(n/N). */ - svbool_t is_special_case = svacgt (pg, x, d->thres); - svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ - svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); - svfloat32_t p34 = svmla_x (pg, C (3), C (4), r); - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t p14 = svmla_x (pg, p12, p34, r2); - svfloat32_t p0 = svmul_x (pg, r, C (0)); - svfloat32_t poly = svmla_x (pg, p0, r2, p14); - - if (unlikely (svptest_any (pg, is_special_case))) - return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case); - - return svmla_x (pg, scale, scale, poly); -} - -PL_SIG (SV, F, 1, exp, -9.9, 9.9) -PL_TEST_ULP (SV_NAME_F1 (exp), 0.55) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, 0x1p-23, 40000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p-23, 1, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 1, 0x1p23, 50000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p23, inf, 50000) diff --git a/pl/math/sv_expf_inline.h b/pl/math/sv_expf_inline.h deleted file mode 100644 index 0ef4e0fda946e6..00000000000000 --- a/pl/math/sv_expf_inline.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * SVE helper for single-precision routines which calculate exp(x) and do - * not need special-case handling - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_SV_EXPF_INLINE_H -#define PL_MATH_SV_EXPF_INLINE_H - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -struct sv_expf_data -{ - float poly[5]; - float inv_ln2, ln2_hi, ln2_lo, shift; -}; - -/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for - compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ -#define SV_EXPF_DATA \ - { \ - .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \ - 0x1.0e4020p-7f }, \ - \ - .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ - .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ - } - -#define C(i) sv_f32 (d->poly[i]) - -static inline svfloat32_t -expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) -{ - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - - /* Load some constants in quad-word chunks to minimise memory access. */ - svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]); - - /* n = round(x/(ln2/N)). */ - svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1); - svfloat32_t n = svsub_x (pg, z, d->shift); - - /* r = x - n*ln2/N. */ - svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2); - r = svmls_lane (r, n, c4_invln2_and_ln2, 3); - - /* scale = 2^(n/N). */ - svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z)); - - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ - svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); - svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0); - svfloat32_t r2 = svmul_f32_x (pg, r, r); - svfloat32_t p14 = svmla_x (pg, p12, p34, r2); - svfloat32_t p0 = svmul_f32_x (pg, r, C (0)); - svfloat32_t poly = svmla_x (pg, p0, r2, p14); - - return svmla_x (pg, scale, scale, poly); -} - -#endif // PL_MATH_SV_EXPF_INLINE_H \ No newline at end of file diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c deleted file mode 100644 index f55e068fd442ca..00000000000000 --- a/pl/math/sv_log10_2u5.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Double-precision SVE log10(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" - -#define Min 0x0010000000000000 -#define Max 0x7ff0000000000000 -#define Thres 0x7fe0000000000000 /* Max - Min. */ -#define Off 0x3fe6900900000000 -#define N (1 << V_LOG10_TABLE_BITS) - -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) -{ - return sv_call_f64 (log10, x, y, special); -} - -/* SVE log10 algorithm. - Maximum measured error is 2.46 ulps. - SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 - want 0x1.fffbdf6eaa667p-6. */ -svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) -{ - svuint64_t ix = svreinterpret_u64 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); - - /* x = 2^k z; where z is in range [Off,2*Off) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); - svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); - i = svand_x (pg, i, (N - 1) << 1); - svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); - svfloat64_t z = svreinterpret_f64 ( - svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); - - /* log(x) = k*log(2) + log(c) + log(z/c). */ - svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i); - svfloat64_t logc - = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i); - - /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1): - r = z/c - 1 (we look up precomputed 1/c) - log(z/c) ~= P(r). */ - svfloat64_t r = svmad_x (pg, invc, z, -1.0); - - /* hi = log(c) + k*log(2). */ - svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10); - svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2); - - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), - special); - return svmla_x (pg, hi, r2, y); -} - -PL_SIG (SV, D, 1, log10, 0.01, 11.1) -PL_TEST_ULP (SV_NAME_D1 (log10), 1.97) -PL_TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000) diff --git a/pl/math/sv_log1pf_1u3.c b/pl/math/sv_log1pf_1u3.c deleted file mode 100644 index ea1a3dbf723a8e..00000000000000 --- a/pl/math/sv_log1pf_1u3.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Single-precision vector log(x + 1) function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f32.h" - -static const struct data -{ - float poly[8]; - float ln2, exp_bias; - uint32_t four, three_quarters; -} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as - this can be fmov-ed directly instead of including it in - the main load-and-mla polynomial schedule. */ - 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, - -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, - 0x1.abcb6p-4f, -0x1.6f0d5ep-5f}, - .ln2 = 0x1.62e43p-1f, - .exp_bias = 0x1p-23f, - .four = 0x40800000, - .three_quarters = 0x3f400000}; - -#define SignExponentMask 0xff800000 - -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) -{ - return sv_call_f32 (log1pf, x, y, special); -} - -/* Vector log1pf approximation using polynomial on reduced interval. Worst-case - error is 1.27 ULP very close to 0.5. - _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2 - want 0x1.9f323ep-2. */ -svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - /* x < -1, Inf/Nan. */ - svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000); - special = svorn_z (pg, special, svcmpge (pg, x, -1)); - - /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m - is in [-0.25, 0.5]): - log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). - - We approximate log1p(m) with a polynomial, then scale by - k*log(2). Instead of doing this directly, we use an intermediate - scale factor s = 4*k*log(2) to ensure the scale is representable - as a normalised fp32 number. */ - svfloat32_t m = svadd_x (pg, x, 1); - - /* Choose k to scale x to the range [-1/4, 1/2]. */ - svint32_t k - = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), - sv_s32 (SignExponentMask)); - - /* Scale x by exponent manipulation. */ - svfloat32_t m_scale = svreinterpret_f32 ( - svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); - - /* Scale up to ensure that the scale factor is representable as normalised - fp32 number, and scale m down accordingly. */ - svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); - m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25)); - - /* Evaluate polynomial on reduced interval. */ - svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale), - ms4 = svmul_x (pg, ms2, ms2); - svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly); - p = svmad_x (pg, m_scale, p, -0.5); - p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); - - /* The scale factor to be applied back at the end - by multiplying float(k) - by 2^-23 we get the unbiased exponent of k. */ - svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias); - - /* Apply the scaling back. */ - svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, y, special); - - return y; -} - -PL_SIG (SV, F, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (SV_NAME_F1 (log1p), 0.77) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000) -PL_TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000) -PL_TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10) diff --git a/pl/math/sv_log1pf_inline.h b/pl/math/sv_log1pf_inline.h deleted file mode 100644 index d13b094f6b5d28..00000000000000 --- a/pl/math/sv_log1pf_inline.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Helper for SVE routines which calculate log(1 + x) and do not - * need special-case handling - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_SV_LOG1PF_INLINE_H -#define PL_MATH_SV_LOG1PF_INLINE_H - -#include "v_math.h" -#include "math_config.h" -#include "poly_sve_f32.h" - -static const struct sv_log1pf_data -{ - float32_t poly[9]; - float32_t ln2; - float32_t scale_back; -} sv_log1pf_data = { - /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */ - .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, - -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f, - -0x1.6f0d5ep-5f }, - .scale_back = 0x1.0p-23f, - .ln2 = 0x1.62e43p-1f, -}; - -static inline svfloat32_t -eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg) -{ - svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1])); - svfloat32_t m2 = svmul_x (pg, m, m); - svfloat32_t q = svmla_x (pg, m, m2, p_12); - svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2); - p = svmul_x (pg, m2, p); - - return svmla_x (pg, q, m2, p); -} - -static inline svfloat32_t -sv_log1pf_inline (svfloat32_t x, svbool_t pg) -{ - const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data); - - svfloat32_t m = svadd_x (pg, x, 1.0f); - - svint32_t ks = svsub_x (pg, svreinterpret_s32 (m), - svreinterpret_s32 (svdup_f32 (0.75f))); - ks = svand_x (pg, ks, 0xff800000); - svuint32_t k = svreinterpret_u32 (ks); - svfloat32_t s = svreinterpret_f32 ( - svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k)); - - svfloat32_t m_scale - = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k)); - m_scale - = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s)); - svfloat32_t p = eval_poly (m_scale, d->poly, pg); - svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back); - return svmla_x (pg, p, scale_back, d->ln2); -} - -#endif // PL_MATH_SV_LOG1PF_INLINE_H \ No newline at end of file diff --git a/pl/math/sv_log2_3u.c b/pl/math/sv_log2_3u.c deleted file mode 100644 index 0775a39cc85d60..00000000000000 --- a/pl/math/sv_log2_3u.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Double-precision SVE log2 function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_sve_f64.h" - -#define N (1 << V_LOG2_TABLE_BITS) -#define Off 0x3fe6900900000000 -#define Max (0x7ff0000000000000) -#define Min (0x0010000000000000) -#define Thresh (0x7fe0000000000000) /* Max - Min. */ - -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) -{ - return sv_call_f64 (log2, x, y, cmp); -} - -/* Double-precision SVE log2 routine. - Implements the same algorithm as AdvSIMD log10, with coefficients and table - entries scaled in extended precision. - The maximum observed error is 2.58 ULP: - SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 - want 0x1.fffb34198d9ddp-5. */ -svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) -{ - svuint64_t ix = svreinterpret_u64 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); - - /* x = 2^k z; where z is in range [Off,2*Off) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); - svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); - i = svand_x (pg, i, (N - 1) << 1); - svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); - svfloat64_t z = svreinterpret_f64 ( - svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52))); - - svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i); - svfloat64_t log2c - = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i); - - /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ - - svfloat64_t r = svmad_x (pg, invc, z, -1.0); - svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2); - - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly); - w = svadd_x (pg, k, w); - - if (unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y), - special); - return svmla_x (pg, w, r2, y); -} - -PL_SIG (SV, D, 1, log2, 0.01, 11.1) -PL_TEST_ULP (SV_NAME_D1 (log2), 2.09) -PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_D1 (log2)) -PL_TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000) -PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000) -PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000) diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c deleted file mode 100644 index 2530c9e3f62cef..00000000000000 --- a/pl/math/sv_log_2u5.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Double-precision SVE log(x) function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define P(i) sv_f64 (__v_log_data.poly[i]) -#define N (1 << V_LOG_TABLE_BITS) -#define Off (0x3fe6900900000000) -#define MaxTop (0x7ff) -#define MinTop (0x001) -#define ThreshTop (0x7fe) /* MaxTop - MinTop. */ - -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) -{ - return sv_call_f64 (log, x, y, cmp); -} - -/* SVE port of AdvSIMD log algorithm. - Maximum measured error is 2.17 ulp: - SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 - want 0x1.ffffff1cca045p-2. */ -svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) -{ - svuint64_t ix = svreinterpret_u64 (x); - svuint64_t top = svlsr_x (pg, ix, 52); - svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop)); - - /* x = 2^k z; where z is in range [Off,2*Off) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); - /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. - The actual value of i is double this due to table layout. */ - svuint64_t i - = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); - svint64_t k - = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); - svfloat64_t z = svreinterpret_f64 (iz); - /* Lookup in 2 global lists (length N). */ - svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); - svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - svfloat64_t r = svmad_x (pg, invc, z, -1); - svfloat64_t kd = svcvt_f64_x (pg, k); - /* hi = r + log(c) + k*Ln2. */ - svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = svmla_x (pg, P (2), r, P (3)); - svfloat64_t p = svmla_x (pg, P (0), r, P (1)); - y = svmla_x (pg, y, r2, P (4)); - y = svmla_x (pg, p, r2, y); - - if (unlikely (svptest_any (pg, cmp))) - return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp); - return svmla_x (pg, hi, r2, y); -} - -PL_SIG (SV, D, 1, log, 0.01, 11.1) -PL_TEST_ULP (SV_NAME_D1 (log), 1.68) -PL_TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000) -PL_TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000) -PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000) -PL_TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000) diff --git a/pl/math/sv_tan_3u5.c b/pl/math/sv_tan_3u5.c deleted file mode 100644 index 746396e98a1024..00000000000000 --- a/pl/math/sv_tan_3u5.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Double-precision SVE tan(x) function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "poly_sve_f64.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - double poly[9]; - double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift; -} data = { - /* Polynomial generated with FPMinimax. */ - .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, - 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, - 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, - 0x1.4e4fd14147622p-12, }, - .half_pi_hi = 0x1.921fb54442d18p0, - .half_pi_lo = 0x1.1a62633145c07p-54, - .inv_half_pi = 0x1.45f306dc9c883p-1, - .range_val = 0x1p23, - .shift = 0x1.8p52, -}; - -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) -{ - return sv_call_f64 (tan, x, y, special); -} - -/* Vector approximation for double-precision tan. - Maximum measured error is 3.48 ULP: - _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 - want -0x1.f6ccd8ecf7deap+37. */ -svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) -{ - const struct data *dat = ptr_barrier (&data); - - /* Invert condition to catch NaNs and Infs as well as large values. */ - svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); - - /* q = nearest integer to 2 * x / pi. */ - svfloat64_t shift = sv_f64 (dat->shift); - svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi); - q = svsub_x (pg, q, shift); - svint64_t qi = svcvt_s64_x (pg, q); - - /* Use q to reduce x to r in [-pi/4, pi/4], by: - r = x - q * pi/2, in extended precision. */ - svfloat64_t r = x; - svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi); - r = svmls_lane (r, q, half_pi, 0); - r = svmls_lane (r, q, half_pi, 1); - /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle - formula. */ - r = svmul_x (pg, r, 0.5); - - /* Approximate tan(r) using order 8 polynomial. - tan(x) is odd, so polynomial has the form: - tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... - Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... - Then compute the approximation by: - tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t r4 = svmul_x (pg, r2, r2); - svfloat64_t r8 = svmul_x (pg, r4, r4); - /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ - svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1); - p = svmad_x (pg, p, r2, dat->poly[0]); - p = svmla_x (pg, r, r2, svmul_x (pg, p, r)); - - /* Recombination uses double-angle formula: - tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) - and reciprocity around pi/2: - tan(x) = 1 / (tan(pi/2 - x)) - to assemble result using change-of-sign and conditional selection of - numerator/denominator dependent on odd/even-ness of q (hence quadrant). */ - svbool_t use_recip - = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0); - - svfloat64_t n = svmad_x (pg, p, p, -1); - svfloat64_t d = svmul_x (pg, p, 2); - svfloat64_t swap = n; - n = svneg_m (n, use_recip, d); - d = svsel (use_recip, swap, d); - if (unlikely (svptest_any (pg, special))) - return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special); - return svdiv_x (pg, n, d); -} - -PL_SIG (SV, D, 1, tan, -3.1, 3.1) -PL_TEST_ULP (SV_NAME_D1 (tan), 2.99) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000) -PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000) diff --git a/pl/math/sv_tanhf_2u6.c b/pl/math/sv_tanhf_2u6.c deleted file mode 100644 index 988a56de0b2e59..00000000000000 --- a/pl/math/sv_tanhf_2u6.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Single-precision SVE tanh(x) function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "sv_math.h" -#include "pl_sig.h" -#include "pl_test.h" - -#include "sv_expm1f_inline.h" - -static const struct data -{ - struct sv_expm1f_data expm1f_consts; - uint32_t boring_bound, onef; -} data = { - .expm1f_consts = SV_EXPM1F_DATA, - /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ - .boring_bound = 0x41102cb3, - .onef = 0x3f800000, -}; - -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) -{ - return sv_call_f32 (tanhf, x, y, special); -} - -/* Approximation for single-precision SVE tanh(x), using a simplified - version of expm1f. The maximum error is 2.57 ULP: - _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5 - want 0x1.fb71aap-5. */ -svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); - - svfloat32_t ax = svabs_x (pg, x); - svuint32_t iax = svreinterpret_u32 (ax); - svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); - svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound); - svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef)); - - svbool_t special = svcmpgt (pg, iax, 0x7f800000); - - /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts); - svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); - if (unlikely (svptest_any (pg, special))) - return special_case (x, svsel_f32 (is_boring, boring, y), special); - return svsel_f32 (is_boring, boring, y); -} - -PL_SIG (SV, F, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (SV_NAME_F1 (tanh), 2.07) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) -PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h deleted file mode 100644 index f2710a979d4097..00000000000000 --- a/pl/math/test/mathbench_funcs.h +++ /dev/null @@ -1,87 +0,0 @@ -// clang-format off -/* - * Function entries for mathbench. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#define _ZSF1(fun, a, b) F(fun##f, a, b) -#define _ZSD1(f, a, b) D(f, a, b) - -#if defined(__vpcs) && __aarch64__ - -#define _ZVF1(fun, a, b) VNF(_ZGVnN4v_##fun##f, a, b) -#define _ZVD1(f, a, b) VND(_ZGVnN2v_##f, a, b) - -#else - -#define _ZVF1(f, a, b) -#define _ZVD1(f, a, b) - -#endif - -#if WANT_SVE_MATH - -#define _ZSVF1(fun, a, b) SVF(_ZGVsMxv_##fun##f, a, b) -#define _ZSVD1(f, a, b) SVD(_ZGVsMxv_##f, a, b) - -#else - -#define _ZSVF1(f, a, b) -#define _ZSVD1(f, a, b) - -#endif - -/* No auto-generated wrappers for binary functions - they have be - manually defined in mathbench_wrappers.h. We have to define silent - macros for them anyway as they will be emitted by PL_SIG. */ -#define _ZSF2(...) -#define _ZSD2(...) -#define _ZVF2(...) -#define _ZVD2(...) -#define _ZSVF2(...) -#define _ZSVD2(...) - -#include "mathbench_funcs_gen.h" - -/* PL_SIG only emits entries for unary functions, since if a function - needs to be wrapped in mathbench there is no way for it to know the - same of the wrapper. Add entries for binary functions, or any other - exotic signatures that need wrapping, below. */ - -{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, -{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}}, -{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}}, - -{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}}, -{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}}, -{"_ZGVnN4vv_hypotf", 'f', 'n', -10.0, 10.0, {.vnf = _Z_hypotf_wrap}}, -{"_ZGVnN2vv_hypot", 'd', 'n', -10.0, 10.0, {.vnd = _Z_hypot_wrap}}, -{"_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = xy_Z_pow}}, -{"x_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = x_Z_pow}}, -{"y_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = y_Z_pow}}, -{"_ZGVnN4vl4l4_sincosf", 'f', 'n', -3.1, 3.1, {.vnf = _Z_sincosf_wrap}}, -{"_ZGVnN2vl8l8_sincos", 'd', 'n', -3.1, 3.1, {.vnd = _Z_sincos_wrap}}, -{"_ZGVnN4v_cexpif", 'f', 'n', -3.1, 3.1, {.vnf = _Z_cexpif_wrap}}, -{"_ZGVnN2v_cexpi", 'd', 'n', -3.1, 3.1, {.vnd = _Z_cexpi_wrap}}, - -#if WANT_SVE_MATH -{"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}}, -{"_ZGVsMxvv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}}, -{"_ZGVsMxvv_hypotf", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_hypotf_wrap}}, -{"_ZGVsMxvv_hypot", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_hypot_wrap}}, -{"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}}, -{"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}}, -{"_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = xy_Z_sv_powf}}, -{"x_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = x_Z_sv_powf}}, -{"y_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = y_Z_sv_powf}}, -{"_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = xy_Z_sv_pow}}, -{"x_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = x_Z_sv_pow}}, -{"y_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = y_Z_sv_pow}}, -{"_ZGVsMxvl4l4_sincosf", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_sincosf_wrap}}, -{"_ZGVsMxvl8l8_sincos", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_sincos_wrap}}, -{"_ZGVsMxv_cexpif", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_cexpif_wrap}}, -{"_ZGVsMxv_cexpi", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_cexpi_wrap}}, -#endif - // clang-format on diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h deleted file mode 100644 index fe7f8963cdeee5..00000000000000 --- a/pl/math/test/mathbench_wrappers.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Function wrappers for mathbench. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -static double -atan2_wrap (double x) -{ - return atan2 (5.0, x); -} - -static float -atan2f_wrap (float x) -{ - return atan2f (5.0f, x); -} - -static double -powi_wrap (double x) -{ - return __builtin_powi (x, (int) round (x)); -} - -#if __aarch64__ && defined(__vpcs) - -__vpcs static v_double -_Z_atan2_wrap (v_double x) -{ - return _ZGVnN2vv_atan2 (v_double_dup (5.0), x); -} - -__vpcs static v_float -_Z_atan2f_wrap (v_float x) -{ - return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x); -} - -__vpcs static v_float -_Z_hypotf_wrap (v_float x) -{ - return _ZGVnN4vv_hypotf (v_float_dup (5.0f), x); -} - -__vpcs static v_double -_Z_hypot_wrap (v_double x) -{ - return _ZGVnN2vv_hypot (v_double_dup (5.0), x); -} - -__vpcs static v_double -xy_Z_pow (v_double x) -{ - return _ZGVnN2vv_pow (x, x); -} - -__vpcs static v_double -x_Z_pow (v_double x) -{ - return _ZGVnN2vv_pow (x, v_double_dup (23.4)); -} - -__vpcs static v_double -y_Z_pow (v_double x) -{ - return _ZGVnN2vv_pow (v_double_dup (2.34), x); -} - -__vpcs static v_float -_Z_sincosf_wrap (v_float x) -{ - v_float s, c; - _ZGVnN4vl4l4_sincosf (x, &s, &c); - return s + c; -} - -__vpcs static v_float -_Z_cexpif_wrap (v_float x) -{ - __f32x4x2_t sc = _ZGVnN4v_cexpif (x); - return sc.val[0] + sc.val[1]; -} - -__vpcs static v_double -_Z_sincos_wrap (v_double x) -{ - v_double s, c; - _ZGVnN2vl8l8_sincos (x, &s, &c); - return s + c; -} - -__vpcs static v_double -_Z_cexpi_wrap (v_double x) -{ - __f64x2x2_t sc = _ZGVnN2v_cexpi (x); - return sc.val[0] + sc.val[1]; -} - -#endif // __arch64__ && __vpcs - -#if WANT_SVE_MATH - -static sv_float -_Z_sv_atan2f_wrap (sv_float x, sv_bool pg) -{ - return _ZGVsMxvv_atan2f (x, svdup_f32 (5.0f), pg); -} - -static sv_double -_Z_sv_atan2_wrap (sv_double x, sv_bool pg) -{ - return _ZGVsMxvv_atan2 (x, svdup_f64 (5.0), pg); -} - -static sv_float -_Z_sv_hypotf_wrap (sv_float x, sv_bool pg) -{ - return _ZGVsMxvv_hypotf (x, svdup_f32 (5.0), pg); -} - -static sv_double -_Z_sv_hypot_wrap (sv_double x, sv_bool pg) -{ - return _ZGVsMxvv_hypot (x, svdup_f64 (5.0), pg); -} - -static sv_float -_Z_sv_powi_wrap (sv_float x, sv_bool pg) -{ - return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg); -} - -static sv_double -_Z_sv_powk_wrap (sv_double x, sv_bool pg) -{ - return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg); -} - -static sv_float -xy_Z_sv_powf (sv_float x, sv_bool pg) -{ - return _ZGVsMxvv_powf (x, x, pg); -} - -static sv_float -x_Z_sv_powf (sv_float x, sv_bool pg) -{ - return _ZGVsMxvv_powf (x, svdup_f32 (23.4f), pg); -} - -static sv_float -y_Z_sv_powf (sv_float x, sv_bool pg) -{ - return _ZGVsMxvv_powf (svdup_f32 (2.34f), x, pg); -} - -static sv_double -xy_Z_sv_pow (sv_double x, sv_bool pg) -{ - return _ZGVsMxvv_pow (x, x, pg); -} - -static sv_double -x_Z_sv_pow (sv_double x, sv_bool pg) -{ - return _ZGVsMxvv_pow (x, svdup_f64 (23.4), pg); -} - -static sv_double -y_Z_sv_pow (sv_double x, sv_bool pg) -{ - return _ZGVsMxvv_pow (svdup_f64 (2.34), x, pg); -} - -static sv_float -_Z_sv_sincosf_wrap (sv_float x, sv_bool pg) -{ - float s[svcntw ()], c[svcntw ()]; - _ZGVsMxvl4l4_sincosf (x, s, c, pg); - return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); -} - -static sv_float -_Z_sv_cexpif_wrap (sv_float x, sv_bool pg) -{ - svfloat32x2_t sc = _ZGVsMxv_cexpif (x, pg); - return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); -} - -static sv_double -_Z_sv_sincos_wrap (sv_double x, sv_bool pg) -{ - double s[svcntd ()], c[svcntd ()]; - _ZGVsMxvl8l8_sincos (x, s, c, pg); - return svadd_x (pg, svld1 (pg, s), svld1 (pg, s)); -} - -static sv_double -_Z_sv_cexpi_wrap (sv_double x, sv_bool pg) -{ - svfloat64x2_t sc = _ZGVsMxv_cexpi (x, pg); - return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1)); -} - -#endif // WANT_SVE_MATH diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h deleted file mode 100644 index e7ed4eed634e87..00000000000000 --- a/pl/math/test/pl_test.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * PL macros for emitting various details about routines for consumption by - * runulp.sh. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. - */ - -/* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV - on PL_TEST_ULP to add EXPECT_FENV to all scalar routines. */ -#if WANT_VMATH || defined(IGNORE_SCALAR_FENV) -# define PL_TEST_ULP(f, l) PL_TEST_ULP f l -#else -# define PL_TEST_ULP(f, l) \ - PL_TEST_EXPECT_FENV_ALWAYS (f) \ - PL_TEST_ULP f l -#endif - -/* Emit routine name if e == 1 and f is expected to correctly trigger fenv - exceptions. e allows declaration to be emitted conditionally upon certain - build flags - defer expansion by one pass to allow those flags to be expanded - properly. */ -#define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e) -#define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f) -#define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f -#define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1) - -#define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n -#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) \ - PL_TEST_INTERVAL (f, lo, hi, n) \ - PL_TEST_INTERVAL (f, -lo, -hi, n) -#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c -#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) \ - PL_TEST_INTERVAL_C (f, lo, hi, n, c) \ - PL_TEST_INTERVAL_C (f, -lo, -hi, n, c) -// clang-format off -#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) \ - PL_TEST_INTERVAL f xlo,ylo xhi,yhi n -// clang-format on diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh deleted file mode 100755 index 0f5a41f76b25c7..00000000000000 --- a/pl/math/test/runulp.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# ULP error check script. -# -# Copyright (c) 2019-2023, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -#set -x -set -eu - -# cd to bin directory. -cd "${0%/*}" - -flags="${ULPFLAGS:--q}" -emu="$@" - -# Enable SVE testing -WANT_SVE_MATH=${WANT_SVE_MATH:-0} - -FAIL=0 -PASS=0 - -t() { - routine=$1 - L=$(cat $LIMITS | grep "^$routine " | awk '{print $2}') - [[ $L =~ ^[0-9]+\.[0-9]+$ ]] - extra_flags= - [[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5" - grep -q "^$routine$" $FENV || extra_flags="$extra_flags -f" - IFS=',' read -ra LO <<< "$2" - IFS=',' read -ra HI <<< "$3" - ITV="${LO[0]} ${HI[0]}" - for i in "${!LO[@]}"; do - [[ "$i" -eq "0" ]] || ITV="$ITV x ${LO[$i]} ${HI[$i]}" - done - # Add -z flag to ignore zero sign for vector routines - { echo $routine | grep -q "ZGV"; } && extra_flags="$extra_flags -z" - $emu ./ulp -e $L $flags ${extra_flags} $routine $ITV $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) -} - -check() { - $emu ./ulp -f -q "$@" #>/dev/null -} - -if [ "$FUNC" == "atan2" ] || [ -z "$FUNC" ]; then - # Regression-test for correct NaN handling in atan2 - check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000 - check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan - check atan2 nan nan x -nan -nan -fi - -# vector functions -flags="${ULPFLAGS:--q}" -runsv= -if [ $WANT_SVE_MATH -eq 1 ]; then -# No guarantees about powi accuracy, so regression-test for exactness -# w.r.t. the custom reference impl in ulp_wrappers.h -check -q -f -e 0 _ZGVsMxvv_powi 0 inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 _ZGVsMxvv_powi 0 inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 _ZGVsMxvv_powk 0 inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x 0 1000 100000 && runsv=1 -check -q -f -e 0 _ZGVsMxvv_powk 0 inf x -0 -1000 100000 && runsv=1 -check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x -0 -1000 100000 && runsv=1 -fi - -while read F LO HI N C -do - t $F $LO $HI $N $C -done << EOF -$(cat $INTERVALS | grep "\b$FUNC\b") -EOF - -[ 0 -eq $FAIL ] || { - echo "FAILED $FAIL PASSED $PASS" - exit 1 -} diff --git a/pl/math/test/testcases/directed/erff.tst b/pl/math/test/testcases/directed/erff.tst deleted file mode 100644 index 9b1d3d5114ae31..00000000000000 --- a/pl/math/test/testcases/directed/erff.tst +++ /dev/null @@ -1,17 +0,0 @@ -; erff.tst -; -; Copyright (c) 2007-2023, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -func=erff op1=7fc00001 result=7fc00001 errno=0 -func=erff op1=ffc00001 result=7fc00001 errno=0 -func=erff op1=7f800001 result=7fc00001 errno=0 status=i -func=erff op1=ff800001 result=7fc00001 errno=0 status=i -func=erff op1=7f800000 result=3f800000 errno=0 -func=erff op1=ff800000 result=bf800000 errno=0 -func=erff op1=00000000 result=00000000 errno=ERANGE -func=erff op1=80000000 result=80000000 errno=ERANGE -func=erff op1=00000001 result=00000001 errno=0 status=ux -func=erff op1=80000001 result=80000001 errno=0 status=ux -func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0 -func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0 diff --git a/pl/math/test/testcases/directed/log2.tst b/pl/math/test/testcases/directed/log2.tst deleted file mode 100644 index 5d1eb9b877e804..00000000000000 --- a/pl/math/test/testcases/directed/log2.tst +++ /dev/null @@ -1,21 +0,0 @@ -; Directed test cases for log2 -; -; Copyright (c) 2018-2023, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 -func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 -func=log2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i -func=log2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i -func=log2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 -func=log2 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i -func=log2 op1=7fefffff.ffffffff result=408fffff.ffffffff.ffa errno=0 -func=log2 op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i -func=log2 op1=3ff00000.00000000 result=00000000.00000000 errno=0 -func=log2 op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i -func=log2 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z -func=log2 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z -func=log2 op1=00000000.00000001 result=c090c800.00000000 errno=0 -func=log2 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i -func=log2 op1=40000000.00000000 result=3ff00000.00000000 errno=0 -func=log2 op1=3fe00000.00000000 result=bff00000.00000000 errno=0 diff --git a/pl/math/test/testcases/directed/log2f.tst b/pl/math/test/testcases/directed/log2f.tst deleted file mode 100644 index 4e08110878d69f..00000000000000 --- a/pl/math/test/testcases/directed/log2f.tst +++ /dev/null @@ -1,27 +0,0 @@ -; log2f.tst - Directed test cases for log2f -; -; Copyright (c) 2017-2023, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -func=log2f op1=7fc00001 result=7fc00001 errno=0 -func=log2f op1=ffc00001 result=7fc00001 errno=0 -func=log2f op1=7f800001 result=7fc00001 errno=0 status=i -func=log2f op1=ff800001 result=7fc00001 errno=0 status=i -func=log2f op1=ff810000 result=7fc00001 errno=0 status=i -func=log2f op1=7f800000 result=7f800000 errno=0 -func=log2f op1=ff800000 result=7fc00001 errno=EDOM status=i -func=log2f op1=3f800000 result=00000000 errno=0 -func=log2f op1=00000000 result=ff800000 errno=ERANGE status=z -func=log2f op1=80000000 result=ff800000 errno=ERANGE status=z -func=log2f op1=80000001 result=7fc00001 errno=EDOM status=i - -func=log2f op1=3f7d70a4 result=bc6d8f8b.7d4 error=0 -func=log2f op1=3f604189 result=be4394c8.395 error=0 -func=log2f op1=3f278034 result=bf1caa73.88e error=0 -func=log2f op1=3edd3c36 result=bf9af3b9.619 error=0 -func=log2f op1=3e61259a result=c00bdb95.650 error=0 -func=log2f op1=3f8147ae result=3c6b3267.d6a error=0 -func=log2f op1=3f8fbe77 result=3e2b5fe2.a1c error=0 -func=log2f op1=3fac3eea result=3edb4d5e.1fc error=0 -func=log2f op1=3fd6e632 result=3f3f5d3a.827 error=0 -func=log2f op1=40070838 result=3f89e055.a0a error=0 diff --git a/pl/math/test/testcases/random/double.tst b/pl/math/test/testcases/random/double.tst deleted file mode 100644 index d83283ef78649b..00000000000000 --- a/pl/math/test/testcases/random/double.tst +++ /dev/null @@ -1,6 +0,0 @@ -!! double.tst - Random test case specification for DP functions -!! -!! Copyright (c) 1999-2023, Arm Limited. -!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -test log10 10000 diff --git a/pl/math/test/testcases/random/float.tst b/pl/math/test/testcases/random/float.tst deleted file mode 100644 index fa77efecfabb7a..00000000000000 --- a/pl/math/test/testcases/random/float.tst +++ /dev/null @@ -1,8 +0,0 @@ -!! float.tst - Random test case specification for SP functions -!! -!! Copyright (c) 2022-2023, Arm Limited. -!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -test erff 10000 -test log10f 10000 -test tanf 10000 diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h deleted file mode 100644 index 4929b481ffe1a2..00000000000000 --- a/pl/math/test/ulp_funcs.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Function entries for ulp. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#if defined(__vpcs) && __aarch64__ - -#define _ZVF1(f) ZVF1 (f) -#define _ZVD1(f) ZVD1 (f) -#define _ZVF2(f) ZVF2 (f) -#define _ZVD2(f) ZVD2 (f) - -#else - -#define _ZVF1(f) -#define _ZVD1(f) -#define _ZVF2(f) -#define _ZVD2(f) - -#endif - -#if WANT_SVE_MATH - -#define _ZSVF1(f) ZSVF1 (f) -#define _ZSVF2(f) ZSVF2 (f) -#define _ZSVD1(f) ZSVD1 (f) -#define _ZSVD2(f) ZSVD2 (f) - -#else - -#define _ZSVF1(f) -#define _ZSVF2(f) -#define _ZSVD1(f) -#define _ZSVD2(f) - -#endif - -#define _ZSF1(f) F1 (f) -#define _ZSF2(f) F2 (f) -#define _ZSD1(f) D1 (f) -#define _ZSD2(f) D2 (f) - -#include "ulp_funcs_gen.h" - -F (_ZGVnN4v_sincosf_sin, v_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) -F (_ZGVnN4v_sincosf_cos, v_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) -F (_ZGVnN4v_cexpif_sin, v_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) -F (_ZGVnN4v_cexpif_cos, v_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) - -F (_ZGVnN2v_sincos_sin, v_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) -F (_ZGVnN2v_sincos_cos, v_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) -F (_ZGVnN2v_cexpi_sin, v_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) -F (_ZGVnN2v_cexpi_cos, v_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) - -#if WANT_SVE_MATH -F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0) -F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0) - -F (_ZGVsMxv_sincosf_sin, sv_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0) -F (_ZGVsMxv_sincosf_cos, sv_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0) -F (_ZGVsMxv_cexpif_sin, sv_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0) -F (_ZGVsMxv_cexpif_cos, sv_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0) - -F (_ZGVsMxv_sincos_sin, sv_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0) -F (_ZGVsMxv_sincos_cos, sv_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0) -F (_ZGVsMxv_cexpi_sin, sv_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0) -F (_ZGVsMxv_cexpi_cos, sv_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0) -#endif diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h deleted file mode 100644 index 0f7b68949c7bfb..00000000000000 --- a/pl/math/test/ulp_wrappers.h +++ /dev/null @@ -1,140 +0,0 @@ -// clang-format off -/* - * Function wrappers for ulp. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#define _GNU_SOURCE -#include -#include - -#if USE_MPFR -static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { - mpfr_cos(y, x, r); - return mpfr_sin(y, x, r); -} -static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { - mpfr_sin(y, x, r); - return mpfr_cos(y, x, r); -} -static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) { - mpfr_t y2; - mpfr_init(y2); - mpfr_trunc(y2, y); - return mpfr_pow(ret, x, y2, rnd); -} -#endif - -/* Our implementations of powi/powk are too imprecise to verify - against any established pow implementation. Instead we have the - following simple implementation, against which it is enough to - maintain bitwise reproducibility. Note the test framework expects - the reference impl to be of higher precision than the function - under test. For instance this means that the reference for - double-precision powi will be passed a long double, so to check - bitwise reproducibility we have to cast it back down to - double. This is fine since a round-trip to higher precision and - back down is correctly rounded. */ -#define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \ - static DBL_T __attribute__((unused)) NAME (DBL_T in_val, DBL_T y) \ - { \ - INT_T n = (INT_T) round (y); \ - FLT_T acc = 1.0; \ - bool want_recip = n < 0; \ - n = n < 0 ? -n : n; \ - \ - for (FLT_T c = in_val; n; c *= c, n >>= 1) \ - { \ - if (n & 0x1) \ - { \ - acc *= c; \ - } \ - } \ - if (want_recip) \ - { \ - acc = 1.0 / acc; \ - } \ - return acc; \ - } - -DECL_POW_INT_REF(ref_powif, double, float, int) -DECL_POW_INT_REF(ref_powi, long double, double, int) - -#define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; } -#define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; } -#define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; } -#define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; } - -#if defined(__vpcs) && __aarch64__ - -#define ZVNF1_WRAP(func) ZVF1_WRAP(func) -#define ZVNF2_WRAP(func) ZVF2_WRAP(func) -#define ZVND1_WRAP(func) ZVD1_WRAP(func) -#define ZVND2_WRAP(func) ZVD2_WRAP(func) - -#else - -#define ZVNF1_WRAP(func) -#define ZVNF2_WRAP(func) -#define ZVND1_WRAP(func) -#define ZVND2_WRAP(func) - -#endif - -#define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); } -#define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); } -#define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); } -#define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); } - -#if WANT_SVE_MATH - -#define ZSVNF1_WRAP(func) ZSVF1_WRAP(func) -#define ZSVNF2_WRAP(func) ZSVF2_WRAP(func) -#define ZSVND1_WRAP(func) ZSVD1_WRAP(func) -#define ZSVND2_WRAP(func) ZSVD2_WRAP(func) - -#else - -#define ZSVNF1_WRAP(func) -#define ZSVNF2_WRAP(func) -#define ZSVND1_WRAP(func) -#define ZSVND2_WRAP(func) - -#endif - -/* No wrappers for scalar routines, but PL_SIG will emit them. */ -#define ZSNF1_WRAP(func) -#define ZSNF2_WRAP(func) -#define ZSND1_WRAP(func) -#define ZSND2_WRAP(func) - -#include "ulp_wrappers_gen.h" - -float v_sincosf_sin(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return s[0]; } -float v_sincosf_cos(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return c[0]; } -float v_cexpif_sin(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[0][0]; } -float v_cexpif_cos(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[1][0]; } - -double v_sincos_sin(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return s[0]; } -double v_sincos_cos(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return c[0]; } -double v_cexpi_sin(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[0][0]; } -double v_cexpi_cos(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[1][0]; } - -#if WANT_SVE_MATH -static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_s32((int)round(y)), svptrue_b32())); } -static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_s64((long)round(y)), svptrue_b64())); } - -float sv_sincosf_sin(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return s[0]; } -float sv_sincosf_cos(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return c[0]; } -float sv_cexpif_sin(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 0)); } -float sv_cexpif_cos(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 1)); } - -double sv_sincos_sin(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return s[0]; } -double sv_sincos_cos(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return c[0]; } -double sv_cexpi_sin(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 0)); } -double sv_cexpi_cos(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 1)); } - -#endif -// clang-format on diff --git a/pl/math/trigpi_references.c b/pl/math/trigpi_references.c deleted file mode 100644 index 4b0514b6766a72..00000000000000 --- a/pl/math/trigpi_references.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Extended precision scalar reference functions for trigpi. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#define _GNU_SOURCE -#include "math_config.h" -#include "mathlib.h" - -long double -sinpil (long double x) -{ - /* sin(inf) should return nan, as defined by C23. */ - if (isinf (x)) - return __math_invalid (x); - - long double ax = fabsl (x); - - /* Return 0 for all values above 2^64 to prevent - overflow when casting to uint64_t. */ - if (ax >= 0x1p64) - return 0; - - /* All integer cases should return 0. */ - if (ax == (uint64_t) ax) - return 0; - - return sinl (x * M_PIl); -} - -long double -cospil (long double x) -{ - /* cos(inf) should return nan, as defined by C23. */ - if (isinf (x)) - return __math_invalid (x); - - long double ax = fabsl (x); - - if (ax >= 0x1p64) - return 1; - - uint64_t m = (uint64_t) ax; - - /* Integer values of cospi(x) should return +/-1. - The sign depends on if x is odd or even. */ - if (m == ax) - return (m & 1) ? -1 : 1; - - /* Values of Integer + 0.5 should always return 0. */ - if (ax - 0.5 == m || ax + 0.5 == m) - return 0; - - return cosl (ax * M_PIl); -} \ No newline at end of file diff --git a/pl/math/v_asinh_3u5.c b/pl/math/v_asinh_3u5.c deleted file mode 100644 index 4862bef948617d..00000000000000 --- a/pl/math/v_asinh_3u5.c +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Double-precision vector asinh(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" - -#define A(i) v_f64 (__v_log_data.poly[i]) -#define N (1 << V_LOG_TABLE_BITS) - -const static struct data -{ - float64x2_t poly[18]; - uint64x2_t off, huge_bound, abs_mask; - float64x2_t ln2, tiny_bound; -} data = { - .off = V2 (0x3fe6900900000000), - .ln2 = V2 (0x1.62e42fefa39efp-1), - .huge_bound = V2 (0x5fe0000000000000), - .tiny_bound = V2 (0x1p-26), - .abs_mask = V2 (0x7fffffffffffffff), - /* Even terms of polynomial s.t. asinh(x) is approximated by - asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). - Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ - .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4), - V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6), - V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6), - V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7), - V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7), - V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8), - V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9), - V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12), - V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) }, -}; - -static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) -{ - return v_call_f64 (asinh, x, y, special); -} - -struct entry -{ - float64x2_t invc; - float64x2_t logc; -}; - -static inline struct entry -lookup (uint64x2_t i) -{ - float64x2_t e0 = vld1q_f64 ( - &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); - float64x2_t e1 = vld1q_f64 ( - &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); - return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) }; -} - -static inline float64x2_t -log_inline (float64x2_t x, const struct data *d) -{ - /* Double-precision vector log, copied from ordinary vector log with some - cosmetic modification and special-cases removed. */ - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint64x2_t tmp = vsubq_u64 (ix, d->off); - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); - uint64x2_t iz - = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52))); - float64x2_t z = vreinterpretq_f64_u64 (iz); - struct entry e = lookup (tmp); - float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); - float64x2_t kd = vcvtq_f64_s64 (k); - float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); - float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t y = vfmaq_f64 (A (2), A (3), r); - float64x2_t p = vfmaq_f64 (A (0), A (1), r); - y = vfmaq_f64 (y, A (4), r2); - y = vfmaq_f64 (p, y, r2); - y = vfmaq_f64 (hi, y, r2); - return y; -} - -/* Double-precision implementation of vector asinh(x). - asinh is very sensitive around 1, so it is impractical to devise a single - low-cost algorithm which is sufficiently accurate on a wide range of input. - Instead we use two different algorithms: - asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 - = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise - where log(x) is an optimized log approximation, and P(x) is a polynomial - shared with the scalar routine. The greatest observed error 3.29 ULP, in - |x| >= 1: - __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1 - want 0x1.ffffcfd0e2352p-1. */ -VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - - float64x2_t ax = vabsq_f64 (x); - uint64x2_t iax = vreinterpretq_u64_f64 (ax); - - uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); - uint64x2_t special = vcgeq_u64 (iax, d->huge_bound); - -#if WANT_SIMD_EXCEPT - uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); - special = vorrq_u64 (special, tiny); -#endif - - /* Option 1: |x| >= 1. - Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). - If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will - overflow, by setting special lanes to 1. These will be fixed later. */ - float64x2_t option_1 = v_f64 (0); - if (likely (v_any_u64 (gt1))) - { -#if WANT_SIMD_EXCEPT - float64x2_t xm = v_zerofy_f64 (ax, special); -#else - float64x2_t xm = ax; -#endif - option_1 = log_inline ( - vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d); - } - - /* Option 2: |x| < 1. - Compute asinh(x) using a polynomial. - If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will - overflow, and tiny lanes, which will underflow, by setting them to 0. They - will be fixed later, either by selecting x or falling back to the scalar - special-case. The largest observed error in this region is 1.47 ULPs: - __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 - want 0x1.c1d6bf874019cp-1. */ - float64x2_t option_2 = v_f64 (0); - if (likely (v_any_u64 (vceqzq_u64 (gt1)))) - { -#if WANT_SIMD_EXCEPT - ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); -#endif - float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2), - z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2), - z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8); - float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly); - option_2 = vfmaq_f64 (ax, p, x3); -#if WANT_SIMD_EXCEPT - option_2 = vbslq_f64 (tiny, x, option_2); -#endif - } - - /* Choose the right option for each lane. */ - float64x2_t y = vbslq_f64 (gt1, option_1, option_2); - /* Copy sign. */ - y = vbslq_f64 (d->abs_mask, y, x); - - if (unlikely (v_any_u64 (special))) - return special_case (x, y, special); - return y; -} - -PL_SIG (V, D, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_D1 (asinh), 2.80) -PL_TEST_EXPECT_FENV (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT) -/* Test vector asinh 3 times, with control lane < 1, > 1 and special. - Ensures the v_sel is choosing the right option in all cases. */ -#define V_ASINH_INTERVAL(lo, hi, n) \ - PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0.5) \ - PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 2) \ - PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0x1p600) -V_ASINH_INTERVAL (0, 0x1p-26, 50000) -V_ASINH_INTERVAL (0x1p-26, 1, 50000) -V_ASINH_INTERVAL (1, 0x1p511, 50000) -V_ASINH_INTERVAL (0x1p511, inf, 40000) diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c deleted file mode 100644 index 1723ba90d2f301..00000000000000 --- a/pl/math/v_asinhf_2u7.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Single-precision vector asinh(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "v_log1pf_inline.h" - -#define SignMask v_u32 (0x80000000) - -const static struct data -{ - struct v_log1pf_data log1pf_consts; - uint32x4_t big_bound; -#if WANT_SIMD_EXCEPT - uint32x4_t tiny_bound; -#endif -} data = { - .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, - .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ -#if WANT_SIMD_EXCEPT - .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */ -#endif -}; - -static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) -{ - return v_call_f32 (asinhf, x, y, special); -} - -/* Single-precision implementation of vector asinh(x), using vector log1p. - Worst-case error is 2.66 ULP, at roughly +/-0.25: - __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */ -VPCS_ATTR float32x4_t V_NAME_F1 (asinh) (float32x4_t x) -{ - const struct data *dat = ptr_barrier (&data); - uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask); - float32x4_t ax = vreinterpretq_f32_u32 (iax); - uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); - float32x4_t special_arg = x; - -#if WANT_SIMD_EXCEPT - /* Sidestep tiny and large values to avoid inadvertently triggering - under/overflow. */ - special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound)); - if (unlikely (v_any_u32 (special))) - { - ax = v_zerofy_f32 (ax, special); - x = v_zerofy_f32 (x, special); - } -#endif - - /* asinh(x) = log(x + sqrt(x * x + 1)). - For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ - float32x4_t d - = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x))); - float32x4_t y = log1pf_inline ( - vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts); - - if (unlikely (v_any_u32 (special))) - return special_case (special_arg, vbslq_f32 (SignMask, x, y), special); - return vbslq_f32 (SignMask, x, y); -} - -PL_SIG (V, F, 1, asinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_F1 (asinh), 2.17) -PL_TEST_EXPECT_FENV (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000) -PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000) diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c deleted file mode 100644 index f24667682dec00..00000000000000 --- a/pl/math/v_atan2_3u.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Double-precision vector atan2(x) function. - * - * Copyright (c) 2021-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f64.h" - -static const struct data -{ - float64x2_t pi_over_2; - float64x2_t poly[20]; -} data = { - /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on - the interval [2**-1022, 1.0]. */ - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, - .pi_over_2 = V2 (0x1.921fb54442d18p+0), -}; - -#define SignMask v_u64 (0x8000000000000000) - -/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) -{ - return v_call2_f64 (atan2, y, x, ret, cmp); -} - -/* Returns 1 if input is the bit representation of 0, infinity or nan. */ -static inline uint64x2_t -zeroinfnan (uint64x2_t i) -{ - /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ - return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), - v_u64 (2 * asuint64 (INFINITY) - 1)); -} - -/* Fast implementation of vector atan2. - Maximum observed error is 2.8 ulps: - _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) - got 0x1.92d628ab678ccp-1 - want 0x1.92d628ab678cfp-1. */ -float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) -{ - const struct data *data_ptr = ptr_barrier (&data); - - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint64x2_t iy = vreinterpretq_u64_f64 (y); - - uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); - - uint64x2_t sign_x = vandq_u64 (ix, SignMask); - uint64x2_t sign_y = vandq_u64 (iy, SignMask); - uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y); - - float64x2_t ax = vabsq_f64 (x); - float64x2_t ay = vabsq_f64 (y); - - uint64x2_t pred_xlt0 = vcltzq_f64 (x); - uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); - - /* Set up z for call to atan. */ - float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); - float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); - float64x2_t z = vdivq_f64 (n, d); - - /* Work out the correct shift. */ - float64x2_t shift = vreinterpretq_f64_u64 ( - vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); - shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); - shift = vmulq_f64 (shift, data_ptr->pi_over_2); - - /* Calculate the polynomial approximation. - Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of - full scheme to avoid underflow in x^16. - The order 19 polynomial P approximates - (atan(sqrt(x))-sqrt(x))/x^(3/2). */ - float64x2_t z2 = vmulq_f64 (z, z); - float64x2_t x2 = vmulq_f64 (z2, z2); - float64x2_t x4 = vmulq_f64 (x2, x2); - float64x2_t x8 = vmulq_f64 (x4, x4); - float64x2_t ret - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), - v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); - - /* Finalize. y = shift + z + z^3 * P(z^2). */ - ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); - ret = vaddq_f64 (ret, shift); - - /* Account for the sign of x and y. */ - ret = vreinterpretq_f64_u64 ( - veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); - - if (unlikely (v_any_u64 (special_cases))) - return special_case (y, x, ret, special_cases); - - return ret; -} - -/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ -PL_SIG (V, D, 2, atan2) -// TODO tighten this once __v_atan2 is fixed -PL_TEST_ULP (V_NAME_D2 (atan2), 2.9) -PL_TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000) -PL_TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000) -PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000) -PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000) diff --git a/pl/math/v_exp_data.c b/pl/math/v_exp_data.c deleted file mode 100644 index fd01cf27606fa0..00000000000000 --- a/pl/math/v_exp_data.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Scale values for vector exp and exp2 - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -/* 2^(j/N), j=0..N, N=2^7=128. Copied from math/v_exp_data.c. */ -const uint64_t __v_exp_data[] = { - 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, - 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, - 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, - 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, - 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, - 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, - 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, - 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, - 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, - 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, - 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, - 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, - 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, - 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, - 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, - 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, - 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, - 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, - 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, - 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, - 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, - 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, - 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, - 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, - 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, - 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, - 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, - 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, - 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, - 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, - 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, - 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, - 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, - 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, - 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, - 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, - 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, - 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, - 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, - 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, - 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, - 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, - 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, -}; diff --git a/pl/math/v_exp_tail.h b/pl/math/v_exp_tail.h deleted file mode 100644 index 903f1fd9571780..00000000000000 --- a/pl/math/v_exp_tail.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Constants for double-precision e^(x+tail) vector function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#define C1_scal 0x1.fffffffffffd4p-2 -#define C2_scal 0x1.5555571d6b68cp-3 -#define C3_scal 0x1.5555576a59599p-5 -#define InvLn2_scal 0x1.71547652b82fep8 /* N/ln2. */ -#define Ln2hi_scal 0x1.62e42fefa39efp-9 /* ln2/N. */ -#define Ln2lo_scal 0x1.abc9e3b39803f3p-64 - -#define N (1 << V_EXP_TAIL_TABLE_BITS) -#define Tab __v_exp_tail_data -#define IndexMask_scal (N - 1) -#define Shift_scal 0x1.8p+52 -#define Thres_scal 704.0 diff --git a/pl/math/v_exp_tail_inline.h b/pl/math/v_exp_tail_inline.h deleted file mode 100644 index 76ecc6b0a33a28..00000000000000 --- a/pl/math/v_exp_tail_inline.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Double-precision vector e^(x+tail) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#ifndef PL_MATH_V_EXP_TAIL_INLINE_H -#define PL_MATH_V_EXP_TAIL_INLINE_H - -#include "v_math.h" -#include "poly_advsimd_f64.h" - -#ifndef WANT_V_EXP_TAIL_SPECIALCASE -#error \ - "Cannot use v_exp_tail_inline.h without specifying whether you need the special case computation." -#endif - -#define N (1 << V_EXP_TAIL_TABLE_BITS) - -static const struct data -{ - float64x2_t poly[4]; -#if WANT_V_EXP_TAIL_SPECIALCASE - float64x2_t big_bound, huge_bound; -#endif - float64x2_t shift, invln2, ln2_hi, ln2_lo; -} data = { -#if WANT_V_EXP_TAIL_SPECIALCASE - .big_bound = V2 (704.0), - .huge_bound = V2 (1280.0 * N), -#endif - .shift = V2 (0x1.8p52), - .invln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ - .ln2_hi = V2 (0x1.62e42fefa39efp-9), /* ln2/N. */ - .ln2_lo = V2 (0x1.abc9e3b39803f3p-64), - .poly = { V2 (1.0), V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), - V2 (0x1.5555576a59599p-5) }, -}; - -static inline uint64x2_t -lookup_sbits (uint64x2_t i) -{ - return (uint64x2_t){__v_exp_tail_data[i[0]], __v_exp_tail_data[i[1]]}; -} - -#if WANT_V_EXP_TAIL_SPECIALCASE -#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ -/* The following 2 bias when combined form the exponent bias: - SpecialBias1 - SpecialBias2 = asuint64(1.0). */ -#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ -#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ -static float64x2_t VPCS_ATTR -v_exp_tail_special_case (float64x2_t s, float64x2_t y, float64x2_t n, - const struct data *d) -{ - /* 2^(n/N) may overflow, break it up into s1*s2. */ - uint64x2_t b = vandq_u64 (vclezq_f64 (n), SpecialOffset); - float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); - float64x2_t s2 = vreinterpretq_f64_u64 ( - vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); - uint64x2_t oflow = vcagtq_f64 (n, d->huge_bound); - float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); - float64x2_t r1 = vmulq_f64 (s1, s1); - return vbslq_f64 (oflow, r1, r0); -} -#endif - -static inline float64x2_t VPCS_ATTR -v_exp_tail_inline (float64x2_t x, float64x2_t xtail) -{ - const struct data *d = ptr_barrier (&data); -#if WANT_V_EXP_TAIL_SPECIALCASE - uint64x2_t special = vcgtq_f64 (vabsq_f64 (x), d->big_bound); -#endif - /* n = round(x/(ln2/N)). */ - float64x2_t z = vfmaq_f64 (d->shift, x, d->invln2); - uint64x2_t u = vreinterpretq_u64_f64 (z); - float64x2_t n = vsubq_f64 (z, d->shift); - - /* r = x - n*ln2/N. */ - float64x2_t r = x; - r = vfmsq_f64 (r, d->ln2_hi, n); - r = vfmsq_f64 (r, d->ln2_lo, n); - - uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); - uint64x2_t i = vandq_u64 (u, v_u64 (N - 1)); - - /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4, using Horner. */ - float64x2_t y = v_horner_3_f64 (r, d->poly); - y = vfmaq_f64 (xtail, y, r); - - /* s = 2^(n/N). */ - u = lookup_sbits (i); - float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); - -#if WANT_V_EXP_TAIL_SPECIALCASE - if (unlikely (v_any_u64 (special))) - return v_exp_tail_special_case (s, y, n, d); -#endif - return vfmaq_f64 (s, y, s); -} -#endif // PL_MATH_V_EXP_TAIL_INLINE_H diff --git a/pl/math/v_expf_inline.h b/pl/math/v_expf_inline.h deleted file mode 100644 index 166683726b4db3..00000000000000 --- a/pl/math/v_expf_inline.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Helper for single-precision routines which calculate exp(x) and do not - * need special-case handling - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_V_EXPF_INLINE_H -#define PL_MATH_V_EXPF_INLINE_H - -#include "v_math.h" - -struct v_expf_data -{ - float32x4_t poly[5]; - float32x4_t shift, invln2_and_ln2; -}; - -/* maxerr: 1.45358 +0.5 ulp. */ -#define V_EXPF_DATA \ - { \ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \ - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \ - .shift = V4 (0x1.8p23f), \ - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ - } - -#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */ -#define C(i) d->poly[i] - -static inline float32x4_t -v_expf_inline (float32x4_t x, const struct v_expf_data *d) -{ - /* Helper routine for calculating exp(x). - Copied from v_expf.c, with all special-case handling removed - the - calling routine should handle special values if required. */ - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - float32x4_t n, r, z; - z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0); - n = vsubq_f32 (z, d->shift); - r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1); - r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2); - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); - - /* Custom order-4 Estrin avoids building high order monomial. */ - float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t p, q, poly; - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); - q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); - return vfmaq_f32 (scale, poly, scale); -} - -#endif // PL_MATH_V_EXPF_INLINE_H diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c deleted file mode 100644 index dd255472cec0f9..00000000000000 --- a/pl/math/v_expm1_2u5.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Double-precision vector exp(x) - 1 function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - float64x2_t poly[11]; - float64x2_t invln2, ln2, shift; - int64x2_t exponent_bias; -#if WANT_SIMD_EXCEPT - uint64x2_t thresh, tiny_bound; -#else - float64x2_t oflow_bound; -#endif -} data = { - /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, - .invln2 = V2 (0x1.71547652b82fep0), - .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, - .shift = V2 (0x1.8p52), - .exponent_bias = V2 (0x3ff0000000000000), -#if WANT_SIMD_EXCEPT - /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs - compare. */ - .thresh = V2 (0x78c56fa6d34b552), - /* asuint64(0x1p-51) << 1. */ - .tiny_bound = V2 (0x3cc0000000000000 << 1), -#else - /* Value above which expm1(x) should overflow. Absolute value of the - underflow bound is greater than this, so it catches both cases - there is - a small window where fallbacks are triggered unnecessarily. */ - .oflow_bound = V2 (0x1.62b7d369a5aa9p+9), -#endif -}; - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) -{ - return v_call_f64 (expm1, x, y, special); -} - -/* Double-precision vector exp(x) - 1 function. - The maximum error observed error is 2.18 ULP: - _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 - want 0x1.a8b9ea8d66e2p-2. */ -float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - - uint64x2_t ix = vreinterpretq_u64_f64 (x); - -#if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered correctly, fall back to scalar for - |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for - shift-left by 1, and compare with thresh which was left-shifted offline - - this is effectively an absolute compare. */ - uint64x2_t special - = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); - if (unlikely (v_any_u64 (special))) - x = v_zerofy_f64 (x, special); -#else - /* Large input, NaNs and Infs. */ - uint64x2_t special = vcageq_f64 (x, d->oflow_bound); -#endif - - /* Reduce argument to smaller range: - Let i = round(x / ln2) - and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where 2^i is exact because i is an integer. */ - float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (n); - float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); - f = vfmsq_laneq_f64 (f, n, d->ln2, 1); - - /* Approximate expm1(f) using polynomial. - Taylor expansion for expm1(x) has the form: - x + ax^2 + bx^3 + cx^4 .... - So we calculate the polynomial P(f) = a + bf + cf^2 + ... - and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t f8 = vmulq_f64 (f4, f4); - float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); - - /* Assemble the result. - expm1(x) ~= 2^i * (p + 1) - 1 - Let t = 2^i. */ - int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); - float64x2_t t = vreinterpretq_f64_s64 (u); - - if (unlikely (v_any_u64 (special))) - return special_case (vreinterpretq_f64_u64 (ix), - vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t), - special); - - /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); -} - -PL_SIG (V, D, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (V_NAME_D1 (expm1), 1.68) -PL_TEST_EXPECT_FENV (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100) diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c deleted file mode 100644 index 6b282d0cc00f3b..00000000000000 --- a/pl/math/v_expm1f_1u6.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Single-precision vector exp(x) - 1 function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - float32x4_t poly[5]; - float32x4_t invln2_and_ln2; - float32x4_t shift; - int32x4_t exponent_bias; -#if WANT_SIMD_EXCEPT - uint32x4_t thresh; -#else - float32x4_t oflow_bound; -#endif -} data = { - /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ - .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), - V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, - /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, - .shift = V4 (0x1.8p23f), - .exponent_bias = V4 (0x3f800000), -#if !WANT_SIMD_EXCEPT - /* Value above which expm1f(x) should overflow. Absolute value of the - underflow bound is greater than this, so it catches both cases - there is - a small window where fallbacks are triggered unnecessarily. */ - .oflow_bound = V4 (0x1.5ebc4p+6), -#else - /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute - compare. */ - .thresh = V4 (0x1d5ebc40), -#endif -}; - -/* asuint(0x1p-23), shifted by 1 for abs compare. */ -#define TinyBound v_u32 (0x34000000 << 1) - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) -{ - return v_call_f32 (expm1f, x, y, special); -} - -/* Single-precision vector exp(x) - 1 function. - The maximum error is 1.51 ULP: - _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2 - want 0x1.e2fb94p-2. */ -float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - uint32x4_t ix = vreinterpretq_u32_f32 (x); - -#if WANT_SIMD_EXCEPT - /* If fp exceptions are to be triggered correctly, fall back to scalar for - |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for - shift-left by 1, and compare with thresh which was left-shifted offline - - this is effectively an absolute compare. */ - uint32x4_t special - = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); - if (unlikely (v_any_u32 (special))) - x = v_zerofy_f32 (x, special); -#else - /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ - uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); -#endif - - /* Reduce argument to smaller range: - Let i = round(x / ln2) - and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where 2^i is exact because i is an integer. */ - float32x4_t j = vsubq_f32 ( - vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); - int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); - f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); - - /* Approximate expm1(f) using polynomial. - Taylor expansion for expm1(x) has the form: - x + ax^2 + bx^3 + cx^4 .... - So we calculate the polynomial P(f) = a + bf + cf^2 + ... - and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float32x4_t p = v_horner_4_f32 (f, d->poly); - p = vfmaq_f32 (f, vmulq_f32 (f, f), p); - - /* Assemble the result. - expm1(x) ~= 2^i * (p + 1) - 1 - Let t = 2^i. */ - int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); - float32x4_t t = vreinterpretq_f32_s32 (u); - - if (unlikely (v_any_u32 (special))) - return special_case (vreinterpretq_f32_u32 (ix), - vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t), - special); - - /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); -} - -PL_SIG (V, F, 1, expm1, -9.9, 9.9) -PL_TEST_ULP (V_NAME_F1 (expm1), 1.02) -PL_TEST_EXPECT_FENV (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000) -PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000) -PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000) -PL_TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000) -PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000) diff --git a/pl/math/v_expm1f_inline.h b/pl/math/v_expm1f_inline.h deleted file mode 100644 index 6ae94c452de2f1..00000000000000 --- a/pl/math/v_expm1f_inline.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Helper for single-precision routines which calculate exp(x) - 1 and do not - * need special-case handling - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_V_EXPM1F_INLINE_H -#define PL_MATH_V_EXPM1F_INLINE_H - -#include "v_math.h" -#include "math_config.h" -#include "poly_advsimd_f32.h" - -struct v_expm1f_data -{ - float32x4_t poly[5]; - float32x4_t invln2_and_ln2, shift; - int32x4_t exponent_bias; -}; - -/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, - log(2)/2]. Exponent bias is asuint(1.0f). - invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */ -#define V_EXPM1F_DATA \ - { \ - .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \ - V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \ - .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \ - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ - } - -static inline float32x4_t -expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) -{ - /* Helper routine for calculating exp(x) - 1. - Copied from v_expm1f_1u6.c, with all special-case handling removed - the - calling routine should handle special values if required. */ - - /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - float32x4_t j = vsubq_f32 ( - vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); - int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); - f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); - - /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). - Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses - Horner. */ - float32x4_t f2 = vmulq_f32 (f, f); - float32x4_t f4 = vmulq_f32 (f2, f2); - float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly); - p = vfmaq_f32 (f, f2, p); - - /* t = 2^i. */ - int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); - float32x4_t t = vreinterpretq_f32_s32 (u); - /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); -} - -#endif // PL_MATH_V_EXPM1F_INLINE_H diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c deleted file mode 100644 index 35dd62fe5e3ef8..00000000000000 --- a/pl/math/v_log10_2u5.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Double-precision vector log10(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f64.h" - -#define N (1 << V_LOG10_TABLE_BITS) - -static const struct data -{ - uint64x2_t min_norm; - uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t invln10, log10_2, ln2; - uint64x2_t sign_exp_mask; -} data = { - /* Computed from log coefficients divided by log(10) then rounded to double - precision. */ - .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3), - V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4), - V2 (-0x1.287461742fee4p-4) }, - .ln2 = V2 (0x1.62e42fefa39efp-1), - .invln10 = V2 (0x1.bcb7b1526e50ep-2), - .log10_2 = V2 (0x1.34413509f79ffp-2), - .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ - .sign_exp_mask = V2 (0xfff0000000000000), -}; - -#define Off v_u64 (0x3fe6900900000000) -#define IndexMask (N - 1) - -#define T(s, i) __v_log10_data.s[i] - -struct entry -{ - float64x2_t invc; - float64x2_t log10c; -}; - -static inline struct entry -lookup (uint64x2_t i) -{ - struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; - float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); - float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); - e.invc = vuzp1q_f64 (e0, e1); - e.log10c = vuzp2q_f64 (e0, e1); - return e; -} - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, - uint32x2_t special) -{ - return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special)); -} - -/* Fast implementation of double-precision vector log10 - is a slight modification of double-precision vector log. - Max ULP error: < 2.5 ulp (nearest rounding.) - Maximum measured at 2.46 ulp for x in [0.96, 0.97] - _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 - want 0x1.fff6be3cae4b9p-6. */ -float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - uint64x2_t tmp = vsubq_u64 (ix, Off); - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); - float64x2_t z = vreinterpretq_f64_u64 (iz); - - struct entry e = lookup (tmp); - - /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ - float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); - float64x2_t kd = vcvtq_f64_s64 (k); - - /* hi = r / log(10) + log10(c) + k*log10(2). - Constants in v_log10_data.c are computed (in extended precision) as - e.log10c := e.logc * ivln10. */ - float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10); - - /* y = log10(1+r) + n * log10(2). */ - float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2); - - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); - - if (unlikely (v_any_u32h (special))) - return special_case (x, y, hi, r2, special); - return vfmaq_f64 (hi, r2, y); -} - -PL_SIG (V, D, 1, log10, 0.01, 11.1) -PL_TEST_ULP (V_NAME_D1 (log10), 1.97) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log10)) -PL_TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000) -PL_TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000) -PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000) diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c deleted file mode 100644 index 92bc50ba5bd93a..00000000000000 --- a/pl/math/v_log10f_3u5.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Single-precision vector log10 function. - * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - uint32x4_t min_norm; - uint16x8_t special_bound; - float32x4_t poly[8]; - float32x4_t inv_ln10, ln2; - uint32x4_t off, mantissa_mask; -} data = { - /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in - [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ - .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), - V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), - V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, - .ln2 = V4 (0x1.62e43p-1f), - .inv_ln10 = V4 (0x1.bcb7b2p-2f), - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ - .off = V4 (0x3f2aaaab), /* 0.666667. */ - .mantissa_mask = V4 (0x007fffff), -}; - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, - uint16x4_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); -} - -/* Fast implementation of AdvSIMD log10f, - uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and - an order 9 polynomial. - Maximum error: 3.305ulps (nearest rounding.) - _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 - want 0x1.ffe2f4p-4. */ -float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); - float32x4_t n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); - float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); - - /* y = log10(1+r) + n * log10(2). */ - float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly); - /* y = Log10(2) * n + poly * InvLn(10). */ - float32x4_t y = vfmaq_f32 (r, d->ln2, n); - y = vmulq_f32 (y, d->inv_ln10); - - if (unlikely (v_any_u16h (special))) - return special_case (x, y, poly, r2, special); - return vfmaq_f32 (y, poly, r2); -} - -PL_SIG (V, F, 1, log10, 0.01, 11.1) -PL_TEST_ULP (V_NAME_F1 (log10), 2.81) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log10)) -PL_TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100) -PL_TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100) -PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000) diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c deleted file mode 100644 index face02ddc6c388..00000000000000 --- a/pl/math/v_log1p_2u5.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Double-precision vector log(1+x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" - -const static struct data -{ - float64x2_t poly[19], ln2[2]; - uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one; - int64x2_t one_top; -} data = { - /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ - .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), - V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), - V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), - V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), - V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), - V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), - V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), - V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), - V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), - V2 (-0x1.cfa7385bdb37ep-6) }, - .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, - /* top32(asuint64(sqrt(2)/2)) << 32. */ - .hf_rt2_top = V2 (0x3fe6a09e00000000), - /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ - .one_m_hf_rt2_top = V2 (0x00095f6200000000), - .umask = V2 (0x000fffff00000000), - .one_top = V2 (0x3ff), - .inf = V2 (0x7ff0000000000000), - .minus_one = V2 (0xbff0000000000000) -}; - -#define BottomMask v_u64 (0xffffffff) - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) -{ - return v_call_f64 (log1p, x, y, special); -} - -/* Vector log1p approximation using polynomial on reduced interval. Routine is - a modification of the algorithm used in scalar log1p, with no shortcut for - k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP: - _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2 - want 0x1.fd61d0727429fp+2 . */ -VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); - uint64x2_t special = vcgeq_u64 (ia, d->inf); - -#if WANT_SIMD_EXCEPT - special = vorrq_u64 (special, - vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1)))); - if (unlikely (v_any_u64 (special))) - x = v_zerofy_f64 (x, special); -#else - special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1))); -#endif - - /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f - is in [sqrt(2)/2, sqrt(2)]): - log1p(x) = k*log(2) + log1p(f). - - f may not be representable exactly, so we need a correction term: - let m = round(1 + x), c = (1 + x) - m. - c << m: at very small x, log1p(x) ~ x, hence: - log(1+x) - log(m) ~ c/m. - - We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ - - /* Obtain correctly scaled k by manipulation in the exponent. - The scalar algorithm casts down to 32-bit at this point to calculate k and - u_red. We stay in double-width to obtain f and k, using the same constants - as the scalar algorithm but shifted left by 32. */ - float64x2_t m = vaddq_f64 (x, v_f64 (1)); - uint64x2_t mi = vreinterpretq_u64_f64 (m); - uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); - - int64x2_t ki - = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); - float64x2_t k = vcvtq_f64_s64 (ki); - - /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ - uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); - uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); - float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); - - /* Correction term c/m. */ - float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); - - /* Approximate log1p(x) on the reduced input using a polynomial. Because - log1p(0)=0 we choose an approximation of the form: - x + C0*x^2 + C1*x^3 + C2x^4 + ... - Hence approximation has the form f + f^2 * P(f) - where P(x) = C0 + C1*x + C2x^2 + ... - Assembling this all correctly is dealt with at the final step. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); - - float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); - float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); - float64x2_t y = vaddq_f64 (ylo, yhi); - - if (unlikely (v_any_u64 (special))) - return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p), - special); - - return vfmaq_f64 (y, f2, p); -} - -PL_SIG (V, D, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (V_NAME_D1 (log1p), 1.97) -PL_TEST_EXPECT_FENV (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000) -PL_TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500) diff --git a/pl/math/v_log1p_inline.h b/pl/math/v_log1p_inline.h deleted file mode 100644 index bd57bfc6fe6e84..00000000000000 --- a/pl/math/v_log1p_inline.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Helper for vector double-precision routines which calculate log(1 + x) and do - * not need special-case handling - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -#ifndef PL_MATH_V_LOG1P_INLINE_H -#define PL_MATH_V_LOG1P_INLINE_H - -#include "v_math.h" -#include "poly_advsimd_f64.h" - -struct v_log1p_data -{ - float64x2_t poly[19], ln2[2]; - uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; - int64x2_t one_top; -}; - -/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ -#define V_LOG1P_CONSTANTS_TABLE \ - { \ - .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \ - V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \ - V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \ - V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \ - V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \ - V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \ - V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \ - V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \ - V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \ - V2 (-0x1.cfa7385bdb37ep-6) }, \ - .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \ - .hf_rt2_top = V2 (0x3fe6a09e00000000), \ - .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ - .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ - } - -#define BottomMask v_u64 (0xffffffff) - -static inline float64x2_t -log1p_inline (float64x2_t x, const struct v_log1p_data *d) -{ - /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several - modifications: - - No special-case handling - this should be dealt with by the caller. - - Pairwise Horner polynomial evaluation for improved accuracy. - - Optionally simulate the shortcut for k=0, used in the scalar routine, - using v_sel, for improved accuracy when the argument to log1p is close to - 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in - the source of the caller before including this file. - See v_log1pf_2u1.c for details of the algorithm. */ - float64x2_t m = vaddq_f64 (x, v_f64 (1)); - uint64x2_t mi = vreinterpretq_u64_f64 (m); - uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); - - int64x2_t ki - = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); - float64x2_t k = vcvtq_f64_s64 (ki); - - /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ - uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); - uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); - float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); - - /* Correction term c/m. */ - float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); - -#ifndef WANT_V_LOG1P_K0_SHORTCUT -#error \ - "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" -#elif WANT_V_LOG1P_K0_SHORTCUT - /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is - that the approximation is solely the polynomial. */ - uint64x2_t k0 = vceqzq_f64 (k); - cm = v_zerofy_f64 (cm, k0); - f = vbslq_f64 (k0, x, f); -#endif - - /* Approximate log1p(f) on the reduced input using a polynomial. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); - - /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ - float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); - float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); - return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); -} - -#endif // PL_MATH_V_LOG1P_INLINE_H diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c deleted file mode 100644 index 153c88da9c888d..00000000000000 --- a/pl/math/v_log1pf_2u1.c +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Single-precision vector log(1+x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f32.h" - -const static struct data -{ - float32x4_t poly[8], ln2; - uint32x4_t tiny_bound, minus_one, four, thresh; - int32x4_t three_quarters; -} data = { - .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients - (1, -0.5) are not stored as they can be generated more - efficiently. */ - V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), - V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), - V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, - .ln2 = V4 (0x1.62e43p-1f), - .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ - .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */ - .minus_one = V4 (0xbf800000), - .four = V4 (0x40800000), - .three_quarters = V4 (0x3f400000) -}; - -static inline float32x4_t -eval_poly (float32x4_t m, const float32x4_t *p) -{ - /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */ - float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]); - float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]); - float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]); - float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]); - - float32x4_t m2 = vmulq_f32 (m, m); - float32x4_t p_02 = vfmaq_f32 (m, m2, p_12); - float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56); - float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]); - - float32x4_t m4 = vmulq_f32 (m2, m2); - float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36); - return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79)); -} - -static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) -{ - return v_call_f32 (log1pf, x, y, special); -} - -/* Vector log1pf approximation using polynomial on reduced interval. Accuracy - is roughly 2.02 ULP: - log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ -VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - - uint32x4_t ix = vreinterpretq_u32_f32 (x); - uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - uint32x4_t special_cases - = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh), - vcgeq_u32 (ix, d->minus_one)); - float32x4_t special_arg = x; - -#if WANT_SIMD_EXCEPT - if (unlikely (v_any_u32 (special_cases))) - /* Side-step special lanes so fenv exceptions are not triggered - inadvertently. */ - x = v_zerofy_f32 (x, special_cases); -#endif - - /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m - is in [-0.25, 0.5]): - log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). - - We approximate log1p(m) with a polynomial, then scale by - k*log(2). Instead of doing this directly, we use an intermediate - scale factor s = 4*k*log(2) to ensure the scale is representable - as a normalised fp32 number. */ - - float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); - - /* Choose k to scale x to the range [-1/4, 1/2]. */ - int32x4_t k - = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), - v_s32 (0xff800000)); - uint32x4_t ku = vreinterpretq_u32_s32 (k); - - /* Scale x by exponent manipulation. */ - float32x4_t m_scale - = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); - - /* Scale up to ensure that the scale factor is representable as normalised - fp32 number, and scale m down accordingly. */ - float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); - m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); - - /* Evaluate polynomial on the reduced interval. */ - float32x4_t p = eval_poly (m_scale, d->poly); - - /* The scale factor to be applied back at the end - by multiplying float(k) - by 2^-23 we get the unbiased exponent of k. */ - float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23)); - - /* Apply the scaling back. */ - float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2); - - if (unlikely (v_any_u32 (special_cases))) - return special_case (special_arg, y, special_cases); - return y; -} - -PL_SIG (V, F, 1, log1p, -0.9, 10.0) -PL_TEST_ULP (V_NAME_F1 (log1p), 1.53) -PL_TEST_EXPECT_FENV (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000) -PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000) diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h deleted file mode 100644 index c654c6bad08fd7..00000000000000 --- a/pl/math/v_log1pf_inline.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Helper for single-precision routines which calculate log(1 + x) and do not - * need special-case handling - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef PL_MATH_V_LOG1PF_INLINE_H -#define PL_MATH_V_LOG1PF_INLINE_H - -#include "v_math.h" -#include "poly_advsimd_f32.h" - -struct v_log1pf_data -{ - float32x4_t poly[8], ln2; - uint32x4_t four; - int32x4_t three_quarters; -}; - -/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients - (1, -0.5) are not stored as they can be generated more efficiently. */ -#define V_LOG1PF_CONSTANTS_TABLE \ - { \ - .poly \ - = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \ - V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \ - V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \ - .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ - .three_quarters = V4 (0x3f400000) \ - } - -static inline float32x4_t -eval_poly (float32x4_t m, const float32x4_t *c) -{ - /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine - uses split Estrin, but this way reduces register pressure in the calling - routine). */ - float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]); - float32x4_t m2 = vmulq_f32 (m, m); - q = vfmaq_f32 (m, m2, q); - float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1); - p = vmulq_f32 (m2, p); - return vfmaq_f32 (q, m2, p); -} - -static inline float32x4_t -log1pf_inline (float32x4_t x, const struct v_log1pf_data d) -{ - /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no - special-case handling. See that file for details of the algorithm. */ - float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); - int32x4_t k - = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters), - v_s32 (0xff800000)); - uint32x4_t ku = vreinterpretq_u32_s32 (k); - float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku)); - float32x4_t m_scale - = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); - m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); - float32x4_t p = eval_poly (m_scale, d.poly); - float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); - return vfmaq_f32 (p, scale_back, d.ln2); -} - -#endif // PL_MATH_V_LOG1PF_INLINE_H diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c deleted file mode 100644 index 2dd2c34b7c97f0..00000000000000 --- a/pl/math/v_log2_3u.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Double-precision vector log2 function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "pl_sig.h" -#include "pl_test.h" -#include "poly_advsimd_f64.h" - -#define N (1 << V_LOG2_TABLE_BITS) - -static const struct data -{ - uint64x2_t min_norm; - uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t invln2; - uint64x2_t sign_exp_mask; -} data = { - /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 - and N = 128, then scaled by log2(e) in extended precision and rounded back - to double precision. */ - .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2), - V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2), - V2 (-0x1.ec738d616fe26p-3) }, - .invln2 = V2 (0x1.71547652b82fep0), - .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ - .sign_exp_mask = V2 (0xfff0000000000000), -}; - -#define Off v_u64 (0x3fe6900900000000) -#define IndexMask (N - 1) - -struct entry -{ - float64x2_t invc; - float64x2_t log2c; -}; - -static inline struct entry -lookup (uint64x2_t i) -{ - struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; - float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); - float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); - e.invc = vuzp1q_f64 (e0, e1); - e.log2c = vuzp2q_f64 (e0, e1); - return e; -} - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, - uint32x2_t special) -{ - return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special)); -} - -/* Double-precision vector log2 routine. Implements the same algorithm as - vector log10, with coefficients and table entries scaled in extended - precision. The maximum observed error is 2.58 ULP: - _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 - want 0x1.fffb34198d9ddp-5. */ -float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); - - /* x = 2^k z; where z is in range [Off,2*Off) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - uint64x2_t tmp = vsubq_u64 (ix, Off); - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); - float64x2_t z = vreinterpretq_f64_u64 (iz); - - struct entry e = lookup (tmp); - - /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ - - float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); - float64x2_t kd = vcvtq_f64_s64 (k); - float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2); - - float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); - w = vaddq_f64 (kd, w); - - if (unlikely (v_any_u32h (special))) - return special_case (x, y, w, r2, special); - return vfmaq_f64 (w, r2, y); -} - -PL_SIG (V, D, 1, log2, 0.01, 11.1) -PL_TEST_ULP (V_NAME_D1 (log2), 2.09) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log2)) -PL_TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000) diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c deleted file mode 100644 index c64d88742136e1..00000000000000 --- a/pl/math/v_log2f_2u5.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Single-precision vector log2 function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f32.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - uint32x4_t min_norm; - uint16x8_t special_bound; - uint32x4_t off, mantissa_mask; - float32x4_t poly[9]; -} data = { - /* Coefficients generated using Remez algorithm approximate - log2(1+r)/r for r in [ -1/3, 1/3 ]. - rel error: 0x1.c4c4b0cp-26. */ - .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ - V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), - V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), - V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ - .off = V4 (0x3f2aaaab), /* 0.666667. */ - .mantissa_mask = V4 (0x007fffff), -}; - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, - uint16x4_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); -} - -/* Fast implementation for single precision AdvSIMD log2, - relies on same argument reduction as AdvSIMD logf. - Maximum error: 2.48 ULPs - _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 - want 0x1.a9be8p-2. */ -float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); - float32x4_t n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); - float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); - - /* y = log2(1+r) + n. */ - float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); - - if (unlikely (v_any_u16h (special))) - return special_case (x, n, p, r, special); - return vfmaq_f32 (n, p, r); -} - -PL_SIG (V, F, 1, log2, 0.01, 11.1) -PL_TEST_ULP (V_NAME_F1 (log2), 1.99) -PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log2)) -PL_TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100) -PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000) -PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000) -PL_TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000) diff --git a/pl/math/v_log_data.c b/pl/math/v_log_data.c deleted file mode 100644 index a26e8a051d973b..00000000000000 --- a/pl/math/v_log_data.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Lookup table for double-precision log(x) vector function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -const struct v_log_data __v_log_data = { - /* Worst-case error: 1.17 + 0.5 ulp. - Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, - 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 }, - .ln2 = 0x1.62e42fefa39efp-1, - /* Algorithm: - - x = 2^k z - log(x) = k ln2 + log(c) + poly(z/c - 1) - - where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, - N=128) and log(c) and 1/c for the ith subinterval comes from two lookup - tables: - - table[i].invc = 1/c - table[i].logc = (double)log(c) - - where c is near the center of the subinterval and is chosen by trying - several floating point invc candidates around 1/center and selecting one - for which the error in (double)log(c) is minimized (< 0x1p-74), except the - subinterval that contains 1 and the previous one got tweaked to avoid - cancellation. */ - .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, - { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, - { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, - { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, - { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, - { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, - { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, - { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, - { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, - { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, - { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, - { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, - { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, - { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, - { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, - { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, - { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, - { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, - { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, - { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, - { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, - { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, - { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, - { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, - { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, - { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, - { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, - { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, - { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, - { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, - { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, - { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, - { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, - { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, - { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, - { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, - { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, - { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, - { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, - { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, - { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, - { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, - { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, - { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, - { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, - { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, - { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, - { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, - { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, - { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, - { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, - { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, - { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, - { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, - { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, - { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, - { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, - { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, - { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, - { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, - { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, - { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, - { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, - { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, - { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, - { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, - { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, - { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, - { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, - { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, - { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, - { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, - { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, - { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, - { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, - { 1.0, 0.0 }, - { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, - { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, - { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, - { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, - { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, - { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, - { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, - { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, - { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, - { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, - { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, - { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, - { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, - { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, - { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, - { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, - { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, - { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, - { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, - { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, - { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, - { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, - { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, - { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, - { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, - { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, - { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, - { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, - { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, - { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, - { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, - { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, - { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, - { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, - { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, - { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, - { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, - { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, - { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, - { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, - { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, - { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, - { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, - { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, - { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, - { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, - { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, - { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, - { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, - { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, - { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, - { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } -}; diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c deleted file mode 100644 index a644f54b4a0f75..00000000000000 --- a/pl/math/v_sinh_3u.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Double-precision vector sinh(x) function. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f64.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - float64x2_t poly[11]; - float64x2_t inv_ln2, m_ln2, shift; - uint64x2_t halff; - int64x2_t onef; -#if WANT_SIMD_EXCEPT - uint64x2_t tiny_bound, thresh; -#else - uint64x2_t large_bound; -#endif -} data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, - - .inv_ln2 = V2 (0x1.71547652b82fep0), - .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, - .shift = V2 (0x1.8p52), - - .halff = V2 (0x3fe0000000000000), - .onef = V2 (0x3ff0000000000000), -#if WANT_SIMD_EXCEPT - /* 2^-26, below which sinh(x) rounds to x. */ - .tiny_bound = V2 (0x3e50000000000000), - /* asuint(large_bound) - asuint(tiny_bound). */ - .thresh = V2 (0x0230000000000000), -#else -/* 2^9. expm1 helper overflows for large input. */ - .large_bound = V2 (0x4080000000000000), -#endif -}; - -static inline float64x2_t -expm1_inline (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - - /* Reduce argument: - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where i = round(x / ln2) - and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ - float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (j); - float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0); - f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1); - /* Approximate expm1(f) using polynomial. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t f8 = vmulq_f64 (f4, f4); - float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); - /* t = 2^i. */ - float64x2_t t = vreinterpretq_f64_u64 ( - vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef))); - /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); -} - -static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x) -{ - return v_call_f64 (sinh, x, x, v_u64 (-1)); -} - -/* Approximation for vector double-precision sinh(x) using expm1. - sinh(x) = (exp(x) - exp(-x)) / 2. - The greatest observed error is 2.57 ULP: - _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 - want 0x1.ab34e59d678d9p-2. */ -float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - - float64x2_t ax = vabsq_f64 (x); - uint64x2_t sign - = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax)); - float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff)); - -#if WANT_SIMD_EXCEPT - uint64x2_t special = vcgeq_u64 ( - vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); -#else - uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound); -#endif - - /* Fall back to scalar variant for all lanes if any of them are special. */ - if (unlikely (v_any_u64 (special))) - return special_case (x); - - /* Up to the point that expm1 overflows, we can use it to calculate sinh - using a slight rearrangement of the definition of sinh. This allows us to - retain acceptable accuracy for very small inputs. */ - float64x2_t t = expm1_inline (ax); - t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); - return vmulq_f64 (t, halfsign); -} - -PL_SIG (V, D, 1, sinh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_D1 (sinh), 2.08) -PL_TEST_EXPECT_FENV (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/pl/math/v_tanh_3u.c b/pl/math/v_tanh_3u.c deleted file mode 100644 index 5de85c68da2cd3..00000000000000 --- a/pl/math/v_tanh_3u.c +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Double-precision vector tanh(x) function. - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" -#include "poly_advsimd_f64.h" -#include "mathlib.h" -#include "pl_sig.h" -#include "pl_test.h" - -static const struct data -{ - float64x2_t poly[11]; - float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; - uint64x2_t onef; - uint64x2_t thresh, tiny_bound; -} data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, - - .inv_ln2 = V2 (0x1.71547652b82fep0), - .ln2_hi = V2 (-0x1.62e42fefa39efp-1), - .ln2_lo = V2 (-0x1.abc9e3b39803fp-56), - .shift = V2 (0x1.8p52), - - .onef = V2 (0x3ff0000000000000), - .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ - /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ - .thresh = V2 (0x01f241bf835f9d5f), -}; - -static inline float64x2_t -expm1_inline (float64x2_t x, const struct data *d) -{ - /* Helper routine for calculating exp(x) - 1. Vector port of the helper from - the scalar variant of tanh. */ - - /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (j); - float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi); - f = vfmaq_f64 (f, j, d->ln2_lo); - - /* Approximate expm1(f) using polynomial. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t p = vfmaq_f64 ( - f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly)); - - /* t = 2 ^ i. */ - float64x2_t t = vreinterpretq_f64_u64 ( - vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef)); - /* expm1(x) = p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t); -} - -static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) -{ - return v_call_f64 (tanh, x, y, special); -} - -/* Vector approximation for double-precision tanh(x), using a simplified - version of expm1. The greatest observed error is 2.77 ULP: - _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 - want -0x1.bd6a21a163624p-3. */ -float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - - uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); - - float64x2_t u = x; - - /* Trigger special-cases for tiny, boring and infinity/NaN. */ - uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh); -#if WANT_SIMD_EXCEPT - /* To trigger fp exceptions correctly, set special lanes to a neutral value. - They will be fixed up later by the special-case handler. */ - if (unlikely (v_any_u64 (special))) - u = v_zerofy_f64 (u, special); -#endif - - u = vaddq_f64 (u, u); - - /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - float64x2_t q = expm1_inline (u, d); - float64x2_t qp2 = vaddq_f64 (q, v_f64 (2)); - - if (unlikely (v_any_u64 (special))) - return special_case (x, vdivq_f64 (q, qp2), special); - return vdivq_f64 (q, qp2); -} - -PL_SIG (V, D, 1, tanh, -10.0, 10.0) -PL_TEST_ULP (V_NAME_D1 (tanh), 2.27) -PL_TEST_EXPECT_FENV (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) -PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/string/Dir.mk b/string/Dir.mk index 40ff5acc093e9d..dd8283ec4977a4 100644 --- a/string/Dir.mk +++ b/string/Dir.mk @@ -13,9 +13,12 @@ all-string bench-string check-string install-string clean-string: else string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS]) +string-lib-srcs += $(wildcard $(S)/$(ARCH)/experimental/*.[cS]) string-test-srcs := $(wildcard $(S)/test/*.c) string-bench-srcs := $(wildcard $(S)/bench/*.c) +string-arch-include-dir := $(wildcard $(S)/$(ARCH)) +string-arch-includes := $(wildcard $(S)/$(ARCH)/*.h) string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) string-libs := \ @@ -43,6 +46,7 @@ string-tests := \ string-benches := \ build/bin/bench/memcpy \ + build/bin/bench/memset \ build/bin/bench/strlen string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs))) @@ -64,8 +68,8 @@ string-files := \ all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes) -$(string-objs): $(string-includes) -$(string-objs): CFLAGS_ALL += $(string-cflags) +$(string-objs): $(string-includes) $(string-arch-includes) +$(string-objs): CFLAGS_ALL += $(string-cflags) -I$(string-arch-include-dir) $(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE @@ -101,6 +105,7 @@ check-string: $(string-tests-out) bench-string: $(string-benches) $(EMULATOR) build/bin/bench/strlen $(EMULATOR) build/bin/bench/memcpy + $(EMULATOR) build/bin/bench/memset install-string: \ $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \ diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S index 207e22950c6d3c..34b5789240dacb 100644 --- a/string/aarch64/__mtag_tag_region.S +++ b/string/aarch64/__mtag_tag_region.S @@ -27,9 +27,6 @@ #define zva_val x4 ENTRY (__mtag_tag_region) - PTR_ARG (0) - SIZE_ARG (1) - add dstend, dstin, count cmp count, 96 diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S index 44b8e0114f4265..2fa248e25621eb 100644 --- a/string/aarch64/__mtag_tag_zero_region.S +++ b/string/aarch64/__mtag_tag_zero_region.S @@ -27,9 +27,6 @@ #define zva_val x4 ENTRY (__mtag_tag_zero_region) - PTR_ARG (0) - SIZE_ARG (1) - add dstend, dstin, count cmp count, 96 diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h index 131b95e1fea98f..90166676977a95 100644 --- a/string/aarch64/asmdefs.h +++ b/string/aarch64/asmdefs.h @@ -21,19 +21,6 @@ #define FEATURE_1_PAC 2 /* Add a NT_GNU_PROPERTY_TYPE_0 note. */ -#ifdef __ILP32__ -#define GNU_PROPERTY(type, value) \ - .section .note.gnu.property, "a"; \ - .p2align 2; \ - .word 4; \ - .word 12; \ - .word 5; \ - .asciz "GNU"; \ - .word type; \ - .word 4; \ - .word value; \ - .text -#else #define GNU_PROPERTY(type, value) \ .section .note.gnu.property, "a"; \ .p2align 3; \ @@ -46,7 +33,6 @@ .word value; \ .word 0; \ .text -#endif /* If set then the GNU Property Note section will be added to mark objects to support BTI and PAC-RET. */ @@ -80,27 +66,4 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) #define L(l) .L ## l -#ifdef __ILP32__ - /* Sanitize padding bits of pointer arguments as per aapcs64 */ -#define PTR_ARG(n) mov w##n, w##n -#else -#define PTR_ARG(n) -#endif - -#ifdef __ILP32__ - /* Sanitize padding bits of size arguments as per aapcs64 */ -#define SIZE_ARG(n) mov w##n, w##n -#else -#define SIZE_ARG(n) -#endif - -/* Compiler supports SVE instructions */ -#ifndef HAVE_SVE -# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) -# define HAVE_SVE 1 -# else -# define HAVE_SVE 0 -# endif -#endif - #endif diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/experimental/memchr-sve.S similarity index 96% rename from string/aarch64/memchr-sve.S rename to string/aarch64/experimental/memchr-sve.S index b851cf31f2383e..b314551f3e0fea 100644 --- a/string/aarch64/memchr-sve.S +++ b/string/aarch64/experimental/memchr-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -15,8 +16,6 @@ */ ENTRY (__memchr_aarch64_sve) - PTR_ARG (0) - SIZE_ARG (2) dup z1.b, w1 /* duplicate c to a vector */ setffr /* initialize FFR */ mov x3, 0 /* initialize off */ @@ -59,6 +58,3 @@ ENTRY (__memchr_aarch64_sve) ret END (__memchr_aarch64_sve) - -#endif - diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/experimental/memcmp-sve.S similarity index 93% rename from string/aarch64/memcmp-sve.S rename to string/aarch64/experimental/memcmp-sve.S index d52ce4555344e5..ad3534836d046f 100644 --- a/string/aarch64/memcmp-sve.S +++ b/string/aarch64/experimental/memcmp-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -15,9 +16,6 @@ */ ENTRY (__memcmp_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) mov x3, 0 /* initialize off */ 0: whilelo p0.b, x3, x2 /* while off < max */ @@ -46,6 +44,3 @@ ENTRY (__memcmp_aarch64_sve) ret END (__memcmp_aarch64_sve) - -#endif - diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/experimental/stpcpy-sve.S similarity index 100% rename from string/aarch64/stpcpy-sve.S rename to string/aarch64/experimental/stpcpy-sve.S diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/experimental/strchr-sve.S similarity index 97% rename from string/aarch64/strchr-sve.S rename to string/aarch64/experimental/strchr-sve.S index ff075167bfefb7..7d74ae9ff232cd 100644 --- a/string/aarch64/strchr-sve.S +++ b/string/aarch64/experimental/strchr-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -22,7 +23,6 @@ #endif ENTRY (FUNC) - PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ @@ -65,6 +65,3 @@ ENTRY (FUNC) b 0b END (FUNC) - -#endif - diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/experimental/strchrnul-sve.S similarity index 100% rename from string/aarch64/strchrnul-sve.S rename to string/aarch64/experimental/strchrnul-sve.S diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/experimental/strcmp-sve.S similarity index 96% rename from string/aarch64/strcmp-sve.S rename to string/aarch64/experimental/strcmp-sve.S index eaf909a378f1f5..b6c24958853457 100644 --- a/string/aarch64/strcmp-sve.S +++ b/string/aarch64/experimental/strcmp-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -15,8 +16,6 @@ */ ENTRY (__strcmp_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) setffr /* initialize FFR */ ptrue p1.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ @@ -54,6 +53,3 @@ ENTRY (__strcmp_aarch64_sve) b 1b END (__strcmp_aarch64_sve) - -#endif - diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/experimental/strcpy-sve.S similarity index 96% rename from string/aarch64/strcpy-sve.S rename to string/aarch64/experimental/strcpy-sve.S index 00e72dce4451b3..57b77c8a00e7aa 100644 --- a/string/aarch64/strcpy-sve.S +++ b/string/aarch64/experimental/strcpy-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -22,8 +23,6 @@ #endif ENTRY (FUNC) - PTR_ARG (0) - PTR_ARG (1) setffr /* initialize FFR */ ptrue p2.b, all /* all ones; loop invariant */ mov x2, 0 /* initialize offset */ @@ -66,6 +65,3 @@ ENTRY (FUNC) ret END (FUNC) - -#endif - diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/experimental/strlen-sve.S similarity index 96% rename from string/aarch64/strlen-sve.S rename to string/aarch64/experimental/strlen-sve.S index 12ebbdba5c93ae..c83155052c07d2 100644 --- a/string/aarch64/strlen-sve.S +++ b/string/aarch64/experimental/strlen-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -15,7 +16,6 @@ */ ENTRY (__strlen_aarch64_sve) - PTR_ARG (0) setffr /* initialize FFR */ ptrue p2.b /* all ones; loop invariant */ mov x1, 0 /* initialize length */ @@ -50,6 +50,3 @@ ENTRY (__strlen_aarch64_sve) b 0b END (__strlen_aarch64_sve) - -#endif - diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/experimental/strncmp-sve.S similarity index 95% rename from string/aarch64/strncmp-sve.S rename to string/aarch64/experimental/strncmp-sve.S index 6a9e9f7b6437fd..a281e642d8aaba 100644 --- a/string/aarch64/strncmp-sve.S +++ b/string/aarch64/experimental/strncmp-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -15,9 +16,6 @@ */ ENTRY (__strncmp_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) setffr /* initialize FFR */ mov x3, 0 /* initialize off */ @@ -64,6 +62,3 @@ ENTRY (__strncmp_aarch64_sve) ret END (__strncmp_aarch64_sve) - -#endif - diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/experimental/strnlen-sve.S similarity index 96% rename from string/aarch64/strnlen-sve.S rename to string/aarch64/experimental/strnlen-sve.S index 6c43dc427da7a9..11d835a1b13ce9 100644 --- a/string/aarch64/strnlen-sve.S +++ b/string/aarch64/experimental/strnlen-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -15,8 +16,6 @@ */ ENTRY (__strnlen_aarch64_sve) - PTR_ARG (0) - SIZE_ARG (1) setffr /* initialize FFR */ mov x2, 0 /* initialize len */ b 1f @@ -69,6 +68,3 @@ ENTRY (__strnlen_aarch64_sve) ret END (__strnlen_aarch64_sve) - -#endif - diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/experimental/strrchr-sve.S similarity index 98% rename from string/aarch64/strrchr-sve.S rename to string/aarch64/experimental/strrchr-sve.S index 825a7384cfc118..731edaddf15650 100644 --- a/string/aarch64/strrchr-sve.S +++ b/string/aarch64/experimental/strrchr-sve.S @@ -7,7 +7,8 @@ #include "asmdefs.h" -#if __ARM_FEATURE_SVE +.arch armv8-a+sve + /* Assumptions: * * ARMv8-a, AArch64 @@ -15,7 +16,6 @@ */ ENTRY (__strrchr_aarch64_sve) - PTR_ARG (0) dup z1.b, w1 /* replicate byte across vector */ setffr /* initialize FFR */ ptrue p1.b /* all ones; loop invariant */ @@ -79,6 +79,3 @@ ENTRY (__strrchr_aarch64_sve) ret END (__strrchr_aarch64_sve) - -#endif - diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S index 948c3cbc7dd43a..68bd0af9a8c5fa 100644 --- a/string/aarch64/memchr-mte.S +++ b/string/aarch64/memchr-mte.S @@ -40,8 +40,6 @@ exactly which byte matched. */ ENTRY (__memchr_aarch64_mte) - PTR_ARG (0) - SIZE_ARG (2) bic src, srcin, 15 cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index fe6cfe2bc0e28d..d12a38abbc3009 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -47,8 +47,6 @@ */ ENTRY (__memchr_aarch64) - PTR_ARG (0) - SIZE_ARG (2) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) /* diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S index 35135e72cc8e53..43439de4db69d5 100644 --- a/string/aarch64/memcmp.S +++ b/string/aarch64/memcmp.S @@ -30,10 +30,6 @@ ENTRY (__memcmp_aarch64) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - cmp limit, 16 b.lo L(less16) ldp data1, data3, [src1] diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S index 9d3027d4d3cdee..cbf4c581500e40 100644 --- a/string/aarch64/memcpy-advsimd.S +++ b/string/aarch64/memcpy-advsimd.S @@ -52,9 +52,6 @@ ENTRY_ALIAS (__memmove_aarch64_simd) ENTRY (__memcpy_aarch64_simd) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) add srcend, src, count cmp count, 128 b.hi L(copy_long) diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S index b45c31418717cd..03ae95570c0404 100644 --- a/string/aarch64/memcpy-mops.S +++ b/string/aarch64/memcpy-mops.S @@ -8,10 +8,6 @@ #include "asmdefs.h" ENTRY (__memcpy_aarch64_mops) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - mov x3, x0 .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */ .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */ diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S index e8a946d7db37f4..9b05cb2a58eebe 100644 --- a/string/aarch64/memcpy-sve.S +++ b/string/aarch64/memcpy-sve.S @@ -13,8 +13,6 @@ #include "asmdefs.h" -#ifdef HAVE_SVE - .arch armv8-a+sve #define dstin x0 @@ -51,10 +49,6 @@ ENTRY_ALIAS (__memmove_aarch64_sve) ENTRY (__memcpy_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - cmp count, 128 b.hi L(copy_long) cntb vlen @@ -173,5 +167,3 @@ L(return): ret END (__memcpy_aarch64_sve) - -#endif diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S index 7c0606e2104a04..351f1a11f09728 100644 --- a/string/aarch64/memcpy.S +++ b/string/aarch64/memcpy.S @@ -55,9 +55,6 @@ ENTRY_ALIAS (__memmove_aarch64) ENTRY (__memcpy_aarch64) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) add srcend, src, count add dstend, dstin, count cmp count, 128 diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S index 6c73017bb16f00..d9839f86e9b4f6 100644 --- a/string/aarch64/memmove-mops.S +++ b/string/aarch64/memmove-mops.S @@ -8,10 +8,6 @@ #include "asmdefs.h" ENTRY (__memmove_aarch64_mops) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - mov x3, x0 .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */ .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */ diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S index 6418bdf56f4148..ed38478a6faad8 100644 --- a/string/aarch64/memrchr.S +++ b/string/aarch64/memrchr.S @@ -42,7 +42,6 @@ exactly which byte matched. */ ENTRY (__memrchr_aarch64) - PTR_ARG (0) add end, srcin, cntin sub endm1, end, 1 bic src, endm1, 15 diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S index ec791493bae9c0..00d8e7d2c05f00 100644 --- a/string/aarch64/memset-mops.S +++ b/string/aarch64/memset-mops.S @@ -8,9 +8,6 @@ #include "asmdefs.h" ENTRY (__memset_aarch64_mops) - PTR_ARG (0) - SIZE_ARG (2) - mov x3, x0 .inst 0x19c10443 /* setp [x3]!, x2!, x1 */ .inst 0x19c14443 /* setm [x3]!, x2!, x1 */ diff --git a/string/aarch64/memset-sve.S b/string/aarch64/memset-sve.S new file mode 100644 index 00000000000000..efaeaece284e85 --- /dev/null +++ b/string/aarch64/memset-sve.S @@ -0,0 +1,114 @@ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2024-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#include "asmdefs.h" + +.arch armv8-a+sve + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 +#define vlen x5 +#define off x3 +#define dstend2 x5 + +ENTRY (__memset_aarch64_sve) + dup v0.16B, valw + cmp count, 16 + b.lo L(set_16) + + add dstend, dstin, count + cmp count, 64 + b.hs L(set_128) + + /* Set 16..63 bytes. */ + mov off, 16 + and off, off, count, lsr 1 + sub dstend2, dstend, off + str q0, [dstin] + str q0, [dstin, off] + str q0, [dstend2, -16] + str q0, [dstend, -16] + ret + + .p2align 4 +L(set_16): + whilelo p0.b, xzr, count + st1b z0.b, p0, [dstin] + ret + + .p2align 4 +L(set_128): + bic dst, dstin, 15 + cmp count, 128 + b.hi L(set_long) + stp q0, q0, [dstin] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +L(set_long): + cmp count, 256 + b.lo L(no_zva) + tst valw, 255 + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + str q0, [dstin] + str q0, [dst, 16] + bic dst, dstin, 31 + stp q0, q0, [dst, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ + bic x8, x8, 15 + stp q0, q0, [x8, -48] + str q0, [x8, -16] + str q0, [dstend, -16] + + .p2align 4 +L(zva64_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva64_loop) + ret + +L(no_zva): + str q0, [dstin] + sub count, dstend, dst /* Count is 16 too large. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 16] + stp q0, q0, [dst, 48] + add dst, dst, 64 + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +END (__memset_aarch64_sve) diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S index 553b0fcaefea5e..906a4dcf46c643 100644 --- a/string/aarch64/memset.S +++ b/string/aarch64/memset.S @@ -1,7 +1,7 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012-2022, Arm Limited. + * Copyright (c) 2012-2024, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ @@ -20,93 +20,98 @@ #define dst x3 #define dstend x4 #define zva_val x5 +#define off x3 +#define dstend2 x5 ENTRY (__memset_aarch64) - PTR_ARG (0) - SIZE_ARG (2) - dup v0.16B, valw - add dstend, dstin, count - - cmp count, 96 - b.hi L(set_long) cmp count, 16 - b.hs L(set_medium) - mov val, v0.D[0] + b.lo L(set_small) - /* Set 0..15 bytes. */ - tbz count, 3, 1f - str val, [dstin] - str val, [dstend, -8] + add dstend, dstin, count + cmp count, 64 + b.hs L(set_128) + + /* Set 16..63 bytes. */ + mov off, 16 + and off, off, count, lsr 1 + sub dstend2, dstend, off + str q0, [dstin] + str q0, [dstin, off] + str q0, [dstend2, -16] + str q0, [dstend, -16] ret + .p2align 4 -1: tbz count, 2, 2f - str valw, [dstin] - str valw, [dstend, -4] + /* Set 0..15 bytes. */ +L(set_small): + add dstend, dstin, count + cmp count, 4 + b.lo 2f + lsr off, count, 3 + sub dstend2, dstend, off, lsl 2 + str s0, [dstin] + str s0, [dstin, off, lsl 2] + str s0, [dstend2, -4] + str s0, [dstend, -4] ret + + /* Set 0..3 bytes. */ 2: cbz count, 3f + lsr off, count, 1 strb valw, [dstin] - tbz count, 1, 3f - strh valw, [dstend, -2] + strb valw, [dstin, off] + strb valw, [dstend, -1] 3: ret - /* Set 17..96 bytes. */ -L(set_medium): - str q0, [dstin] - tbnz count, 6, L(set96) - str q0, [dstend, -16] - tbz count, 5, 1f - str q0, [dstin, 16] - str q0, [dstend, -32] -1: ret - .p2align 4 - /* Set 64..96 bytes. Write 64 bytes from the start and - 32 bytes from the end. */ -L(set96): - str q0, [dstin, 16] +L(set_128): + bic dst, dstin, 15 + cmp count, 128 + b.hi L(set_long) + stp q0, q0, [dstin] stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret .p2align 4 L(set_long): - and valw, valw, 255 - bic dst, dstin, 15 str q0, [dstin] - cmp count, 160 - ccmp valw, 0, 0, hs + str q0, [dst, 16] + tst valw, 255 b.ne L(no_zva) - #ifndef SKIP_ZVA_CHECK mrs zva_val, dczid_el0 and zva_val, zva_val, 31 cmp zva_val, 4 /* ZVA size is 64 bytes. */ b.ne L(no_zva) #endif - str q0, [dst, 16] stp q0, q0, [dst, 32] - bic dst, dst, 63 + bic dst, dstin, 63 sub count, dstend, dst /* Count is now 64 too large. */ - sub count, count, 128 /* Adjust count and bias for loop. */ + sub count, count, 64 + 64 /* Adjust count and bias for loop. */ + + /* Write last bytes before ZVA loop. */ + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] .p2align 4 -L(zva_loop): +L(zva64_loop): add dst, dst, 64 dc zva, dst subs count, count, 64 - b.hi L(zva_loop) - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] + b.hi L(zva64_loop) ret + .p2align 3 L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - sub dst, dst, 16 /* Dst is biased by -32. */ - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ + sub count, dstend, dst /* Count is 32 too large. */ + sub count, count, 64 + 32 /* Adjust count and bias for loop. */ L(no_zva_loop): stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64]! + stp q0, q0, [dst, 64] + add dst, dst, 64 subs count, count, 64 b.hi L(no_zva_loop) stp q0, q0, [dstend, -64] @@ -114,4 +119,3 @@ L(no_zva_loop): ret END (__memset_aarch64) - diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S index 6ec08f7acc766b..42b747311bc6f5 100644 --- a/string/aarch64/strchr-mte.S +++ b/string/aarch64/strchr-mte.S @@ -39,7 +39,6 @@ If it is not a multiple of 4, there was no match. */ ENTRY (__strchr_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index 37193bd947a73d..c1d01e9635b6c1 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -51,7 +51,6 @@ /* Locals and temporaries. */ ENTRY (__strchr_aarch64) - PTR_ARG (0) /* Magic constant 0xc0300c03 to allow us to identify which lane matches the requested byte. Even bits are set if the character matches, odd bits if either the char is NUL or matches. */ diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S index 543ee88bb28585..b3180cdf9e2cec 100644 --- a/string/aarch64/strchrnul-mte.S +++ b/string/aarch64/strchrnul-mte.S @@ -38,7 +38,6 @@ exactly which byte matched. */ ENTRY (__strchrnul_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index 666e8d0304c16d..0a32c46c30c558 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -47,7 +47,6 @@ /* Locals and temporaries. */ ENTRY (__strchrnul_aarch64) - PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the termination condition. */ mov wtmp2, #0x0401 diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S index 137a9aa06681a3..7c0d0485a89ba1 100644 --- a/string/aarch64/strcmp.S +++ b/string/aarch64/strcmp.S @@ -51,8 +51,6 @@ ENTRY (__strcmp_aarch64) - PTR_ARG (0) - PTR_ARG (1) sub off2, src2, src1 mov zeroones, REP8_01 and tmp, src1, 7 diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 97ae37ea422973..5852616e602494 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -52,8 +52,6 @@ exactly which byte matched. */ ENTRY (STRCPY) - PTR_ARG (0) - PTR_ARG (1) bic src, srcin, 15 ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S index 77235797f7c54f..afa72eed9a43f7 100644 --- a/string/aarch64/strlen-mte.S +++ b/string/aarch64/strlen-mte.S @@ -33,7 +33,6 @@ identifies the first zero byte. */ ENTRY (__strlen_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 @@ -41,37 +40,50 @@ ENTRY (__strlen_aarch64_mte) shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift - cbz synd, L(loop) + cbz synd, L(next16) rbit synd, synd clz result, synd lsr result, result, 2 ret +L(next16): + ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + cbz synd, L(loop) + add src, src, 16 +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + sub result, src, srcin + clz tmp, synd + add result, result, tmp, lsr 2 + ret + .p2align 5 L(loop): - ldr data, [src, 16] + ldr data, [src, 32]! cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + addhn vend.8b, vhas_nul.8h, vhas_nul.8h fmov synd, dend cbnz synd, L(loop_end) - ldr data, [src, 32]! + ldr data, [src, 16] cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + addhn vend.8b, vhas_nul.8h, vhas_nul.8h fmov synd, dend cbz synd, L(loop) - sub src, src, 16 + add src, src, 16 L(loop_end): - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ - sub result, src, srcin - fmov synd, dend + sub result, shift, src, lsl 2 /* (srcin - src) << 2. */ #ifndef __AARCH64EB__ rbit synd, synd + sub result, result, 3 #endif - add result, result, 16 clz tmp, synd - add result, result, tmp, lsr 2 + sub result, tmp, result + lsr result, result, 2 ret END (__strlen_aarch64_mte) - diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index 6f6f08f636b248..0ebb26be844c1a 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -75,7 +75,6 @@ character, return the length, if not, continue in the main loop. */ ENTRY (__strlen_aarch64) - PTR_ARG (0) and tmp1, srcin, MIN_PAGE_SIZE - 1 cmp tmp1, MIN_PAGE_SIZE - 32 b.hi L(page_cross) diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index 128a10c52bb175..493a0f06ed1d00 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -55,9 +55,6 @@ #endif ENTRY (__strncmp_aarch64) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index f2090a7485a564..6a96ec268f1a6d 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -36,8 +36,6 @@ identifies the first zero byte. */ ENTRY (__strnlen_aarch64) - PTR_ARG (0) - SIZE_ARG (1) bic src, srcin, 15 cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S index bb61ab9ad4e7c5..8668ce6d291620 100644 --- a/string/aarch64/strrchr-mte.S +++ b/string/aarch64/strrchr-mte.S @@ -42,7 +42,6 @@ if the relevant byte matched the NUL end of string. */ ENTRY (__strrchr_aarch64_mte) - PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin movi vrepmask.16b, 0x33 diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S index bf9cb297b6cb3f..f5713f4260fbca 100644 --- a/string/aarch64/strrchr.S +++ b/string/aarch64/strrchr.S @@ -55,7 +55,6 @@ identify exactly which byte is causing the termination, and why. */ ENTRY (__strrchr_aarch64) - PTR_ARG (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the requested byte. Magic constant 0x80200802 used similarly for NUL termination. */ diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c index b628f9b60d96e0..583fa505db754c 100644 --- a/string/bench/memcpy.c +++ b/string/bench/memcpy.c @@ -20,35 +20,18 @@ #define MIN_SIZE 32768 #define MAX_SIZE (1024 * 1024) -static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); -static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); - -#define F(x) {#x, x}, - -static const struct fun -{ - const char *name; - void *(*fun)(void *, const void *, size_t); -} funtab[] = -{ -#if __aarch64__ - F(__memcpy_aarch64) -# if __ARM_NEON - F(__memcpy_aarch64_simd) -# endif -# if __ARM_FEATURE_SVE - F(__memcpy_aarch64_sve) -# endif -# if WANT_MOPS - F(__memcpy_aarch64_mops) -# endif -#elif __arm__ - F(__memcpy_arm) -#endif - F(memcpy) -#undef F - {0, 0} -}; +static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096))); +static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096))); + +#define DOTEST(STR,TESTFN) \ + printf (STR); \ + RUN (TESTFN, memcpy); \ + RUNA64 (TESTFN, __memcpy_aarch64); \ + RUNA64 (TESTFN, __memcpy_aarch64_simd); \ + RUNSVE (TESTFN, __memcpy_aarch64_sve); \ + RUNMOPS (TESTFN, __memcpy_aarch64_mops); \ + RUNA32 (TESTFN, __memcpy_arm); \ + printf ("\n"); typedef struct { uint16_t size; uint16_t freq; } freq_data_t; typedef struct { uint8_t align; uint16_t freq; } align_data_t; @@ -160,183 +143,125 @@ init_copies (size_t max_size) return total; } -int main (void) +static void inline __attribute ((always_inline)) +memcpy_random (const char *name, void *(*fn)(void *, const void *, size_t)) { - init_copy_distribution (); - - memset (a, 1, sizeof (a)); - memset (b, 2, sizeof (b)); - - printf("Random memcpy (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - size_t total = 0; - uint64_t tsum = 0; - printf ("%22s ", funtab[f].name); - rand32 (0x12345678); - - for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) - { - size_t copy_size = init_copies (size) * ITERS; - - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, - test_arr[c].len); - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, - test_arr[c].len); - t = clock_get_ns () - t; - total += copy_size; - tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); - } - printf( "avg %.2f\n", (double)total / tsum); - } - - size_t total = 0; - uint64_t tsum = 0; - printf ("%22s ", "memcpy_call"); - rand32 (0x12345678); - + printf ("%22s ", name); + uint64_t total = 0, tsum = 0; for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) { - size_t copy_size = init_copies (size) * ITERS; + uint64_t copy_size = init_copies (size) * ITERS; for (int c = 0; c < NUM_TESTS; c++) - memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) for (int c = 0; c < NUM_TESTS; c++) - memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); t = clock_get_ns () - t; total += copy_size; tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); + printf ("%dK: %5.2f ", size / 1024, (double)copy_size / t); } - printf( "avg %.2f\n", (double)total / tsum); - + printf( "avg %5.2f\n", (double)total / tsum); +} - printf ("\nAligned medium memcpy (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (b, a, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - printf ("\n"); - } +static void inline __attribute ((always_inline)) +memcpy_medium_aligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("%22s ", "memcpy_call"); for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) - memcpy (b, a, size); + fn (b, a, size); t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); +} +static void inline __attribute ((always_inline)) +memcpy_medium_unaligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("\nUnaligned medium memcpy (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (b + 3, a + 1, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - printf ("\n"); - } - - printf ("%22s ", "memcpy_call"); for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) - memcpy (b + 3, a + 1, size); + fn (b + 3, a + 1, size); t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); +} +static void inline __attribute ((always_inline)) +memcpy_large (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("\nLarge memcpy (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 1024; size <= 65536; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (b, a, size); - t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); - } - printf ("\n"); - } - - printf ("%22s ", "memcpy_call"); for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) - memcpy (b, a, size); + fn (b, a, size); t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); +} +static void inline __attribute ((always_inline)) +memmove_forward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("\nUnaligned forwards memmove (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) + for (int size = 1024; size <= 65536; size *= 2) { - printf ("%22s ", funtab[f].name); - - for (int size = 1024; size <= 65536; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a, a + 256 + (i & 31), size); - t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); - } - printf ("\n"); + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + fn (a, a + 256 + (i & 31), size); + t = clock_get_ns () - t; + printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t); } + printf ("\n"); +} + +static void inline __attribute ((always_inline)) +memmove_backward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t)) +{ + printf ("%22s ", name); - printf ("\nUnaligned backwards memmove (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) + for (int size = 1024; size <= 65536; size *= 2) { - printf ("%22s ", funtab[f].name); - - for (int size = 1024; size <= 65536; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a + 256 + (i & 31), a, size); - t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); - } - printf ("\n"); + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + fn (a + 256 + (i & 31), a, size); + t = clock_get_ns () - t; + printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t); } + printf ("\n"); +} + +int main (void) +{ + init_copy_distribution (); + + memset (a, 1, sizeof (a)); + memset (b, 2, sizeof (b)); + + DOTEST ("Random memcpy (bytes/ns):\n", memcpy_random); + DOTEST ("Medium memcpy aligned (bytes/ns):\n", memcpy_medium_aligned); + DOTEST ("Medium memcpy unaligned (bytes/ns):\n", memcpy_medium_unaligned); + DOTEST ("Large memcpy (bytes/ns):\n", memcpy_large); + DOTEST ("Forwards memmove unaligned (bytes/ns):\n", memmove_forward_unaligned); + DOTEST ("Backwards memmove unaligned (bytes/ns):\n", memmove_backward_unaligned); return 0; } diff --git a/string/bench/memset.c b/string/bench/memset.c index 990e23ba9a368b..07474e46914625 100644 --- a/string/bench/memset.c +++ b/string/bench/memset.c @@ -20,25 +20,16 @@ #define MIN_SIZE 32768 #define MAX_SIZE (1024 * 1024) -static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64))); +static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(4096))); -#define F(x) {#x, x}, - -static const struct fun -{ - const char *name; - void *(*fun)(void *, int, size_t); -} funtab[] = -{ -#if __aarch64__ - F(__memset_aarch64) -#elif __arm__ - F(__memset_arm) -#endif - F(memset) -#undef F - {0, 0} -}; +#define DOTEST(STR,TESTFN) \ + printf (STR); \ + RUN (TESTFN, memset); \ + RUNA64 (TESTFN, __memset_aarch64); \ + RUNSVE (TESTFN, __memset_aarch64_sve); \ + RUNMOPS (TESTFN, __memset_mops); \ + RUNA32 (TESTFN, __memset_arm); \ + printf ("\n"); typedef struct { uint32_t offset : 20, len : 12; } memset_test_t; static memset_test_t test_arr[NUM_TESTS]; @@ -127,117 +118,73 @@ init_memset (size_t max_size) return total; } - -int main (void) +static void inline __attribute ((always_inline)) +memset_random (const char *name, void *(*set)(void *, int, size_t)) { - init_memset_distribution (); - - memset (a, 1, sizeof (a)); - - printf("Random memset (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - size_t total_size = 0; - uint64_t tsum = 0; - printf ("%22s ", funtab[f].name); - rand32 (0x12345678); - - for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) - { - size_t memset_size = init_memset (size) * ITERS; - - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); - t = clock_get_ns () - t; - total_size += memset_size; - tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); - } - printf( "avg %.2f\n", (double)total_size / tsum); - } - - size_t total_size = 0; + uint64_t total_size = 0; uint64_t tsum = 0; - printf ("%22s ", "memset_call"); + printf ("%22s ", name); rand32 (0x12345678); for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) { - size_t memset_size = init_memset (size) * ITERS; + uint64_t memset_size = init_memset (size) * ITERS; for (int c = 0; c < NUM_TESTS; c++) - memset (a + test_arr[c].offset, 0, test_arr[c].len); + set (a + test_arr[c].offset, 0, test_arr[c].len); uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) for (int c = 0; c < NUM_TESTS; c++) - memset (a + test_arr[c].offset, 0, test_arr[c].len); + set (a + test_arr[c].offset, 0, test_arr[c].len); t = clock_get_ns () - t; total_size += memset_size; tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); + printf ("%dK: %5.2f ", size / 1024, (double)memset_size / t); } - printf( "avg %.2f\n", (double)total_size / tsum); - + printf( "avg %5.2f\n", (double)total_size / tsum); +} - printf ("\nMedium memset (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (a, 0, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - printf ("\n"); - } +static void inline __attribute ((always_inline)) +memset_medium (const char *name, void *(*set)(void *, int, size_t)) +{ + printf ("%22s ", name); - printf ("%22s ", "memset_call"); for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) - memset (a, 0, size); + set (a, 0, size); t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t); } + printf ("\n"); +} +static void inline __attribute ((always_inline)) +memset_large (const char *name, void *(*set)(void *, int, size_t)) +{ + printf ("%22s ", name); - printf ("\nLarge memset (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 1024; size <= 65536; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a, 0, size); - t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); - } - printf ("\n"); - } - - printf ("%22s ", "memset_call"); for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) - memset (a, 0, size); + set (a, 0, size); t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + printf ("%dKB: %6.2f ", size / 1024, (double)size * ITERS3 / t); } - printf ("\n\n"); + printf ("\n"); +} + +int main (void) +{ + init_memset_distribution (); + + memset (a, 1, sizeof (a)); + DOTEST ("Random memset (bytes/ns):\n", memset_random); + DOTEST ("Medium memset (bytes/ns):\n", memset_medium); + DOTEST ("Large memset (bytes/ns):\n", memset_large); return 0; } diff --git a/string/bench/strlen.c b/string/bench/strlen.c index f05d0d5b89e6f1..a8dd55cf5fc4f2 100644 --- a/string/bench/strlen.c +++ b/string/bench/strlen.c @@ -14,40 +14,23 @@ #include "benchlib.h" #define ITERS 5000 -#define ITERS2 20000000 -#define ITERS3 2000000 -#define NUM_TESTS 16384 +#define ITERS2 40000000 +#define ITERS3 4000000 +#define NUM_TESTS 65536 #define MAX_ALIGN 32 -#define MAX_STRLEN 256 +#define MAX_STRLEN 128 static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096))); -#define F(x, mte) {#x, x, mte}, - -static const struct fun -{ - const char *name; - size_t (*fun) (const char *s); - int test_mte; -} funtab[] = { - // clang-format off - F(strlen, 0) -#if __aarch64__ - F(__strlen_aarch64, 0) - F(__strlen_aarch64_mte, 1) -# if __ARM_FEATURE_SVE - F(__strlen_aarch64_sve, 1) -# endif -#elif __arm__ -# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 - F(__strlen_armv6t2, 0) -# endif -#endif - {0, 0, 0} - // clang-format on -}; -#undef F +#define DOTEST(STR,TESTFN) \ + printf (STR); \ + RUN (TESTFN, strlen); \ + RUNA64 (TESTFN, __strlen_aarch64); \ + RUNA64 (TESTFN, __strlen_aarch64_mte); \ + RUNSVE (TESTFN, __strlen_aarch64_sve); \ + RUNT32 (TESTFN, __strlen_armv6t2); \ + printf ("\n"); static uint16_t strlen_tests[NUM_TESTS]; @@ -124,98 +107,119 @@ init_strlen_tests (void) strlen_tests[n] = index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len; + assert ((strlen_tests[n] & (align - 1)) == 0); + assert (strlen (a + strlen_tests[n]) == exp_len); } } static volatile size_t maskv = 0; -int main (void) +static void inline __attribute ((always_inline)) +strlen_random (const char *name, size_t (*fn)(const char *)) { - rand32 (0x12345678); - init_strlen_distribution (); - init_strlen_tests (); + size_t res = 0, mask = maskv; + uint64_t strlen_size = 0; + printf ("%22s ", name); + + for (int c = 0; c < NUM_TESTS; c++) + strlen_size += fn (a + strlen_tests[c]) + 1; + strlen_size *= ITERS; + + /* Measure throughput of strlen. */ + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + res += fn (a + strlen_tests[c]); + t = clock_get_ns () - t; + printf ("tp: %.3f ", (double)strlen_size / t); + + /* Measure latency of strlen result with (res & mask). */ + t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + res += fn (a + strlen_tests[c] + (res & mask)); + t = clock_get_ns () - t; + printf ("lat: %.3f\n", (double)strlen_size / t); + maskv = res & mask; +} - printf ("\nRandom strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - size_t res = 0, strlen_size = 0, mask = maskv; - printf ("%22s ", funtab[f].name); +static void inline __attribute ((always_inline)) +strlen_small_aligned (const char *name, size_t (*fn)(const char *)) +{ + printf ("%22s ", name); - for (int c = 0; c < NUM_TESTS; c++) - strlen_size += funtab[f].fun (a + strlen_tests[c]); - strlen_size *= ITERS; + size_t res = 0, mask = maskv; + for (int size = 1; size <= 64; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; - /* Measure latency of strlen result with (res & mask). */ uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) - res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); + for (int i = 0; i < ITERS2; i++) + res += fn (a + (i & mask)); t = clock_get_ns () - t; - printf ("%.2f\n", (double)strlen_size / t); + printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); } + maskv &= res; + printf ("\n"); +} - printf ("\nSmall aligned strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 1; size <= 64; size *= 2) - { - memset (a, 'x', size); - a[size - 1] = 0; - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (a); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); - } - printf ("\n"); - } +static void inline __attribute ((always_inline)) +strlen_small_unaligned (const char *name, size_t (*fn)(const char *)) +{ + printf ("%22s ", name); - printf ("\nSmall unaligned strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) + size_t res = 0, mask = maskv; + int align = 9; + for (int size = 1; size <= 64; size *= 2) { - printf ("%22s ", funtab[f].name); - - int align = 9; - for (int size = 1; size <= 64; size *= 2) - { - memset (a + align, 'x', size); - a[align + size - 1] = 0; - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (a + align); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); - } - printf ("\n"); + memset (a + align, 'x', size); + a[align + size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + res += fn (a + align + (i & mask)); + t = clock_get_ns () - t; + printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); } + maskv &= res; + printf ("\n"); +} - printf ("\nMedium strlen (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) +static void inline __attribute ((always_inline)) +strlen_medium (const char *name, size_t (*fn)(const char *)) +{ + printf ("%22s ", name); + + size_t res = 0, mask = maskv; + for (int size = 128; size <= 4096; size *= 2) { - printf ("%22s ", funtab[f].name); - - for (int size = 128; size <= 4096; size *= 2) - { - memset (a, 'x', size); - a[size - 1] = 0; - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a); - t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); - } - printf ("\n"); - } + memset (a, 'x', size); + a[size - 1] = 0; + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + res += fn (a + (i & mask)); + t = clock_get_ns () - t; + printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + maskv &= res; printf ("\n"); +} + +int main (void) +{ + rand32 (0x12345678); + init_strlen_distribution (); + init_strlen_tests (); + + DOTEST ("Random strlen (bytes/ns):\n", strlen_random); + DOTEST ("Small aligned strlen (bytes/ns):\n", strlen_small_aligned); + DOTEST ("Small unaligned strlen (bytes/ns):\n", strlen_small_unaligned); + DOTEST ("Medium strlen (bytes/ns):\n", strlen_medium); return 0; } diff --git a/string/include/benchlib.h b/string/include/benchlib.h index f1bbea388cd217..486504e99ddf0d 100644 --- a/string/include/benchlib.h +++ b/string/include/benchlib.h @@ -30,4 +30,35 @@ rand32 (uint32_t seed) return res; } +/* Macros to run a benchmark BENCH using string function FN. */ +#define RUN(BENCH, FN) BENCH(#FN, FN) +#if __aarch64__ +# define RUNA64(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNA64(BENCH, FN) +#endif + +#if __ARM_FEATURE_SVE +# define RUNSVE(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNSVE(BENCH, FN) +#endif + +#if WANT_MOPS +# define RUNMOPS(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNMOPS(BENCH, FN) +#endif + +#if __arm__ +# define RUNA32(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNA32(BENCH, FN) +#endif + +#if __arm__ && __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 +# define RUNT32(BENCH, FN) BENCH(#FN, FN) +#else +# define RUNT32(BENCH, FN) +#endif diff --git a/string/include/stringlib.h b/string/include/stringlib.h index 01da7ebfc18db9..bb9db930f132ed 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -33,13 +33,12 @@ char *__strchr_aarch64_mte (const char *, int); char * __strchrnul_aarch64_mte (const char *, int ); size_t __strlen_aarch64_mte (const char *); char *__strrchr_aarch64_mte (const char *, int); -#if __ARM_NEON void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_simd (void *, const void *, size_t); -#endif # if __ARM_FEATURE_SVE void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64_sve (void *, int, size_t); void *__memchr_aarch64_sve (const void *, int, size_t); int __memcmp_aarch64_sve (const void *, const void *, size_t); char *__strchr_aarch64_sve (const char *, int); diff --git a/string/test/memcpy.c b/string/test/memcpy.c index dc95844bd45a8c..98255e06f31c55 100644 --- a/string/test/memcpy.c +++ b/string/test/memcpy.c @@ -25,9 +25,7 @@ static const struct fun F(memcpy, 0) #if __aarch64__ F(__memcpy_aarch64, 1) -# if __ARM_NEON F(__memcpy_aarch64_simd, 1) -# endif # if __ARM_FEATURE_SVE F(__memcpy_aarch64_sve, 1) # endif diff --git a/string/test/memmove.c b/string/test/memmove.c index b85dd1e864effe..ff3f7652f76329 100644 --- a/string/test/memmove.c +++ b/string/test/memmove.c @@ -25,9 +25,7 @@ static const struct fun F(memmove, 0) #if __aarch64__ F(__memmove_aarch64, 1) -# if __ARM_NEON F(__memmove_aarch64_simd, 1) -# endif # if __ARM_FEATURE_SVE F(__memmove_aarch64_sve, 1) # endif diff --git a/string/test/memset.c b/string/test/memset.c index 7d09c267ffecfc..a9639f9b28b0a5 100644 --- a/string/test/memset.c +++ b/string/test/memset.c @@ -25,6 +25,9 @@ static const struct fun F(memset, 0) #if __aarch64__ F(__memset_aarch64, 1) +# if __ARM_FEATURE_SVE + F(__memset_aarch64_sve, 1) +# endif # if WANT_MOPS F(__memset_aarch64_mops, 1) # endif From bc77aa7df7339b166c0d6394526fe59dea89f4b1 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Mon, 23 Dec 2024 10:19:27 +0000 Subject: [PATCH 100/143] release: install etc files from the source tree, not the host Reviewed by: cperciva MFC after: 3 days Differential Revision: https://reviews.freebsd.org/D48180 --- release/Makefile.oci | 2 ++ release/tools/oci-image-static.conf | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/release/Makefile.oci b/release/Makefile.oci index 461c95f49636b6..da35156c5a9592 100644 --- a/release/Makefile.oci +++ b/release/Makefile.oci @@ -24,6 +24,8 @@ OCI_DEPS_minimal= container-image-dynamic.txz .for _IMG in ${OCI_IMAGES} OCI_TARGETS+= container-image-${_IMG}.txz container-image-${_IMG}.txz: ${OCI_DEPS_${_IMG}} + # Adjust PATH so that we run pwd_mkdb from the bootstrap tools + env PATH=${OBJTOP}/tmp/legacy/bin:${PATH:Q} \ sh ${.CURDIR}/scripts/make-oci-image.sh ${.CURDIR} ${REVISION} ${BRANCH} ${TARGET_ARCH} ${_IMG} skopeo copy \ containers-storage:localhost/freebsd${REVISION:R}-${_IMG}:latest \ diff --git a/release/tools/oci-image-static.conf b/release/tools/oci-image-static.conf index 27cfb3c6778cc4..552328e66f3cab 100644 --- a/release/tools/oci-image-static.conf +++ b/release/tools/oci-image-static.conf @@ -7,17 +7,20 @@ OCI_BASE_IMAGE= oci_image_build() { - mtree -deU -p $m/ -f /etc/mtree/BSD.root.dist > /dev/null - mtree -deU -p $m/var -f /etc/mtree/BSD.var.dist > /dev/null - mtree -deU -p $m/usr -f /etc/mtree/BSD.usr.dist > /dev/null - mtree -deU -p $m/usr/include -f /etc/mtree/BSD.include.dist > /dev/null - mtree -deU -p $m/usr/lib -f /etc/mtree/BSD.debug.dist > /dev/null + local srcdir=${curdir}/.. + mtree -deU -p $m/ -f ${srcdir}/etc/mtree/BSD.root.dist > /dev/null + mtree -deU -p $m/var -f ${srcdir}/etc/mtree/BSD.var.dist > /dev/null + mtree -deU -p $m/usr -f ${srcdir}/etc/mtree/BSD.usr.dist > /dev/null + mtree -deU -p $m/usr/include -f ${srcdir}/etc/mtree/BSD.include.dist > /dev/null + mtree -deU -p $m/usr/lib -f ${srcdir}/etc/mtree/BSD.debug.dist > /dev/null install_packages ${abi} ${workdir} $m FreeBSD-caroot FreeBSD-zoneinfo - cp /etc/master.passwd $m/etc + cp ${srcdir}/etc/master.passwd $m/etc pwd_mkdb -p -d $m/etc $m/etc/master.passwd || return $? - cp /etc/group $m/etc || return $? - cp /etc/termcap.small $m/etc/termcap.small || return $? - cp /etc/termcap.small $m/usr/share/misc/termcap || return $? + cp ${srcdir}/etc/group $m/etc || return $? + # termcap.small is generated so we get it from OBJDIR - make sets our + # working directory to OBJDIR/release + cp ../etc/termcap/termcap.small $m/etc/termcap.small || return $? + cp ../etc/termcap/termcap.small $m/usr/share/misc/termcap || return $? env DESTDIR=$m /usr/sbin/certctl rehash # Generate a suitable repo config for pkgbase case ${branch} in From 84de8c51d1a0fff1c65cd1ec44dd3c3a0e7904eb Mon Sep 17 00:00:00 2001 From: Rick Macklem Date: Fri, 10 Jan 2025 06:49:45 -0800 Subject: [PATCH 101/143] nfsd: Add vfs.nfsd.testing_disable_grace sysctl The grace time of 2 minutes plus when the nfsd is started is needed for normal operation. It allows client(s) to recovery open/lock state. However, for testing situations where there are no client(s) to recover state, it introduces an unacceptable delay. The new per-vnet jail sysctl can be set non-zero to disable the grace period. It should only be used for testing and can be applied on a per-jail basis. It must be set before the nfsd is started up. Requested by: asomers Tested by: asomers --- sys/fs/nfsserver/nfs_nfsdstate.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c index 6cd8c1c861ec77..d1639f48451c62 100644 --- a/sys/fs/nfsserver/nfs_nfsdstate.c +++ b/sys/fs/nfsserver/nfs_nfsdstate.c @@ -115,6 +115,11 @@ SYSCTL_INT(_vfs_nfsd, OID_AUTO, flexlinuxhack, CTLFLAG_RW, &nfsrv_flexlinuxhack, 0, "For Linux clients, hack around Flex File Layout bug"); +NFSD_VNET_DEFINE_STATIC(bool, nfsd_disable_grace) = false; +SYSCTL_BOOL(_vfs_nfsd, OID_AUTO, testing_disable_grace, + CTLFLAG_NFSD_VNET | CTLFLAG_RW, &NFSD_VNET_NAME(nfsd_disable_grace), + 0, "Disable grace for testing"); + /* * Hash lists for nfs V4. */ @@ -4381,11 +4386,13 @@ nfsrv_checkgrace(struct nfsrv_descript *nd, struct nfsclient *clp, * ReclaimComplete. If so, grace can end now. */ notreclaimed = 0; - LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head, - nst_list) { - if ((sp->nst_flag & NFSNST_RECLAIMED) == 0) { - notreclaimed = 1; - break; + if (!NFSD_VNET(nfsd_disable_grace)) { + LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head, + nst_list) { + if ((sp->nst_flag & NFSNST_RECLAIMED) == 0) { + notreclaimed = 1; + break; + } } } if (notreclaimed == 0) From 5e7d93a604400ca3c9db3be1df82ce963527740c Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:13:31 +0200 Subject: [PATCH 102/143] lib/libc/aarch64/string: add strcmp SIMD implementation This changeset includes a port of the SIMD implementation of strcmp for amd64 to Aarch64. Below is a description of its method as described in D41971. The basic idea is to process the bulk of the string in aligned blocks of 16 bytes such that one string runs ahead and the other runs behind. The string that runs ahead is checked for NUL bytes, the one that runs behind is compared with the corresponding chunk of the string that runs ahead. This trades an extra load per iteration for the very complicated block-reassembly needed in the other implementations (bionic, glibc). On the flip side, we need two code paths depending on the relative alignment of the two buffers. The initial part of the string is compared directly if it is known not to cross a page boundary. Otherwise, a complex slow path to avoid crossing into unmapped memory commences. Performance is better in most cases than the existing implementation from the Arm Optimized Routines repository. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D45839 --- lib/libc/aarch64/string/Makefile.inc | 4 +- lib/libc/aarch64/string/strcmp.S | 350 +++++++++++++++++++++++++++ 2 files changed, 353 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/strcmp.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index cabc79e4f35140..ba0947511872cf 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -13,13 +13,15 @@ AARCH64_STRING_FUNCS= \ stpcpy \ strchr \ strchrnul \ - strcmp \ strcpy \ strlen \ strncmp \ strnlen \ strrchr +# SIMD-enhanced routines not derived from Arm's code +MDSRCS+= \ + strcmp.S # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/strcmp.S b/lib/libc/aarch64/string/strcmp.S new file mode 100644 index 00000000000000..e8418dfc6763a0 --- /dev/null +++ b/lib/libc/aarch64/string/strcmp.S @@ -0,0 +1,350 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include +#include + + .weak strcmp + .set strcmp, __strcmp + .text + +ENTRY(__strcmp) + + bic x8, x0, #0xf // x0 aligned to the boundary + and x9, x0, #0xf // x9 is the offset + bic x10, x1, #0xf // x1 aligned to the boundary + and x11, x1, #0xf // x11 is the offset + + mov x13, #-1 + + /* + * Check if either string is located at end of page to avoid crossing + * into unmapped page. If so, we load 16 bytes from the nearest + * alignment boundary and shift based on the offset. + */ + + add x3, x0, #16 // end of head + add x4, x1, #16 + eor x3, x3, x0 + eor x4, x4, x1 // bits that changed + orr x3, x3, x4 // in either str1 or str2 + tbz w3, #PAGE_SHIFT, .Lbegin + + ldr q0, [x8] // load aligned head + ldr q2, [x10] + + lsl x14, x9, #2 + lsl x15, x11, #2 + lsl x3, x13, x14 // string head + lsl x4, x13, x15 + + cmeq v5.16b, v0.16b, #0 + cmeq v6.16b, v2.16b, #0 + + shrn v5.8b, v5.8h, #4 + shrn v6.8b, v6.8h, #4 + fmov x5, d5 + fmov x6, d6 + + adrp x2, shift_data + add x2, x2, :lo12:shift_data + + /* heads may cross page boundary, avoid unmapped loads */ + tst x5, x3 + b.eq 0f + + ldr q4, [x2, x9] // load permutation table + tbl v0.16b, {v0.16b}, v4.16b + + b 1f + .p2align 4 +0: + ldr q0, [x0] // load true head +1: + tst x6, x4 + b.eq 0f + + ldr q4, [x2, x11] + tbl v4.16b, {v2.16b}, v4.16b + + b 1f + + .p2align 4 +.Lbegin: + ldr q0, [x0] // load true heads +0: + ldr q4, [x1] +1: + + cmeq v2.16b, v0.16b, #0 // NUL byte present? + cmeq v4.16b, v0.16b, v4.16b // which bytes match? + + orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte? + + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, .Lhead_mismatch + + ldr q2, [x8, #16] // load second chunk + ldr q3, [x10, #16] + subs x9, x9, x11 // is a&0xf >= b&0xf + b.lo .Lswapped // if not swap operands + sub x12, x10, x9 + ldr q0, [x12, #16]! + sub x10, x10, x8 + sub x11, x10, x9 + + cmeq v1.16b, v3.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + add x8, x8, #16 + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 + cbnz x5, .Lmismatch + add x8, x8, #16 // advance aligned pointers + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X1 doesn't end within region 2, then we compare chunk B between the + * two strings. As X1 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that x0 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + cmeq v1.16b, v1.16b, #0 // end of string? + cmeq v0.16b, v0.16b, v2.16b // do the chunks match? + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 // any mismatches? + cbnz x5, .Lmismatch + + add x8, x8, #16 + + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + add x8, x8, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound2 + mvn x5, x5 + cbz x5, 0b + + sub x8, x8, #16 // roll back second increment +.Lmismatch: + rbit x2, x5 + clz x2, x2 // index of mismatch + lsr x2, x2, #2 + add x11, x8, x11 + + ldrb w4, [x8, x2] + ldrb w5, [x11, x2] + sub w0, w4, w5 // byte difference + ret + + .p2align 4 +.Lnulfound2: + sub x8, x8, #16 + +.Lnulfound: + mov x7, x9 + mov x4, x6 + + ubfiz x7, x7, #2, #4 // x7 = (x7 & 0xf) << 2 + lsl x6, x6, x7 // adjust NUL mask to indices + orn x5, x6, x5 + cbnz x5, .Lmismatch + + /* + * (x0) == (x1) and NUL is past the string. + * Compare (x1) with the corresponding part + * of the other string until the NUL byte. + */ + ldr q0, [x8, x9] + ldr q1, [x8, x10] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x2, x5 + clz x2, x2 + lsr x5, x2, #2 + + add x10, x10, x8 // restore x10 pointer + add x8, x8, x9 // point to corresponding chunk + + ldrb w4, [x8, x5] + ldrb w5, [x10, x5] + sub w0, w4, w5 + ret + + .p2align 4 +.Lhead_mismatch: + rbit x2, x5 + clz x2, x2 // index of mismatch + lsr x2, x2, #2 + ldrb w4, [x0, x2] + ldrb w5, [x1, x2] + sub w0, w4, w5 + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ + .p2align 4 +.Lswapped: + add x12, x8, x9 + ldr q0, [x12, #16]! + sub x8, x8, x10 + add x11, x8, x9 + neg x9, x9 + + cmeq v1.16b, v2.16b, #0 + cmeq v0.16b, v0.16b, v3.16b + add x10, x10, #16 + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + add x10, x10, #16 + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X0 doesn't end within region 2, then we compare chunk B between the + * two strings. As X0 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that X1 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + + add x10, x10, #16 + + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + add x10, x10, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + fmov x6, d1 + shrn v0.8b, v0.8h, #4 + fmov x5, d0 + cbnz x6, .Lnulfound2s + mvn x5, x5 + cbz x5, 0b + + sub x10, x10, #16 + +.Lmismatchs: + rbit x2, x5 + clz x2, x2 + lsr x2, x2, #2 + add x11, x10, x11 + + ldrb w4, [x10, x2] + ldrb w5, [x11, x2] + sub w0, w5, w4 + ret + + .p2align 4 +.Lnulfound2s: + sub x10, x10, #16 +.Lnulfounds: + mov x7, x9 + mov x4, x6 + + ubfiz x7, x7, #2, #4 + lsl x6, x6, x7 + orn x5, x6, x5 + cbnz x5, .Lmismatchs + + ldr q0, [x10, x9] + ldr q1, [x10, x8] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x2, x5 + clz x2, x2 + lsr x5, x2, #2 + + add x11, x10, x8 + add x10, x10, x9 + + ldrb w4, [x10, x5] + ldrb w5, [x11, x5] + sub w0, w5, w4 + ret + +END(__strcmp) + + .section .rodata + .p2align 4 +shift_data: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .fill 16, 1, -1 + .size shift_data, .-shift_data From b91003acffe7b50dd6506be15116c6b42fc512c6 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:13:54 +0200 Subject: [PATCH 103/143] lib/libc/aarch64/string: add strspn optimized implementation This is a port of the Scalar optimized variant of strspn for amd64 to aarch64. It utilizes a LUT to speed up the function, a SIMD variant is still under development. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46396 --- lib/libc/aarch64/string/Makefile.inc | 4 +- lib/libc/aarch64/string/strspn.S | 111 +++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/strspn.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index ba0947511872cf..09bfaef963eb5f 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -21,7 +21,9 @@ AARCH64_STRING_FUNCS= \ # SIMD-enhanced routines not derived from Arm's code MDSRCS+= \ - strcmp.S + strcmp.S \ + strspn.S + # # Add the above functions. Generate an asm file that includes the needed # Arm Optimized Routines file defining the function name to the libc name. diff --git a/lib/libc/aarch64/string/strspn.S b/lib/libc/aarch64/string/strspn.S new file mode 100644 index 00000000000000..0ef42c2b737e90 --- /dev/null +++ b/lib/libc/aarch64/string/strspn.S @@ -0,0 +1,111 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + + .weak strspn + .set strspn, __strspn + .text + +ENTRY(__strspn) + + /* check for special cases */ + ldrb w4, [x1] // first character in set + cbz w4, .Lzero // empty set always returns 0 + + mov x15, #1 // preload register with 1 for stores + + // set is only one character + ldrb w5, [x1, #1] // second character in the set + cbz w5, .Lsingle + + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp, sp, #256 // allocate 256 bytes on the stack + + /* no special case matches -- prepare lookup table */ + mov w3, #28 +0: add x9, sp, x3, lsl #3 + stp xzr, xzr, [x9] + stp xzr, xzr, [x9, #16] + subs w3, w3, #4 + b.cs 0b + + strb w15, [sp, x4] // register first character in set + add x1, x1, #2 + + /* process remaining chars in set */ + .p2align 4 + + +0: ldrb w4, [x1] // next char in set + strb w15, [sp, x5] // register previous char + cbz w4, 1f // NUL encountered? + + ldrb w5, [x1, #1] + add x1, x1, #2 + strb w15, [sp, x4] + cbnz w5, 0b + +1: mov x5, x0 // stash a copy of src + + /* find mismatch */ + .p2align 4 +0: ldrb w8, [x0] + ldrb w9, [sp, x8] + cbz w9, 2f + + ldrb w8, [x0, #1] + ldrb w9, [sp, x8] + cbz w9, 3f + + ldrb w8, [x0, #2] + ldrb w9, [sp, x8] + cbz w9, 4f + + ldrb w8, [x0, #3] + add x0, x0, #4 + ldrb w9, [sp, x8] + cbnz w9, 0b + + sub x0, x0, #3 +4: sub x5, x5, #1 +3: add x0, x0, #1 +2: sub x0, x0, x5 + mov sp, x29 + ldp x29, x30, [sp], #16 + ret + +.Lzero: + mov x0, #0 + ret + +.Lsingle: + ldrb w8, [x0, x5] + cmp w4, w8 + b.ne 1f + + add x5, x5, #1 + ldrb w8, [x0, x5] + cmp w4, w8 + b.ne 1f + + add x5, x5, #1 + ldrb w8, [x0, x5] + cmp w4, w8 + b.ne 1f + + add x5, x5, #1 + ldrb w8, [x0, x5] + add x5, x5, #1 + cmp w4, w8 + b.eq .Lsingle + + sub x5, x5, #1 +1: mov x0, x5 + ret + +END(__strspn) From f2bd390a54f183f85dd7faab815740fb3bea9591 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:14:01 +0200 Subject: [PATCH 104/143] lib/libc/aarch64/string: add strcspn optimized implementation This is a port of the Scalar optimized variant of strcspn for amd64 to aarch64 It utilizes a LUT to speed up the function, a SIMD variant is still under development. Performance benchmarks are as usual generated by strperf. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46398 --- lib/libc/aarch64/string/Makefile.inc | 3 +- lib/libc/aarch64/string/strcspn.S | 109 +++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/strcspn.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 09bfaef963eb5f..34483532a3dd3c 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -22,7 +22,8 @@ AARCH64_STRING_FUNCS= \ # SIMD-enhanced routines not derived from Arm's code MDSRCS+= \ strcmp.S \ - strspn.S + strspn.S \ + strcspn.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strcspn.S b/lib/libc/aarch64/string/strcspn.S new file mode 100644 index 00000000000000..8f2d6d20f0f66b --- /dev/null +++ b/lib/libc/aarch64/string/strcspn.S @@ -0,0 +1,109 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + + .weak strcspn + .set strcspn, __strcspn + .text + +ENTRY(__strcspn) + stp x29, x30, [sp, #-16]! + mov x29, sp + mov x15, #1 // preload register with 1 for stores + + /* check for special cases */ + ldrb w4, [x1] // first character in the set + cbz w4, .Lstrlen + + movi v0.16b, #0 + + ldrb w5, [x1, #1] // second character in the set + cbz w5, .Lstrchr + + sub sp, sp, #256 // allocate 256 bytes on the stack + + /* no special case matches -- prepare lookup table */ + mov w3, #20 + .p2align 4 +0: add x9, sp, x3, lsl #3 + stp xzr, xzr, [x9] + stp xzr, xzr, [x9, #16] + subs w3, w3, #4 + b.cs 0b + + /* utilize SIMD stores to speed up zeroing the table */ + stp q0, q0, [sp, #6*32] + stp q0, q0, [sp, #7*32] + + add x1, x1, #2 + strb w15, [sp, x4] // register first chars in the set + strb w15, [sp, x5] + + mov x4, x0 // stash a copy of src + + /* process remaining chars in set */ + .p2align 4 +0: ldrb w5, [x1] + strb w15, [sp, x5] + cbz w5, 1f // end of set? + + ldrb w5, [x1, #1] + strb w15, [sp, x5] + cbz w5, 1f + + add x1, x1, #2 + b 0b + + /* find match */ + .p2align 4 +1: ldrb w8, [x0] + ldrb w9, [sp, x8] + cbnz w9, 2f + + ldrb w8, [x0, #1] + ldrb w9, [sp, x8] + cbnz w9, 3f + + ldrb w8, [x0, #2] + ldrb w9, [sp, x8] + cbnz w9, 4f + + ldrb w8, [x0, #3] + ldrb w9, [sp, x8] + add x0, x0, #4 + cbz w9, 1b + + sub x0, x0, #3 // fix up return value +4: sub x4, x4, #1 +3: add x0, x0, #1 +2: sub x0, x0, x4 + mov sp, x29 + ldp x29, x30, [sp], #16 // restore sp and lr + ret + + /* set is empty, degrades to strlen */ + .p2align 4 +.Lstrlen: + mov sp, x29 + ldp x29, x30, [sp], #16 // restore sp and lr + b strlen + + /* just one character in set, degrades to strchrnul */ + .p2align 4 +.Lstrchr: + stp x0, x1, [sp, #-16]! + mov x1, x4 + + bl strchrnul + + ldp x18, x17, [sp], #16 // restore stashed src + sub x0, x0, x18 + + ldp x29, x30, [sp], #16 // Restore sp and lr + ret + +END(__strcspn) From 89b3872376cbb6e8ab53cb50fa8c4c6d14e2d405 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:14:08 +0200 Subject: [PATCH 105/143] lib/libc/aarch64/string: add optimized strpbrk & strsep implementations These are direct copies from the amd64 string functions using the optimized strcspn from D46398 Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46399 --- lib/libc/aarch64/string/Makefile.inc | 4 +- lib/libc/aarch64/string/strpbrk.c | 43 +++++++++++++++++++++ lib/libc/aarch64/string/strsep.c | 57 ++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/strpbrk.c create mode 100644 lib/libc/aarch64/string/strsep.c diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 34483532a3dd3c..996a2fd45bc034 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -23,7 +23,9 @@ AARCH64_STRING_FUNCS= \ MDSRCS+= \ strcmp.S \ strspn.S \ - strcspn.S + strcspn.S \ + strpbrk.c \ + strsep.c # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strpbrk.c b/lib/libc/aarch64/string/strpbrk.c new file mode 100644 index 00000000000000..87f5877899918f --- /dev/null +++ b/lib/libc/aarch64/string/strpbrk.c @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include + +size_t __strcspn(const char *, const char *); + +char * +strpbrk(const char *s, const char *charset) +{ + size_t loc; + + loc = __strcspn(s, charset); + + return (s[loc] == '\0' ? NULL : (char *)&s[loc]); +} diff --git a/lib/libc/aarch64/string/strsep.c b/lib/libc/aarch64/string/strsep.c new file mode 100644 index 00000000000000..7afd47957aa9af --- /dev/null +++ b/lib/libc/aarch64/string/strsep.c @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include +#include + +size_t __strcspn(const char *, const char *); + +/* + * We have a fast strcspn() on aarch64. Use it over a direct + * implementation of strsep for better performance. + */ +char * +strsep(char **stringp, const char *delim) +{ + size_t n; + char *s; + + s = *stringp; + if (s == NULL) + return (NULL); + + n = __strcspn(s, delim); + if (s[n] == '\0') + *stringp = NULL; + else { + s[n] = '\0'; + *stringp = s + n + 1; + } + + return (s); +} From 79287d783c72f95eb47c26dbfdfca279086e16a9 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:14:15 +0200 Subject: [PATCH 106/143] lib/libc/aarch64/string: strcat enable use of SIMD Call into SIMD strlen and stpcpy for an optimized strcat. Port of D42600 for amd64. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46417 --- lib/libc/aarch64/string/Makefile.inc | 3 ++- lib/libc/aarch64/string/strcat.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/strcat.c diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 996a2fd45bc034..0b2974947389e1 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -25,7 +25,8 @@ MDSRCS+= \ strspn.S \ strcspn.S \ strpbrk.c \ - strsep.c + strsep.c \ + strcat.c # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strcat.c b/lib/libc/aarch64/string/strcat.c new file mode 100644 index 00000000000000..c70875be1c1a1e --- /dev/null +++ b/lib/libc/aarch64/string/strcat.c @@ -0,0 +1,20 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + +#undef strcat /* _FORTIFY_SOURCE */ + +char * +strcat(char * __restrict s, const char * __restrict append) +{ + char *save = s; + + /* call into SIMD optimized functions */ + stpcpy(s + strlen(s), append); + + return(save); +} From 756b7fc80837567d114a3c93e9bb987e219a1b23 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:14:31 +0200 Subject: [PATCH 107/143] lib/libc/aarch64/string: add strlcpy SIMD implementation This changeset includes a port of the SIMD implementation of strlcpy for amd64 to Aarch64. It is based on memccpy (D46170) with some minor differences. Performance is significantly better than the scalar implementation. Benchmark results are as usual generated by the strperf utility written by fuz. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46243 --- lib/libc/aarch64/string/Makefile.inc | 3 +- lib/libc/aarch64/string/strlcpy.S | 316 +++++++++++++++++++++++++++ 2 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/strlcpy.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 0b2974947389e1..34a84bcfe1331b 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -26,7 +26,8 @@ MDSRCS+= \ strcspn.S \ strpbrk.c \ strsep.c \ - strcat.c + strcat.c \ + strlcpy.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strlcpy.S b/lib/libc/aarch64/string/strlcpy.S new file mode 100644 index 00000000000000..3859aaca447bfd --- /dev/null +++ b/lib/libc/aarch64/string/strlcpy.S @@ -0,0 +1,316 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + + .weak strlcpy + .set strlcpy, __strlcpy + .text + +ENTRY(__strlcpy) + subs x2, x2, #1 + b.lo .L0 + + mov x9, x0 // stash copy of dst pointer + bic x10, x1, #0xf // src aligned + and x11, x1, #0xf // src offset + + ldr q1, [x10] + cmeq v1.16b, v1.16b, #0 // NUL found in head? + + mov x8, #-1 // fill register with 0xfff..fff + lsl x12, x11, #2 + lsl x8, x8, x12 // mask of bytes in the string + + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + ands x5, x5, x8 + b.ne .Lhead_nul + + ldr q3, [x10, #16] // load second string chunk + ldr q2, [x1] // load true head + mov x8, #32 + sub x8, x8, x11 + + cmeq v1.16b, v3.16b, #0 // NUL found in second chunk? + + subs x2, x2, x8 + b.ls .Lhead_buf_end + + /* process second chunk */ + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + cbnz x5, .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer */ + ldr q1, [x10, #32] // load next string chunk + str q2, [x0] // deposit head into buffer + sub x0, x0, x11 // adjust x0 + str q3, [x0, #16] // deposit second chunk + add x10, x10, #32 // advance src + add x0, x0, #32 // advance dst + subs x2, x2, #16 // enough left for another round? + b.ls 1f + + /* main loop unrolled twice */ + .p2align 4 +0: + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, 3f + + str q1, [x0] + ldr q1, [x10, #16] // load next chunk + + cmp x2, #16 // more than a full chunk left? + b.ls 2f + + add x10, x10, #32 // advance pointers + add x0, x0, #32 + + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + cbnz x5, 4f // process chunk if match + + str q1, [x0, #-16] + ldr q1, [x10] // load next chunk + + subs x2, x2, #32 + b.hi 0b + +1: + sub x10, x10, #16 // undo second advancement + add x2, x2, #16 + sub x0, x0, #16 + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: + cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x4, d2 + + mov x6, #0xf + mov x7, x4 + + lsl x5, x2, #2 // shift 0xf to the limits position + lsl x5, x6, x5 + cmp x2, #16 // dont induce match if limit >=16 + csel x5, x5, xzr, lo + orr x8, x4, x5 // treat limit as if terminator present + + rbit x8, x8 // simulate x86 tzcnt + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + add x0, x0, x8 + + ldr q1, [x10, x8] // load tail + str q1, [x0] // store tail + strb wzr, [x0, #16] + + /* continue to find the end of the string */ + cbnz x7, 1f + + /* we opt for a simpler strlen than the one in libc as the + * cmeq, shrn approach is faster for shorter strings. + */ + .p2align 4 +0: + ldr q1, [x10, #32] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + cbnz x7, 2f + + ldr q1, [x10, #48] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + add x10, x10, #32 + cbz x7, 0b + +1: sub x10, x10, #16 +2: rbit x8, x7 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x10, x10, x1 + add x0, x10, #32 + add x0, x0, x8 + + ret + +4: + sub x10, x10, #16 // undo second advancement + sub x0, x0, #16 // undo second advancement + + /* string has ended but buffer has not */ +3: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + add x0, x0, x8 // restore dst pointer + add x10, x10, x8 + + ldr q1, [x10, #-15] + str q1, [x0, #-15] + add x0, x0, #1 + sub x0, x10, x1 + + ret + +.Lhead_buf_end: + shrn v1.8b, v1.8h, #4 + fmov x8, d1 + + add x2, x2, #32 // restore limit + + mov x7, x8 + mov x6, #0xf + + cmp x2, #16 // should we induce a match or not + b.lo 0f + + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + add x8, x8, #16 + + cmp x8, x2 + csel x8, x8, x2, lo // copy min(buflen, srclen) bytes + b 1f +0: + + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + mov x8, x2 +1: + + sub x8, x8, x11 + strb wzr, [x9, x8] + + /* continue to find the end of the string */ + cbnz x7, 1f + + /* we opt for a simpler strlen than the one in libc as the + * cmeq, shrn approach is faster for shorter strings. + */ + .p2align 4 +0: + ldr q1, [x10, #32] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + cbnz x7, 2f + + ldr q1, [x10, #48] + cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL + shrn v1.8b, v1.8h, #4 + fmov x7, d1 + add x10, x10, #32 + cbz x7, 0b + +1: sub x10, x10, #16 +2: rbit x6, x7 + clz x6, x6 // index of mismatch + lsr x6, x6, #2 + + sub x10, x10, x1 + add x0, x10, #32 + add x0, x0, x6 + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + b .L1732 + +.Lsecond_nul: + add x2, x2, x8 + + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x5, x8, #2 + + sub x8, x11, #16 + sub x0, x5, x8 // string length + + cmp x0, x2 // did we match or hit limit first? + csel x8, x2, x0, hi + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + strb wzr, [x4] + + /* copy 17-32 bytes */ +.L1732: + cmp x8, #16 + b.lo .L0816 + ldp x16, x17, [x1] + ldp x12, x1, [x5, #-16] + stp x16, x17, [x9] + stp x12, x1, [x4, #-16] + ret + +.Lhead_nul: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x0, x8, x11 + cmp x0, x2 + csel x8, x2, x0, hi + + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + strb wzr, [x4] + + /* Copy 8-16 bytes */ +.L0816: + tbz x8, #3, .L0407 + ldr x16, [x1] + ldr x17, [x5, #-8] + str x16, [x9] + str x17, [x4, #-8] + ret + + /* Copy 4-7 bytes */ + .p2align 4 +.L0407: + cmp x8, #3 + b.ls .L0203 + ldr w16, [x1] + ldr w18, [x5, #-4] + str w16, [x9] + str w18, [x4, #-4] + ret + +.L0203: + tbz x8, 1, .L0001 + ldrh w16, [x1] + ldrh w17, [x5, #-2] + strh w16, [x9] + strh w17, [x4, #-2] + ret + +.L0001: + ldrb w16, [x1] + strb w16, [x9] + strb wzr, [x4] + ret + +.L0: + mov x0, x1 + b strlen + ret +END(__strlcpy) From 25c485e147691f3929b0b5029bab58bf56d3606b Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:14:37 +0200 Subject: [PATCH 108/143] lib/libc/aarch64/string: add strncmp SIMD implementation This changeset includes a port of the SIMD implementation of strncmp for amd64 to Aarch64. It is based on D45839 with added handling for the limit. An extended unit test for strncmp is currently being written to make sure the bounds checks for page crossings work as expected. Performance is significantly better than the existing implementation from the Arm Optimized Routines repository. Benchmark results are generated by the strperf utility by fuz. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D45943 --- lib/libc/aarch64/string/Makefile.inc | 4 +- lib/libc/aarch64/string/strncmp.S | 569 +++++++++++++++++++++++++++ 2 files changed, 571 insertions(+), 2 deletions(-) create mode 100644 lib/libc/aarch64/string/strncmp.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 34a84bcfe1331b..351f3424b6d0a6 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -15,7 +15,6 @@ AARCH64_STRING_FUNCS= \ strchrnul \ strcpy \ strlen \ - strncmp \ strnlen \ strrchr @@ -27,7 +26,8 @@ MDSRCS+= \ strpbrk.c \ strsep.c \ strcat.c \ - strlcpy.S + strlcpy.S \ + strncmp.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strncmp.S b/lib/libc/aarch64/string/strncmp.S new file mode 100644 index 00000000000000..a7f4156da9e8fe --- /dev/null +++ b/lib/libc/aarch64/string/strncmp.S @@ -0,0 +1,569 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include +#include + + .weak strncmp + .set strncmp, __strncmp + .text + +ENTRY(__strncmp) + + bic x8, x0, #0xf // x0 aligned to the boundary + and x9, x0, #0xf // x9 is the offset + bic x10, x1, #0xf // x1 aligned to the boundary + and x11, x1, #0xf // x11 is the offset + + subs x2, x2, #1 + b.lo .Lempty + + mov x13, #-1 // save constants for later + mov x16, #0xf + + /* + * Check if either string is located at end of page to avoid crossing + * into unmapped page. If so, we load 16 bytes from the nearest + * alignment boundary and shift based on the offset. + */ + + add x3, x0, #16 // end of head + add x4, x1, #16 + eor x3, x3, x0 + eor x4, x4, x1 // bits that changed + orr x3, x3, x4 // in either str1 or str2 + cmp x2,#16 + b.lo .Llt16 + tbz w3, #PAGE_SHIFT, .Lbegin + + ldr q0, [x8] // load aligned head + ldr q1, [x10] + + lsl x14, x9, #2 + lsl x15, x11, #2 + lsl x3, x13, x14 // string head + lsl x4, x13, x15 + + cmeq v5.16b, v0.16b, #0 + cmeq v6.16b, v1.16b, #0 + + shrn v5.8b, v5.8h, #4 + shrn v6.8b, v6.8h, #4 + fmov x5, d5 + fmov x6, d6 + + adrp x14, shift_data + add x14, x14, :lo12:shift_data + + /* heads may cross page boundary, avoid unmapped loads */ + tst x5, x3 + b.eq 0f + + ldr q4, [x14, x9] // load permutation table + tbl v0.16b, {v0.16b}, v4.16b + + b 1f + .p2align 4 +0: + ldr q0, [x0] // load true head +1: + tst x6, x4 + b.eq 0f + + ldr q4, [x14, x11] + tbl v4.16b, {v1.16b}, v4.16b + + b 1f + + .p2align 4 +.Lbegin: + ldr q0, [x0] // load true heads +0: + ldr q4, [x1] +1: + cmeq v2.16b, v0.16b, #0 // NUL byte present? + cmeq v4.16b, v0.16b, v4.16b // which bytes match? + + orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte? + + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, .Lhead_mismatch + /* load head and second chunk */ + ldr q2, [x8, #16] // load second chunk + ldr q3, [x10, #16] + + add x2, x2, x11 + sub x2, x2, #16 + + subs x9, x9, x11 // is a&0xf >= b&0xf + b.lo .Lswapped // if not swap operands + b .Lnormal + + .p2align 4 +.Llt16: + /* + * Check if either string is located at end of page to avoid crossing + * into unmapped page. If so, we load 16 bytes from the nearest + * alignment boundary and shift based on the offset. + */ + tbz w3, #PAGE_SHIFT, 2f + + ldr q0, [x8] // load aligned head + ldr q1, [x10] + + lsl x14, x9, #2 + lsl x15, x11, #2 + lsl x3, x13, x14 // string head + lsl x4, x13, x15 + + /* Introduce a null byte match if the limit is within the aligned chunk */ + add x14, x2, x9 + add x15, x2, x11 + lsl x14, x14, #2 + lsl x15, x15, #2 + lsl x14, x16, x14 + lsl x15, x16, x15 + + cmeq v5.16b, v0.16b, #0 + cmeq v6.16b, v1.16b, #0 + + shrn v5.8b, v5.8h, #4 + shrn v6.8b, v6.8h, #4 + fmov x5, d5 + fmov x6, d6 + + orr x5, x5, x14 // insert match at limit + orr x6, x6, x15 + + adrp x14, shift_data + add x14, x14, :lo12:shift_data + + /* heads may cross page boundary, avoid unmapped loads */ + tst x5, x3 + b.eq 0f + + ldr q4, [x14, x9] // load permutation table + tbl v0.16b, {v0.16b}, v4.16b + + b 1f + .p2align 4 +0: + ldr q0, [x0] // load true head +1: + tst x6, x4 + b.eq 0f + + ldr q4, [x14, x11] + tbl v4.16b, {v1.16b}, v4.16b + + b 1f + + .p2align 4 +2: + ldr q0, [x0] // load true heads +0: + ldr q4, [x1] +1: + + cmeq v2.16b, v0.16b, #0 // NUL byte present? + cmeq v4.16b, v0.16b, v4.16b // which bytes match? + + bic v2.16b, v4.16b, v2.16b // match and not NUL byte + + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + lsl x4, x2, #2 + lsl x4, x13, x4 + orn x5, x4, x5 // mismatch or NUL byte? + +.Lhead_mismatch: + rbit x3, x5 + clz x3, x3 // index of mismatch + lsr x3, x3, #2 + ldrb w4, [x0, x3] + ldrb w5, [x1, x3] + sub w0, w4, w5 + ret + + .p2align 4 +.Lnormal: + sub x12, x10, x9 + ldr q0, [x12, #16]! + sub x10, x10, x8 + sub x11, x10, x9 + + cmeq v1.16b, v3.16b, #0 // NUL present? + cmeq v0.16b, v0.16b, v2.16b // Mismatch between chunks? + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + add x8, x8, #32 // advance to next iteration + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3, x6, x4 // introduce a null byte match + cmp x2, #16 // does the buffer end within x2 + csel x6, x3, x6, lo + cbnz x6, .Lnulfound2 // NUL or end of buffer found? + mvn x5, x5 + cbnz x5, .Lmismatch2 + sub x2, x2, #16 + cmp x2, #32 // end of buffer? + b.lo .Ltail + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X1 doesn't end within region 2, then we compare chunk B between the + * two strings. As X1 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that x0 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + cmeq v1.16b, v1.16b, #0 // end of string? + cmeq v0.16b, v0.16b, v2.16b // do the chunks match? + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfound + mvn x5, x5 // any mismatches? + cbnz x5, .Lmismatch + + add x8, x8, #16 + + /* main loop unrolled twice */ + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + add x8, x8, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfound2 + mvn x5, x5 + cbnz x5, .Lmismatch2 + sub x2, x2, #32 + cmp x2, #32 // end of buffer? + b.hs 0b // if yes, process tail + + /* end of buffer will occur in next 32 bytes */ +.Ltail: + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + cmeq v1.16b, v1.16b, #0 // end of string? + cmeq v0.16b, v0.16b, v2.16b // do the chunks match? + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + /* + * If x2 <= 16 then we introduce a NUL byte in the + * result from CMEQ to avoid comparing further! + */ + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3, x6, x4 // introduce a null byte match + cmp x2, #16 // does the buffer end within x2 + csel x6, x3, x6, lo + + cbnz x6, .Lnulfound // NUL or end of string found + mvn x5, x5 + cbnz x5, .Lmismatch + + add x8, x8, #16 + + /* main loop unrolled twice */ + ldr q0, [x8, x11] + ldr q1, [x8, x10] + ldr q2, [x8] + + add x8, x8, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + ubfiz x4, x2, #2, #4 // (x2 - 16) << 2 + lsl x4, x13, x4 // take first half into account + orr x6, x6, x4 // introduce a null byte match + +.Lnulfound2: + sub x8, x8, #16 + +.Lnulfound: + mov x4, x6 + + ubfiz x7, x9, #2, #4 + lsl x6, x6, x7 // adjust NUL mask to indices + + orn x5, x6, x5 + cbnz x5, .Lmismatch + + /* + * (x0) == (x1) and NUL is past the string. + * Compare (x1) with the corresponding part + * of the other string until the NUL byte. + */ + ldr q0, [x8, x9] + ldr q1, [x8, x10] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x3, x5 + clz x3, x3 + lsr x5, x3, #2 + + add x10, x10, x8 // restore x10 pointer + add x8, x8, x9 // point to corresponding chunk + + ldrb w4, [x8, x5] + ldrb w5, [x10, x5] + sub w0, w4, w5 + ret + + .p2align 4 +.Lmismatch2: + sub x8, x8, #16 // roll back second increment +.Lmismatch: + rbit x3, x5 + clz x3, x3 // index of mismatch + lsr x3, x3, #2 + add x11, x8, x11 + + ldrb w4, [x8, x3] + ldrb w5, [x11, x3] + sub w0, w4, w5 // byte difference + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ + .p2align 4 +.Lswapped: + add x12, x8, x9 + ldr q0, [x12, #16]! + sub x8, x8, x10 + add x11, x8, x9 + add x2,x2,x9 + neg x9, x9 + + cmeq v1.16b, v2.16b, #0 + cmeq v0.16b, v0.16b, v3.16b + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + add x10, x10, #32 + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3,x6,x4 // introduce a null byte match + cmp x2,#16 + csel x6, x3, x6, lo + cbnz x6, .Lnulfound2s + mvn x5, x5 + cbnz x5, .Lmismatch2s + + sub x2, x2, #16 + cmp x2, #32 + b.lo .Ltails + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * X0 doesn't end within region 2, then we compare chunk B between the + * two strings. As X0 is known not to hold a NUL byte in regions 1 + * and 2 at this point, this also ensures that X1 has not ended yet. + */ + .p2align 4 +0: + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + + add x10, x10, #16 + + /* main loop unrolled twice */ + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + add x10, x10, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + cbnz x6, .Lnulfound2s + mvn x5, x5 + cbnz x5, .Lmismatch2s + sub x2, x2, #32 + cmp x2, #32 + b.hs 0b + +.Ltails: + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + /* + * If x2 <= 16 then we introduce a NUL byte in the + * result from CMEQ to avoid comparing further! + */ + + lsl x4, x2, #2 + lsl x4, x13, x4 + orr x3, x6, x4 // introduce a null byte match + cmp x2, #16 + csel x6, x3, x6, lo + + cbnz x6, .Lnulfounds + mvn x5, x5 + cbnz x5, .Lmismatchs + + add x10, x10, #16 + + ldr q0, [x10, x11] + ldr q1, [x10, x8] + ldr q2, [x10] + + add x10, x10, #16 + cmeq v1.16b, v1.16b, #0 + cmeq v0.16b, v0.16b, v2.16b + + shrn v1.8b, v1.8h, #4 + shrn v0.8b, v0.8h, #4 + fmov x6, d1 + fmov x5, d0 + + ubfiz x4, x2, #2, #4 + lsl x4, x13, x4 + orr x6, x6, x4 // introduce a null byte match + +.Lnulfound2s: + sub x10, x10, #16 +.Lnulfounds: + mov x4, x6 + + ubfiz x7, x9, #2, #4 + lsl x6, x6, x7 + + orn x5, x6, x5 + + cbnz x5, .Lmismatchs + + ldr q0, [x10, x9] + ldr q1, [x10, x8] + + cmeq v1.16b, v0.16b, v1.16b + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + orn x5, x4, x5 + + rbit x3, x5 + clz x3, x3 + lsr x5, x3, #2 + + add x11, x10, x8 + add x10, x10, x9 + + ldrb w4, [x10, x5] + ldrb w5, [x11, x5] + sub w0, w5, w4 + ret + + .p2align 4 +.Lmismatch2s: + sub x10, x10, #16 +.Lmismatchs: + rbit x3, x5 + clz x3, x3 + lsr x3, x3, #2 + add x11, x10, x11 + + ldrb w4, [x10, x3] + ldrb w5, [x11, x3] + sub w0, w5, w4 + ret + + .p2align 4 +.Lempty: + eor x0, x0, x0 + ret + +END(__strncmp) + + .section .rodata + .p2align 4 +shift_data: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + .fill 16, 1, -1 + .size shift_data, .-shift_data From bad17991c06d684e9053938d00a07b962e2fd31c Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:15:13 +0200 Subject: [PATCH 109/143] lib/libc/aarch64/string: add memccpy SIMD implementation This changeset includes a port of the SIMD implementation of memccpy for amd64 to Aarch64. Performance is significantly better than the scalar implementation except for short strings. Benchmark results are as usual generated by the strperf utility written by fuz. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46170 --- lib/libc/aarch64/string/Makefile.inc | 3 +- lib/libc/aarch64/string/memccpy.S | 271 +++++++++++++++++++++++++++ 2 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/memccpy.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 351f3424b6d0a6..78145a17ab8585 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -27,7 +27,8 @@ MDSRCS+= \ strsep.c \ strcat.c \ strlcpy.S \ - strncmp.S + strncmp.S \ + memccpy.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/memccpy.S b/lib/libc/aarch64/string/memccpy.S new file mode 100644 index 00000000000000..7d9fdb14b84b9d --- /dev/null +++ b/lib/libc/aarch64/string/memccpy.S @@ -0,0 +1,271 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + + .weak memccpy + .set memccpy, __memccpy + .text + +ENTRY(__memccpy) + subs x3, x3, #1 + b.lo .L0 + + dup v0.16b, w2 + + mov x9, x0 // stash copy of src pointer + bic x10, x1, #0xf // src aligned + and x11, x1, #0xf // src offset + + ldr q1, [x10] + cmeq v1.16b, v1.16b, v0.16b // bytewise compare against src char + + mov x8, #-1 // prepare a 0xfff..fff register + mov x6, #0xf + + lsl x12, x11, #2 + lsl x8, x8, x12 // mask of bytes in the string + + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + sub x12, x11, #32 + adds x12, x12, x3 // distance from alignment boundary - 32 + b.cc .Lrunt // branch if buffer length is 32 or less + + ands x8, x8, x5 + b.eq 0f + + /* match in first chunk */ + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x8, x8, x11 // ... from beginning of the string + + add x0, x0, x8 + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + add x0, x0, #1 + + b .L0816 + +0: + ldr q3, [x10, #16] // load second string chunk + ldr q2, [x1] // load true head + cmeq v1.16b, v3.16b, v0.16b // char found in second chunk? + + /* process second chunk */ + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + cbz x5, 0f + + /* match in second chunk */ + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x11, x11, #16 + sub x8, x8, x11 // adjust for alignment offset + add x0, x0, x8 // return value + add x0, x0, #1 + + add x4, x9, x8 + add x5, x1, x8 + b .L1732 + +0: + /* string didn't end in second chunk and neither did buffer */ + ldr q1, [x10, #32] // load next string chunk + str q2, [x0] // deposit head into buffer + sub x0, x0, x11 // adjust x0 + mov x3, x12 + str q3, [x0, #16] // deposit second chunk + + add x10, x10, #32 // advance src + add x0, x0, #32 // advance dst + subs x3, x3, #16 // enough left for another round? + b.lo 1f + + /* main loop unrolled twice */ + .p2align 4 +0: + cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, 3f + + str q1, [x0] + ldr q1, [x10, #16] // load next chunk + + cmp x3, #16 // more than a full chunk left? + b.lo 2f + + add x10, x10, #32 // advance pointers + add x0, x0, #32 + + cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + cbnz x5, 4f // process chunk if match + + str q1, [x0, #-16] + ldr q1, [x10] // load next chunk + + subs x3, x3, #32 + b.hs 0b + +1: + sub x10, x10, #16 // undo second advancement + add x3, x3, #16 + sub x0, x0, #16 + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: + cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x4, d2 + + lsl x5, x3, #2 // shift 0xf to the limits position + lsl x5, x6, x5 + orr x8, x4, x5 // insert match in mask at limit + + rbit x8, x8 // simulate x86 tzcnt + clz x7, x8 // index of mismatch + lsr x8, x7, #2 + + lsl x5, x6, x7 // simulate x86 bt with shifted 0xf + + add x8, x8, #1 + add x0, x0, x8 + + ldr q1, [x10, x8] // load tail + str q1, [x0] // store tail + + add x0, x0, #16 + + tst x4, x5 // terminator encountered inside buffer? + csel x0, x0, xzr, ne // if yes, return pointer, else NUL + ret + +4: + sub x10, x10, #16 // undo second advancement + sub x0, x0, #16 // undo second advancement + +3: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x3, x8, #2 + + add x0, x0, x3 // restore dst pointer + add x10, x10, x3 + ldr q1, [x10, #-15] + str q1, [x0, #-15] + add x0, x0, #1 + ret + +.Lrunt: + add x13, x11, x3 + + mov x7, x5 // keep a copy of original match mask + + lsl x4, x12, #2 // shift 0xf to the limits position + lsl x4, x6, x4 + + cmp x13, #16 // dont induce match if limit >=16 + csel x4, x4, xzr, lo + orr x5, x5, x4 // insert match in mask at limit + + ands x8, x8, x5 // if match always fall through + b.ne 0f + + ldr q4, [x10, #16] // load second string chunk + cmeq v1.16b, v4.16b, v0.16b // char found in second chunk? + + /* process second chunk */ + shrn v1.8b, v1.8h, #4 + fmov x8, d1 + mov x7, x8 + + lsl x4, x12, #2 + lsl x4, x6, x4 + orr x8, x8, x4 // induce match in upper bytes of mask + + rbit x8, x8 + clz x4, x8 // index of mismatch + lsr x8, x4, #2 + add x8, x8, #16 // no match in first chunk + b 1f + +0: + rbit x8, x8 + clz x4, x8 // index of mismatch + lsr x8, x4, #2 +1: + add x0, x0, x8 // return value if terminator not found + sub x0, x0, x11 + add x0, x0, #1 + + /* check if we encountered a match or the limit first */ + lsl x5, x6, x4 + ands x7, x7, x5 // was the terminator present? + csel x0, xzr, x0, eq // return value based on what we matched + + sub x8, x8, x11 + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + /* copy 17-32 bytes */ +.L1732: + cmp x8, #16 + b.lo .L0816 + add x5, x5, #1 // ldp offsets are powers of 2 + add x4, x4, #1 + ldp x16, x17, [x1] + ldp x12, x13, [x5, #-16] + stp x16, x17, [x9] + stp x12, x13, [x4, #-16] + ret + + /* Copy 8-16 bytes */ +.L0816: + tbz x8, #3, .L0407 + ldr x16, [x1] + ldr x17, [x5, #-7] + str x16, [x9] + str x17, [x4, #-7] + ret + + /* Copy 4-7 bytes */ + .p2align 4 +.L0407: + cmp x8, #3 + b.lo .L0103 + ldr w16, [x1] + ldr w18, [x5, #-3] + str w16, [x9] + str w18, [x4, #-3] + ret + + /* Copy 1-3 bytes */ + .p2align 4 +.L0103: + lsr x14, x8, #1 + ldrb w16, [x1] + ldrb w15, [x5] + ldrb w18, [x1, x14] + strb w16, [x9] + strb w18, [x9, x14] + strb w15, [x4] + ret + +.L0: + eor x0, x0, x0 + ret + +END(__memccpy) From 3dc5429158cf221374cdbd0bbb728962bff4fb76 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:15:34 +0200 Subject: [PATCH 110/143] lib/libc/aarch64/string: add strncat SIMD implementation This patch requires D46170 as it depends on strlcpy being labeled __memccpy. It's a direct copy from the amd64 string functions. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46292 --- lib/libc/aarch64/string/Makefile.inc | 3 ++- lib/libc/aarch64/string/strncat.c | 29 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/strncat.c diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 78145a17ab8585..876ef4257b4c06 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -28,7 +28,8 @@ MDSRCS+= \ strcat.c \ strlcpy.S \ strncmp.S \ - memccpy.S + memccpy.S \ + strncat.c # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strncat.c b/lib/libc/aarch64/string/strncat.c new file mode 100644 index 00000000000000..33b278ac5e04cf --- /dev/null +++ b/lib/libc/aarch64/string/strncat.c @@ -0,0 +1,29 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include + +void *__memccpy(void *restrict, const void *restrict, int, size_t); + +char * +strncat(char *dest, const char *src, size_t n) +{ + size_t len; + char *endptr; + + len = strlen(dest); + endptr = __memccpy(dest + len, src, '\0', n); + + /* avoid an extra branch */ + if (endptr == NULL) + endptr = dest + len + n + 1; + + endptr[-1] = '\0'; + + return (dest); +} From bea89d038ac54048bb7dcb149cabd99067e5a3a9 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 23:10:16 +0200 Subject: [PATCH 111/143] lib/libc/aarch64/string: add strlcat SIMD implementation This patch requires D46243 as it depends on strlcpy being labeled __strlcpy. It's a direct copy from the amd64 string functions using memchr and strlcpy to implement strlcat. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46272 --- lib/libc/aarch64/string/Makefile.inc | 3 ++- lib/libc/aarch64/string/memchr.S | 4 ++++ lib/libc/aarch64/string/strlcat.c | 25 +++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/memchr.S create mode 100644 lib/libc/aarch64/string/strlcat.c diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 876ef4257b4c06..f8c67319fe12ac 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -29,7 +29,8 @@ MDSRCS+= \ strlcpy.S \ strncmp.S \ memccpy.S \ - strncat.c + strncat.c \ + strlcat.c # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/memchr.S b/lib/libc/aarch64/string/memchr.S new file mode 100644 index 00000000000000..6d4330d9115e9c --- /dev/null +++ b/lib/libc/aarch64/string/memchr.S @@ -0,0 +1,4 @@ + .weak memchr + .set memchr, __memchr_aarch64 + +#include "aarch64/memchr.S" diff --git a/lib/libc/aarch64/string/strlcat.c b/lib/libc/aarch64/string/strlcat.c new file mode 100644 index 00000000000000..c3c996163ade00 --- /dev/null +++ b/lib/libc/aarch64/string/strlcat.c @@ -0,0 +1,25 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include + +void *__memchr_aarch64(const void *, int, size_t); +size_t __strlcpy(char *restrict, const char *restrict, size_t); + +size_t +strlcat(char *restrict dst, const char *restrict src, size_t dstsize) +{ + char *loc = __memchr_aarch64(dst, '\0', dstsize); + + if (loc != NULL) { + size_t dstlen = (size_t)(loc - dst); + + return (dstlen + __strlcpy(loc, src, dstsize - dstlen)); + } else + return (dstsize + strlen(src)); +} From 5ebd4d0dd2f45040aa5e5b028a4b93163aea6899 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 20:13:44 +0200 Subject: [PATCH 112/143] lib/libc/aarch64/string: add memcpy SIMD implementation I noticed that we have a SIMD optimized memcpy in the arm-optimized-routines in /contrib. This patch ensures we use the SIMD variant as opposed to the Scalar optimized variant. Benchmarks are generated by fuz' strperf utility. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46251 --- lib/libc/aarch64/string/memcpy.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/libc/aarch64/string/memcpy.S b/lib/libc/aarch64/string/memcpy.S index f403dd2e42a8ad..53e860750eb225 100644 --- a/lib/libc/aarch64/string/memcpy.S +++ b/lib/libc/aarch64/string/memcpy.S @@ -1,3 +1,3 @@ -#define __memcpy_aarch64 memcpy -#define __memmove_aarch64 memmove -#include "aarch64/memcpy.S" +#define __memcpy_aarch64_simd memcpy +#define __memmove_aarch64_simd memmove +#include "aarch64/memcpy-advsimd.S" From 3863fec1ce2dc6033f094a085118605ea89db9e2 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Mon, 26 Aug 2024 21:54:32 +0200 Subject: [PATCH 113/143] lib/libc/aarch64/string: add strlen SIMD implementation Adds a SIMD enhanced strlen for Aarch64. It takes inspiration from the amd64 implementation but I struggled getting the performance I had hoped for on cores like the Graviton3 when compared to the existing implementation from Arm Optimized Routines. See the DR for bechmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D45623 --- lib/libc/aarch64/string/Makefile.inc | 4 +-- lib/libc/aarch64/string/strlen.S | 46 ++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 lib/libc/aarch64/string/strlen.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index f8c67319fe12ac..7325b54d9716fc 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -14,7 +14,6 @@ AARCH64_STRING_FUNCS= \ strchr \ strchrnul \ strcpy \ - strlen \ strnlen \ strrchr @@ -30,7 +29,8 @@ MDSRCS+= \ strncmp.S \ memccpy.S \ strncat.c \ - strlcat.c + strlcat.c \ + strlen.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strlen.S b/lib/libc/aarch64/string/strlen.S new file mode 100644 index 00000000000000..7bfac7f4b1e191 --- /dev/null +++ b/lib/libc/aarch64/string/strlen.S @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen +*/ + +#include + + .weak strlen + .set strlen, __strlen + .text + +ENTRY(__strlen) + bic x10, x0, #0xf // aligned src + and x9, x0, #0xf + ldr q0, [x10] + cmeq v0.16b, v0.16b, #0 + shrn v0.8b, v0.8h, #4 + fmov x1, d0 + cbz x9, .Laligned + lsl x2, x0, #2 // get the byte offset + lsr x1, x1, x2 // shift by offset index + cbz x1, .Lloop + rbit x1, x1 + clz x0, x1 + lsr x0, x0, #2 + ret + +.Laligned: + cbnz x1, .Ldone + +.Lloop: + ldr q0, [x10, #16]! + cmeq v0.16b, v0.16b, #0 + shrn v0.8b, v0.8h, #4 // reduce to fit mask in GPR + fcmp d0, #0.0 + b.eq .Lloop + fmov x1, d0 +.Ldone: + sub x0, x10, x0 + rbit x1, x1 // reverse bits as NEON has no ctz + clz x3, x1 + lsr x3, x3, #2 + add x0, x0, x3 + ret +END(__strlen) From 79e01e7e643c9337d8d6046b6db7df674475a099 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Wed, 28 Aug 2024 15:13:45 +0200 Subject: [PATCH 114/143] lib/libc/aarch64/string: add bcopy & bzero wrapper This patch enabled usage of SIMD enhanced functions to implement bcopy and bzero. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46459 --- lib/libc/aarch64/string/Makefile.inc | 4 +++- lib/libc/aarch64/string/bcopy.c | 14 ++++++++++++++ lib/libc/aarch64/string/bzero.c | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 lib/libc/aarch64/string/bcopy.c create mode 100644 lib/libc/aarch64/string/bzero.c diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 7325b54d9716fc..752cc6d9900b2c 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -30,7 +30,9 @@ MDSRCS+= \ memccpy.S \ strncat.c \ strlcat.c \ - strlen.S + strlen.S \ + bcopy.c \ + bzero.c # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/bcopy.c b/lib/libc/aarch64/string/bcopy.c new file mode 100644 index 00000000000000..0dee529fb9dff8 --- /dev/null +++ b/lib/libc/aarch64/string/bcopy.c @@ -0,0 +1,14 @@ +/*- + * Public domain. + */ + +#include + +#undef bcopy /* _FORTIFY_SOURCE */ + +void +bcopy(const void *src, void *dst, size_t len) +{ + + memmove(dst, src, len); +} diff --git a/lib/libc/aarch64/string/bzero.c b/lib/libc/aarch64/string/bzero.c new file mode 100644 index 00000000000000..d82f3061865b9d --- /dev/null +++ b/lib/libc/aarch64/string/bzero.c @@ -0,0 +1,14 @@ +/*- + * Public domain. + */ + +#include + +#undef bzero /* _FORTIFY_SOURCE */ + +void +bzero(void *b, size_t len) +{ + + memset(b, 0, len); +} From ce6af7a49ec7949c70f144f1b461b587ca7efd32 Mon Sep 17 00:00:00 2001 From: Getz Mikalsen Date: Wed, 28 Aug 2024 15:13:55 +0200 Subject: [PATCH 115/143] share/man/man7/simd.7: document SIMD-enhanced aarch64 functions This documents all the newly ported SIMD-enhanced string functions for the aarch64 platform. Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) Relnotes: yes PR: 281175 Differential Revision: https://reviews.freebsd.org/D46452 --- share/man/man7/simd.7 | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 877bc77adf4be5..f60aa8ee794d18 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd June 7, 2024 +.Dd August 26, 2024 .Dt SIMD 7 .Os .Sh NAME @@ -51,40 +51,40 @@ can be used to override this mechanism. Enhanced functions are present for the following architectures: .Bl -column FUNCTION_________ aarch64_ arm_ amd64_ i386_ ppc64_ -offset indent .It Em FUNCTION Ta Em AARCH64 Ta Em ARM Ta Em AMD64 Ta Em I386 Ta Em PPC64 -.It bcmp Ta Ta Ta S1 Ta S -.It bcopy Ta Ta S Ta S Ta S Ta SV -.It bzero Ta Ta S Ta S Ta S +.It bcmp Ta A Ta Ta S1 Ta S +.It bcopy Ta A Ta S Ta S Ta S Ta SV +.It bzero Ta A Ta S Ta S Ta S .It div Ta Ta Ta S Ta S .It index Ta A Ta Ta S1 .It ldiv Ta Ta Ta S Ta S .It lldiv Ta Ta Ta S .It memchr Ta A Ta Ta S1 .It memcmp Ta A Ta S Ta S1 Ta S -.It memccpy Ta Ta Ta S1 -.It memcpy Ta S Ta S Ta S Ta S Ta SV -.It memmove Ta S Ta S Ta S Ta S Ta SV +.It memccpy Ta A Ta Ta S1 +.It memcpy Ta A Ta S Ta S Ta S Ta SV +.It memmove Ta A Ta S Ta S Ta S Ta SV .It memrchr Ta A Ta Ta S1 .It memset Ta A Ta S Ta S Ta S .It rindex Ta A Ta Ta S1 Ta S .It stpcpy Ta A Ta Ta S1 .It stpncpy Ta Ta Ta S1 -.It strcat Ta Ta Ta S1 Ta S +.It strcat Ta A Ta Ta S1 Ta S .It strchr Ta A Ta Ta S1 Ta S .It strchrnul Ta A Ta Ta S1 -.It strcmp Ta S Ta S Ta S1 Ta S +.It strcmp Ta A Ta S Ta S1 Ta S .It strcpy Ta A Ta Ta S1 Ta S Ta S2 -.It strcspn Ta Ta Ta S2 -.It strlcat Ta Ta Ta S1 -.It strlcpy Ta Ta Ta S1 +.It strcspn Ta S Ta Ta S2 +.It strlcat Ta A Ta Ta S1 +.It strlcpy Ta A Ta Ta S1 .It strlen Ta A Ta S Ta S1 -.It strncat Ta Ta Ta S1 -.It strncmp Ta S Ta S Ta S1 Ta S +.It strncat Ta A Ta Ta S1 +.It strncmp Ta A Ta S Ta S1 Ta S .It strncpy Ta Ta Ta S1 Ta Ta S2 .It strnlen Ta A Ta Ta S1 .It strrchr Ta A Ta Ta S1 Ta S -.It strpbrk Ta Ta Ta S2 -.It strsep Ta Ta Ta S2 -.It strspn Ta Ta Ta S2 +.It strpbrk Ta S Ta Ta S2 +.It strsep Ta S Ta Ta S2 +.It strspn Ta S Ta Ta S2 .It swab Ta Ta Ta Ta S .It timingsafe_bcmp Ta Ta Ta S1 .It timingsafe_memcmp Ta Ta Ta S From f2c98669fc1b3fd2dbc7a7e3eedd098970a10dec Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 9 Dec 2024 10:49:49 +0100 Subject: [PATCH 116/143] lib/libc/aarch64/string: add ASIMD-enhanced timingsafe_bcmp implementation A straightforward port of the amd64 implementation. Approved by: security (cperciva) Reviewed by: getz, cperciva Event: EuroBSDcon 2024 Differential Revision: https://reviews.freebsd.org/D46757 --- lib/libc/aarch64/string/Makefile.inc | 1 + lib/libc/aarch64/string/timingsafe_bcmp.S | 113 ++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 lib/libc/aarch64/string/timingsafe_bcmp.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 752cc6d9900b2c..8019ab4adafc34 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -31,6 +31,7 @@ MDSRCS+= \ strncat.c \ strlcat.c \ strlen.S \ + timingsafe_bcmp.S \ bcopy.c \ bzero.c diff --git a/lib/libc/aarch64/string/timingsafe_bcmp.S b/lib/libc/aarch64/string/timingsafe_bcmp.S new file mode 100644 index 00000000000000..baa5c6f0940cb0 --- /dev/null +++ b/lib/libc/aarch64/string/timingsafe_bcmp.S @@ -0,0 +1,113 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Robert Clausecker + */ + +#include + +ENTRY(timingsafe_bcmp) + cmp x2, #32 // at least 33 bytes to process? + bhi .Lgt32 + + cmp x2, #16 // at least 17 bytes to process? + bhi .L1732 + + cmp x2, #8 // at least 9 bytes to process? + bhi .L0916 + + cmp x2, #4 // at least 5 bytes to process? + bhi .L0508 + + cmp x2, #2 // at least 3 bytes to process? + bhi .L0304 + + cbnz x2, .L0102 // buffer empty? + + mov w0, #0 // empty buffer always matches + ret + +.L0102: ldrb w3, [x0] // load first bytes + ldrb w4, [x1] + sub x2, x2, #1 + ldrb w5, [x0, x2] // load last bytes + ldrb w6, [x1, x2] + eor w3, w3, w4 + eor w5, w5, w6 + orr w0, w3, w5 + ret + +.L0304: ldrh w3, [x0] // load first halfwords + ldrh w4, [x1] + sub x2, x2, #2 + ldrh w5, [x0, x2] // load last halfwords + ldrh w6, [x1, x2] + eor w3, w3, w4 + eor w5, w5, w6 + orr w0, w3, w5 + ret + +.L0508: ldr w3, [x0] // load first words + ldr w4, [x1] + sub x2, x2, #4 + ldr w5, [x0, x2] // load last words + ldr w6, [x1, x2] + eor w3, w3, w4 + eor w5, w5, w6 + orr w0, w3, w5 + ret + +.L0916: ldr x3, [x0] + ldr x4, [x1] + sub x2, x2, #8 + ldr x5, [x0, x2] + ldr x6, [x1, x2] + eor x3, x3, x4 + eor x5, x5, x6 + orr x0, x3, x5 + orr x0, x0, x0, lsr #32 // ensure low 32 bits are nonzero iff mismatch + ret + +.L1732: ldr q0, [x0] + ldr q1, [x1] + sub x2, x2, #16 + ldr q2, [x0, x2] + ldr q3, [x1, x2] + eor v0.16b, v0.16b, v1.16b + eor v2.16b, v2.16b, v3.16b + orr v0.16b, v0.16b, v2.16b + umaxv s0, v0.4s // get a nonzero word if any + mov w0, v0.s[0] + ret + + /* more than 32 bytes: process buffer in a loop */ +.Lgt32: ldp q0, q1, [x0], #32 + ldp q2, q3, [x1], #32 + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + orr v4.16b, v0.16b, v1.16b + subs x2, x2, #64 // enough left for another iteration? + bls .Ltail + +0: ldp q0, q1, [x0], #32 + ldp q2, q3, [x1], #32 + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + orr v0.16b, v0.16b, v1.16b + orr v4.16b, v4.16b, v0.16b + subs x2, x2, #32 + bhi 0b + + /* process last 32 bytes */ +.Ltail: add x0, x0, x2 // point to the last 32 bytes in the buffer + add x1, x1, x2 + ldp q0, q1, [x0] + ldp q2, q3, [x1] + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + orr v0.16b, v0.16b, v1.16b + orr v4.16b, v4.16b, v0.16b + umaxv s0, v4.4s // get a nonzero word if any + mov w0, v0.s[0] + ret +END(timingsafe_bcmp) From 3f224333af163d5fcd7547a20993dcf18f19076c Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 9 Dec 2024 10:50:00 +0100 Subject: [PATCH 117/143] lib/libc/aarch64/string: add timingsafe_memcmp() assembly implementation A port of the amd64 implementation with some slight changes due to differences in instructions provided by aarch64. No ASIMD for the same reason as the amd64 code: it's just not particularly suitable for this application. Event: EuroBSDcon 2024 Approved by: security (cperciva) Reviewed by: getz, cperciva Differential Revision: https://reviews.freebsd.org/D46758 --- lib/libc/aarch64/string/Makefile.inc | 1 + lib/libc/aarch64/string/timingsafe_memcmp.S | 117 ++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 lib/libc/aarch64/string/timingsafe_memcmp.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 8019ab4adafc34..9574aad9593323 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -32,6 +32,7 @@ MDSRCS+= \ strlcat.c \ strlen.S \ timingsafe_bcmp.S \ + timingsafe_memcmp.S \ bcopy.c \ bzero.c diff --git a/lib/libc/aarch64/string/timingsafe_memcmp.S b/lib/libc/aarch64/string/timingsafe_memcmp.S new file mode 100644 index 00000000000000..28fdd911a3875b --- /dev/null +++ b/lib/libc/aarch64/string/timingsafe_memcmp.S @@ -0,0 +1,117 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Robert Clausecker + */ + +#include + +ENTRY(timingsafe_memcmp) + cmp x2, #16 // at least 17 bytes to process? + bhi .Lgt16 + + cmp x2, #8 // at least 9 bytes to process? + bhi .L0916 + + cmp x2, #4 // at least 5 bytes to process? + bhi .L0508 + + cmp x2, #2 // at least 3 bytes to process? + bhi .L0304 + + cbnz x2, .L0102 // buffer empty? + + mov w0, #0 // empty buffer always matches + ret + +.L0102: ldrb w3, [x0] // load first bytes + ldrb w4, [x1] + sub x2, x2, #1 + ldrb w5, [x0, x2] // load last bytes + ldrb w6, [x1, x2] + bfi w5, w3, #8, #8 // join bytes in big endian + bfi w6, w4, #8, #8 + sub w0, w5, w6 + ret + + +.L0304: ldrh w3, [x0] // load first halfwords + ldrh w4, [x1] + sub x2, x2, #2 + ldrh w5, [x0, x2] // load last halfwords + ldrh w6, [x1, x2] + bfi w3, w5, #16, #16 // join halfwords in little endian + bfi w4, w6, #16, #16 + rev w3, w3 // swap word order + rev w4, w4 + cmp w3, w4 + csetm w0, lo // w0 = w3 >= w4 ? 0 : -1 + csinc w0, w0, wzr, ls // w0 = w3 <=> w4 ? 1 : 0 : -1 + ret + +.L0508: ldr w3, [x0] // load first words + ldr w4, [x1] + sub x2, x2, #4 + ldr w5, [x0, x2] // load last words + ldr w6, [x1, x2] + bfi x3, x5, #32, #32 // join words in little endian + bfi x4, x6, #32, #32 + rev x3, x3 // swap word order + rev x4, x4 + cmp x3, x4 + csetm w0, lo // x0 = x3 >= w4 ? 0 : -1 + csinc w0, w0, wzr, ls // x0 = x3 <=> w4 ? 1 : 0 : -1 + ret + +.L0916: ldr x3, [x0] + ldr x4, [x1] + sub x2, x2, #8 + ldr x5, [x0, x2] + ldr x6, [x1, x2] + cmp x3, x4 // mismatch in first pair? + csel x3, x3, x5, ne // use second pair if first pair equal + csel x4, x4, x6, ne + rev x3, x3 + rev x4, x4 + cmp x3, x4 + csetm w0, lo + csinc w0, w0, wzr, ls + ret + + /* more than 16 bytes: process buffer in a loop */ +.Lgt16: ldp x3, x4, [x0], #16 + ldp x5, x6, [x1], #16 + cmp x3, x5 // mismatch in first pair? + csel x3, x3, x4, ne // use second pair if first pair equal + csel x5, x5, x6, ne + subs x2, x2, #32 + bls .Ltail + +0: ldp x4, x7, [x0], #16 + ldp x6, x8, [x1], #16 + cmp x4, x6 // mismatch in first pair? + csel x4, x4, x7, ne // if not, try second pair + csel x6, x6, x8, ne + cmp x3, x5 // was there a mismatch previously? + csel x3, x3, x4, ne // apply new pair if there was not + csel x5, x5, x6, ne + subs x2, x2, #16 + bhi 0b + +.Ltail: add x0, x0, x2 + add x1, x1, x2 + ldp x4, x7, [x0] + ldp x6, x8, [x1] + cmp x4, x6 // mismatch in first pair? + csel x4, x4, x7, ne // if not, try second pair + csel x6, x6, x8, ne + cmp x3, x5 // was there a mismatch previously? + csel x3, x3, x4, ne // apply new pair if there was not + csel x5, x5, x6, ne + rev x3, x3 + rev x5, x5 + cmp x3, x5 + csetm w0, lo + csinc w0, w0, wzr, ls + ret +END(timingsafe_bcmp) From c15b847b183bf836148caa1a1dc10d5d86507d09 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 18 Nov 2024 14:44:47 +0100 Subject: [PATCH 118/143] share/man/man7/simd.7: document SIMD-enhanced timingsafe_{b,mem}cmp See also: D46758, D46757 Event: EuroBSDcon 2024 Relnotes: yes --- share/man/man7/simd.7 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index f60aa8ee794d18..d5092348d9b396 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd August 26, 2024 +.Dd November 18, 2024 .Dt SIMD 7 .Os .Sh NAME @@ -86,8 +86,8 @@ Enhanced functions are present for the following architectures: .It strsep Ta S Ta Ta S2 .It strspn Ta S Ta Ta S2 .It swab Ta Ta Ta Ta S -.It timingsafe_bcmp Ta Ta Ta S1 -.It timingsafe_memcmp Ta Ta Ta S +.It timingsafe_bcmp Ta A Ta Ta S1 +.It timingsafe_memcmp Ta S Ta Ta S .It wcschr Ta Ta Ta Ta S .It wcscmp Ta Ta Ta Ta S .It wcslen Ta Ta Ta Ta S From 6b82130e6c9add4a8892ca897df5a0ec04663ea2 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 10 Jan 2025 15:37:07 +0000 Subject: [PATCH 119/143] clock: Add a long ticks variable, ticksl For compatibility with Linux, it's useful to have a tick counter of width sizeof(long), but our tick counter is an int. Currently the linuxkpi tries paper over this difference, but this cannot really be done reliably, so it's desirable to have a wider tick counter. This change introduces ticksl, keeping the existing ticks variable. Follow a suggestion from kib to avoid having to maintain two separate counters and to avoid converting existing code to use ticksl: change hardclock() to update ticksl instead of ticks, and then use assembler directives to make ticks and ticksl overlap such that loading ticks gives the bottom 32 bits. This makes it possible to use ticksl in the linuxkpi without having to convert any native code, and without making hardclock() more complicated or expensive. Then, the linuxkpi can be modified to use ticksl instead of ticks. Reviewed by: olce, kib, emaste MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D48383 --- sys/conf/files | 1 + sys/kern/kern_clock.c | 26 +++++++++++++------------ sys/kern/kern_tc.c | 4 ++-- sys/kern/subr_param.c | 2 +- sys/kern/subr_ticks.s | 44 +++++++++++++++++++++++++++++++++++++++++++ sys/sys/kernel.h | 9 +++++++++ sys/sys/timetc.h | 2 +- 7 files changed, 72 insertions(+), 16 deletions(-) create mode 100644 sys/kern/subr_ticks.s diff --git a/sys/conf/files b/sys/conf/files index d358737c561320..a630d9dd72bc57 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3932,6 +3932,7 @@ kern/subr_stack.c optional ddb | stack | ktr kern/subr_stats.c optional stats kern/subr_taskqueue.c standard kern/subr_terminal.c optional vt +kern/subr_ticks.s standard kern/subr_trap.c standard kern/subr_turnstile.c standard kern/subr_uio.c standard diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 6fa2272ed54a93..b11c0d235139e4 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -323,7 +323,7 @@ read_cpu_time(long *cp_time) #include -static int watchdog_ticks; +static long watchdog_ticks; static int watchdog_enabled; static void watchdog_fire(void); static void watchdog_config(void *, u_int, int *); @@ -369,10 +369,9 @@ watchdog_attach(void) int stathz; int profhz; int profprocs; -volatile int ticks; int psratio; -DPCPU_DEFINE_STATIC(int, pcputicks); /* Per-CPU version of ticks. */ +DPCPU_DEFINE_STATIC(long, pcputicks); /* Per-CPU version of ticks. */ #ifdef DEVICE_POLLING static int devpoll_run = 0; #endif @@ -480,14 +479,14 @@ hardclock(int cnt, int usermode) struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; - int *t = DPCPU_PTR(pcputicks); - int global, i, newticks; + long global, newticks, *t; /* * Update per-CPU and possibly global ticks values. */ + t = DPCPU_PTR(pcputicks); *t += cnt; - global = ticks; + global = atomic_load_long(&ticksl); do { newticks = *t - global; if (newticks <= 0) { @@ -496,7 +495,7 @@ hardclock(int cnt, int usermode) newticks = 0; break; } - } while (!atomic_fcmpset_int(&ticks, &global, *t)); + } while (!atomic_fcmpset_long(&ticksl, &global, *t)); /* * Run current process's virtual and profile time, as needed. @@ -525,8 +524,10 @@ hardclock(int cnt, int usermode) } #endif /* DEVICE_POLLING */ if (watchdog_enabled > 0) { - i = atomic_fetchadd_int(&watchdog_ticks, -newticks); - if (i > 0 && i <= newticks) + long left; + + left = atomic_fetchadd_long(&watchdog_ticks, -newticks); + if (left > 0 && left <= newticks) watchdog_fire(); } intr_event_handle(clk_intr_event, NULL); @@ -540,11 +541,12 @@ hardclock(int cnt, int usermode) void hardclock_sync(int cpu) { - int *t; + long *t; + KASSERT(!CPU_ABSENT(cpu), ("Absent CPU %d", cpu)); - t = DPCPU_ID_PTR(cpu, pcputicks); - *t = ticks; + t = DPCPU_ID_PTR(cpu, pcputicks); + *t = ticksl; } /* diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index 26f09cb602603d..a797a101bf6f3a 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -1916,9 +1916,9 @@ SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0, "Approximate number of hardclock ticks in a millisecond"); void -tc_ticktock(int cnt) +tc_ticktock(long cnt) { - static int count; + static long count; if (mtx_trylock_spin(&tc_setclock_mtx)) { count += cnt; diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index 19169ba63061a3..f4359efec46687 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -197,7 +197,7 @@ init_param1(void) * Arrange for ticks to wrap 10 minutes after boot to help catch * sign problems sooner. */ - ticks = INT_MAX - (hz * 10 * 60); + ticksl = INT_MAX - (hz * 10 * 60); vn_lock_pair_pause_max = hz / 100; if (vn_lock_pair_pause_max == 0) diff --git a/sys/kern/subr_ticks.s b/sys/kern/subr_ticks.s new file mode 100644 index 00000000000000..6565ba42413783 --- /dev/null +++ b/sys/kern/subr_ticks.s @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Mark Johnston + */ + +/* + * Define the "ticks" and "ticksl" variables. The former is overlaid onto the + * low bits of the latter. + */ + +#if defined(__aarch64__) +#include +#include + +GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL) +#endif + +#ifdef _ILP32 +#define SIZEOF_TICKSL 4 +#define TICKSL_INIT .long 0 +#else +#define SIZEOF_TICKSL 8 +#define TICKSL_INIT .quad 0 +#endif + +#if defined(_ILP32) || __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define TICKS_OFFSET 0 +#else +#define TICKS_OFFSET 4 +#endif + + .data + + .global ticksl + .type ticksl, %object + .align SIZEOF_TICKSL +ticksl: TICKSL_INIT + .size ticksl, SIZEOF_TICKSL + + .global ticks + .type ticks, %object +ticks =ticksl + TICKS_OFFSET + .size ticks, 4 diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h index 3144b5a50c9857..3800990921076f 100644 --- a/sys/sys/kernel.h +++ b/sys/sys/kernel.h @@ -65,7 +65,16 @@ extern int psratio; /* ratio: prof / stat */ extern int stathz; /* statistics clock's frequency */ extern int profhz; /* profiling clock's frequency */ extern int profprocs; /* number of process's profiling */ + +/* + * The ticks and ticksl symbols overlap, giving a 64-bit tick counter on 64-bit + * platforms while still maintaining compatibility with the legacy 32-bit + * counter. Either value can be used, but rollover must be handled; at 1000Hz, + * ticks (and ticksl on 32-bit platforms) roll over roughly every 25 days. On + * 64-bit platforms, ticksl will not roll over in the foreseeable future. + */ extern volatile int ticks; +extern volatile long ticksl; #endif /* _KERNEL */ diff --git a/sys/sys/timetc.h b/sys/sys/timetc.h index 1d9b18620e96c5..52277086842576 100644 --- a/sys/sys/timetc.h +++ b/sys/sys/timetc.h @@ -87,7 +87,7 @@ extern int tc_min_ticktock_freq; /* u_int64_t tc_getfrequency(void); void tc_init(struct timecounter *tc); void tc_setclock(struct timespec *ts); -void tc_ticktock(int cnt); +void tc_ticktock(long cnt); void cpu_tick_calibration(void); #ifdef SYSCTL_DECL From 9eb30ef4b7a0ca1ef7bcc871b6391d98b00c259f Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Fri, 10 Jan 2025 13:57:36 -0400 Subject: [PATCH 120/143] riscv: enable Allwinner D1 USB drivers Add the generic USB drivers and FDT glue to the build. Make small tweaks to the aw_usbphy and aw_musb drivers for the Allwinner D1. Reviewed by: manu Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D48126 --- sys/arm/allwinner/aw_usbphy.c | 9 +++++++++ sys/conf/files.riscv | 6 ++++++ sys/dev/usb/controller/musb_otg_allwinner.c | 5 +++-- sys/riscv/allwinner/files.allwinner | 3 +++ sys/riscv/conf/std.allwinner | 3 +++ 5 files changed, 24 insertions(+), 2 deletions(-) diff --git a/sys/arm/allwinner/aw_usbphy.c b/sys/arm/allwinner/aw_usbphy.c index b0ef7d9da0a98c..97c3d220777320 100644 --- a/sys/arm/allwinner/aw_usbphy.c +++ b/sys/arm/allwinner/aw_usbphy.c @@ -56,6 +56,7 @@ enum awusbphy_type { AWUSBPHY_TYPE_A64, AWUSBPHY_TYPE_A83T, AWUSBPHY_TYPE_H6, + AWUSBPHY_TYPE_D1, }; struct aw_usbphy_conf { @@ -121,6 +122,13 @@ static const struct aw_usbphy_conf h6_usbphy_conf = { .phy0_route = true, }; +static const struct aw_usbphy_conf d1_usbphy_conf = { + .num_phys = 2, + .phy_type = AWUSBPHY_TYPE_D1, + .pmu_unk1 = true, + .phy0_route = true, +}; + static struct ofw_compat_data compat_data[] = { { "allwinner,sun4i-a10-usb-phy", (uintptr_t)&a10_usbphy_conf }, { "allwinner,sun5i-a13-usb-phy", (uintptr_t)&a13_usbphy_conf }, @@ -130,6 +138,7 @@ static struct ofw_compat_data compat_data[] = { { "allwinner,sun50i-a64-usb-phy", (uintptr_t)&a64_usbphy_conf }, { "allwinner,sun8i-a83t-usb-phy", (uintptr_t)&a83t_usbphy_conf }, { "allwinner,sun50i-h6-usb-phy", (uintptr_t)&h6_usbphy_conf }, + { "allwinner,sun20i-d1-usb-phy", (uintptr_t)&d1_usbphy_conf }, { NULL, 0 } }; diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv index 534fe5013c568c..514c955181c38e 100644 --- a/sys/conf/files.riscv +++ b/sys/conf/files.riscv @@ -12,6 +12,12 @@ dev/pci/pci_host_generic.c optional pci dev/pci/pci_host_generic_fdt.c optional pci fdt dev/uart/uart_cpu_fdt.c optional uart fdt dev/uart/uart_dev_lowrisc.c optional uart_lowrisc +dev/usb/controller/generic_ehci.c optional ehci +dev/usb/controller/generic_ehci_fdt.c optional ehci fdt +dev/usb/controller/generic_ohci.c optional ohci fdt +dev/usb/controller/generic_usb_if.m optional ohci fdt +dev/usb/controller/generic_xhci.c optional xhci +dev/usb/controller/generic_xhci_fdt.c optional xhci fdt dev/vmm/vmm_dev.c optional vmm dev/vmm/vmm_stat.c optional vmm dev/xilinx/axi_quad_spi.c optional xilinx_spi diff --git a/sys/dev/usb/controller/musb_otg_allwinner.c b/sys/dev/usb/controller/musb_otg_allwinner.c index 574e8e712713ca..781b4d7e33fae9 100644 --- a/sys/dev/usb/controller/musb_otg_allwinner.c +++ b/sys/dev/usb/controller/musb_otg_allwinner.c @@ -77,7 +77,7 @@ #if defined(__arm__) #define bs_parent_space(bs) ((bs)->bs_parent) typedef bus_space_tag_t awusb_bs_tag; -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(__riscv) #define bs_parent_space(bs) (bs) typedef void * awusb_bs_tag; #endif @@ -89,6 +89,7 @@ static struct ofw_compat_data compat_data[] = { { "allwinner,sun6i-a31-musb", AWUSB_OKAY }, { "allwinner,sun8i-a33-musb", AWUSB_OKAY | AWUSB_NO_CONFDATA }, { "allwinner,sun8i-h3-musb", AWUSB_OKAY | AWUSB_NO_CONFDATA }, + { "allwinner,sun20i-d1-musb", AWUSB_OKAY | AWUSB_NO_CONFDATA }, { NULL, 0 } }; @@ -474,7 +475,7 @@ awusbdrd_attach(device_t dev) #if defined(__arm__) sc->bs.bs_parent = rman_get_bustag(sc->res[0]); -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(__riscv) sc->bs.bs_cookie = rman_get_bustag(sc->res[0]); #endif diff --git a/sys/riscv/allwinner/files.allwinner b/sys/riscv/allwinner/files.allwinner index f55d883abf57b9..a87d79dfda2a10 100644 --- a/sys/riscv/allwinner/files.allwinner +++ b/sys/riscv/allwinner/files.allwinner @@ -3,6 +3,7 @@ arm/allwinner/aw_gpio.c optional gpio aw_gpio fdt arm/allwinner/aw_rtc.c optional aw_rtc fdt arm/allwinner/aw_syscon.c optional syscon arm/allwinner/aw_sid.c optional aw_sid nvmem +arm/allwinner/aw_usbphy.c optional ehci aw_usbphy fdt arm/allwinner/aw_wdog.c optional aw_wdog arm/allwinner/if_awg.c optional awg syscon @@ -18,4 +19,6 @@ dev/clk/allwinner/aw_clk_np.c optional aw_ccu fdt dev/clk/allwinner/aw_clk_prediv_mux.c optional aw_ccu fdt dev/clk/allwinner/ccu_d1.c optional soc_allwinner_d1 aw_ccu fdt +dev/usb/controller/musb_otg_allwinner.c optional musb fdt + riscv/allwinner/d1_padconf.c optional soc_allwinner_d1 aw_gpio fdt diff --git a/sys/riscv/conf/std.allwinner b/sys/riscv/conf/std.allwinner index a888e4d5d370c0..5e7a6c0e0a52d3 100644 --- a/sys/riscv/conf/std.allwinner +++ b/sys/riscv/conf/std.allwinner @@ -9,7 +9,10 @@ device aw_ccu # Allwinner clock controller device aw_gpio # Allwinner GPIO controller device aw_rtc # Allwinner Real-time Clock device aw_sid # Allwinner Secure ID EFUSE +device aw_usbphy # Allwinner USB PHY device aw_wdog # Allwinner Watchdog device awg # Allwinner EMAC Gigabit Ethernet +device musb # Mentor Graphics USB OTG controller + files "../allwinner/files.allwinner" From aa766e2a03f0eb2fb6272828865c83a807b81cf1 Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Fri, 10 Jan 2025 14:46:01 -0400 Subject: [PATCH 121/143] ofw_cpu: fix __riscv preprocessor check The canonical name is __riscv, not __riscv__. Newer compilers no longer emit the latter. This re-enables finding the nominal frequency from the CPU's clock. I checked, and there are no remaining mistakes like this in the tree. Reviewed by: jrtc27, imp, jhb Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D48122 --- sys/dev/ofw/ofw_cpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/dev/ofw/ofw_cpu.c b/sys/dev/ofw/ofw_cpu.c index ad0fd670421adc..cbca8caee1869b 100644 --- a/sys/dev/ofw/ofw_cpu.c +++ b/sys/dev/ofw/ofw_cpu.c @@ -42,7 +42,7 @@ #include #include -#if defined(__arm__) || defined(__arm64__) || defined(__riscv__) +#if defined(__arm__) || defined(__arm64__) || defined(__riscv) #include #endif @@ -206,7 +206,7 @@ ofw_cpu_attach(device_t dev) phandle_t node; pcell_t cell; int rv; -#if defined(__arm__) || defined(__arm64__) || defined(__riscv__) +#if defined(__arm__) || defined(__arm64__) || defined(__riscv) clk_t cpuclk; uint64_t freq; #endif @@ -276,7 +276,7 @@ ofw_cpu_attach(device_t dev) sc->sc_cpu_pcpu = pcpu_find(device_get_unit(dev)); if (OF_getencprop(node, "clock-frequency", &cell, sizeof(cell)) < 0) { -#if defined(__arm__) || defined(__arm64__) || defined(__riscv__) +#if defined(__arm__) || defined(__arm64__) || defined(__riscv) rv = clk_get_by_ofw_index(dev, 0, 0, &cpuclk); if (rv == 0) { rv = clk_get_freq(cpuclk, &freq); From 4ea0c3f04f42119dff92317c0e4cef52350ed9db Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Fri, 10 Jan 2025 14:46:43 -0400 Subject: [PATCH 122/143] ofw_cpu: collapse some #ifdef code Mainly, to avoid repeating the list of architectures, #define HAS_CLK. Further, split the clk code into a helper function, which is a stub in the !HAS_CLK case. This aids in overall legibility. While here, add one separating whitespace, again for legibility. Reviewed by: jhb Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D48149 --- sys/dev/ofw/ofw_cpu.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/sys/dev/ofw/ofw_cpu.c b/sys/dev/ofw/ofw_cpu.c index cbca8caee1869b..339716a946ff45 100644 --- a/sys/dev/ofw/ofw_cpu.c +++ b/sys/dev/ofw/ofw_cpu.c @@ -44,6 +44,7 @@ #if defined(__arm__) || defined(__arm64__) || defined(__riscv) #include +#define HAS_CLK #endif static int ofw_cpulist_probe(device_t); @@ -198,6 +199,30 @@ ofw_cpu_probe(device_t dev) return (0); } +static int +get_freq_from_clk(device_t dev, struct ofw_cpu_softc *sc) +{ +#ifdef HAS_CLK + clk_t cpuclk; + uint64_t freq; + int rv; + + rv = clk_get_by_ofw_index(dev, 0, 0, &cpuclk); + if (rv == 0) { + rv = clk_get_freq(cpuclk, &freq); + if (rv != 0 && bootverbose) + device_printf(dev, + "Cannot get freq of property clocks\n"); + else + sc->sc_nominal_mhz = freq / 1000000; + } + + return (rv); +#else + return (ENODEV); +#endif +} + static int ofw_cpu_attach(device_t dev) { @@ -206,10 +231,6 @@ ofw_cpu_attach(device_t dev) phandle_t node; pcell_t cell; int rv; -#if defined(__arm__) || defined(__arm64__) || defined(__riscv) - clk_t cpuclk; - uint64_t freq; -#endif sc = device_get_softc(dev); psc = device_get_softc(device_get_parent(dev)); @@ -276,18 +297,7 @@ ofw_cpu_attach(device_t dev) sc->sc_cpu_pcpu = pcpu_find(device_get_unit(dev)); if (OF_getencprop(node, "clock-frequency", &cell, sizeof(cell)) < 0) { -#if defined(__arm__) || defined(__arm64__) || defined(__riscv) - rv = clk_get_by_ofw_index(dev, 0, 0, &cpuclk); - if (rv == 0) { - rv = clk_get_freq(cpuclk, &freq); - if (rv != 0 && bootverbose) - device_printf(dev, - "Cannot get freq of property clocks\n"); - else - sc->sc_nominal_mhz = freq / 1000000; - } else -#endif - { + if (get_freq_from_clk(dev, sc) != 0) { if (bootverbose) device_printf(dev, "missing 'clock-frequency' property\n"); @@ -298,6 +308,7 @@ ofw_cpu_attach(device_t dev) if (sc->sc_nominal_mhz != 0 && bootverbose) device_printf(dev, "Nominal frequency %dMhz\n", sc->sc_nominal_mhz); + bus_identify_children(dev); bus_attach_children(dev); return (0); From ac9de183f37006fc2089757779d6d5065a530d5b Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Fri, 10 Jan 2025 14:46:56 -0400 Subject: [PATCH 123/143] ofw_cpu: check for "disabled" status during probe Some RISC-V CPUs contain a "monitor core" with limited functionality (no MMU). These cores appear in some device trees, but we don't run the kernel on them; in early CPU start-up code we skip them, and they have no impact on mp_ncpu. It seems the new trend is to mark these monitor cores with a 'status' property of 'disabled'. However, we still instantiate an ofw_cpu pseudo device for the disabled core. This is generally harmless, but there is an impact when attempting to attach the cpufreq_dt driver. It counts more OFW CPU devices (unit number) than logical CPUs (mp_ncpus), and therefore fails to attach for the last logical CPU. The solution is to check the status property in ofw_cpu_probe(), and fail if the core is marked "disabled". This is subject to the same exception already in ofw_cpu_early_foreach(); that is, if a disabled CPU has an 'enable-method' property, it can be used by the kernel. Reviewed by: andrew, jrtc27 MFC after: 1 month Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D48123 --- sys/dev/ofw/ofw_cpu.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/sys/dev/ofw/ofw_cpu.c b/sys/dev/ofw/ofw_cpu.c index 339716a946ff45..888af0440746e0 100644 --- a/sys/dev/ofw/ofw_cpu.c +++ b/sys/dev/ofw/ofw_cpu.c @@ -182,6 +182,24 @@ static driver_t ofw_cpu_driver = { DRIVER_MODULE(ofw_cpu, cpulist, ofw_cpu_driver, 0, 0); +static bool +ofw_cpu_is_runnable(phandle_t node) +{ + /* + * Per the DeviceTree Specification, a cpu node (under /cpus) that + * has 'status = disabled' indicates that "the CPU is in a quiescent + * state." + * + * A quiescent CPU that specifies an "enable-method", such as + * "spin-table", can still be used by the kernel. + * + * Lacking this, any CPU marked "disabled" or other non-okay status + * should be excluded from the kernel's view. + */ + return (ofw_bus_node_status_okay(node) || + OF_hasprop(node, "enable-method")); +} + static int ofw_cpu_probe(device_t dev) { @@ -190,6 +208,9 @@ ofw_cpu_probe(device_t dev) if (type == NULL || strcmp(type, "cpu") != 0) return (ENXIO); + if (!ofw_cpu_is_runnable(ofw_bus_get_node(dev))) + return (ENXIO); + device_set_desc(dev, "Open Firmware CPU"); if (!bootverbose && device_get_unit(dev) != 0) { device_quiet(dev); @@ -352,7 +373,6 @@ ofw_cpu_early_foreach(ofw_cpu_foreach_cb callback, bool only_runnable) { phandle_t node, child; pcell_t addr_cells, reg[2]; - char status[16]; char device_type[16]; u_int id, next_id; int count, rv; @@ -389,14 +409,8 @@ ofw_cpu_early_foreach(ofw_cpu_foreach_cb callback, bool only_runnable) * those that have been enabled, or do provide a method * to enable them. */ - if (only_runnable) { - status[0] = '\0'; - OF_getprop(child, "status", status, sizeof(status)); - if (status[0] != '\0' && strcmp(status, "okay") != 0 && - strcmp(status, "ok") != 0 && - !OF_hasprop(child, "enable-method")) - continue; - } + if (only_runnable && !ofw_cpu_is_runnable(child)) + continue; /* * Check we have a register to identify the cpu From def7999c2ccddc9a303a65c0bea22976e79d8613 Mon Sep 17 00:00:00 2001 From: Mitchell Horne Date: Tue, 8 Oct 2024 15:49:11 -0300 Subject: [PATCH 124/143] riscv: enable cpufreq_dt driver Implement the small amount of MD code required; copied from arm/arm64. One tweak is made to cpufreq_dt itself: if the opp-shared property is missing, but there is only one CPU, then we can still attach. This is relevant for the single-core Allwinner D1. Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D48124 --- sys/conf/files.riscv | 1 + sys/dev/cpufreq/cpufreq_dt.c | 2 +- sys/riscv/conf/GENERIC | 3 +++ sys/riscv/include/pcpu.h | 3 ++- sys/riscv/riscv/machdep.c | 12 +++++++++++- 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv index 514c955181c38e..36eea03f29a1be 100644 --- a/sys/conf/files.riscv +++ b/sys/conf/files.riscv @@ -4,6 +4,7 @@ cddl/dev/dtrace/riscv/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/dtrace/riscv/instr_size.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/riscv/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" crypto/des/des_enc.c optional netsmb +dev/cpufreq/cpufreq_dt.c optional cpufreq fdt dev/ofw/ofw_cpu.c optional fdt dev/ofw/ofw_pcib.c optional pci fdt dev/pci/pci_dw.c optional pci fdt diff --git a/sys/dev/cpufreq/cpufreq_dt.c b/sys/dev/cpufreq/cpufreq_dt.c index 929eebfe7dc548..e35a8ec73ef487 100644 --- a/sys/dev/cpufreq/cpufreq_dt.c +++ b/sys/dev/cpufreq/cpufreq_dt.c @@ -401,7 +401,7 @@ cpufreq_dt_oppv2_parse(struct cpufreq_dt_softc *sc, phandle_t node) if (opp_table == opp_xref) return (ENXIO); - if (!OF_hasprop(opp_table, "opp-shared")) { + if (!OF_hasprop(opp_table, "opp-shared") && mp_ncpus > 1) { device_printf(sc->dev, "Only opp-shared is supported\n"); return (ENXIO); } diff --git a/sys/riscv/conf/GENERIC b/sys/riscv/conf/GENERIC index 23d8a4e47eee22..34426f16796343 100644 --- a/sys/riscv/conf/GENERIC +++ b/sys/riscv/conf/GENERIC @@ -90,6 +90,9 @@ device syscon device syscon_power device riscv_syscon +# CPU frequency control +device cpufreq + # Bus drivers device pci diff --git a/sys/riscv/include/pcpu.h b/sys/riscv/include/pcpu.h index d00226defc2f17..f11060496963be 100644 --- a/sys/riscv/include/pcpu.h +++ b/sys/riscv/include/pcpu.h @@ -46,7 +46,8 @@ struct pmap *pc_curpmap; /* Currently active pmap */ \ uint32_t pc_pending_ipis; /* IPIs pending to this CPU */ \ uint32_t pc_hart; /* Hart ID */ \ - char __pad[56] /* Pad to factor of PAGE_SIZE */ + uint64_t pc_clock; \ + char __pad[48] /* Pad to factor of PAGE_SIZE */ #ifdef _KERNEL diff --git a/sys/riscv/riscv/machdep.c b/sys/riscv/riscv/machdep.c index c5da4832dd367b..c0d4b7cc27262d 100644 --- a/sys/riscv/riscv/machdep.c +++ b/sys/riscv/riscv/machdep.c @@ -219,8 +219,18 @@ cpu_flush_dcache(void *ptr, size_t len) int cpu_est_clockrate(int cpu_id, uint64_t *rate) { + struct pcpu *pc; - panic("cpu_est_clockrate"); + pc = pcpu_find(cpu_id); + if (pc == NULL || rate == NULL) + return (EINVAL); + + if (pc->pc_clock == 0) + return (EOPNOTSUPP); + + *rate = pc->pc_clock; + + return (0); } void From 9234a50752cd47887849d4665af0f9f4abdefb5d Mon Sep 17 00:00:00 2001 From: Seyed Pouria Mousavizadeh Tehrani Date: Fri, 10 Jan 2025 12:46:18 +0330 Subject: [PATCH 125/143] ng_ksocket: add IPv6 support for address parsing and unparsing Differential Revision: https://reviews.freebsd.org/D48204 --- share/man/man4/ng_ksocket.4 | 13 +++-- sys/netgraph/ng_ksocket.c | 94 ++++++++++++++++++++++++++++++++++--- 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/share/man/man4/ng_ksocket.4 b/share/man/man4/ng_ksocket.4 index 1f32d39dc7d000..bb653c3688ad4d 100644 --- a/share/man/man4/ng_ksocket.4 +++ b/share/man/man4/ng_ksocket.4 @@ -32,7 +32,7 @@ .\" .\" Author: Archie Cobbs .\" -.Dd January 9, 2012 +.Dd January 9, 2025 .Dt NG_KSOCKET 4 .Os .Sh NAME @@ -183,7 +183,8 @@ in the argument field, the normal equivalent of the C structure is an acceptable form. For the -.Dv PF_INET +.Dv PF_INET , +.Dv PF_INET6 and .Dv PF_LOCAL address families, a more convenient form is also used, which is @@ -191,7 +192,11 @@ the protocol family name, followed by a slash, followed by the actual address. For .Dv PF_INET , -the address is an IP address followed by an optional colon and port number. +the address is an IPv4 address followed by an optional colon and port number. +For +.Dv PF_INET6 , +the address is an IPv6 address enclosed in square brackets followed +by an optional colon and port number. For .Dv PF_LOCAL , the address is the pathname as a doubly quoted string. @@ -202,6 +207,8 @@ Examples: local/"/tmp/foo.socket" .It Dv PF_INET inet/192.168.1.1:1234 +.It Dv PF_INET6 +inet6/[2001::1]:1234 .It Other .Dv "\&{ family=16 len=16 data=[0x70 0x00 0x01 0x23] \&}" .El diff --git a/sys/netgraph/ng_ksocket.c b/sys/netgraph/ng_ksocket.c index 3e4427f9e387be..43a2747224f309 100644 --- a/sys/netgraph/ng_ksocket.c +++ b/sys/netgraph/ng_ksocket.c @@ -58,6 +58,9 @@ #include #include +#include +#include + #include #include #include @@ -66,6 +69,8 @@ #include #include +#include + #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_KSOCKET, "netgraph_ksock", "netgraph ksock node"); @@ -147,6 +152,19 @@ static const struct ng_ksocket_alias ng_ksocket_protos[] = { { "swipe", IPPROTO_SWIPE, PF_INET }, { "encap", IPPROTO_ENCAP, PF_INET }, { "pim", IPPROTO_PIM, PF_INET }, + { "ip6", IPPROTO_IPV6, PF_INET6 }, + { "raw6", IPPROTO_RAW, PF_INET6 }, + { "icmp6", IPPROTO_ICMPV6, PF_INET6 }, + { "igmp6", IPPROTO_IGMP, PF_INET6 }, + { "tcp6", IPPROTO_TCP, PF_INET6 }, + { "udp6", IPPROTO_UDP, PF_INET6 }, + { "gre6", IPPROTO_GRE, PF_INET6 }, + { "esp6", IPPROTO_ESP, PF_INET6 }, + { "ah6", IPPROTO_AH, PF_INET6 }, + { "swipe6", IPPROTO_SWIPE, PF_INET6 }, + { "encap6", IPPROTO_ENCAP, PF_INET6 }, + { "divert6", IPPROTO_DIVERT, PF_INET6 }, + { "pim6", IPPROTO_PIM, PF_INET6 }, { NULL, -1 }, }; @@ -296,9 +314,58 @@ ng_ksocket_sockaddr_parse(const struct ng_parse_type *type, break; } -#if 0 - case PF_INET6: /* XXX implement this someday */ -#endif + case PF_INET6: + { + struct sockaddr_in6 *const sin6 = (struct sockaddr_in6 *)sa; + char *eptr; + char addr[INET6_ADDRSTRLEN]; + char ifname[16]; + u_long port; + bool hasifname = true; + + /* RFC 3986 Section 3.2.2, Validate IP literal within square brackets. */ + if (s[*off] == '[' && (strstr(&s[*off], "]"))) + (*off)++; + else + return (EINVAL); + if ((eptr = strstr(&s[*off], "%")) == NULL) { + hasifname = false; + eptr = strstr(&s[*off], "]"); + } + snprintf(addr, eptr - (s + *off) + 1, "%s", &s[*off]); + *off += (eptr - (s + *off)); + if (!inet_pton(AF_INET6, addr, &sin6->sin6_addr)) + return (EINVAL); + + if (hasifname) { + uint16_t scope; + + eptr = strstr(&s[*off], "]"); + (*off)++; + snprintf(ifname, eptr - (s + *off) + 1, "%s", &s[*off]); + *off += (eptr - (s + *off)); + + if (sin6->sin6_addr.s6_addr16[0] != IPV6_ADDR_INT16_ULL) + return (EINVAL); + scope = in6_getscope(&sin6->sin6_addr); + sin6->sin6_scope_id = + in6_getscopezone(ifunit(ifname), scope); + } + + (*off)++; + if (s[*off] == ':') { + (*off)++; + port = strtoul(s + *off, &eptr, 10); + if (port > 0xffff || eptr == s + *off) + return (EINVAL); + *off += (eptr - (s + *off)); + sin6->sin6_port = htons(port); + } else + sin6->sin6_port = 0; + + sin6->sin6_len = sizeof(*sin6); + break; + } default: return (EINVAL); @@ -358,9 +425,24 @@ ng_ksocket_sockaddr_unparse(const struct ng_parse_type *type, return(0); } -#if 0 - case PF_INET6: /* XXX implement this someday */ -#endif + case PF_INET6: + { + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa; + char addr[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, &sin6->sin6_addr, addr, INET6_ADDRSTRLEN); + slen += snprintf(cbuf, cbuflen, "inet6/[%s]", addr); + + if (sin6->sin6_port != 0) { + slen += snprintf(cbuf + strlen(cbuf), + cbuflen - strlen(cbuf), ":%d", + (u_int)ntohs(sin6->sin6_port)); + } + if (slen >= cbuflen) + return (ERANGE); + *off += sizeof(*sin6); + return(0); + } default: return (*ng_ksocket_generic_sockaddr_type.supertype->unparse) From f021e3573519ff192fc708cda9ca4bba264c96f7 Mon Sep 17 00:00:00 2001 From: Shteryana Shopova Date: Fri, 10 Jan 2025 15:30:21 -0500 Subject: [PATCH 126/143] bsnmpwalk: Fix crash on invalid data PR: 258570 Reported by: Robert Morris Reviewed by: emaste, markj Differential Revision: https://reviews.freebsd.org/D48422 --- usr.sbin/bsnmpd/tools/bsnmptools/bsnmpget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr.sbin/bsnmpd/tools/bsnmptools/bsnmpget.c b/usr.sbin/bsnmpd/tools/bsnmptools/bsnmpget.c index 81108387d3a4ad..9d5a693c7c687d 100644 --- a/usr.sbin/bsnmpd/tools/bsnmptools/bsnmpget.c +++ b/usr.sbin/bsnmpd/tools/bsnmptools/bsnmpget.c @@ -502,7 +502,7 @@ snmptool_walk(struct snmp_toolinfo *snmptoolctx) outputs += rc; - if ((u_int)rc < resp.nbindings) { + if ((u_int)rc < resp.nbindings || resp.nbindings == 0) { snmp_pdu_free(&resp); break; } From b2b974f7ef4cddff251d0de69d8da13232b25e4d Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 10 Jan 2025 23:59:47 +0000 Subject: [PATCH 127/143] clock: Simplify subr_ticks and rename - We can use builtin constants for the size of int and long to simplify definitions. - The file should have a .S prefix since we want to run it through the preprocessor, though apparently this happens anyway with .s... - Move ticks and ticksl from .data to .bss. Reported by: jrtc27 Reviewed by: jrtc27, kib, emaste Fixes: 6b82130e6c9a ("clock: Add a long ticks variable, ticksl") Differential Revision: https://reviews.freebsd.org/D48420 --- sys/conf/files | 2 +- sys/kern/{subr_ticks.s => subr_ticks.S} | 22 +++++++--------------- 2 files changed, 8 insertions(+), 16 deletions(-) rename sys/kern/{subr_ticks.s => subr_ticks.S} (62%) diff --git a/sys/conf/files b/sys/conf/files index a630d9dd72bc57..fc9108b5e10f3b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3932,7 +3932,7 @@ kern/subr_stack.c optional ddb | stack | ktr kern/subr_stats.c optional stats kern/subr_taskqueue.c standard kern/subr_terminal.c optional vt -kern/subr_ticks.s standard +kern/subr_ticks.S standard kern/subr_trap.c standard kern/subr_turnstile.c standard kern/subr_uio.c standard diff --git a/sys/kern/subr_ticks.s b/sys/kern/subr_ticks.S similarity index 62% rename from sys/kern/subr_ticks.s rename to sys/kern/subr_ticks.S index 6565ba42413783..52435b194c4f07 100644 --- a/sys/kern/subr_ticks.s +++ b/sys/kern/subr_ticks.S @@ -16,29 +16,21 @@ GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL) #endif -#ifdef _ILP32 -#define SIZEOF_TICKSL 4 -#define TICKSL_INIT .long 0 -#else -#define SIZEOF_TICKSL 8 -#define TICKSL_INIT .quad 0 -#endif - -#if defined(_ILP32) || __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define TICKS_OFFSET 0 #else -#define TICKS_OFFSET 4 +#define TICKS_OFFSET (__SIZEOF_LONG__ - __SIZEOF_INT__) #endif - .data + .bss .global ticksl .type ticksl, %object - .align SIZEOF_TICKSL -ticksl: TICKSL_INIT - .size ticksl, SIZEOF_TICKSL + .align __SIZEOF_LONG__ +ticksl: .zero __SIZEOF_LONG__ + .size ticksl, __SIZEOF_LONG__ .global ticks .type ticks, %object ticks =ticksl + TICKS_OFFSET - .size ticks, 4 + .size ticks, __SIZEOF_INT__ From 550137eab24ab6bdf05bfbb986927004b9f2f34e Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 10 Jan 2025 22:49:59 -0500 Subject: [PATCH 128/143] universe: Permit requesting a specific version of GCC If USE_GCC_TOOLCHAINS is set to a value matching the pattern 'gcc*', use that as the GCC version. For example, USE_GCC_TOOLCHAINS=gcc16 would use amd64-gcc16 for amd64, etc. If the variable is set to a value that doesn't match that pattern, use the default version. Reviewed by: imp, emaste Differential Revision: https://reviews.freebsd.org/D48418 --- Makefile | 16 +++++++++------- share/man/man7/build.7 | 7 ++++++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 5c113d5b83cced..1970cc7368296d 100644 --- a/Makefile +++ b/Makefile @@ -531,13 +531,15 @@ TARGET_ARCHES_${target}= ${MACHINE_ARCH_LIST_${target}} .endfor .if defined(USE_GCC_TOOLCHAINS) -TOOLCHAINS_amd64= amd64-gcc12 -TOOLCHAINS_arm= armv7-gcc12 -TOOLCHAINS_arm64= aarch64-gcc12 -TOOLCHAINS_i386= i386-gcc12 -TOOLCHAINS_powerpc= powerpc-gcc12 powerpc64-gcc12 -TOOLCHAIN_powerpc64= powerpc64-gcc12 -TOOLCHAINS_riscv= riscv64-gcc12 +_DEFAULT_GCC_VERSION= gcc12 +_GCC_VERSION= ${"${USE_GCC_TOOLCHAINS:Mgcc*}" != "":?${USE_GCC_TOOLCHAINS}:${_DEFAULT_GCC_VERSION}} +TOOLCHAINS_amd64= amd64-${_GCC_VERSION} +TOOLCHAINS_arm= armv7-${_GCC_VERSION} +TOOLCHAINS_arm64= aarch64-${_GCC_VERSION} +TOOLCHAINS_i386= i386-${_GCC_VERSION} +TOOLCHAINS_powerpc= powerpc-${_GCC_VERSION} powerpc64-${_GCC_VERSION} +TOOLCHAIN_powerpc64= powerpc64-${_GCC_VERSION} +TOOLCHAINS_riscv= riscv64-${_GCC_VERSION} .endif # If a target is using an external toolchain, set MAKE_PARAMS to enable use diff --git a/share/man/man7/build.7 b/share/man/man7/build.7 index 32c33aa3d7305d..5ca44c51cf56cf 100644 --- a/share/man/man7/build.7 +++ b/share/man/man7/build.7 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd November 13, 2024 +.Dd January 10, 2025 .Dt BUILD 7 .Os .Sh NAME @@ -879,6 +879,11 @@ This variable implies Use external GCC toolchains to build the requested targets. If the required toolchain package for a supported architecture is not installed, the build for that architecture is skipped. +.Pp +A specific version of GCC can be used by setting the value of this variable +to the desired version +.Pq for example, Dq gcc14 ; +otherwise a default version of GCC is used. .It Va TARGETS Only build the listed targets instead of each supported architecture. .It Va EXTRA_TARGETS From 84a62d974e744fbe00ee542fc5f95e99c528bd0a Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 10 Jan 2025 22:58:09 -0500 Subject: [PATCH 129/143] mlx.4: Remove stale diagnostic message This message would never have been omitted before since bus_generic_attach never fails. Reviewed by: ziaee, emaste Fixes: 18250ec6c089 Replace calls to bus_generic_attach with bus_attach_children Differential Revision: https://reviews.freebsd.org/D48402 --- share/man/man4/mlx.4 | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/share/man/man4/mlx.4 b/share/man/man4/mlx.4 index f7a0d64e9e7392..7cb3e9a98c1de3 100644 --- a/share/man/man4/mlx.4 +++ b/share/man/man4/mlx.4 @@ -23,7 +23,7 @@ .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" -.Dd February 15, 2017 +.Dd January 10, 2025 .Dt MLX 4 .Os .Sh NAME @@ -146,7 +146,6 @@ controller. The current status of all system drives could not be fetched; attachment of system drives will be aborted. .It mlx%d: device_add_child failed -.It mlx%d: bus_generic_attach returned %d .Pp Creation of the system drive instances failed; attachment of one or more system drives may have been aborted. From 4378bd382ea0f5707099273b1fa2393979a22628 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 10 Jan 2025 23:01:21 -0500 Subject: [PATCH 130/143] stand/kshim: Replace bus_generic_attach with bus_attach_children Fixes: 18250ec6c089 Replace calls to bus_generic_attach with bus_attach_children Differential Revision: https://reviews.freebsd.org/D48404 --- stand/kshim/bsd_kernel.c | 6 ++---- stand/kshim/bsd_kernel.h | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/stand/kshim/bsd_kernel.c b/stand/kshim/bsd_kernel.c index ac552d86603fc9..fb1b4239fd0398 100644 --- a/stand/kshim/bsd_kernel.c +++ b/stand/kshim/bsd_kernel.c @@ -205,16 +205,14 @@ bus_release_resource(device_t dev, int type, int rid, struct resource *r) return (EINVAL); } -int -bus_generic_attach(device_t dev) +void +bus_attach_children(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->dev_children, dev_link) { device_probe_and_attach(child); } - - return (0); } bus_space_tag_t diff --git a/stand/kshim/bsd_kernel.h b/stand/kshim/bsd_kernel.h index 0608d32f21ec38..0b5d659951a284 100644 --- a/stand/kshim/bsd_kernel.h +++ b/stand/kshim/bsd_kernel.h @@ -646,7 +646,7 @@ int bus_release_resource(device_t, int, int, struct resource *); void bus_release_resources(device_t, const struct resource_spec *, struct resource **); struct resource *bus_alloc_resource_any(device_t, int, int *, unsigned int); -int bus_generic_attach(device_t); +void bus_attach_children(device_t); bus_space_tag_t rman_get_bustag(struct resource *); bus_space_handle_t rman_get_bushandle(struct resource *); u_long rman_get_size(struct resource *); From ee15875c01593b287e55147c482b914e3ab01152 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 10 Jan 2025 23:01:48 -0500 Subject: [PATCH 131/143] stand/kshim: Update for devclass being removed from DRIVER_MODULE The kshim code abused the devclass argument to DRIVER_MODULE in some odd ways. Instead, refactor the devclass handling to more closely mirror what new-bus does in the kernel by having a linked list of devclasses looked up by name and associate devices with a devclass. Devices are now only associated with a module while probing and attaching. Reviewed by: imp, markj Differential Revision: https://reviews.freebsd.org/D48409 --- stand/kshim/bsd_kernel.c | 140 ++++++++++++++----------------- stand/kshim/bsd_kernel.h | 11 ++- stand/usb/storage/umass_common.c | 4 +- 3 files changed, 71 insertions(+), 84 deletions(-) diff --git a/stand/kshim/bsd_kernel.c b/stand/kshim/bsd_kernel.c index fb1b4239fd0398..78b6f9e0b4d419 100644 --- a/stand/kshim/bsd_kernel.c +++ b/stand/kshim/bsd_kernel.c @@ -554,6 +554,8 @@ static const char unknown_string[] = { "unknown" }; static TAILQ_HEAD(, module_data) module_head = TAILQ_HEAD_INITIALIZER(module_head); +static TAILQ_HEAD(, devclass) devclasses = + TAILQ_HEAD_INITIALIZER(devclasses); static uint8_t devclass_equal(const char *a, const char *b) @@ -686,58 +688,50 @@ device_get_nameunit(device_t dev) return (unknown_string); } -static uint8_t -devclass_create(devclass_t *dc_pp) +static devclass_t +devclass_create(const char *classname) { - if (dc_pp == NULL) { - return (1); - } - if (dc_pp[0] == NULL) { - dc_pp[0] = malloc(sizeof(**(dc_pp)), - M_DEVBUF, M_WAITOK | M_ZERO); + devclass_t dc; - if (dc_pp[0] == NULL) { - return (1); - } + dc = malloc(sizeof(*dc), M_DEVBUF, M_WAITOK | M_ZERO); + if (dc == NULL) { + return (NULL); } - return (0); + dc->name = classname; + TAILQ_INSERT_TAIL(&devclasses, dc, link); + return (dc); } -static const struct module_data * +static devclass_t devclass_find_create(const char *classname) { - const struct module_data *mod; + devclass_t dc; - TAILQ_FOREACH(mod, &module_head, entry) { - if (devclass_equal(mod->mod_name, classname)) { - if (devclass_create(mod->devclass_pp)) { - continue; - } - return (mod); - } - } - return (NULL); + dc = devclass_find(classname); + if (dc == NULL) + dc = devclass_create(classname); + return (dc); } static uint8_t -devclass_add_device(const struct module_data *mod, device_t dev) +devclass_add_device(devclass_t dc, device_t dev) { device_t *pp_dev; device_t *end; uint8_t unit; - pp_dev = mod->devclass_pp[0]->dev_list; + pp_dev = dc->dev_list; end = pp_dev + DEVCLASS_MAXUNIT; unit = 0; while (pp_dev != end) { if (*pp_dev == NULL) { *pp_dev = dev; + dev->dev_class = dc; dev->dev_unit = unit; - dev->dev_module = mod; snprintf(dev->dev_nameunit, sizeof(dev->dev_nameunit), - "%s%d", device_get_name(dev), unit); + "%s%d", dc->name, unit); return (0); } pp_dev++; @@ -748,26 +742,26 @@ devclass_add_device(const struct module_data *mod, device_t dev) } static void -devclass_delete_device(const struct module_data *mod, device_t dev) +devclass_delete_device(devclass_t dc, device_t dev) { - if (mod == NULL) { + if (dc == NULL) { return; } - mod->devclass_pp[0]->dev_list[dev->dev_unit] = NULL; - dev->dev_module = NULL; + dc->dev_list[dev->dev_unit] = NULL; + dev->dev_class = NULL; } static device_t make_device(device_t parent, const char *name) { device_t dev = NULL; - const struct module_data *mod = NULL; + devclass_t dc = NULL; if (name) { - mod = devclass_find_create(name); + dc = devclass_find_create(name); - if (!mod) { + if (!dc) { DPRINTF("%s:%d:%s: can't find device " "class %s\n", __FILE__, __LINE__, @@ -787,7 +781,7 @@ make_device(device_t parent, const char *name) if (name) { dev->dev_fixed_class = 1; - if (devclass_add_device(mod, dev)) { + if (devclass_add_device(dc, dev)) { goto error; } } @@ -843,7 +837,8 @@ device_delete_child(device_t dev, device_t child) } } - devclass_delete_device(child->dev_module, child); + if (child->dev_class != NULL) + devclass_delete_device(child->dev_class, child); if (dev != NULL) { /* remove child from parent */ @@ -911,7 +906,7 @@ device_get_method(device_t dev, const char *what) const char * device_get_name(device_t dev) { - if (dev == NULL) + if (dev == NULL || dev->dev_module == NULL) return (unknown_string); return (dev->dev_module->driver->name); @@ -942,16 +937,34 @@ device_probe_and_attach(device_t dev) { const struct module_data *mod; const char *bus_name_parent; - - bus_name_parent = device_get_name(device_get_parent(dev)); + devclass_t dc; if (dev->dev_attached) return (0); /* fail-safe */ - if (dev->dev_fixed_class) { + /* + * Find a module for our device, if any + */ + bus_name_parent = device_get_name(device_get_parent(dev)); + + TAILQ_FOREACH(mod, &module_head, entry) { + if (!devclass_equal(mod->bus_name, bus_name_parent)) + continue; + + dc = devclass_find(mod->mod_name); - mod = dev->dev_module; + /* Does this device need assigning to the new devclass? */ + if (dev->dev_class != dc) { + if (dev->dev_fixed_class) + continue; + if (dev->dev_class != NULL) + devclass_delete_device(dev->dev_class, dev); + if (devclass_add_device(dc, dev)) { + continue; + } + } + dev->dev_module = mod; if (DEVICE_PROBE(dev) <= 0) { if (device_allocate_softc(dev) == 0) { @@ -963,40 +976,11 @@ device_probe_and_attach(device_t dev) } } } - device_detach(dev); + /* else try next driver */ - goto error; - } - /* - * Else find a module for our device, if any - */ - - TAILQ_FOREACH(mod, &module_head, entry) { - if (devclass_equal(mod->bus_name, bus_name_parent)) { - if (devclass_create(mod->devclass_pp)) { - continue; - } - if (devclass_add_device(mod, dev)) { - continue; - } - if (DEVICE_PROBE(dev) <= 0) { - - if (device_allocate_softc(dev) == 0) { - - if (DEVICE_ATTACH(dev) == 0) { - /* success */ - dev->dev_attached = 1; - return (0); - } - } - } - /* else try next driver */ - - device_detach(dev); - } + device_detach(dev); } -error: return (ENODEV); } @@ -1015,9 +999,10 @@ device_detach(device_t dev) dev->dev_attached = 0; } device_set_softc(dev, NULL); + dev->dev_module = NULL; if (dev->dev_fixed_class == 0) - devclass_delete_device(mod, dev); + devclass_delete_device(dev->dev_class, dev); return (0); } @@ -1093,11 +1078,11 @@ devclass_get_device(devclass_t dc, int unit) devclass_t devclass_find(const char *classname) { - const struct module_data *mod; + devclass_t dc; - TAILQ_FOREACH(mod, &module_head, entry) { - if (devclass_equal(mod->driver->name, classname)) - return (mod->devclass_pp[0]); + TAILQ_FOREACH(dc, &devclasses, link) { + if (devclass_equal(dc->name, classname)) + return (dc); } return (NULL); } @@ -1108,6 +1093,7 @@ module_register(void *data) struct module_data *mdata = data; TAILQ_INSERT_TAIL(&module_head, mdata, entry); + (void)devclass_find_create(mdata->mod_name); } /*------------------------------------------------------------------------* diff --git a/stand/kshim/bsd_kernel.h b/stand/kshim/bsd_kernel.h index 0b5d659951a284..8600bd1f31dc3b 100644 --- a/stand/kshim/bsd_kernel.h +++ b/stand/kshim/bsd_kernel.h @@ -87,11 +87,11 @@ struct sysctl_req { #define MOD_UNLOAD 2 #define DEVMETHOD(what,func) { #what, (void *)&func } #define DEVMETHOD_END {0,0} -#define EARLY_DRIVER_MODULE(a, b, c, d, e, f, g) DRIVER_MODULE(a, b, c, d, e, f) -#define DRIVER_MODULE(name, busname, driver, devclass, evh, arg) \ +#define EARLY_DRIVER_MODULE(a, b, c, d, e, f) DRIVER_MODULE(a, b, c, d, e) +#define DRIVER_MODULE(name, busname, driver, evh, arg) \ static struct module_data bsd_##name##_##busname##_driver_mod = { \ evh, arg, #busname, #name, #busname "/" #name, \ - &driver, &devclass, { 0, 0 } }; \ + &driver, { 0, 0 } }; \ SYSINIT(bsd_##name##_##busname##_driver_mod, SI_SUB_DRIVERS, \ SI_ORDER_MIDDLE, module_register, \ &bsd_##name##_##busname##_driver_mod) @@ -135,6 +135,7 @@ SYSINIT_ENTRY(uniq##_entry, "sysuninit", (subs), \ #define cold 0 #define BUS_PROBE_GENERIC 0 #define BUS_PROBE_DEFAULT (-20) +#define DEVICE_UNIT_ANY -1 #define CALLOUT_RETURNUNLOCKED 0x1 #undef ffs #define ffs(x) __builtin_ffs(x) @@ -406,6 +407,7 @@ struct device { TAILQ_HEAD(device_list, device) dev_children; TAILQ_ENTRY(device) dev_link; + devclass_t dev_class; struct device *dev_parent; const struct module_data *dev_module; void *dev_sc; @@ -429,6 +431,8 @@ struct device { }; struct devclass { + TAILQ_ENTRY(devclass) link; + const char *name; device_t dev_list[DEVCLASS_MAXUNIT]; }; @@ -445,7 +449,6 @@ struct module_data { const char *mod_name; const char *long_name; const struct driver *driver; - struct devclass **devclass_pp; TAILQ_ENTRY(module_data) entry; }; diff --git a/stand/usb/storage/umass_common.c b/stand/usb/storage/umass_common.c index 639d70a7f7e603..b62412b136373a 100644 --- a/stand/usb/storage/umass_common.c +++ b/stand/usb/storage/umass_common.c @@ -38,8 +38,6 @@ static device_probe_t umass_probe; static device_attach_t umass_attach; static device_detach_t umass_detach; -static devclass_t umass_devclass; - static device_method_t umass_methods[] = { /* Device interface */ DEVMETHOD(device_probe, umass_probe), @@ -54,7 +52,7 @@ static driver_t umass_driver = { .methods = umass_methods, }; -DRIVER_MODULE(umass, uhub, umass_driver, umass_devclass, NULL, 0); +DRIVER_MODULE(umass, uhub, umass_driver, NULL, 0); static int umass_probe(device_t dev) From 8e4535ee5870b76f97f6d33a05e9979dcb75c185 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 10 Jan 2025 23:02:27 -0500 Subject: [PATCH 132/143] stand/kshim: Implement bus_detach_children While here, update bus_generic_detach to delete devices as in the kernel. Reviewed by: imp, markj Differential Revision: https://reviews.freebsd.org/D48410 --- stand/kshim/bsd_kernel.c | 13 ++++++++++++- stand/kshim/bsd_kernel.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/stand/kshim/bsd_kernel.c b/stand/kshim/bsd_kernel.c index 78b6f9e0b4d419..371aaa6e96bf9d 100644 --- a/stand/kshim/bsd_kernel.c +++ b/stand/kshim/bsd_kernel.c @@ -664,7 +664,7 @@ device_get_unit(device_t dev) } int -bus_generic_detach(device_t dev) +bus_detach_children(device_t dev) { device_t child; int error; @@ -679,6 +679,17 @@ bus_generic_detach(device_t dev) return (0); } +int +bus_generic_detach(device_t dev) +{ + int error; + + error = bus_detach_children(dev); + if (error == 0) + error = device_delete_children(dev); + return (error); +} + const char * device_get_nameunit(device_t dev) { diff --git a/stand/kshim/bsd_kernel.h b/stand/kshim/bsd_kernel.h index 8600bd1f31dc3b..25a779d5ea0c5f 100644 --- a/stand/kshim/bsd_kernel.h +++ b/stand/kshim/bsd_kernel.h @@ -650,6 +650,7 @@ void bus_release_resources(device_t, const struct resource_spec *, struct resource **); struct resource *bus_alloc_resource_any(device_t, int, int *, unsigned int); void bus_attach_children(device_t); +int bus_detach_children(device_t); bus_space_tag_t rman_get_bustag(struct resource *); bus_space_handle_t rman_get_bushandle(struct resource *); u_long rman_get_size(struct resource *); From f6f5aa8a2df5910b0769ab7cd0e6199c9b2ab624 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 10 Jan 2025 23:02:46 -0500 Subject: [PATCH 133/143] stand/usb: Quiet warnings so this builds again Reviewed by: markj, emaste Differential Revision: https://reviews.freebsd.org/D48411 --- stand/usb/Makefile | 2 +- stand/usb/test/Makefile | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/stand/usb/Makefile b/stand/usb/Makefile index 0a1c9e0022ee10..a9c20fc340ad9f 100644 --- a/stand/usb/Makefile +++ b/stand/usb/Makefile @@ -35,7 +35,7 @@ INTERNALLIB= CFLAGS+= -DBOOTPROG=\"usbloader\" CFLAGS+= -ffunction-sections -fdata-sections -CFLAGS+= -Wformat -Wall +CFLAGS+= -Wformat -Wall -Wno-unused CFLAGS+= -g CFLAGS+= -fno-pic diff --git a/stand/usb/test/Makefile b/stand/usb/test/Makefile index 74e501a39ae7da..d92ef6dddecba4 100644 --- a/stand/usb/test/Makefile +++ b/stand/usb/test/Makefile @@ -35,6 +35,7 @@ PROG= usbloader SRCS= CFLAGS+= -Wall +CFLAGS+= -Wno-error=missing-prototypes -Wno-error=unused-parameter CFLAGS+= -g .if ${MACHINE_CPUARCH} == "amd64" From 40d7ba08773751ff7d0df1a3f112b32d1d04e5ec Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 10 Jan 2025 23:03:02 -0500 Subject: [PATCH 134/143] stand/kshim: Replace devclass_equal with calls to strcmp Reviewed by: imp, markj, emaste Differential Revision: https://reviews.freebsd.org/D48412 --- stand/kshim/bsd_kernel.c | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/stand/kshim/bsd_kernel.c b/stand/kshim/bsd_kernel.c index 371aaa6e96bf9d..91ca46e18d7490 100644 --- a/stand/kshim/bsd_kernel.c +++ b/stand/kshim/bsd_kernel.c @@ -557,27 +557,6 @@ static TAILQ_HEAD(, module_data) module_head = static TAILQ_HEAD(, devclass) devclasses = TAILQ_HEAD_INITIALIZER(devclasses); -static uint8_t -devclass_equal(const char *a, const char *b) -{ - char ta, tb; - - if (a == b) - return (1); - - while (1) { - ta = *a; - tb = *b; - if (ta != tb) - return (0); - if (ta == 0) - break; - a++; - b++; - } - return (1); -} - int bus_generic_resume(device_t dev) { @@ -906,7 +885,7 @@ device_get_method(device_t dev, const char *what) mtod = dev->dev_module->driver->methods; while (mtod->func != NULL) { - if (devclass_equal(mtod->desc, what)) { + if (strcmp(mtod->desc, what) == 0) { return (mtod->func); } mtod++; @@ -959,7 +938,7 @@ device_probe_and_attach(device_t dev) bus_name_parent = device_get_name(device_get_parent(dev)); TAILQ_FOREACH(mod, &module_head, entry) { - if (!devclass_equal(mod->bus_name, bus_name_parent)) + if (strcmp(mod->bus_name, bus_name_parent) != 0) continue; dc = devclass_find(mod->mod_name); @@ -1092,7 +1071,7 @@ devclass_find(const char *classname) devclass_t dc; TAILQ_FOREACH(dc, &devclasses, link) { - if (devclass_equal(dc->name, classname)) + if (strcmp(dc->name, classname) == 0) return (dc); } return (NULL); From 0fda4ffd69054217096dd1a40355d97be9a8ab94 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 10 Jan 2025 20:55:39 -0800 Subject: [PATCH 135/143] netlink: augment group writer with priv(9) argument This will allow to broadcast messages visible only to priveleged subscribers. Reviewed by: melifaro Differential Revision: https://reviews.freebsd.org/D48307 --- sys/netlink/netlink_domain.c | 8 +++++--- sys/netlink/netlink_generic.c | 2 +- sys/netlink/netlink_glue.c | 7 ++++--- sys/netlink/netlink_message_writer.c | 3 ++- sys/netlink/netlink_message_writer.h | 10 ++++++---- sys/netlink/netlink_sysevent.c | 2 +- sys/netlink/netlink_var.h | 2 +- sys/netlink/route/iface.c | 5 +++-- sys/netlink/route/neigh.c | 2 +- sys/netlink/route/nexthop.c | 4 ++-- sys/netlink/route/rt.c | 5 +++-- 11 files changed, 29 insertions(+), 21 deletions(-) diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c index 922da32bfb6d6c..45d427f4316633 100644 --- a/sys/netlink/netlink_domain.c +++ b/sys/netlink/netlink_domain.c @@ -47,7 +47,7 @@ #include #include #include -#include /* priv_check */ +#include #include #include @@ -225,8 +225,10 @@ nl_send_group(struct nl_writer *nw) NLCTL_RLOCK(ctl); CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { - if (nl_isset_group_locked(nlp, nw->group.id) && - nlp->nl_proto == nw->group.proto) { + if ((nw->group.priv == 0 || priv_check_cred( + nlp->nl_socket->so_cred, nw->group.priv) == 0) && + nlp->nl_proto == nw->group.proto && + nl_isset_group_locked(nlp, nw->group.id)) { if (nlp_last != NULL) { struct nl_buf *copy; diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c index 0f960d79f47741..62353149217382 100644 --- a/sys/netlink/netlink_generic.c +++ b/sys/netlink/netlink_generic.c @@ -259,7 +259,7 @@ nlctrl_notify(void *arg __unused, const struct genl_family *gf, int cmd) struct nl_writer nw; if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_GENERIC, ctrl_group_id, - false)) { + 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } diff --git a/sys/netlink/netlink_glue.c b/sys/netlink/netlink_glue.c index e550a6013654ba..0e8fdc0b054c76 100644 --- a/sys/netlink/netlink_glue.c +++ b/sys/netlink/netlink_glue.c @@ -118,7 +118,7 @@ nl_writer_unicast_stub(struct nl_writer *nw, size_t size, struct nlpcb *nlp, static bool nl_writer_group_stub(struct nl_writer *nw, size_t size, uint16_t protocol, - uint16_t group_id, bool waitok) + uint16_t group_id, int priv, bool waitok) { return (get_stub_writer(nw)); } @@ -221,9 +221,10 @@ nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp, bool nl_writer_group(struct nl_writer *nw, size_t size, uint16_t protocol, - uint16_t group_id, bool waitok) + uint16_t group_id, int priv, bool waitok) { - return (_nl->nl_writer_group(nw, size, protocol, group_id, waitok)); + return (_nl->nl_writer_group(nw, size, protocol, group_id, priv, + waitok)); } bool diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c index 1aebc4690c2d17..8c5b3ec140584c 100644 --- a/sys/netlink/netlink_message_writer.c +++ b/sys/netlink/netlink_message_writer.c @@ -86,11 +86,12 @@ _nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp, bool _nl_writer_group(struct nl_writer *nw, size_t size, uint16_t protocol, - uint16_t group_id, bool waitok) + uint16_t group_id, int priv, bool waitok) { *nw = (struct nl_writer){ .group.proto = protocol, .group.id = group_id, + .group.priv = priv, .cb = nl_send_group, }; diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h index 1655acb53fef22..83f925e8d93d8e 100644 --- a/sys/netlink/netlink_message_writer.h +++ b/sys/netlink/netlink_message_writer.h @@ -50,6 +50,7 @@ struct nl_writer { struct { uint16_t proto; uint16_t id; + int priv; } group; }; u_int num_messages; /* Number of messages in the buffer */ @@ -67,7 +68,8 @@ struct nl_writer { /* Provide optimized calls to the functions inside the same linking unit */ bool _nl_writer_unicast(struct nl_writer *, size_t, struct nlpcb *nlp, bool); -bool _nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, bool); +bool _nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, int, + bool); bool _nlmsg_flush(struct nl_writer *nw); void _nlmsg_ignore_limit(struct nl_writer *nw); @@ -89,9 +91,9 @@ nl_writer_unicast(struct nl_writer *nw, size_t size, struct nlpcb *nlp, static inline bool nl_writer_group(struct nl_writer *nw, size_t size, uint16_t proto, - uint16_t group_id, bool waitok) + uint16_t group_id, int priv, bool waitok) { - return (_nl_writer_group(nw, size, proto, group_id, waitok)); + return (_nl_writer_group(nw, size, proto, group_id, priv, waitok)); } static inline bool @@ -141,7 +143,7 @@ nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) /* Provide access to the functions via netlink_glue.c */ bool nl_writer_unicast(struct nl_writer *, size_t, struct nlpcb *, bool waitok); -bool nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, +bool nl_writer_group(struct nl_writer *, size_t, uint16_t, uint16_t, int, bool waitok); bool nlmsg_flush(struct nl_writer *nw); void nlmsg_ignore_limit(struct nl_writer *nw); diff --git a/sys/netlink/netlink_sysevent.c b/sys/netlink/netlink_sysevent.c index 3359c77fa303b1..c955ce2e8b4528 100644 --- a/sys/netlink/netlink_sysevent.c +++ b/sys/netlink/netlink_sysevent.c @@ -82,7 +82,7 @@ sysevent_write(struct sysevent_group *se, const char *subsystem, const char *typ { struct nl_writer nw; - if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_GENERIC, se->id, + if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_GENERIC, se->id, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h index 8efe84e935c3e5..34cba0b28d2712 100644 --- a/sys/netlink/netlink_var.h +++ b/sys/netlink/netlink_var.h @@ -187,7 +187,7 @@ struct nl_function_wrapper { bool (*nl_writer_unicast)(struct nl_writer *nw, size_t size, struct nlpcb *nlp, bool waitok); bool (*nl_writer_group)(struct nl_writer *nw, size_t size, - uint16_t protocol, uint16_t group_id, bool waitok); + uint16_t protocol, uint16_t group_id, int priv, bool waitok); bool (*nlmsg_end_dump)(struct nl_writer *nw, int error, struct nlmsghdr *hdr); int (*nl_modify_ifp_generic)(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt); diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c index 86b1f8f1b1bc9c..d856498b975f34 100644 --- a/sys/netlink/route/iface.c +++ b/sys/netlink/route/iface.c @@ -1386,7 +1386,8 @@ rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) if (!nl_has_listeners(NETLINK_ROUTE, group)) return; - if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, group, false)) { + if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, group, 0, + false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } @@ -1406,7 +1407,7 @@ rtnl_handle_ifevent(if_t ifp, int nlmsg_type, int if_flags_mask) if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK)) return; - if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK, + if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c index ec58c6140db884..9eaaae26325420 100644 --- a/sys/netlink/route/neigh.c +++ b/sys/netlink/route/neigh.c @@ -566,7 +566,7 @@ rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt) int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH; - if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH, + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c index 03f1a57fd1e4a4..30aa3dd7253474 100644 --- a/sys/netlink/route/nexthop.c +++ b/sys/netlink/route/nexthop.c @@ -554,7 +554,7 @@ delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) }; if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP, - false)) { + 0, false)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } @@ -949,7 +949,7 @@ rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, }; if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP, - false)) { + 0, false)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c index 14bd73d3341166..e90debee46da6e 100644 --- a/sys/netlink/route/rt.c +++ b/sys/netlink/route/rt.c @@ -353,7 +353,8 @@ report_operation(uint32_t fibnum, struct rib_cmd_info *rc, struct nl_writer nw; uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); - if (nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, false)) { + if (nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0, + false)) { struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, @@ -1082,7 +1083,7 @@ rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) }; uint32_t group_id = family_to_group(family); - if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating event buffer"); return; From 8a8d095718cb4e3ce84bef1cd61c20b518b8d047 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 10 Jan 2025 20:55:50 -0800 Subject: [PATCH 136/143] netlink: add snl(3) primitive to obtain group ID using the family name and the group name as lookup arguments. Reviewed by: melifaro Differential Revision: https://reviews.freebsd.org/D48308 --- sys/netlink/netlink_snl_generic.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sys/netlink/netlink_snl_generic.h b/sys/netlink/netlink_snl_generic.h index 0a2913c9155ee1..32b460c612bd17 100644 --- a/sys/netlink/netlink_snl_generic.h +++ b/sys/netlink/netlink_snl_generic.h @@ -127,6 +127,24 @@ snl_get_genl_family(struct snl_state *ss, const char *family_name) return (attrs.family_id); } +static inline uint16_t +snl_get_genl_mcast_group(struct snl_state *ss, const char *family_name, + const char *group_name, uint16_t *family_id) +{ + struct _getfamily_attrs attrs = {}; + + snl_get_genl_family_info(ss, family_name, &attrs); + if (attrs.family_id == 0) + return (0); + if (family_id != NULL) + *family_id = attrs.family_id; + for (u_int i = 0; i < attrs.mcast_groups.num_groups; i++) + if (strcmp(attrs.mcast_groups.groups[i]->mcast_grp_name, + group_name) == 0) + return (attrs.mcast_groups.groups[i]->mcast_grp_id); + return (0); +} + static const struct snl_hdr_parser *snl_all_genl_parsers[] = { &_genl_ctrl_getfam_parser, &_genl_ctrl_mc_parser, }; From bbe6559cf958a9016cb18ff1833ebd3a884f349f Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 10 Jan 2025 20:57:55 -0800 Subject: [PATCH 137/143] netlink: fix size comparison We want to check the size of the header, not a pointer to it. Reviewed by: melifaro, markj Differential Revision: https://reviews.freebsd.org/D48309 --- sys/netlink/netlink_generic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c index 62353149217382..b78ab80ab3c27c 100644 --- a/sys/netlink/netlink_generic.c +++ b/sys/netlink/netlink_generic.c @@ -67,7 +67,8 @@ genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) return (ENOTSUP); } - if (__predict_false(hdr->nlmsg_len < sizeof(hdr) + GENL_HDRLEN)) { + if (__predict_false(hdr->nlmsg_len < sizeof(struct nlmsghdr) + + GENL_HDRLEN)) { NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", hdr->nlmsg_len); return (EINVAL); } From 26d1ad5a44e17d2d8c48d36755567043349d8b63 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 10 Jan 2025 20:58:08 -0800 Subject: [PATCH 138/143] netlink: snl_create_genl_msg_request() may fail due to ENOMEM Reviewed by: melifaro Differential Revision: https://reviews.freebsd.org/D48310 --- sys/netlink/netlink_snl_generic.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sys/netlink/netlink_snl_generic.h b/sys/netlink/netlink_snl_generic.h index 32b460c612bd17..e2dc4d1bfffe9b 100644 --- a/sys/netlink/netlink_snl_generic.h +++ b/sys/netlink/netlink_snl_generic.h @@ -33,16 +33,24 @@ /* Genetlink helpers */ static inline struct nlmsghdr * -snl_create_genl_msg_request(struct snl_writer *nw, int genl_family, uint8_t genl_cmd) +snl_create_genl_msg_request(struct snl_writer *nw, uint16_t genl_family, + uint8_t genl_cmd) { + struct nlmsghdr *hdr; + struct genlmsghdr *ghdr; + assert(nw->hdr == NULL); - struct nlmsghdr *hdr = snl_reserve_msg_object(nw, struct nlmsghdr); + hdr = snl_reserve_msg_object(nw, struct nlmsghdr); + if (__predict_false(hdr == NULL)) + return (NULL); hdr->nlmsg_type = genl_family; hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - nw->hdr = hdr; - struct genlmsghdr *ghdr = snl_reserve_msg_object(nw, struct genlmsghdr); + ghdr = snl_reserve_msg_object(nw, struct genlmsghdr); + if (__predict_false(ghdr == NULL)) + return (NULL); ghdr->cmd = genl_cmd; + nw->hdr = hdr; return (hdr); } From 926d2eadcb671dd26431a1082d4c49c3d5ad7f22 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 10 Jan 2025 20:59:29 -0800 Subject: [PATCH 139/143] netlink: some refactoring of NETLINK_GENERIC layer - Statically initialize control family/group. This removes extra startup code and provides a strong guarantee that they reside at the 0 index of the respective arrays. Before a genl_register_family() with a higher SYSINIT order could try to hijack index 0. - Remove the family_id field completely. Now the family ID as well as group ID are array indices and there is basically no place for a mistake. Previous code had a bug where a KPI user could induce an ID mismatch. - Merge netlink_generic_kpi.c to netlink_generic.c. Both files are small and now there is more dependency between the control family and the family allocator. Ok'ed by melifaro@. Reviewed by: melifaro Differential Revision: https://reviews.freebsd.org/D48316 --- sys/conf/files | 1 - sys/netlink/netlink_ctl.h | 2 +- sys/netlink/netlink_generic.c | 291 +++++++++++++++++++++++++++--- sys/netlink/netlink_generic_kpi.c | 276 ---------------------------- sys/netlink/netlink_var.h | 4 +- 5 files changed, 273 insertions(+), 301 deletions(-) delete mode 100644 sys/netlink/netlink_generic_kpi.c diff --git a/sys/conf/files b/sys/conf/files index fc9108b5e10f3b..a02174f3d9548b 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4481,7 +4481,6 @@ netipsec/xform_ipcomp.c optional ipsec inet | ipsec inet6 netipsec/xform_tcp.c optional ipsec inet tcp_signature | \ ipsec inet6 tcp_signature | ipsec_support inet tcp_signature | \ ipsec_support inet6 tcp_signature -netlink/netlink_generic_kpi.c standard netlink/netlink_glue.c standard netlink/netlink_message_parser.c standard netlink/netlink_domain.c optional netlink diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h index 95b79c763ccdc3..a23e9a3a948f0d 100644 --- a/sys/netlink/netlink_ctl.h +++ b/sys/netlink/netlink_ctl.h @@ -92,7 +92,7 @@ struct genl_cmd { uint32_t cmd_num; }; -uint32_t genl_register_family(const char *family_name, size_t hdrsize, +uint16_t genl_register_family(const char *family_name, size_t hdrsize, uint16_t family_version, uint16_t max_attr_idx); bool genl_unregister_family(const char *family_name); bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c index b78ab80ab3c27c..d4c84a34b850fa 100644 --- a/sys/netlink/netlink_generic.c +++ b/sys/netlink/netlink_generic.c @@ -119,7 +119,7 @@ dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, ghdr_new->reserved = 0; nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name); - nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, gf->family_id); + nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, genl_get_family_id(gf)); nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version); nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize); nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max); @@ -173,9 +173,6 @@ dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, static void nlctrl_notify(void *arg, const struct genl_family *gf, int action); static eventhandler_tag family_event_tag; -static uint32_t ctrl_family_id; -static uint32_t ctrl_group_id; - struct nl_parsed_family { uint32_t family_id; char *family_name; @@ -201,7 +198,7 @@ match_family(const struct genl_family *gf, const struct nl_parsed_family *attrs) { if (gf->family_name == NULL) return (false); - if (attrs->family_id != 0 && attrs->family_id != gf->family_id) + if (attrs->family_id != 0 && attrs->family_id != genl_get_family_id(gf)) return (false); if (attrs->family_name != NULL && strcmp(attrs->family_name, gf->family_name)) return (false); @@ -259,7 +256,7 @@ nlctrl_notify(void *arg __unused, const struct genl_family *gf, int cmd) struct genlmsghdr ghdr = { .cmd = cmd }; struct nl_writer nw; - if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_GENERIC, ctrl_group_id, + if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_GENERIC, CTRL_GROUP_ID, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; @@ -269,27 +266,16 @@ nlctrl_notify(void *arg __unused, const struct genl_family *gf, int cmd) nlmsg_flush(&nw); } -static const struct genl_cmd nlctrl_cmds[] = { - { - .cmd_num = CTRL_CMD_GETFAMILY, - .cmd_name = "GETFAMILY", - .cmd_cb = nlctrl_handle_getfamily, - .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, - }, -}; - static const struct nlhdr_parser *all_parsers[] = { &genl_parser }; static void genl_load_all(void *u __unused) { NL_VERIFY_PARSERS(all_parsers); - ctrl_family_id = genl_register_family(CTRL_FAMILY_NAME, 0, 2, CTRL_ATTR_MAX); - genl_register_cmds(CTRL_FAMILY_NAME, nlctrl_cmds, nitems(nlctrl_cmds)); - ctrl_group_id = genl_register_group(CTRL_FAMILY_NAME, "notify"); - family_event_tag = EVENTHANDLER_REGISTER(genl_family_event, nlctrl_notify, NULL, - EVENTHANDLER_PRI_ANY); - netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", genl_handle_message); + family_event_tag = EVENTHANDLER_REGISTER(genl_family_event, + nlctrl_notify, NULL, EVENTHANDLER_PRI_ANY); + netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", + genl_handle_message); } SYSINIT(genl_load_all, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load_all, NULL); @@ -298,7 +284,268 @@ genl_unload(void *u __unused) { netlink_unregister_proto(NETLINK_GENERIC); EVENTHANDLER_DEREGISTER(genl_family_event, family_event_tag); - genl_unregister_family(CTRL_FAMILY_NAME); NET_EPOCH_WAIT(); } SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL); + +/* + * Public KPI for NETLINK_GENERIC families/groups registration logic below. + */ + +static struct sx sx_lock; +SX_SYSINIT(genl_lock, &sx_lock, "genetlink lock"); +#define GENL_LOCK() sx_xlock(&sx_lock) +#define GENL_UNLOCK() sx_xunlock(&sx_lock) +#define GENL_ASSERT_LOCKED() sx_assert(&sx_lock, SA_LOCKED) +#define GENL_ASSERT_XLOCKED() sx_assert(&sx_lock, SA_XLOCKED) + +static struct genl_cmd nlctrl_cmds[] = { + [CTRL_CMD_GETFAMILY] = { + .cmd_num = CTRL_CMD_GETFAMILY, + .cmd_name = "GETFAMILY", + .cmd_cb = nlctrl_handle_getfamily, + .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | + GENL_CMD_CAP_HASPOL, + }, +}; + +static struct genl_family families[MAX_FAMILIES] = { + [CTRL_FAMILY_ID] = { + .family_name = CTRL_FAMILY_NAME, + .family_hdrsize = 0, + .family_version = 2, + .family_attr_max = CTRL_ATTR_MAX, + .family_cmd_size = CTRL_CMD_GETFAMILY + 1, + .family_cmds = nlctrl_cmds, + .family_num_groups = 1, + }, +} +; +static struct genl_group groups[MAX_GROUPS] = { + [CTRL_GROUP_ID] = { + .group_family = &families[CTRL_FAMILY_ID], + .group_name = CTRL_GROUP_NAME, + }, +}; + +static struct genl_family * +find_family(const char *family_name) +{ + GENL_ASSERT_LOCKED(); + for (u_int i = 0; i < MAX_FAMILIES; i++) + if (families[i].family_name != NULL && + strcmp(families[i].family_name, family_name) == 0) + return (&families[i]); + + return (NULL); +} + +static struct genl_family * +find_empty_family_id(const char *family_name) +{ + GENL_ASSERT_LOCKED(); + /* Microoptimization: index 0 is reserved for the control family */ + for (u_int i = 1; i < MAX_FAMILIES; i++) + if (families[i].family_name == NULL) + return (&families[i]); + + return (NULL); +} + +uint16_t +genl_register_family(const char *family_name, size_t hdrsize, + uint16_t family_version, uint16_t max_attr_idx) +{ + struct genl_family *gf; + uint16_t family_id; + + GENL_LOCK(); + if (find_family(family_name) != NULL) { + GENL_UNLOCK(); + return (0); + } + + gf = find_empty_family_id(family_name); + KASSERT(gf, ("%s: maximum of %u generic netlink families allocated", + __func__, MAX_FAMILIES)); + + *gf = (struct genl_family) { + .family_name = family_name, + .family_version = family_version, + .family_hdrsize = hdrsize, + .family_attr_max = max_attr_idx, + }; + family_id = genl_get_family_id(gf); + GENL_UNLOCK(); + + NL_LOG(LOG_DEBUG2, "Registered family %s id %d", gf->family_name, + family_id); + EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_NEWFAMILY); + + return (family_id); +} + +static void +free_family(struct genl_family *gf) +{ + if (gf->family_cmds != NULL) + free(gf->family_cmds, M_NETLINK); +} + +/* + * unregister groups of a given family + */ +static void +unregister_groups(const struct genl_family *gf) +{ + + for (u_int i = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family == gf && gg->group_name != NULL) { + gg->group_family = NULL; + gg->group_name = NULL; + } + } +} + +/* + * Can sleep, I guess + */ +bool +genl_unregister_family(const char *family_name) +{ + bool found = false; + + GENL_LOCK(); + struct genl_family *gf = find_family(family_name); + + if (gf != NULL) { + EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_DELFAMILY); + found = true; + unregister_groups(gf); + /* TODO: zero pointer first */ + free_family(gf); + bzero(gf, sizeof(*gf)); + } + GENL_UNLOCK(); + + return (found); +} + +bool +genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, + int count) +{ + struct genl_family *gf; + uint16_t cmd_size; + + GENL_LOCK(); + if ((gf = find_family(family_name)) == NULL) { + GENL_UNLOCK(); + return (false); + } + + cmd_size = gf->family_cmd_size; + + for (u_int i = 0; i < count; i++) { + MPASS(cmds[i].cmd_cb != NULL); + if (cmds[i].cmd_num >= cmd_size) + cmd_size = cmds[i].cmd_num + 1; + } + + if (cmd_size > gf->family_cmd_size) { + void *old_data; + + /* need to realloc */ + size_t sz = cmd_size * sizeof(struct genl_cmd); + void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO); + + memcpy(data, gf->family_cmds, + gf->family_cmd_size * sizeof(struct genl_cmd)); + old_data = gf->family_cmds; + gf->family_cmds = data; + gf->family_cmd_size = cmd_size; + free(old_data, M_NETLINK); + } + + for (u_int i = 0; i < count; i++) { + const struct genl_cmd *cmd = &cmds[i]; + + MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL); + gf->family_cmds[cmd->cmd_num] = cmds[i]; + NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s", + cmd->cmd_name, cmd->cmd_num, gf->family_name); + } + GENL_UNLOCK(); + return (true); +} + +static struct genl_group * +find_group(const struct genl_family *gf, const char *group_name) +{ + for (u_int i = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family == gf && + !strcmp(gg->group_name, group_name)) + return (gg); + } + return (NULL); +} + +uint32_t +genl_register_group(const char *family_name, const char *group_name) +{ + struct genl_family *gf; + uint32_t group_id = 0; + + MPASS(family_name != NULL); + MPASS(group_name != NULL); + + GENL_LOCK(); + if ((gf = find_family(family_name)) == NULL || + find_group(gf, group_name) != NULL) { + GENL_UNLOCK(); + return (0); + } + + /* Microoptimization: index 0 is reserved for the control family */ + for (u_int i = 1; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family == NULL) { + gf->family_num_groups++; + gg->group_family = gf; + gg->group_name = group_name; + group_id = i + MIN_GROUP_NUM; + break; + } + } + GENL_UNLOCK(); + + return (group_id); +} + +/* accessors */ +struct genl_family * +genl_get_family(uint16_t family_id) +{ + return ((family_id < MAX_FAMILIES) ? &families[family_id] : NULL); +} + +const char * +genl_get_family_name(const struct genl_family *gf) +{ + return (gf->family_name); +} + +uint16_t +genl_get_family_id(const struct genl_family *gf) +{ + MPASS(gf >= &families[0] && gf < &families[MAX_FAMILIES]); + return ((uint16_t)(gf - &families[0]) + GENL_MIN_ID); +} + +struct genl_group * +genl_get_group(uint32_t group_id) +{ + return ((group_id < MAX_GROUPS) ? &groups[group_id] : NULL); +} diff --git a/sys/netlink/netlink_generic_kpi.c b/sys/netlink/netlink_generic_kpi.c deleted file mode 100644 index e6125ab893d868..00000000000000 --- a/sys/netlink/netlink_generic_kpi.c +++ /dev/null @@ -1,276 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2022 Alexander V. Chernikov - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define DEBUG_MOD_NAME nl_generic_kpi -#define DEBUG_MAX_LEVEL LOG_DEBUG3 -#include -_DECLARE_DEBUG(LOG_INFO); - - -/* - * NETLINK_GENERIC families/groups registration logic - */ - -#define GENL_LOCK() sx_xlock(&sx_lock) -#define GENL_UNLOCK() sx_xunlock(&sx_lock) -static struct sx sx_lock; -SX_SYSINIT(genl_lock, &sx_lock, "genetlink lock"); - -static struct genl_family families[MAX_FAMILIES]; -static struct genl_group groups[MAX_GROUPS]; - -static struct genl_family * -find_family(const char *family_name) -{ - for (int i = 0; i < MAX_FAMILIES; i++) { - struct genl_family *gf = &families[i]; - if (gf->family_name != NULL && !strcmp(gf->family_name, family_name)) - return (gf); - } - - return (NULL); -} - -static struct genl_family * -find_empty_family_id(const char *family_name) -{ - struct genl_family *gf = NULL; - - if (!strcmp(family_name, CTRL_FAMILY_NAME)) { - gf = &families[0]; - gf->family_id = GENL_MIN_ID; - } else { - /* Index 0 is reserved for the control family */ - for (int i = 1; i < MAX_FAMILIES; i++) { - gf = &families[i]; - if (gf->family_name == NULL) { - gf->family_id = GENL_MIN_ID + i; - break; - } - } - } - - return (gf); -} - -uint32_t -genl_register_family(const char *family_name, size_t hdrsize, - uint16_t family_version, uint16_t max_attr_idx) -{ - - MPASS(family_name != NULL); - if (find_family(family_name) != NULL) - return (0); - - GENL_LOCK(); - - struct genl_family *gf = find_empty_family_id(family_name); - MPASS(gf != NULL); - - gf->family_name = family_name; - gf->family_version = family_version; - gf->family_hdrsize = hdrsize; - gf->family_attr_max = max_attr_idx; - NL_LOG(LOG_DEBUG2, "Registered family %s id %d", gf->family_name, - gf->family_id); - EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_NEWFAMILY); - - GENL_UNLOCK(); - - return (gf->family_id); -} - -static void -free_family(struct genl_family *gf) -{ - if (gf->family_cmds != NULL) - free(gf->family_cmds, M_NETLINK); -} - -/* - * unregister groups of a given family - */ -static void -unregister_groups(const struct genl_family *gf) -{ - - for (int i = 0; i < MAX_GROUPS; i++) { - struct genl_group *gg = &groups[i]; - if (gg->group_family == gf && gg->group_name != NULL) { - gg->group_family = NULL; - gg->group_name = NULL; - } - } -} - -/* - * Can sleep, I guess - */ -bool -genl_unregister_family(const char *family_name) -{ - bool found = false; - - GENL_LOCK(); - struct genl_family *gf = find_family(family_name); - - if (gf != NULL) { - EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_DELFAMILY); - found = true; - unregister_groups(gf); - /* TODO: zero pointer first */ - free_family(gf); - bzero(gf, sizeof(*gf)); - } - GENL_UNLOCK(); - - return (found); -} - -bool -genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, int count) -{ - GENL_LOCK(); - struct genl_family *gf = find_family(family_name); - if (gf == NULL) { - GENL_UNLOCK(); - return (false); - } - - int cmd_size = gf->family_cmd_size; - - for (int i = 0; i < count; i++) { - MPASS(cmds[i].cmd_cb != NULL); - if (cmds[i].cmd_num >= cmd_size) - cmd_size = cmds[i].cmd_num + 1; - } - - if (cmd_size > gf->family_cmd_size) { - /* need to realloc */ - size_t sz = cmd_size * sizeof(struct genl_cmd); - void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO); - - memcpy(data, gf->family_cmds, gf->family_cmd_size * sizeof(struct genl_cmd)); - void *old_data = gf->family_cmds; - gf->family_cmds = data; - gf->family_cmd_size = cmd_size; - free(old_data, M_NETLINK); - } - - for (int i = 0; i < count; i++) { - const struct genl_cmd *cmd = &cmds[i]; - MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL); - gf->family_cmds[cmd->cmd_num] = cmds[i]; - NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s", - cmd->cmd_name, cmd->cmd_num, gf->family_name); - } - GENL_UNLOCK(); - return (true); -} - -static struct genl_group * -find_group(const struct genl_family *gf, const char *group_name) -{ - for (int i = 0; i < MAX_GROUPS; i++) { - struct genl_group *gg = &groups[i]; - if (gg->group_family == gf && !strcmp(gg->group_name, group_name)) - return (gg); - } - return (NULL); -} - -uint32_t -genl_register_group(const char *family_name, const char *group_name) -{ - uint32_t group_id = 0; - - MPASS(family_name != NULL); - MPASS(group_name != NULL); - - GENL_LOCK(); - struct genl_family *gf = find_family(family_name); - - if (gf == NULL || find_group(gf, group_name) != NULL) { - GENL_UNLOCK(); - return (0); - } - - for (int i = 0; i < MAX_GROUPS; i++) { - struct genl_group *gg = &groups[i]; - if (gg->group_family == NULL) { - gf->family_num_groups++; - gg->group_family = gf; - gg->group_name = group_name; - group_id = i + MIN_GROUP_NUM; - break; - } - } - GENL_UNLOCK(); - - return (group_id); -} - -/* accessors */ -struct genl_family * -genl_get_family(uint16_t family_id) -{ - return ((family_id < MAX_FAMILIES) ? &families[family_id] : NULL); -} - -const char * -genl_get_family_name(const struct genl_family *gf) -{ - return (gf->family_name); -} - -uint16_t -genl_get_family_id(const struct genl_family *gf) -{ - return (gf->family_id); -} - -struct genl_group * -genl_get_group(uint32_t group_id) -{ - return ((group_id < MAX_GROUPS) ? &groups[group_id] : NULL); -} - diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h index 34cba0b28d2712..87b9f5aaaecdef 100644 --- a/sys/netlink/netlink_var.h +++ b/sys/netlink/netlink_var.h @@ -147,7 +147,6 @@ void nl_buf_free(struct nl_buf *nb); struct genl_family { const char *family_name; uint16_t family_hdrsize; - uint16_t family_id; uint16_t family_version; uint16_t family_attr_max; uint16_t family_cmd_size; @@ -168,7 +167,10 @@ struct genl_group *genl_get_group(uint32_t group_id); #define MIN_GROUP_NUM 48 +#define CTRL_FAMILY_ID 0 #define CTRL_FAMILY_NAME "nlctrl" +#define CTRL_GROUP_ID 0 +#define CTRL_GROUP_NAME "notify" struct ifnet; struct nl_parsed_link; From 4dc1820a16b9b6108e0ff8a0265c08c67fa34146 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 10 Jan 2025 21:08:02 -0800 Subject: [PATCH 140/143] libbsnmp: avoid division by zero with empty password PR: 283909 --- contrib/bsnmp/lib/snmpclient.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/bsnmp/lib/snmpclient.c b/contrib/bsnmp/lib/snmpclient.c index ab3ac2d18e6464..b312a37ed3edcd 100644 --- a/contrib/bsnmp/lib/snmpclient.c +++ b/contrib/bsnmp/lib/snmpclient.c @@ -1792,7 +1792,7 @@ snmp_discover_engine(char *passwd) if (snmp_client.user.auth_proto == SNMP_AUTH_NOAUTH) return (0); - if (passwd == NULL || + if (passwd == NULL || strlen(passwd) == 0 || snmp_passwd_to_keys(&snmp_client.user, passwd) != SNMP_CODE_OK || snmp_get_local_keys(&snmp_client.user, snmp_client.engine.engine_id, snmp_client.engine.engine_len) != SNMP_CODE_OK) From 3999a860d6e899de98b1025317d2d0ef1f83255f Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 10 Jan 2025 21:08:02 -0800 Subject: [PATCH 141/143] libbsnmptools: avoid uninitialized snmptoolctx->passwd with empty password The removed check left snmptoolctx->passwd pointer to uninitialized memory. Always calling strlcpy(3) would guarantee that with empty password it will point to empty string. Submitted by: markj PR: 283909 --- usr.sbin/bsnmpd/tools/libbsnmptools/bsnmptools.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/usr.sbin/bsnmpd/tools/libbsnmptools/bsnmptools.c b/usr.sbin/bsnmpd/tools/libbsnmptools/bsnmptools.c index a9d44cee43648d..fb09e1ac785eb4 100644 --- a/usr.sbin/bsnmpd/tools/libbsnmptools/bsnmptools.c +++ b/usr.sbin/bsnmpd/tools/libbsnmptools/bsnmptools.c @@ -178,8 +178,7 @@ snmptool_init(struct snmp_toolinfo *snmptoolctx) warn("malloc() failed"); return (-1); } - if (slen > 0) - strlcpy(snmptoolctx->passwd, str, slen + 1); + strlcpy(snmptoolctx->passwd, str, slen + 1); } return (0); From 76658cd70add383dba14d1f71df3d41dedb77015 Mon Sep 17 00:00:00 2001 From: Ahmad Khalifa Date: Sat, 11 Jan 2025 15:31:23 +0200 Subject: [PATCH 142/143] install-boot.sh: Expose configuration variables Expose configuration variables if we're being sourced. This provides a convenient way to check them in the release scripts. Signed-off-by: Ahmad Khalifa --- tools/boot/install-boot.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/boot/install-boot.sh b/tools/boot/install-boot.sh index 217bf0ff14571a..dd369dd3201056 100755 --- a/tools/boot/install-boot.sh +++ b/tools/boot/install-boot.sh @@ -450,4 +450,18 @@ mbr2=${srcroot}/boot/boot # sourced, so we shouldn't run anything. if [ -n "${dev}" ]; then eval boot_${geli}_${scheme}_${fs}_${bios} $dev $srcroot $opts || echo "Unsupported boot env: ${geli}-${scheme}-${fs}-${bios}" +elif [ $(basename "$0") != "install-boot.sh" ]; then + # If we're being sourced, give the sourcer + # the configuration variables. + + srctop=$(dirname $(realpath $0))/../.. + _=$IFS + + IFS=$'\n' + for i in $(make -C $srctop showconfig) + do + setvar ${i%%[[:space:]]*=*} ${i##*=[[:space:]]} + done + + IFS=$_ fi From da3e113138e32bff6e322706ae49187e8100ab39 Mon Sep 17 00:00:00 2001 From: Ahmad Khalifa Date: Sat, 11 Jan 2025 15:33:20 +0200 Subject: [PATCH 143/143] release/amd64: Check config variables instead of checking files Check the MK_LOADER_IA32 variable instead of manually checking if the file exists. Signed-off-by: Ahmad Khalifa --- release/amd64/make-memstick.sh | 8 +++----- release/amd64/mkisoimages.sh | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/release/amd64/make-memstick.sh b/release/amd64/make-memstick.sh index cbb80e97134350..e140316825bf35 100755 --- a/release/amd64/make-memstick.sh +++ b/release/amd64/make-memstick.sh @@ -60,12 +60,10 @@ fi # Make an ESP in a file. espfilename=$(mktemp /tmp/efiboot.XXXXXX) -if [ -f "${BASEBITSDIR}/boot/loader_ia32.efi" ]; then - make_esp_file ${espfilename} ${fat32min} ${BASEBITSDIR}/boot/loader.efi bootx64 \ - ${BASEBITSDIR}/boot/loader_ia32.efi bootia32 -else - make_esp_file ${espfilename} ${fat32min} ${BASEBITSDIR}/boot/loader.efi +if [ ${MK_LOADER_IA32} = "yes" ]; then + extra_args="${BASEBITSDIR}/boot/loader_ia32.efi bootia32" fi +make_esp_file ${espfilename} ${fat32min} ${BASEBITSDIR}/boot/loader.efi bootx64 ${extra_args} mkimg -s mbr \ -b ${BASEBITSDIR}/boot/mbr \ diff --git a/release/amd64/mkisoimages.sh b/release/amd64/mkisoimages.sh index 245beb660c3fee..51681edf360a05 100644 --- a/release/amd64/mkisoimages.sh +++ b/release/amd64/mkisoimages.sh @@ -64,12 +64,10 @@ if [ "$1" = "-b" ]; then espfilename=$(mktemp /tmp/efiboot.XXXXXX) # ESP file size in KB. espsize="2048" - if [ -f "${BASEBITSDIR}/boot/loader_ia32.efi" ]; then - make_esp_file ${espfilename} ${espsize} ${BASEBITSDIR}/boot/loader.efi bootx64 \ - ${BASEBITSDIR}/boot/loader_ia32.efi bootia32 - else - make_esp_file ${espfilename} ${espsize} ${BASEBITSDIR}/boot/loader.efi + if [ ${MK_LOADER_IA32} = "yes" ]; then + extra_args="${BASEBITSDIR}/boot/loader_ia32.efi bootia32" fi + make_esp_file ${espfilename} ${espsize} ${BASEBITSDIR}/boot/loader.efi bootx64 ${extra_args} bootable="$bootable -o bootimage=i386;${espfilename} -o no-emul-boot -o platformid=efi" shift