From aaca777268028413f833307476e27fc22d588267 Mon Sep 17 00:00:00 2001 From: Alwin Garside Date: Sat, 20 Dec 2025 20:36:56 +0100 Subject: [PATCH] feat: new `preg_escape()` function --- README.md | 7 +++ composer.json | 1 + src/pcre.php | 104 ++++++++++++++++++++++++++++++++++ tests/Unit/pcreTest.php | 120 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 232 insertions(+) create mode 100644 src/pcre.php create mode 100644 tests/Unit/pcreTest.php diff --git a/README.md b/README.md index 25ee9d5..c36db35 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,13 @@ _Functions_: — Replaces a suffix in a path with a new value. +### PCRE Functions + + - [preg_escape()](https://usephul.empaphy.org/packages/PCRE.html#function_preg_escape) + — Escapes all instances of the given PCRE delimiter character in a raw + regular expression pattern. + + ### SPL Functions - [class_parents_uses()](https://usephul.empaphy.org/packages/Other-SPL.html#function_class_parents_uses) diff --git a/composer.json b/composer.json index b112ce2..90377c2 100644 --- a/composer.json +++ b/composer.json @@ -32,6 +32,7 @@ "src/generators.php", "src/Math/functions.php", "src/other.php", + "src/pcre.php", "src/Path/functions.php", "src/Type/functions.php", "src/Var/functions.php" diff --git a/src/pcre.php b/src/pcre.php new file mode 100644 index 0000000..8c56b41 --- /dev/null +++ b/src/pcre.php @@ -0,0 +1,104 @@ + + * @copyright 2025 The Empaphy Project + * @license MIT + * @package PCRE + */ + +declare(strict_types=1); + +namespace empaphy\usephul; + +use ValueError; + +use function assert; +use function preg_last_error_msg; +use function preg_match; +use function preg_quote; +use function preg_replace; +use function sprintf; +use function str_contains; +use function strlen; + +/** + * Escapes all instances of the given PCRE delimiter character in a raw regular + * expression pattern. + * + * {@see preg_escape()} takes __pattern__ and puts a backslash in front + * of every unescaped __delimiter__. This is useful to prepare raw regular + * expression patterns for use with PHP's PCRE functions. + * + * For example: + * + * preg_escape('foo_bar', '_'); // returns `foo\_bar` + * preg_escape('foo\\_bar', '_'); // returns `foo\_bar` + * + * @param string $pattern + * The input pattern. + * + * @param non-empty-string $delimiter + * The delimiter to be escaped. Must be a single non-alphanumeric, + * non-backslash, non-whitespace character. + * + * This function doesn't support bracket style delimiters (`(`, `)`, + * `{`, `}`, `[`, `]`, `<`, and `>`). + * + * @return string + * The __pattern__ with all instances of __delimiter__ escaped where needed. + * + * @throws ValueError + * Thrown if __delimiter__ is not a single non-alphanumeric, non-backslash, + * non-whitespace character, or if it is a bracket style delimiter. + */ +function preg_escape(string $pattern, string $delimiter): string +{ + // A delimiter can be any non-alphanumeric, non-backslash, non-whitespace + // character. + $matched = preg_match('/^[^[:alnum:]\\\\[:space:](){}\\[\\]<>]$/', $delimiter); + if (! $matched) { + assert($matched !== false, preg_last_error_msg()); + + if (strlen($delimiter) !== 1) { + throw new ValueError( + sprintf( + '%s(): Argument #2 ($delimiter) must be a single character', + __FUNCTION__, + ), + ); + } + + if (str_contains('(){}[]<>', $delimiter)) { + throw new ValueError( + sprintf( + '%s(): Argument #2 ($delimiter) cannot be a bracket style ' + . 'delimiter', + __FUNCTION__, + ), + ); + } + + throw new ValueError( + sprintf( + '%s(): Argument #2 ($delimiter) must be a non-alphanumeric, ' + . 'non-backslash, non-whitespace character', + __FUNCTION__, + ), + ); + } + + if (empty($pattern)) { + return $pattern; + } + + $pattern = preg_replace( + '/(?\\\\\\\\)*\K' . preg_quote($delimiter, '/') . '/', + '\\\\' . $delimiter, + $pattern, + ); + + assert($pattern !== null); + + return $pattern; +} diff --git a/tests/Unit/pcreTest.php b/tests/Unit/pcreTest.php new file mode 100644 index 0000000..0c1ec53 --- /dev/null +++ b/tests/Unit/pcreTest.php @@ -0,0 +1,120 @@ + + * @copyright 2025 The Empaphy Project + * @license MIT + * + * @noinspection StaticClosureCanBeUsedInspection + */ + +declare(strict_types=1); + +namespace Pest\Unit\pcre; + +use Generator; +use ValueError; + +use function empaphy\usephul\generators\seq; +use function empaphy\usephul\preg_escape; + +function alphanumeric_characters(): Generator +{ + for ($i = 0x30; $i < 0x3a; $i++) { + yield [chr($i)]; + } + + for ($i = 0x41; $i < 0x5b; $i++) { + yield [chr($i)]; + } + + for ($i = 0x61; $i < 0x7b; $i++) { + yield [chr($i)]; + } +} + +/** + * Returns all whitespace characters. + * + * > The space characters are HT (9), LF (10), VT (11), FF (12), + * > CR (13), and space (32). Notice that this list includes the VT + * > character (code 11). + * See: https://www.php.net/regexp.reference.character-classes + */ +function whitespace_characters(): Generator +{ + for ($i = 9; $i < 14; $i++) { + yield [chr($i)]; + } + + yield [chr(32)]; +} + +function delimiters(): Generator +{ + $delimiters = '!"#$%&\'*+,-./:;=?@^_`|~'; + + yield from seq($delimiters); +} + +describe('preg_escape()', function () { + test('escapes delimiters when appropriate', function ($pattern, $delimiter, $expected) { + $value = preg_escape($pattern, $delimiter); + expect($value)->toBe($expected); + })->with([ + ['pattern' => 'foo/bar', 'delimiter' => '/', 'expected' => 'foo\\/bar'], + ['pattern' => 'foo\\/bar', 'delimiter' => '/', 'expected' => 'foo\\/bar'], + ['pattern' => '', 'delimiter' => '/', 'expected' => ''], + ]); + + test('returns valid patterns for all delimiters', function ($expression, $delimiter, $subject) { + $pattern = $delimiter . preg_escape($expression, $delimiter) . $delimiter; + $matched = preg_match($pattern, $subject, $matches); + expect($matched)->toBe(1)->and($matches[0])->toBe($expression); + })->with(function () { + foreach (delimiters() as $delimiter) { + yield ["foo{$delimiter}bar", $delimiter, "quxfoo{$delimiter}barbaz"]; + } + }); + + test('throws ValueError when $delimiter is empty', function () { + preg_escape('foo', ''); // @phpstan-ignore argument.type + })->throws(ValueError::class, 'preg_escape(): Argument #2 ($delimiter) must be a single character'); + + test('throws ValueError when $delimiter is longer than a single character', function () { + preg_escape('foo', '//'); + })->throws(ValueError::class, 'preg_escape(): Argument #2 ($delimiter) must be a single character'); + + test('throws `ValueError` when the delimiter is alphanumeric', function ($delimiter) { + preg_escape('foo', $delimiter); + })->throws( + ValueError::class, + 'preg_escape(): Argument #2 ($delimiter) must be a ' + . 'non-alphanumeric, non-backslash, non-whitespace character', + )->with(alphanumeric_characters(...)); + + test('throws `ValueError` when the delimiter is a backslash', function () { + preg_escape('foo', '\\'); + })->throws( + ValueError::class, + 'preg_escape(): Argument #2 ($delimiter) must be a ' + . 'non-alphanumeric, non-backslash, non-whitespace character', + ); + + test('throws `ValueError` when the delimiter is whitespace', function ($delimiter) { + preg_escape('foo', $delimiter); + })->throws( + ValueError::class, + 'preg_escape(): Argument #2 ($delimiter) must be a ' + . 'non-alphanumeric, non-backslash, non-whitespace character', + )->with(whitespace_characters(...)); + + test('throws `ValueError` when given a bracket style `$delimiter`', function ($delimiter) { + preg_escape('foo', $delimiter); + })->throws( + ValueError::class, + 'preg_escape(): Argument #2 ($delimiter) cannot be a bracket style delimiter', + )->with(function () { + yield from seq('(){}[]<>'); + }); +});