From 935d4f0e42e81e1e7a4f95b949e9e72be9e4e7b8 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 2 Jan 2026 13:35:04 +0900 Subject: [PATCH 1/5] first implementation of d-string --- Lib/test/test_dstring.py | 47 ++++++++++ Lib/test/test_tokenize.py | 4 +- Lib/tokenize.py | 3 +- Parser/action_helpers.c | 180 ++++++++++++++++++++++++++++++++++++-- Parser/lexer/lexer.c | 59 ++++++++----- Parser/string_parser.c | 119 ++++++++++++++++++++++++- 6 files changed, 376 insertions(+), 36 deletions(-) create mode 100644 Lib/test/test_dstring.py diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py new file mode 100644 index 00000000000000..09592980dee8c5 --- /dev/null +++ b/Lib/test/test_dstring.py @@ -0,0 +1,47 @@ +import unittest + + +class DStringTestCase(unittest.TestCase): + def assertAllRaise(self, exception_type, regex, error_strings): + for str in error_strings: + with self.subTest(str=str): + with self.assertRaisesRegex(exception_type, regex) as cm: + eval(str) + # print("Testing expression:", repr(str)) + # print(repr(cm.exception)) + # print(repr(cm.exception.text)) + + def test_single_quote(self): + exprs = [ + "d'hello'", + 'D"hello"', + "d'hello\\nworld'", + ] + self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs) + + def test_empty_dstring(self): + exprs = [ + "d''''''", + 'D""""""', + ] + self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs) + + def test_no_last_newline(self): + exprs = [ + "d'''\nhello world'''", + 'D"""\nhello world"""', + "df'''\nhello {42}'''", + ] + self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs) + + def test_simple_dstring(self): + self.assertEqual(eval('d"""\n hello world\n """'), "hello world\n") + self.assertEqual(eval('d"""\n hello world\n """'), " hello world\n") + self.assertEqual(eval('d"""\n hello world\n"""'), " hello world\n") + self.assertEqual(eval('d"""\n hello world\\\n """'), " hello world") + self.assertEqual(eval('dr"""\n hello world\\\n """'), " hello world\\\n") + + + +if __name__ == '__main__': + unittest.main() diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ca67e381958757..2110f4c6a48320 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3420,7 +3420,7 @@ def determine_valid_prefixes(): # some uppercase-only prefix is added. for letter in itertools.chain(string.ascii_lowercase, string.ascii_uppercase): try: - eval(f'{letter}""') + eval(f'{letter}"""\n"""') # d-string needs multiline single_char_valid_prefixes.add(letter.lower()) except SyntaxError: pass @@ -3444,7 +3444,7 @@ def determine_valid_prefixes(): # because it's a valid expression: not "" continue try: - eval(f'{p}""') + eval(f'{p}"""\n"""') # d-string needs multiline # No syntax error, so p is a valid string # prefix. diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 11c134482db024..74a709024174b6 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -86,7 +86,8 @@ def _all_string_prefixes(): # The valid string prefixes. Only contain the lower case versions, # and don't contain any permutations (include 'fr', but not # 'rf'). The various permutations will be generated. - _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr'] + _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'd', 'br', 'fr', 'tr', + 'bd', 'rd', 'fd', 'td', 'brd', 'frd', 'trd'] # if we add binary f-strings, add: ['fb', 'fbr'] result = {''} for prefix in _valid_string_prefixes: diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 50856686335a14..2f143788ff53ab 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1292,24 +1292,124 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq // Fstring stuff +static int +unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end, + int is_raw, Token* token) +{ + if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) { + return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start); + } + else { + PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token); + if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) { + Py_XDECREF(line); + return -1; + } + Py_DECREF(line); + } + return 0; +} + +static PyObject* +_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count, + int is_raw, int is_first, expr_ty constant, Token* token) +{ + Py_ssize_t lineno = constant->lineno; + const char *line_start = s; + const char *s_end = s + len; + + PyUnicodeWriter *w = PyUnicodeWriter_Create(len); + if (w == NULL) { + return NULL; + } + if (is_first) { + assert (line_start[0] == '\n'); + line_start++; // skip the first newline + } + else { + // Example: df""" + // first part {param} second part + // next line + // """" + // We don't need to dedent the first line in the non-first parts. + const char *line_end = memchr(line_start, '\n', s_end - line_start); + if (line_end) { + line_end++; // include the newline + } + else { + line_end = s_end; + } + if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) { + PyUnicodeWriter_Discard(w); + return NULL; + } + line_start = line_end; + } + + while (line_start < s + len) { + lineno++; + + Py_ssize_t i = 0; + while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) { + i++; + } + + if (line_start[i] == '\0') { // found an empty line without newline. + break; + } + if (line_start[i] == '\n') { // found an empty line with newline. + if (PyUnicodeWriter_WriteChar(w, '\n') < 0) { + PyUnicodeWriter_Discard(w); + return NULL; + } + line_start += i+1; + continue; + } + if (i < dedent_count) { // found an invalid indent. + assert(line_start[i] != indent_char); + PyUnicodeWriter_Discard(w); + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1, + "d-string line missing valid indentation"); + return NULL; + } + + // found a indented line. let's dedent it. + line_start += i; + const char *line_end = memchr(line_start, '\n', s_end - line_start); + if (line_end) { + line_end++; // include the newline + } + else { + line_end = s_end; + } + if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) { + PyUnicodeWriter_Discard(w); + return NULL; + } + line_start = line_end; + } + return PyUnicodeWriter_Finish(w); +} + static expr_ty -_PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) { +_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) { assert(PyUnicode_CheckExact(constant->v.Constant.value)); const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value); if (bstr == NULL) { return NULL; } + is_raw = is_raw || strchr(bstr, '\\') == NULL; - size_t len; - if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) { - len = 1; - } else { - len = strlen(bstr); + PyObject *str = NULL; + if (dedent_count > 0) { + str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count, + is_raw, is_first, constant, token); + } + else { + str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token); } - is_raw = is_raw || strchr(bstr, '\\') == NULL; - PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, token); if (str == NULL) { _Pypegen_raise_decode_error(p); return NULL; @@ -1340,12 +1440,74 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b return NULL; } int is_raw = strpbrk(quote_str, "rR") != NULL; + int is_dedent = strpbrk(quote_str, "dD") != NULL; + int indent_char = 0; + Py_ssize_t indent_count = 0; asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena); if (seq == NULL) { return NULL; } + if (is_dedent) { + expr_ty first_item = asdl_seq_GET(raw_expressions, 0); + if (first_item->kind != Constant_kind + || PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + first_item, + "d-string must start with a newline" + ); + return NULL; + } + + expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1); + if (last_item->kind != Constant_kind) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + last_item, + "d-string must end with an indent line" + ); + return NULL; + } + + Py_ssize_t blen; + const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen); + if (bstr == NULL) { + return NULL; + } + + // memrchr is GNU extension; use manual loop for portability. + const char *lastline = bstr + blen; + while (bstr < lastline) { + if (lastline[-1] == '\n') { + break; + } + lastline--; + if (*lastline != ' ' && *lastline != '\t') { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + last_item, + "d-string must end with an indent line" + ); + return NULL; + } + } + + // checks indent of the last line. + indent_count = bstr + blen - lastline; + if (indent_count > 0) { + indent_char = lastline[0]; + + for (Py_ssize_t i = 1; i < indent_count; i++) { + if (lastline[i] != indent_char) { + RAISE_ERROR_KNOWN_LOCATION( + p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1, + "inconsistent use of tabs and spaces in indentation" + ); + return NULL; + } + } + } + } + Py_ssize_t index = 0; for (Py_ssize_t i = 0; i < n_items; i++) { expr_ty item = asdl_seq_GET(raw_expressions, i); @@ -1377,7 +1539,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b } if (item->kind == Constant_kind) { - item = _PyPegen_decode_fstring_part(p, is_raw, item, b); + item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b); if (item == NULL) { return NULL; } diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 7f25afec302c22..dc3d369c023119 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -455,7 +455,7 @@ tok_continuation_line(struct tok_state *tok) { static int maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, int saw_b, int saw_r, int saw_u, - int saw_f, int saw_t) { + int saw_f, int saw_t, int saw_d) { // Supported: rb, rf, rt (in any order) // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order) @@ -480,6 +480,9 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, if (saw_u && saw_t) { RETURN_SYNTAX_ERROR("u", "t"); } + if (saw_u && saw_d) { + RETURN_SYNTAX_ERROR("u", "d"); + } if (saw_b && saw_f) { RETURN_SYNTAX_ERROR("b", "f"); @@ -487,6 +490,9 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, if (saw_b && saw_t) { RETURN_SYNTAX_ERROR("b", "t"); } + if (saw_b && saw_d) { + RETURN_SYNTAX_ERROR("b", "d"); + } if (saw_f && saw_t) { RETURN_SYNTAX_ERROR("f", "t"); @@ -741,8 +747,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Identifier (most frequent token!) */ nonascii = 0; if (is_potential_identifier_start(c)) { - /* Process the various legal combinations of b"", r"", u"", and f"". */ - int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0; + /* Process the various legal combinations of b"", r"", u"", f"", and d"". */ + int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0, saw_d = 0; while (1) { if (!saw_b && (c == 'b' || c == 'B')) { saw_b = 1; @@ -762,6 +768,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t else if (!saw_t && (c == 't' || c == 'T')) { saw_t = 1; } + else if (!saw_d && (c == 'd' || c == 'D')) { + saw_d = 1; + } else { break; } @@ -769,7 +778,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t if (c == '"' || c == '\'') { // Raise error on incompatible string prefixes: int status = maybe_raise_syntax_error_for_string_prefixes( - tok, saw_b, saw_r, saw_u, saw_f, saw_t); + tok, saw_b, saw_r, saw_u, saw_f, saw_t, saw_d); if (status < 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -1049,7 +1058,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } f_string_quote: - if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't') + if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't' || Py_TOLOWER(*tok->start) == 'd') && (c == '\'' || c == '"'))) { int quote = c; @@ -1089,6 +1098,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t the_current_tok->kind = TOK_FSTRING_MODE; the_current_tok->quote = quote; the_current_tok->quote_size = quote_size; + the_current_tok->raw = 0; the_current_tok->start = tok->start; the_current_tok->multi_line_start = tok->line_start; the_current_tok->first_line = tok->lineno; @@ -1101,25 +1111,28 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t the_current_tok->in_debug = 0; enum string_kind_t string_kind = FSTRING; - switch (*tok->start) { - case 'T': - case 't': - the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; - string_kind = TSTRING; - break; - case 'F': - case 'f': - the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; - break; - case 'R': - case 'r': - the_current_tok->raw = 1; - if (Py_TOLOWER(*(tok->start + 1)) == 't') { + for (const char *p = tok->start; *p != c; p++) { + switch (*p) { + case 'f': + case 'F': + break; + case 't': + case 'T': string_kind = TSTRING; - } - break; - default: - Py_UNREACHABLE(); + break; + case 'r': + case 'R': + the_current_tok->raw = 1; + break; + case 'd': + case 'D': + if (quote_size != 3) { + return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be a multiline string")); + } + break; + default: + Py_UNREACHABLE(); + } } the_current_tok->string_kind = string_kind; diff --git a/Parser/string_parser.c b/Parser/string_parser.c index b164dfbc81a933..99f73c099ff7a2 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -247,6 +247,107 @@ _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t) return decode_unicode_with_escapes(p, s, len, t); } +static PyObject* +_PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Token* token) +{ + // this function is for d-string without t/f-string. + // dt/df-string are processed in action_helper.c:_get_resized_exprs + Py_ssize_t lineno = token->lineno; + + if (len == 0 || s[0] != '\n') { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + token, + "d-string must start with a newline at line %d", + lineno + ); + return NULL; + } + + // find the last newline and check all chars after it are spaces or tabs. + const char *endline = s + len; + while (endline[-1] != '\n') { + assert(endline > s); // we know at least the first char is a newline. + endline--; + if (*endline != ' ' && *endline != '\t') { + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, + // specify the location of just before closing triple quotes. + token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset-2, + "d-string must end with an indent line"); + return NULL; + } + } + + // Now, prefix is both the dedent indentation and the end of the d-string body. + Py_ssize_t indent_len = s + len - endline; + int indent_char = endline[0]; // ' ', '\t', or '\0'. + + // checks the prefix is consistant. + for (Py_ssize_t i = 1; i < indent_len; i++) { + if (endline[i] != indent_char) { + RAISE_ERROR_KNOWN_LOCATION( + p, PyExc_TabError, token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset -2, + "inconsistent use of tabs and spaces in indentation"); + return NULL; + } + } + + PyUnicodeWriter *w = PyUnicodeWriter_Create(endline - s); + if (w == NULL) { + return NULL; + } + const char *line_start = s + 1; // skip the first newline + + while (line_start < endline) { + lineno++; + + Py_ssize_t i; + for (i = 0; i < indent_len && line_start + i < endline; i++) { + if (line_start[i] != indent_char) { + if (line_start[i] == '\n') { + break; // empty line + } + PyUnicodeWriter_Discard(w); + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_IndentationError, lineno, i, lineno, i+1, + "d-string missing valid indentation"); + return NULL; + } + } + + if (line_start[i] == '\n') { // found an empty line with newline. + if (PyUnicodeWriter_WriteChar(w, '\n') < 0) { + PyUnicodeWriter_Discard(w); + return NULL; + } + line_start += i+1; + continue; + } + + // found a indented line. let's dedent it. + line_start += i; + const char *line_end = memchr(line_start, '\n', endline - line_start); + assert(line_end != NULL); // we know there is at least one newline before endline. + line_end++; // include the newline in the line + + if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) { + if (PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start) < 0) { + PyUnicodeWriter_Discard(w); + return NULL; + } + } + else { + PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token); + if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) { + Py_XDECREF(line); + return NULL; + } + Py_DECREF(line); + } + + line_start = line_end; + } + return PyUnicodeWriter_Finish(w); +} + /* s must include the bracketing quote characters, and r, b &/or f prefixes (if any), and embedded escape sequences (if any). (f-strings are handled by the parser) _PyPegen_parse_string parses it, and returns the decoded Python string object. */ @@ -262,9 +363,10 @@ _PyPegen_parse_string(Parser *p, Token *t) int quote = Py_CHARMASK(*s); int bytesmode = 0; int rawmode = 0; + int dedentmode = 0; if (Py_ISALPHA(quote)) { - while (!bytesmode || !rawmode) { + while (!bytesmode || !rawmode || !dedentmode) { if (quote == 'b' || quote == 'B') { quote =(unsigned char)*++s; bytesmode = 1; @@ -276,6 +378,10 @@ _PyPegen_parse_string(Parser *p, Token *t) quote = (unsigned char)*++s; rawmode = 1; } + else if (quote == 'd' || quote == 'D') { + quote =(unsigned char)*++s; + dedentmode = 1; + } else { break; } @@ -315,10 +421,17 @@ _PyPegen_parse_string(Parser *p, Token *t) return NULL; } } + else if (dedentmode) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + t, + "d-string must be triple-quoted"); + return NULL; + } /* Avoid invoking escape decoding routines if possible. */ rawmode = rawmode || strchr(s, '\\') == NULL; if (bytesmode) { + assert(!dedentmode); /* Disallow non-ASCII characters. */ const char *ch; for (ch = s; *ch; ch++) { @@ -335,5 +448,9 @@ _PyPegen_parse_string(Parser *p, Token *t) } return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t); } + if (dedentmode) { + return _PyPegen_dedent_string(p, rawmode, s, len, t); + } return _PyPegen_decode_string(p, rawmode, s, len, t); } + From 3187540ea669eb80ad12722fb53df585ef393ea5 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Wed, 21 Jan 2026 01:29:30 +0900 Subject: [PATCH 2/5] use least indent instead of closing quote indent --- Lib/test/test_dstring.py | 9 --- Objects/unicodeobject.c | 6 +- Parser/action_helpers.c | 110 ++++++++++++++++++++----------------- Parser/lexer/lexer.c | 3 - Parser/string_parser.c | 116 +++++++++++++++++++-------------------- 5 files changed, 120 insertions(+), 124 deletions(-) diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py index 09592980dee8c5..7927877e8bb088 100644 --- a/Lib/test/test_dstring.py +++ b/Lib/test/test_dstring.py @@ -26,14 +26,6 @@ def test_empty_dstring(self): ] self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs) - def test_no_last_newline(self): - exprs = [ - "d'''\nhello world'''", - 'D"""\nhello world"""', - "df'''\nhello {42}'''", - ] - self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs) - def test_simple_dstring(self): self.assertEqual(eval('d"""\n hello world\n """'), "hello world\n") self.assertEqual(eval('d"""\n hello world\n """'), " hello world\n") @@ -42,6 +34,5 @@ def test_simple_dstring(self): self.assertEqual(eval('dr"""\n hello world\\\n """'), " hello world\\\n") - if __name__ == '__main__': unittest.main() diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fdcbcf51cb62c2..412dbfc53b6353 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13480,8 +13480,8 @@ of all lines in the [src, end). It returns the length of the common leading whitespace and sets `output` to point to the beginning of the common leading whitespace if length > 0. */ -static Py_ssize_t -search_longest_common_leading_whitespace( +Py_ssize_t +_Py_search_longest_common_leading_whitespace( const char *const src, const char *const end, const char **output) @@ -13576,7 +13576,7 @@ _PyUnicode_Dedent(PyObject *unicode) // [whitespace_start, whitespace_start + whitespace_len) // describes the current longest common leading whitespace const char *whitespace_start = NULL; - Py_ssize_t whitespace_len = search_longest_common_leading_whitespace( + Py_ssize_t whitespace_len = _Py_search_longest_common_leading_whitespace( src, end, &whitespace_start); if (whitespace_len == 0) { diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 2f143788ff53ab..50337f48b3a303 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1311,8 +1311,8 @@ unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, } static PyObject* -_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count, - int is_raw, int is_first, expr_ty constant, Token* token) +_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len, + int is_first, int is_raw, expr_ty constant, Token* token) { Py_ssize_t lineno = constant->lineno; const char *line_start = s; @@ -1350,7 +1350,7 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha lineno++; Py_ssize_t i = 0; - while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) { + while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) { i++; } @@ -1365,8 +1365,8 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha line_start += i+1; continue; } - if (i < dedent_count) { // found an invalid indent. - assert(line_start[i] != indent_char); + if (i < indent_len) { // found an invalid indent. + assert(line_start[i] != indent[i]); PyUnicodeWriter_Discard(w); RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1, "d-string line missing valid indentation"); @@ -1392,7 +1392,10 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha } static expr_ty -_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) { +_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, + const char *indent, Py_ssize_t indent_len, + expr_ty constant, Token* token) +{ assert(PyUnicode_CheckExact(constant->v.Constant.value)); const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value); @@ -1402,9 +1405,9 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha is_raw = is_raw || strchr(bstr, '\\') == NULL; PyObject *str = NULL; - if (dedent_count > 0) { - str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count, - is_raw, is_first, constant, token); + if (indent_len > 0) { + str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len, + is_first, is_raw, constant, token); } else { str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token); @@ -1423,6 +1426,14 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha p->arena); } +/* defined in unicodeobject.c */ +extern Py_ssize_t +_Py_search_longest_common_leading_whitespace( + const char *const src, + const char *const end, + const char **output + ); + static asdl_expr_seq * _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind) { @@ -1441,14 +1452,15 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b } int is_raw = strpbrk(quote_str, "rR") != NULL; int is_dedent = strpbrk(quote_str, "dD") != NULL; - int indent_char = 0; - Py_ssize_t indent_count = 0; asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena); if (seq == NULL) { return NULL; } + const char *common_indent_start = NULL; + Py_ssize_t common_indent_len = 0; + if (is_dedent) { expr_ty first_item = asdl_seq_GET(raw_expressions, 0); if (first_item->kind != Constant_kind @@ -1460,52 +1472,52 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b return NULL; } - expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1); - if (last_item->kind != Constant_kind) { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION( - last_item, - "d-string must end with an indent line" - ); + // Instead of calculating common indent from all parts, + // build temporary string and calculate common indent from it. + PyBytesWriter *w = PyBytesWriter_Create(0); + if (w == NULL) { return NULL; } - Py_ssize_t blen; - const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen); - if (bstr == NULL) { - return NULL; - } + for (Py_ssize_t i = 0; i < n_items; i++) { + expr_ty item = asdl_seq_GET(raw_expressions, i); - // memrchr is GNU extension; use manual loop for portability. - const char *lastline = bstr + blen; - while (bstr < lastline) { - if (lastline[-1] == '\n') { - break; - } - lastline--; - if (*lastline != ' ' && *lastline != '\t') { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION( - last_item, - "d-string must end with an indent line" - ); - return NULL; + if (item->kind == JoinedStr_kind) { + // Write a placeholder. + if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) { + PyBytesWriter_Discard(w); + return NULL; + } + continue; } - } - - // checks indent of the last line. - indent_count = bstr + blen - lastline; - if (indent_count > 0) { - indent_char = lastline[0]; - - for (Py_ssize_t i = 1; i < indent_count; i++) { - if (lastline[i] != indent_char) { - RAISE_ERROR_KNOWN_LOCATION( - p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1, - "inconsistent use of tabs and spaces in indentation" - ); + if (item->kind == Constant_kind) { + Py_ssize_t blen; + const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen); + if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) { + PyBytesWriter_Discard(w); return NULL; } + continue; } } + // Add a terminator to include the last line before the ending quote + if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) { + PyBytesWriter_Discard(w); + return NULL; + } + + // TODO: instead of creating temp_bytes, we could search + // common index from each part directly. But this need reimplementation + // of _Py_search_longest_common_leading_whitespace. + PyObject *temp_bytes = PyBytesWriter_Finish(w); + if (temp_bytes == NULL) { + return NULL; + } + _PyArena_AddPyObject(p->arena, temp_bytes); + const char *temp_str = PyBytes_AsString(temp_bytes); + const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes); + common_indent_len = _Py_search_longest_common_leading_whitespace( + temp_str, temp_end, &common_indent_start); } Py_ssize_t index = 0; @@ -1539,7 +1551,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b } if (item->kind == Constant_kind) { - item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b); + item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b); if (item == NULL) { return NULL; } diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index dc3d369c023119..cfccdd55a6ae11 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -490,9 +490,6 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, if (saw_b && saw_t) { RETURN_SYNTAX_ERROR("b", "t"); } - if (saw_b && saw_d) { - RETURN_SYNTAX_ERROR("b", "d"); - } if (saw_f && saw_t) { RETURN_SYNTAX_ERROR("f", "t"); diff --git a/Parser/string_parser.c b/Parser/string_parser.c index 99f73c099ff7a2..0fbc20ee4ce6fb 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -247,66 +247,54 @@ _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t) return decode_unicode_with_escapes(p, s, len, t); } +/* defined in unicodeobject.c */ +extern Py_ssize_t +_Py_search_longest_common_leading_whitespace( + const char *const src, + const char *const end, + const char **output + ); + +// Dedent d-string and return result as a bytes. static PyObject* -_PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Token* token) +_PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token) { // this function is for d-string without t/f-string. // dt/df-string are processed in action_helper.c:_get_resized_exprs Py_ssize_t lineno = token->lineno; + const char *end = s + len; + // skips the first newline. if (len == 0 || s[0] != '\n') { RAISE_SYNTAX_ERROR_KNOWN_LOCATION( token, - "d-string must start with a newline at line %d", - lineno + "d-string must start with a newline" ); return NULL; } - // find the last newline and check all chars after it are spaces or tabs. - const char *endline = s + len; - while (endline[-1] != '\n') { - assert(endline > s); // we know at least the first char is a newline. - endline--; - if (*endline != ' ' && *endline != '\t') { - RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, - // specify the location of just before closing triple quotes. - token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset-2, - "d-string must end with an indent line"); - return NULL; - } - } - - // Now, prefix is both the dedent indentation and the end of the d-string body. - Py_ssize_t indent_len = s + len - endline; - int indent_char = endline[0]; // ' ', '\t', or '\0'. + // We find common indent from [s, end+1) because we want to include the last line + // for indent calculation. + assert(*end == '"' || *end == '\''); // end[0:3] is the trailing quotes + const char *indent; + Py_ssize_t indent_len = _Py_search_longest_common_leading_whitespace(s, end+1, &indent); - // checks the prefix is consistant. - for (Py_ssize_t i = 1; i < indent_len; i++) { - if (endline[i] != indent_char) { - RAISE_ERROR_KNOWN_LOCATION( - p, PyExc_TabError, token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset -2, - "inconsistent use of tabs and spaces in indentation"); - return NULL; - } - } - - PyUnicodeWriter *w = PyUnicodeWriter_Create(endline - s); + PyBytesWriter *w = PyBytesWriter_Create(0); if (w == NULL) { return NULL; } - const char *line_start = s + 1; // skip the first newline + const char *line_start = s + 1; - while (line_start < endline) { + while (line_start < end) { lineno++; Py_ssize_t i; - for (i = 0; i < indent_len && line_start + i < endline; i++) { - if (line_start[i] != indent_char) { + for (i = 0; i < indent_len; i++) { + if (line_start[i] != indent[i]) { if (line_start[i] == '\n') { break; // empty line } - PyUnicodeWriter_Discard(w); + PyBytesWriter_Discard(w); RAISE_ERROR_KNOWN_LOCATION(p, PyExc_IndentationError, lineno, i, lineno, i+1, "d-string missing valid indentation"); return NULL; @@ -314,8 +302,8 @@ _PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Tok } if (line_start[i] == '\n') { // found an empty line with newline. - if (PyUnicodeWriter_WriteChar(w, '\n') < 0) { - PyUnicodeWriter_Discard(w); + if (PyBytesWriter_WriteBytes(w, "\n", 1) < 0) { + PyBytesWriter_Discard(w); return NULL; } line_start += i+1; @@ -324,28 +312,21 @@ _PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Tok // found a indented line. let's dedent it. line_start += i; - const char *line_end = memchr(line_start, '\n', endline - line_start); - assert(line_end != NULL); // we know there is at least one newline before endline. - line_end++; // include the newline in the line - - if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) { - if (PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start) < 0) { - PyUnicodeWriter_Discard(w); - return NULL; - } + const char *line_end = memchr(line_start, '\n', end - line_start); + if (line_end == NULL) { + line_end = end; // last line without newline } else { - PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token); - if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) { - Py_XDECREF(line); - return NULL; - } - Py_DECREF(line); + line_end++; // include the newline in the line } + if (PyBytesWriter_WriteBytes(w, line_start, line_end - line_start) < 0) { + PyBytesWriter_Discard(w); + return NULL; + } line_start = line_end; } - return PyUnicodeWriter_Finish(w); + return PyBytesWriter_Finish(w); } /* s must include the bracketing quote characters, and r, b &/or f prefixes @@ -427,11 +408,22 @@ _PyPegen_parse_string(Parser *p, Token *t) "d-string must be triple-quoted"); return NULL; } + PyObject *dedent_bytes = NULL; + if (dedentmode) { + dedent_bytes = _PyPegen_dedent_string(p, s, len, t); + if (dedent_bytes == NULL) { + return NULL; + } + if (PyBytes_AsStringAndSize(dedent_bytes, &s, (Py_ssize_t*)&len) < 0) { + Py_DECREF(dedent_bytes); + return NULL; + } + } /* Avoid invoking escape decoding routines if possible. */ rawmode = rawmode || strchr(s, '\\') == NULL; + PyObject *result; if (bytesmode) { - assert(!dedentmode); /* Disallow non-ASCII characters. */ const char *ch; for (ch = s; *ch; ch++) { @@ -440,17 +432,21 @@ _PyPegen_parse_string(Parser *p, Token *t) t, "bytes can only contain ASCII " "literal characters"); + Py_XDECREF(dedent_bytes); return NULL; } } if (rawmode) { - return PyBytes_FromStringAndSize(s, (Py_ssize_t)len); + result = PyBytes_FromStringAndSize(s, (Py_ssize_t)len); + } + else { + result = decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t); } - return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t); } - if (dedentmode) { - return _PyPegen_dedent_string(p, rawmode, s, len, t); + else { + result = _PyPegen_decode_string(p, rawmode, s, len, t); } - return _PyPegen_decode_string(p, rawmode, s, len, t); + Py_XDECREF(dedent_bytes); + return result; } From e1320af7496be58f53f6cd646c0a1bce3a2300a2 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 23 Jan 2026 23:03:26 +0900 Subject: [PATCH 3/5] fix invalid escape sequences position --- Parser/action_helpers.c | 60 ++++++++++++++++------- Parser/string_parser.c | 104 +++++++++++++++++++++++----------------- 2 files changed, 103 insertions(+), 61 deletions(-) diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 50337f48b3a303..042b3ea97628ca 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1311,17 +1311,34 @@ unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, } static PyObject* -_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len, - int is_first, int is_raw, expr_ty constant, Token* token) +_PyPegen_dedent_string_part( + Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len, + int is_first, int is_raw, expr_ty constant, Token* token) { Py_ssize_t lineno = constant->lineno; const char *line_start = s; - const char *s_end = s + len; + const char *end = s + len; + + int _prev_call_invalid = p->call_invalid_rules; + if (!_prev_call_invalid && !is_raw) { + // _PyPegen_decode_string() and decode_bytes_with_escapes() may call + // warn_invalid_escape_sequence(). It may emit issue or raise SyntaxError + // for invalid escape sequences. + // We need to call it before dedenting since SyntaxError needs exact lineno + // and col_offset of invalid escape sequences. + PyObject *t = _PyPegen_decode_string(p, 0, s, len, token); + if (t == NULL) { + return NULL; + } + Py_DECREF(t); + p->call_invalid_rules = 1; + } PyUnicodeWriter *w = PyUnicodeWriter_Create(len); if (w == NULL) { return NULL; } + if (is_first) { assert (line_start[0] == '\n'); line_start++; // skip the first newline @@ -1332,25 +1349,24 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *in // next line // """" // We don't need to dedent the first line in the non-first parts. - const char *line_end = memchr(line_start, '\n', s_end - line_start); + const char *line_end = memchr(line_start, '\n', end - line_start); if (line_end) { line_end++; // include the newline } else { - line_end = s_end; + line_end = end; } if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) { - PyUnicodeWriter_Discard(w); - return NULL; + goto error; } line_start = line_end; } - while (line_start < s + len) { + while (line_start < end) { lineno++; Py_ssize_t i = 0; - while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) { + while (line_start + i < end && i < indent_len && line_start[i] == indent[i]) { i++; } @@ -1359,36 +1375,39 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *in } if (line_start[i] == '\n') { // found an empty line with newline. if (PyUnicodeWriter_WriteChar(w, '\n') < 0) { - PyUnicodeWriter_Discard(w); - return NULL; + goto error; } line_start += i+1; continue; } if (i < indent_len) { // found an invalid indent. assert(line_start[i] != indent[i]); - PyUnicodeWriter_Discard(w); RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1, "d-string line missing valid indentation"); - return NULL; + goto error; } // found a indented line. let's dedent it. line_start += i; - const char *line_end = memchr(line_start, '\n', s_end - line_start); + const char *line_end = memchr(line_start, '\n', end - line_start); if (line_end) { line_end++; // include the newline } else { - line_end = s_end; + line_end = end; } if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) { - PyUnicodeWriter_Discard(w); - return NULL; + goto error; } line_start = line_end; } + p->call_invalid_rules = _prev_call_invalid; return PyUnicodeWriter_Finish(w); + +error: + p->call_invalid_rules = _prev_call_invalid; + PyUnicodeWriter_Discard(w); + return NULL; } static expr_ty @@ -1405,7 +1424,7 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, is_raw = is_raw || strchr(bstr, '\\') == NULL; PyObject *str = NULL; - if (indent_len > 0) { + if (indent != NULL) { str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len, is_first, is_raw, constant, token); } @@ -1518,6 +1537,11 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes); common_indent_len = _Py_search_longest_common_leading_whitespace( temp_str, temp_end, &common_indent_start); + // _py_serach_longest_common_leading_whitespace() may return NULL when + // indent_len is 0. + if (common_indent_len == 0) { + common_indent_start = ""; + } } Py_ssize_t index = 0; diff --git a/Parser/string_parser.c b/Parser/string_parser.c index 0fbc20ee4ce6fb..3425b856796fe5 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -257,41 +257,20 @@ _Py_search_longest_common_leading_whitespace( // Dedent d-string and return result as a bytes. static PyObject* -_PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token) +_PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, + const char *indent, Py_ssize_t indent_len, int lineno) { - // this function is for d-string without t/f-string. - // dt/df-string are processed in action_helper.c:_get_resized_exprs - Py_ssize_t lineno = token->lineno; - const char *end = s + len; - - // skips the first newline. - if (len == 0 || s[0] != '\n') { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION( - token, - "d-string must start with a newline" - ); - return NULL; - } - - // We find common indent from [s, end+1) because we want to include the last line - // for indent calculation. - assert(*end == '"' || *end == '\''); // end[0:3] is the trailing quotes - const char *indent; - Py_ssize_t indent_len = _Py_search_longest_common_leading_whitespace(s, end+1, &indent); - PyBytesWriter *w = PyBytesWriter_Create(0); if (w == NULL) { return NULL; } - const char *line_start = s + 1; - - while (line_start < end) { - lineno++; + const char *end = s + len; + for (; s < end; lineno++) { Py_ssize_t i; for (i = 0; i < indent_len; i++) { - if (line_start[i] != indent[i]) { - if (line_start[i] == '\n') { + if (s[i] != indent[i]) { + if (s[i] == '\n') { break; // empty line } PyBytesWriter_Discard(w); @@ -301,18 +280,18 @@ _PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token) } } - if (line_start[i] == '\n') { // found an empty line with newline. + if (s[i] == '\n') { // found an empty line with newline. if (PyBytesWriter_WriteBytes(w, "\n", 1) < 0) { PyBytesWriter_Discard(w); return NULL; } - line_start += i+1; + s += i+1; continue; } // found a indented line. let's dedent it. - line_start += i; - const char *line_end = memchr(line_start, '\n', end - line_start); + s += i; + const char *line_end = memchr(s, '\n', end - s); if (line_end == NULL) { line_end = end; // last line without newline } @@ -320,11 +299,11 @@ _PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token) line_end++; // include the newline in the line } - if (PyBytesWriter_WriteBytes(w, line_start, line_end - line_start) < 0) { + if (PyBytesWriter_WriteBytes(w, s, line_end - s) < 0) { PyBytesWriter_Discard(w); return NULL; } - line_start = line_end; + s = line_end; } return PyBytesWriter_Finish(w); } @@ -403,25 +382,62 @@ _PyPegen_parse_string(Parser *p, Token *t) } } else if (dedentmode) { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION( - t, - "d-string must be triple-quoted"); + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "d-string must be triple-quoted"); return NULL; } + + /* Avoid invoking escape decoding routines if possible. */ + rawmode = rawmode || strchr(s, '\\') == NULL; + + int _prev_call_invald = p->call_invalid_rules; + PyObject *dedent_bytes = NULL; if (dedentmode) { - dedent_bytes = _PyPegen_dedent_string(p, s, len, t); - if (dedent_bytes == NULL) { + if (len == 0 || s[0] != '\n') { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "d-string must start with a newline"); return NULL; } - if (PyBytes_AsStringAndSize(dedent_bytes, &s, (Py_ssize_t*)&len) < 0) { - Py_DECREF(dedent_bytes); - return NULL; + + // _PyPegen_decode_string() and decode_bytes_with_escapes() emit + // a warning for invalid escape sequences. + // We need to call it before dedenting since it shifts the positions. + if (!_prev_call_invald && !rawmode) { + PyObject *temp; + if (bytesmode) { + temp = decode_bytes_with_escapes(p, s, len, t); + } + else { + temp = _PyPegen_decode_string(p, 0, s, len, t); + } + if (temp == NULL) { + return NULL; + } + Py_DECREF(temp); + } + + // We find common indent from [s, end+1) because we want to include the last line + // for indent calculation. + const char *end = s + len; + assert(*end == '"' || *end == '\''); // end[0:3] is the trailing quotes + const char *indent; + Py_ssize_t indent_len = _Py_search_longest_common_leading_whitespace(s+1, end+1, &indent); + + s++; len--; // skip the first newline + if (indent_len > 0) { + // dedent the string + dedent_bytes = _PyPegen_dedent_string(p, s, len, indent, indent_len, t->lineno + 1); + if (dedent_bytes == NULL) { + return NULL; + } + if (PyBytes_AsStringAndSize(dedent_bytes, (char**)&s, (Py_ssize_t*)&len) < 0) { + Py_DECREF(dedent_bytes); + return NULL; + } } + + p->call_invalid_rules = 1; } - /* Avoid invoking escape decoding routines if possible. */ - rawmode = rawmode || strchr(s, '\\') == NULL; PyObject *result; if (bytesmode) { /* Disallow non-ASCII characters. */ @@ -433,6 +449,7 @@ _PyPegen_parse_string(Parser *p, Token *t) "bytes can only contain ASCII " "literal characters"); Py_XDECREF(dedent_bytes); + p->call_invalid_rules = _prev_call_invald; return NULL; } } @@ -447,6 +464,7 @@ _PyPegen_parse_string(Parser *p, Token *t) result = _PyPegen_decode_string(p, rawmode, s, len, t); } Py_XDECREF(dedent_bytes); + p->call_invalid_rules = _prev_call_invald; return result; } From 733c2d07edf8dd30563add2b8753a64177f6ebd4 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 25 Jan 2026 17:27:22 +0900 Subject: [PATCH 4/5] improve tests --- Lib/test/test_dstring.py | 94 ++++++++++++++++++++++++++++++++++------ Parser/action_helpers.c | 7 +++ Parser/lexer/lexer.c | 2 +- 3 files changed, 88 insertions(+), 15 deletions(-) diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py index 7927877e8bb088..5161c21b2a6c26 100644 --- a/Lib/test/test_dstring.py +++ b/Lib/test/test_dstring.py @@ -1,37 +1,103 @@ import unittest +_dstring_prefixes = "d db df dt dr drb drf drt".split() +_dstring_prefixes += [p.upper() for p in _dstring_prefixes] + + +def d(s): + # Helper function to evaluate d-strings. + if '"""' in s: + return eval(f"d'''{s}'''") + else: + return eval(f'd"""{s}"""') + + class DStringTestCase(unittest.TestCase): def assertAllRaise(self, exception_type, regex, error_strings): for str in error_strings: with self.subTest(str=str): with self.assertRaisesRegex(exception_type, regex) as cm: eval(str) - # print("Testing expression:", repr(str)) - # print(repr(cm.exception)) - # print(repr(cm.exception.text)) def test_single_quote(self): exprs = [ - "d'hello'", - 'D"hello"', - "d'hello\\nworld'", + f"{p}'hello, world'" for p in _dstring_prefixes + ] + [ + f'{p}"hello, world"' for p in _dstring_prefixes ] self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs) def test_empty_dstring(self): exprs = [ - "d''''''", - 'D""""""', + f"{p}''''''" for p in _dstring_prefixes + ] + [ + f'{p}""""""' for p in _dstring_prefixes ] self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs) - def test_simple_dstring(self): - self.assertEqual(eval('d"""\n hello world\n """'), "hello world\n") - self.assertEqual(eval('d"""\n hello world\n """'), " hello world\n") - self.assertEqual(eval('d"""\n hello world\n"""'), " hello world\n") - self.assertEqual(eval('d"""\n hello world\\\n """'), " hello world") - self.assertEqual(eval('dr"""\n hello world\\\n """'), " hello world\\\n") + for prefix in _dstring_prefixes: + expr = f"{prefix}'''\n'''" + expr2 = f'{prefix}"""\n"""' + with self.subTest(expr=expr): + v = eval(expr) + v2 = eval(expr2) + if 't' in prefix.lower(): + self.assertEqual(v.strings, ("",)) + self.assertEqual(v2.strings, ("",)) + elif 'b' in prefix.lower(): + self.assertEqual(v, b"") + self.assertEqual(v2, b"") + else: + self.assertEqual(v, "") + self.assertEqual(v2, "") + + def test_dedent(self): + # Basic dedent - remove common leading whitespace + result = d(""" + hello + world + """) + self.assertEqual(result, "hello\nworld\n") + + # Dedent with varying indentation + result = d(""" + line1 + line2 + line3 + """) + self.assertEqual(result, " line1\n line2\nline3\n ") + + # Dedent with tabs + result = d(""" +\thello +\tworld +\t""") + self.assertEqual(result, "hello\nworld\n") + + # Mixed spaces and tabs (using common leading whitespace) + result = d(""" +\t\t hello +\t\t world +\t\t """) + self.assertEqual(result, " hello\n world\n") + + # Empty lines do not affect the calculation of common leading whitespace + result = d(""" + hello + + world + """) + self.assertEqual(result, "hello\n\nworld\n") + + # Lines with only whitespace also have their indentation removed. + result = d(""" + hello + \n\ + \n\ + world + """) + self.assertEqual(result, "hello\n\n \nworld\n") if __name__ == '__main__': diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 042b3ea97628ca..b2a778e13e2280 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1481,6 +1481,13 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b Py_ssize_t common_indent_len = 0; if (is_dedent) { + if (total_items == 0) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + a, + "d-string must start with a newline" + ); + return NULL; + } expr_ty first_item = asdl_seq_GET(raw_expressions, 0); if (first_item->kind != Constant_kind || PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') { diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index cfccdd55a6ae11..07c61f1cb4386b 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -1124,7 +1124,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t case 'd': case 'D': if (quote_size != 3) { - return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be a multiline string")); + return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be triple-quoted")); } break; default: From 5faa196b6cba1811b7afa80ebe58562d3f83578c Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Wed, 28 Jan 2026 19:30:21 +0900 Subject: [PATCH 5/5] t/f-string: calculate common indent without temp string --- Parser/action_helpers.c | 151 +++++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 48 deletions(-) diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index b2a778e13e2280..567da3475eb098 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -1445,13 +1445,102 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, p->arena); } -/* defined in unicodeobject.c */ -extern Py_ssize_t -_Py_search_longest_common_leading_whitespace( +/* +This function is customized version of _Py_search_longest_common_leading_whitespace() +in unicodeobject.c +*/ +static void +search_longest_common_leading_whitespace( const char *const src, const char *const end, - const char **output - ); + const char **indent, + Py_ssize_t *indent_len) +{ + // [_start, _start + _len) + // describes the current longest common leading whitespace + const char *_start = *indent; + Py_ssize_t _len = *indent_len; + + // skip the first line. for example: + // s = df""" + // first part + // first part{x}second part + // second part + // """ + // we don't need newline after opening qute. + // we don't need first line in the second part too. + const char *iter = memchr(src, '\n', end - src); + if (iter == NULL) { + // single line string + return; + } + + for (iter++; iter <= end; iter++) { + const char *line_start = iter; + const char *leading_whitespace_end = NULL; + + // scan the whole line + while (iter < end && *iter != '\n') { + if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') { + /* `iter` points to the first non-whitespace character + in this line */ + if (iter == line_start) { + // some line has no indent, fast exit! + *indent = iter; + *indent_len = 0; + return; + } + leading_whitespace_end = iter; + } + ++iter; + } + + if (!leading_whitespace_end) { + // if this line has all white space, skip it + if (iter < end) { + continue; + } + leading_whitespace_end = iter; // last line may not end with '\n' + } + + if (!_start) { + // update the first leading whitespace + _start = line_start; + _len = leading_whitespace_end - line_start; + } + else { + /* We then compare with the current longest leading whitespace. + + [line_start, leading_whitespace_end) is the leading + whitespace of this line, + + [_start, _start + _len) is the leading whitespace of the + current longest leading whitespace. */ + Py_ssize_t new_len = 0; + const char *_iter = _start, *line_iter = line_start; + + while (_iter < _start + _len && line_iter < leading_whitespace_end + && *_iter == *line_iter) + { + ++_iter; + ++line_iter; + ++new_len; + } + + _len = new_len; + if (_len == 0) { + // No common things now, fast exit! + *indent = _start; + *indent_len = 0; + return; + } + } + } + + *indent = _start; + *indent_len = _len; +} + static asdl_expr_seq * _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind) @@ -1477,8 +1566,8 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b return NULL; } - const char *common_indent_start = NULL; - Py_ssize_t common_indent_len = 0; + const char *indent_start = NULL; + Py_ssize_t indent_len = 0; if (is_dedent) { if (total_items == 0) { @@ -1498,56 +1587,22 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b return NULL; } - // Instead of calculating common indent from all parts, - // build temporary string and calculate common indent from it. - PyBytesWriter *w = PyBytesWriter_Create(0); - if (w == NULL) { - return NULL; - } - for (Py_ssize_t i = 0; i < n_items; i++) { expr_ty item = asdl_seq_GET(raw_expressions, i); - - if (item->kind == JoinedStr_kind) { - // Write a placeholder. - if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) { - PyBytesWriter_Discard(w); - return NULL; - } - continue; - } if (item->kind == Constant_kind) { Py_ssize_t blen; const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen); - if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) { - PyBytesWriter_Discard(w); + if (bstr == NULL) { return NULL; } - continue; + search_longest_common_leading_whitespace(bstr, bstr + blen, &indent_start, &indent_len); } } - // Add a terminator to include the last line before the ending quote - if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) { - PyBytesWriter_Discard(w); - return NULL; - } - // TODO: instead of creating temp_bytes, we could search - // common index from each part directly. But this need reimplementation - // of _Py_search_longest_common_leading_whitespace. - PyObject *temp_bytes = PyBytesWriter_Finish(w); - if (temp_bytes == NULL) { - return NULL; - } - _PyArena_AddPyObject(p->arena, temp_bytes); - const char *temp_str = PyBytes_AsString(temp_bytes); - const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes); - common_indent_len = _Py_search_longest_common_leading_whitespace( - temp_str, temp_end, &common_indent_start); - // _py_serach_longest_common_leading_whitespace() may return NULL when - // indent_len is 0. - if (common_indent_len == 0) { - common_indent_start = ""; + assert(indent_start != NULL); // TODO: is this assert true? + // _py_serach_longest_common_leading_whitespace() may not set indent_start when string is empty. + if (indent_len == 0) { + indent_start = ""; } } @@ -1582,7 +1637,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b } if (item->kind == Constant_kind) { - item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b); + item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_start, indent_len, item, b); if (item == NULL) { return NULL; }