From 935d4f0e42e81e1e7a4f95b949e9e72be9e4e7b8 Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Fri, 2 Jan 2026 13:35:04 +0900
Subject: [PATCH 1/5] first implementation of d-string

---
 Lib/test/test_dstring.py  |  47 ++++++++++
 Lib/test/test_tokenize.py |   4 +-
 Lib/tokenize.py           |   3 +-
 Parser/action_helpers.c   | 180 ++++++++++++++++++++++++++++++++++++--
 Parser/lexer/lexer.c      |  59 ++++++++-----
 Parser/string_parser.c    | 119 ++++++++++++++++++++++++-
 6 files changed, 376 insertions(+), 36 deletions(-)
 create mode 100644 Lib/test/test_dstring.py

diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py
new file mode 100644
index 00000000000000..09592980dee8c5
--- /dev/null
+++ b/Lib/test/test_dstring.py
@@ -0,0 +1,47 @@
+import unittest
+
+
+class DStringTestCase(unittest.TestCase):
+    def assertAllRaise(self, exception_type, regex, error_strings):
+        for str in error_strings:
+            with self.subTest(str=str):
+                with self.assertRaisesRegex(exception_type, regex) as cm:
+                    eval(str)
+                # print("Testing expression:", repr(str))
+                # print(repr(cm.exception))
+                # print(repr(cm.exception.text))
+
+    def test_single_quote(self):
+        exprs = [
+            "d'hello'",
+            'D"hello"',
+            "d'hello\\nworld'",
+        ]
+        self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs)
+
+    def test_empty_dstring(self):
+        exprs = [
+            "d''''''",
+            'D""""""',
+        ]
+        self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)
+
+    def test_no_last_newline(self):
+        exprs = [
+            "d'''\nhello world'''",
+            'D"""\nhello world"""',
+            "df'''\nhello {42}'''",
+        ]
+        self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs)
+
+    def test_simple_dstring(self):
+        self.assertEqual(eval('d"""\n  hello world\n  """'), "hello world\n")
+        self.assertEqual(eval('d"""\n  hello world\n """'), " hello world\n")
+        self.assertEqual(eval('d"""\n  hello world\n"""'), "  hello world\n")
+        self.assertEqual(eval('d"""\n  hello world\\\n """'), " hello world")
+        self.assertEqual(eval('dr"""\n  hello world\\\n """'), " hello world\\\n")
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index ca67e381958757..2110f4c6a48320 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -3420,7 +3420,7 @@ def determine_valid_prefixes():
         # some uppercase-only prefix is added.
         for letter in itertools.chain(string.ascii_lowercase, string.ascii_uppercase):
             try:
-                eval(f'{letter}""')
+                eval(f'{letter}"""\n"""')  # d-string needs multiline
                 single_char_valid_prefixes.add(letter.lower())
             except SyntaxError:
                 pass
@@ -3444,7 +3444,7 @@ def determine_valid_prefixes():
                             # because it's a valid expression: not ""
                             continue
                         try:
-                            eval(f'{p}""')
+                            eval(f'{p}"""\n"""')  # d-string needs multiline
 
                             # No syntax error, so p is a valid string
                             # prefix.
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 11c134482db024..74a709024174b6 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -86,7 +86,8 @@ def _all_string_prefixes():
     # The valid string prefixes. Only contain the lower case versions,
     #  and don't contain any permutations (include 'fr', but not
     #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
+    _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'd', 'br', 'fr', 'tr',
+                              'bd', 'rd', 'fd', 'td', 'brd', 'frd', 'trd']
     # if we add binary f-strings, add: ['fb', 'fbr']
     result = {''}
     for prefix in _valid_string_prefixes:
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
index 50856686335a14..2f143788ff53ab 100644
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@@ -1292,24 +1292,124 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq
 
 // Fstring stuff
 
+static int
+unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end,
+                         int is_raw, Token* token)
+{
+    if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
+        return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start);
+    }
+    else {
+        PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
+        if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
+            Py_XDECREF(line);
+            return -1;
+        }
+        Py_DECREF(line);
+    }
+    return 0;
+}
+
+static PyObject*
+_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count,
+                            int is_raw, int is_first, expr_ty constant, Token* token)
+{
+    Py_ssize_t lineno = constant->lineno;
+    const char *line_start = s;
+    const char *s_end = s + len;
+
+    PyUnicodeWriter *w = PyUnicodeWriter_Create(len);
+    if (w == NULL) {
+        return NULL;
+    }
+    if (is_first) {
+        assert (line_start[0] == '\n');
+        line_start++;  // skip the first newline
+    }
+    else {
+        // Example: df"""
+        //      first part {param} second part
+        //      next line
+        //    """"
+        // We don't need to dedent the first line in the non-first parts.
+        const char *line_end = memchr(line_start, '\n', s_end - line_start);
+        if (line_end) {
+            line_end++; // include the newline
+        }
+        else {
+            line_end = s_end;
+        }
+        if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
+            PyUnicodeWriter_Discard(w);
+            return NULL;
+        }
+        line_start = line_end;
+    }
+
+    while (line_start < s + len) {
+        lineno++;
+
+        Py_ssize_t i = 0;
+        while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) {
+            i++;
+        }
+
+        if (line_start[i] == '\0') {  // found an empty line without newline.
+            break;
+        }
+        if (line_start[i] == '\n') {  // found an empty line with newline.
+            if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
+                PyUnicodeWriter_Discard(w);
+                return NULL;
+            }
+            line_start += i+1;
+            continue;
+        }
+        if (i < dedent_count) {  // found an invalid indent.
+            assert(line_start[i] != indent_char);
+            PyUnicodeWriter_Discard(w);
+            RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
+                "d-string line missing valid indentation");
+            return NULL;
+        }
+
+        // found a indented line. let's dedent it.
+        line_start += i;
+        const char *line_end = memchr(line_start, '\n', s_end - line_start);
+        if (line_end) {
+            line_end++; // include the newline
+        }
+        else {
+            line_end = s_end;
+        }
+        if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
+            PyUnicodeWriter_Discard(w);
+            return NULL;
+        }
+        line_start = line_end;
+    }
+    return  PyUnicodeWriter_Finish(w);
+}
+
 static expr_ty
-_PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) {
+_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) {
     assert(PyUnicode_CheckExact(constant->v.Constant.value));
 
     const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
     if (bstr == NULL) {
         return NULL;
     }
+    is_raw = is_raw || strchr(bstr, '\\') == NULL;
 
-    size_t len;
-    if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) {
-        len = 1;
-    } else {
-        len = strlen(bstr);
+    PyObject *str = NULL;
+    if (dedent_count > 0) {
+        str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count,
+                                        is_raw, is_first, constant, token);
+    }
+    else {
+        str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
     }
 
-    is_raw = is_raw || strchr(bstr, '\\') == NULL;
-    PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, token);
     if (str == NULL) {
         _Pypegen_raise_decode_error(p);
         return NULL;
@@ -1340,12 +1440,74 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         return NULL;
     }
     int is_raw = strpbrk(quote_str, "rR") != NULL;
+    int is_dedent = strpbrk(quote_str, "dD") != NULL;
+    int indent_char = 0;
+    Py_ssize_t indent_count = 0;
 
     asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
     if (seq == NULL) {
         return NULL;
     }
 
+    if (is_dedent) {
+        expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
+        if (first_item->kind != Constant_kind
+                || PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') {
+            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+                first_item,
+                "d-string must start with a newline"
+            );
+            return NULL;
+        }
+
+        expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1);
+        if (last_item->kind != Constant_kind) {
+            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+                last_item,
+                "d-string must end with an indent line"
+            );
+            return NULL;
+        }
+
+        Py_ssize_t blen;
+        const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen);
+        if (bstr == NULL) {
+            return NULL;
+        }
+
+        // memrchr is GNU extension; use manual loop for portability.
+        const char *lastline = bstr + blen;
+        while (bstr < lastline) {
+            if (lastline[-1] == '\n') {
+                break;
+            }
+            lastline--;
+            if (*lastline != ' ' && *lastline != '\t') {
+                RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+                    last_item,
+                    "d-string must end with an indent line"
+                );
+                return NULL;
+            }
+        }
+
+        // checks indent of the last line.
+        indent_count = bstr + blen - lastline;
+        if (indent_count > 0) {
+            indent_char = lastline[0];
+
+            for (Py_ssize_t i = 1; i < indent_count; i++) {
+                if (lastline[i] != indent_char) {
+                    RAISE_ERROR_KNOWN_LOCATION(
+                        p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1,
+                        "inconsistent use of tabs and spaces in indentation"
+                    );
+                    return NULL;
+                }
+            }
+        }
+    }
+
     Py_ssize_t index = 0;
     for (Py_ssize_t i = 0; i < n_items; i++) {
         expr_ty item = asdl_seq_GET(raw_expressions, i);
@@ -1377,7 +1539,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         }
 
         if (item->kind == Constant_kind) {
-            item = _PyPegen_decode_fstring_part(p, is_raw, item, b);
+            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b);
             if (item == NULL) {
                 return NULL;
             }
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
index 7f25afec302c22..dc3d369c023119 100644
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@@ -455,7 +455,7 @@ tok_continuation_line(struct tok_state *tok) {
 static int
 maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
                                              int saw_b, int saw_r, int saw_u,
-                                             int saw_f, int saw_t) {
+                                             int saw_f, int saw_t, int saw_d) {
     // Supported: rb, rf, rt (in any order)
     // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order)
 
@@ -480,6 +480,9 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
     if (saw_u && saw_t) {
         RETURN_SYNTAX_ERROR("u", "t");
     }
+    if (saw_u && saw_d) {
+        RETURN_SYNTAX_ERROR("u", "d");
+    }
 
     if (saw_b && saw_f) {
         RETURN_SYNTAX_ERROR("b", "f");
@@ -487,6 +490,9 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
     if (saw_b && saw_t) {
         RETURN_SYNTAX_ERROR("b", "t");
     }
+    if (saw_b && saw_d) {
+        RETURN_SYNTAX_ERROR("b", "d");
+    }
 
     if (saw_f && saw_t) {
         RETURN_SYNTAX_ERROR("f", "t");
@@ -741,8 +747,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     /* Identifier (most frequent token!) */
     nonascii = 0;
     if (is_potential_identifier_start(c)) {
-        /* Process the various legal combinations of b"", r"", u"", and f"". */
-        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0;
+        /* Process the various legal combinations of b"", r"", u"", f"", and d"". */
+        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0, saw_d = 0;
         while (1) {
             if (!saw_b && (c == 'b' || c == 'B')) {
                 saw_b = 1;
@@ -762,6 +768,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             else if (!saw_t && (c == 't' || c == 'T')) {
                 saw_t = 1;
             }
+            else if (!saw_d && (c == 'd' || c == 'D')) {
+                saw_d = 1;
+            }
             else {
                 break;
             }
@@ -769,7 +778,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             if (c == '"' || c == '\'') {
                 // Raise error on incompatible string prefixes:
                 int status = maybe_raise_syntax_error_for_string_prefixes(
-                    tok, saw_b, saw_r, saw_u, saw_f, saw_t);
+                    tok, saw_b, saw_r, saw_u, saw_f, saw_t, saw_d);
                 if (status < 0) {
                     return MAKE_TOKEN(ERRORTOKEN);
                 }
@@ -1049,7 +1058,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     }
 
   f_string_quote:
-    if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't')
+    if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't' || Py_TOLOWER(*tok->start) == 'd')
         && (c == '\'' || c == '"'))) {
 
         int quote = c;
@@ -1089,6 +1098,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->kind = TOK_FSTRING_MODE;
         the_current_tok->quote = quote;
         the_current_tok->quote_size = quote_size;
+        the_current_tok->raw = 0;
         the_current_tok->start = tok->start;
         the_current_tok->multi_line_start = tok->line_start;
         the_current_tok->first_line = tok->lineno;
@@ -1101,25 +1111,28 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->in_debug = 0;
 
         enum string_kind_t string_kind = FSTRING;
-        switch (*tok->start) {
-            case 'T':
-            case 't':
-                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
-                string_kind = TSTRING;
-                break;
-            case 'F':
-            case 'f':
-                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
-                break;
-            case 'R':
-            case 'r':
-                the_current_tok->raw = 1;
-                if (Py_TOLOWER(*(tok->start + 1)) == 't') {
+        for (const char *p = tok->start; *p != c; p++) {
+            switch (*p) {
+                case 'f':
+                case 'F':
+                    break;
+                case 't':
+                case 'T':
                     string_kind = TSTRING;
-                }
-                break;
-            default:
-                Py_UNREACHABLE();
+                    break;
+                case 'r':
+                case 'R':
+                    the_current_tok->raw = 1;
+                    break;
+                case 'd':
+                case 'D':
+                    if (quote_size != 3) {
+                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be a multiline string"));
+                    }
+                    break;
+                default:
+                    Py_UNREACHABLE();
+            }
         }
 
         the_current_tok->string_kind = string_kind;
diff --git a/Parser/string_parser.c b/Parser/string_parser.c
index b164dfbc81a933..99f73c099ff7a2 100644
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -247,6 +247,107 @@ _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
     return decode_unicode_with_escapes(p, s, len, t);
 }
 
+static PyObject*
+_PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Token* token)
+{
+    // this function is for d-string without t/f-string.
+    // dt/df-string are processed in action_helper.c:_get_resized_exprs
+    Py_ssize_t lineno = token->lineno;
+
+    if (len == 0 || s[0] != '\n') {
+        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+            token,
+            "d-string must start with a newline at line %d",
+            lineno
+        );
+        return NULL;
+    }
+
+    // find the last newline and check all chars after it are spaces or tabs.
+    const char *endline = s + len;
+    while (endline[-1] != '\n') {
+        assert(endline > s); // we know at least the first char is a newline.
+        endline--;
+        if (*endline != ' ' && *endline != '\t') {
+            RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
+                // specify the location of just before closing triple quotes.
+                token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset-2,
+                "d-string must end with an indent line");
+            return NULL;
+        }
+    }
+
+    // Now, prefix is both the dedent indentation and the end of the d-string body.
+    Py_ssize_t indent_len = s + len - endline;
+    int indent_char = endline[0];  // ' ', '\t', or '\0'.
+
+    // checks the prefix is consistant.
+    for (Py_ssize_t i = 1; i < indent_len; i++) {
+        if (endline[i] != indent_char) {
+            RAISE_ERROR_KNOWN_LOCATION(
+                p, PyExc_TabError, token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset -2,
+                "inconsistent use of tabs and spaces in indentation");
+            return NULL;
+        }
+    }
+
+    PyUnicodeWriter *w = PyUnicodeWriter_Create(endline - s);
+    if (w == NULL) {
+        return NULL;
+    }
+    const char *line_start = s + 1;  // skip the first newline
+
+    while (line_start < endline) {
+        lineno++;
+
+        Py_ssize_t i;
+        for (i = 0; i < indent_len && line_start + i < endline; i++) {
+            if (line_start[i] != indent_char) {
+                if (line_start[i] == '\n') {
+                    break; // empty line
+                }
+                PyUnicodeWriter_Discard(w);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_IndentationError, lineno, i, lineno, i+1,
+                    "d-string missing valid indentation");
+                return NULL;
+            }
+        }
+
+        if (line_start[i] == '\n') {  // found an empty line with newline.
+            if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
+                PyUnicodeWriter_Discard(w);
+                return NULL;
+            }
+            line_start += i+1;
+            continue;
+        }
+
+        // found a indented line. let's dedent it.
+        line_start += i;
+        const char *line_end = memchr(line_start, '\n', endline - line_start);
+        assert(line_end != NULL);  // we know there is at least one newline before endline.
+        line_end++; // include the newline in the line
+
+        if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
+            if (PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start) < 0) {
+                PyUnicodeWriter_Discard(w);
+                return NULL;
+            }
+        }
+        else {
+            PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
+            if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
+                Py_XDECREF(line);
+                return NULL;
+            }
+            Py_DECREF(line);
+        }
+
+        line_start = line_end;
+    }
+    return PyUnicodeWriter_Finish(w);
+}
+
 /* s must include the bracketing quote characters, and r, b &/or f prefixes
     (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
    _PyPegen_parse_string parses it, and returns the decoded Python string object. */
@@ -262,9 +363,10 @@ _PyPegen_parse_string(Parser *p, Token *t)
     int quote = Py_CHARMASK(*s);
     int bytesmode = 0;
     int rawmode = 0;
+    int dedentmode = 0;
 
     if (Py_ISALPHA(quote)) {
-        while (!bytesmode || !rawmode) {
+        while (!bytesmode || !rawmode || !dedentmode) {
             if (quote == 'b' || quote == 'B') {
                 quote =(unsigned char)*++s;
                 bytesmode = 1;
@@ -276,6 +378,10 @@ _PyPegen_parse_string(Parser *p, Token *t)
                 quote = (unsigned char)*++s;
                 rawmode = 1;
             }
+            else if (quote == 'd' || quote == 'D') {
+                quote =(unsigned char)*++s;
+                dedentmode = 1;
+            }
             else {
                 break;
             }
@@ -315,10 +421,17 @@ _PyPegen_parse_string(Parser *p, Token *t)
             return NULL;
         }
     }
+    else if (dedentmode) {
+        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+            t,
+            "d-string must be triple-quoted");
+        return NULL;
+    }
 
     /* Avoid invoking escape decoding routines if possible. */
     rawmode = rawmode || strchr(s, '\\') == NULL;
     if (bytesmode) {
+        assert(!dedentmode);
         /* Disallow non-ASCII characters. */
         const char *ch;
         for (ch = s; *ch; ch++) {
@@ -335,5 +448,9 @@ _PyPegen_parse_string(Parser *p, Token *t)
         }
         return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
     }
+    if (dedentmode) {
+        return _PyPegen_dedent_string(p, rawmode, s, len, t);
+    }
     return _PyPegen_decode_string(p, rawmode, s, len, t);
 }
+

From 3187540ea669eb80ad12722fb53df585ef393ea5 Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Wed, 21 Jan 2026 01:29:30 +0900
Subject: [PATCH 2/5] use least indent instead of closing quote indent

---
 Lib/test/test_dstring.py |   9 ---
 Objects/unicodeobject.c  |   6 +-
 Parser/action_helpers.c  | 110 ++++++++++++++++++++-----------------
 Parser/lexer/lexer.c     |   3 -
 Parser/string_parser.c   | 116 +++++++++++++++++++--------------------
 5 files changed, 120 insertions(+), 124 deletions(-)

diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py
index 09592980dee8c5..7927877e8bb088 100644
--- a/Lib/test/test_dstring.py
+++ b/Lib/test/test_dstring.py
@@ -26,14 +26,6 @@ def test_empty_dstring(self):
         ]
         self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)
 
-    def test_no_last_newline(self):
-        exprs = [
-            "d'''\nhello world'''",
-            'D"""\nhello world"""',
-            "df'''\nhello {42}'''",
-        ]
-        self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs)
-
     def test_simple_dstring(self):
         self.assertEqual(eval('d"""\n  hello world\n  """'), "hello world\n")
         self.assertEqual(eval('d"""\n  hello world\n """'), " hello world\n")
@@ -42,6 +34,5 @@ def test_simple_dstring(self):
         self.assertEqual(eval('dr"""\n  hello world\\\n """'), " hello world\\\n")
 
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index fdcbcf51cb62c2..412dbfc53b6353 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -13480,8 +13480,8 @@ of all lines in the [src, end).
 It returns the length of the common leading whitespace and sets `output` to
 point to the beginning of the common leading whitespace if length > 0.
 */
-static Py_ssize_t
-search_longest_common_leading_whitespace(
+Py_ssize_t
+_Py_search_longest_common_leading_whitespace(
     const char *const src,
     const char *const end,
     const char **output)
@@ -13576,7 +13576,7 @@ _PyUnicode_Dedent(PyObject *unicode)
     // [whitespace_start, whitespace_start + whitespace_len)
     // describes the current longest common leading whitespace
     const char *whitespace_start = NULL;
-    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
+    Py_ssize_t whitespace_len = _Py_search_longest_common_leading_whitespace(
         src, end, &whitespace_start);
 
     if (whitespace_len == 0) {
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
index 2f143788ff53ab..50337f48b3a303 100644
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@@ -1311,8 +1311,8 @@ unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start,
 }
 
 static PyObject*
-_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count,
-                            int is_raw, int is_first, expr_ty constant, Token* token)
+_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len,
+                            int is_first, int is_raw, expr_ty constant, Token* token)
 {
     Py_ssize_t lineno = constant->lineno;
     const char *line_start = s;
@@ -1350,7 +1350,7 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
         lineno++;
 
         Py_ssize_t i = 0;
-        while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) {
+        while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) {
             i++;
         }
 
@@ -1365,8 +1365,8 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
             line_start += i+1;
             continue;
         }
-        if (i < dedent_count) {  // found an invalid indent.
-            assert(line_start[i] != indent_char);
+        if (i < indent_len) {  // found an invalid indent.
+            assert(line_start[i] != indent[i]);
             PyUnicodeWriter_Discard(w);
             RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
                 "d-string line missing valid indentation");
@@ -1392,7 +1392,10 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
 }
 
 static expr_ty
-_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) {
+_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw,
+                             const char *indent, Py_ssize_t indent_len,
+                             expr_ty constant, Token* token)
+{
     assert(PyUnicode_CheckExact(constant->v.Constant.value));
 
     const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
@@ -1402,9 +1405,9 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha
     is_raw = is_raw || strchr(bstr, '\\') == NULL;
 
     PyObject *str = NULL;
-    if (dedent_count > 0) {
-        str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count,
-                                        is_raw, is_first, constant, token);
+    if (indent_len > 0) {
+        str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len,
+                                          is_first, is_raw, constant, token);
     }
     else {
         str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
@@ -1423,6 +1426,14 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha
                            p->arena);
 }
 
+/* defined in unicodeobject.c */
+extern Py_ssize_t
+_Py_search_longest_common_leading_whitespace(
+    const char *const src,
+    const char *const end,
+    const char **output
+    );
+
 static asdl_expr_seq *
 _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind)
 {
@@ -1441,14 +1452,15 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
     }
     int is_raw = strpbrk(quote_str, "rR") != NULL;
     int is_dedent = strpbrk(quote_str, "dD") != NULL;
-    int indent_char = 0;
-    Py_ssize_t indent_count = 0;
 
     asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
     if (seq == NULL) {
         return NULL;
     }
 
+    const char *common_indent_start = NULL;
+    Py_ssize_t common_indent_len = 0;
+
     if (is_dedent) {
         expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
         if (first_item->kind != Constant_kind
@@ -1460,52 +1472,52 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
             return NULL;
         }
 
-        expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1);
-        if (last_item->kind != Constant_kind) {
-            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
-                last_item,
-                "d-string must end with an indent line"
-            );
+        // Instead of calculating common indent from all parts,
+        // build temporary string and calculate common indent from it.
+        PyBytesWriter *w = PyBytesWriter_Create(0);
+        if (w == NULL) {
             return NULL;
         }
 
-        Py_ssize_t blen;
-        const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen);
-        if (bstr == NULL) {
-            return NULL;
-        }
+        for (Py_ssize_t i = 0; i < n_items; i++) {
+            expr_ty item = asdl_seq_GET(raw_expressions, i);
 
-        // memrchr is GNU extension; use manual loop for portability.
-        const char *lastline = bstr + blen;
-        while (bstr < lastline) {
-            if (lastline[-1] == '\n') {
-                break;
-            }
-            lastline--;
-            if (*lastline != ' ' && *lastline != '\t') {
-                RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
-                    last_item,
-                    "d-string must end with an indent line"
-                );
-                return NULL;
+            if (item->kind == JoinedStr_kind) {
+                // Write a placeholder.
+                if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
+                    PyBytesWriter_Discard(w);
+                    return NULL;
+                }
+                continue;
             }
-        }
-
-        // checks indent of the last line.
-        indent_count = bstr + blen - lastline;
-        if (indent_count > 0) {
-            indent_char = lastline[0];
-
-            for (Py_ssize_t i = 1; i < indent_count; i++) {
-                if (lastline[i] != indent_char) {
-                    RAISE_ERROR_KNOWN_LOCATION(
-                        p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1,
-                        "inconsistent use of tabs and spaces in indentation"
-                    );
+            if (item->kind == Constant_kind) {
+                Py_ssize_t blen;
+                const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen);
+                if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) {
+                    PyBytesWriter_Discard(w);
                     return NULL;
                 }
+                continue;
             }
         }
+        // Add a terminator to include the last line before the ending quote
+        if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
+            PyBytesWriter_Discard(w);
+            return NULL;
+        }
+
+        // TODO: instead of creating temp_bytes, we could search
+        // common index from each part directly. But this need reimplementation
+        // of _Py_search_longest_common_leading_whitespace.
+        PyObject *temp_bytes = PyBytesWriter_Finish(w);
+        if (temp_bytes == NULL) {
+            return NULL;
+        }
+        _PyArena_AddPyObject(p->arena, temp_bytes);
+        const char *temp_str = PyBytes_AsString(temp_bytes);
+        const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes);
+        common_indent_len = _Py_search_longest_common_leading_whitespace(
+            temp_str, temp_end, &common_indent_start);
     }
 
     Py_ssize_t index = 0;
@@ -1539,7 +1551,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         }
 
         if (item->kind == Constant_kind) {
-            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b);
+            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b);
             if (item == NULL) {
                 return NULL;
             }
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
index dc3d369c023119..cfccdd55a6ae11 100644
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@@ -490,9 +490,6 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
     if (saw_b && saw_t) {
         RETURN_SYNTAX_ERROR("b", "t");
     }
-    if (saw_b && saw_d) {
-        RETURN_SYNTAX_ERROR("b", "d");
-    }
 
     if (saw_f && saw_t) {
         RETURN_SYNTAX_ERROR("f", "t");
diff --git a/Parser/string_parser.c b/Parser/string_parser.c
index 99f73c099ff7a2..0fbc20ee4ce6fb 100644
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -247,66 +247,54 @@ _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
     return decode_unicode_with_escapes(p, s, len, t);
 }
 
+/* defined in unicodeobject.c */
+extern Py_ssize_t
+_Py_search_longest_common_leading_whitespace(
+    const char *const src,
+    const char *const end,
+    const char **output
+    );
+
+// Dedent d-string and return result as a bytes.
 static PyObject*
-_PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Token* token)
+_PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token)
 {
     // this function is for d-string without t/f-string.
     // dt/df-string are processed in action_helper.c:_get_resized_exprs
     Py_ssize_t lineno = token->lineno;
+    const char *end = s + len;
 
+    // skips the first newline.
     if (len == 0 || s[0] != '\n') {
         RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
             token,
-            "d-string must start with a newline at line %d",
-            lineno
+            "d-string must start with a newline"
         );
         return NULL;
     }
 
-    // find the last newline and check all chars after it are spaces or tabs.
-    const char *endline = s + len;
-    while (endline[-1] != '\n') {
-        assert(endline > s); // we know at least the first char is a newline.
-        endline--;
-        if (*endline != ' ' && *endline != '\t') {
-            RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
-                // specify the location of just before closing triple quotes.
-                token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset-2,
-                "d-string must end with an indent line");
-            return NULL;
-        }
-    }
-
-    // Now, prefix is both the dedent indentation and the end of the d-string body.
-    Py_ssize_t indent_len = s + len - endline;
-    int indent_char = endline[0];  // ' ', '\t', or '\0'.
+    // We find common indent from [s, end+1) because we want to include the last line
+    // for indent calculation.
+    assert(*end == '"' || *end == '\''); // end[0:3] is the trailing quotes
+    const char *indent;
+    Py_ssize_t indent_len = _Py_search_longest_common_leading_whitespace(s, end+1, &indent);
 
-    // checks the prefix is consistant.
-    for (Py_ssize_t i = 1; i < indent_len; i++) {
-        if (endline[i] != indent_char) {
-            RAISE_ERROR_KNOWN_LOCATION(
-                p, PyExc_TabError, token->end_lineno, token->end_col_offset - 3, token->end_lineno, token->end_col_offset -2,
-                "inconsistent use of tabs and spaces in indentation");
-            return NULL;
-        }
-    }
-
-    PyUnicodeWriter *w = PyUnicodeWriter_Create(endline - s);
+    PyBytesWriter *w = PyBytesWriter_Create(0);
     if (w == NULL) {
         return NULL;
     }
-    const char *line_start = s + 1;  // skip the first newline
+    const char *line_start = s + 1;
 
-    while (line_start < endline) {
+    while (line_start < end) {
         lineno++;
 
         Py_ssize_t i;
-        for (i = 0; i < indent_len && line_start + i < endline; i++) {
-            if (line_start[i] != indent_char) {
+        for (i = 0; i < indent_len; i++) {
+            if (line_start[i] != indent[i]) {
                 if (line_start[i] == '\n') {
                     break; // empty line
                 }
-                PyUnicodeWriter_Discard(w);
+                PyBytesWriter_Discard(w);
                 RAISE_ERROR_KNOWN_LOCATION(p, PyExc_IndentationError, lineno, i, lineno, i+1,
                     "d-string missing valid indentation");
                 return NULL;
@@ -314,8 +302,8 @@ _PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Tok
         }
 
         if (line_start[i] == '\n') {  // found an empty line with newline.
-            if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
-                PyUnicodeWriter_Discard(w);
+            if (PyBytesWriter_WriteBytes(w, "\n", 1) < 0) {
+                PyBytesWriter_Discard(w);
                 return NULL;
             }
             line_start += i+1;
@@ -324,28 +312,21 @@ _PyPegen_dedent_string(Parser *p, int is_raw, const char *s, Py_ssize_t len, Tok
 
         // found a indented line. let's dedent it.
         line_start += i;
-        const char *line_end = memchr(line_start, '\n', endline - line_start);
-        assert(line_end != NULL);  // we know there is at least one newline before endline.
-        line_end++; // include the newline in the line
-
-        if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
-            if (PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start) < 0) {
-                PyUnicodeWriter_Discard(w);
-                return NULL;
-            }
+        const char *line_end = memchr(line_start, '\n', end - line_start);
+        if (line_end == NULL) {
+            line_end = end; // last line without newline
         }
         else {
-            PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
-            if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
-                Py_XDECREF(line);
-                return NULL;
-            }
-            Py_DECREF(line);
+            line_end++; // include the newline in the line
         }
 
+        if (PyBytesWriter_WriteBytes(w, line_start, line_end - line_start) < 0) {
+            PyBytesWriter_Discard(w);
+            return NULL;
+        }
         line_start = line_end;
     }
-    return PyUnicodeWriter_Finish(w);
+    return PyBytesWriter_Finish(w);
 }
 
 /* s must include the bracketing quote characters, and r, b &/or f prefixes
@@ -427,11 +408,22 @@ _PyPegen_parse_string(Parser *p, Token *t)
             "d-string must be triple-quoted");
         return NULL;
     }
+    PyObject *dedent_bytes = NULL;
+    if (dedentmode) {
+        dedent_bytes = _PyPegen_dedent_string(p, s, len, t);
+        if (dedent_bytes == NULL) {
+            return NULL;
+        }
+        if (PyBytes_AsStringAndSize(dedent_bytes, &s, (Py_ssize_t*)&len) < 0) {
+            Py_DECREF(dedent_bytes);
+            return NULL;
+        }
+    }
 
     /* Avoid invoking escape decoding routines if possible. */
     rawmode = rawmode || strchr(s, '\\') == NULL;
+    PyObject *result;
     if (bytesmode) {
-        assert(!dedentmode);
         /* Disallow non-ASCII characters. */
         const char *ch;
         for (ch = s; *ch; ch++) {
@@ -440,17 +432,21 @@ _PyPegen_parse_string(Parser *p, Token *t)
                                    t,
                                    "bytes can only contain ASCII "
                                    "literal characters");
+                Py_XDECREF(dedent_bytes);
                 return NULL;
             }
         }
         if (rawmode) {
-            return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
+            result = PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
+        }
+        else {
+            result = decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
         }
-        return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
     }
-    if (dedentmode) {
-        return _PyPegen_dedent_string(p, rawmode, s, len, t);
+    else {
+        result = _PyPegen_decode_string(p, rawmode, s, len, t);
     }
-    return _PyPegen_decode_string(p, rawmode, s, len, t);
+    Py_XDECREF(dedent_bytes);
+    return result;
 }
 

From e1320af7496be58f53f6cd646c0a1bce3a2300a2 Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Fri, 23 Jan 2026 23:03:26 +0900
Subject: [PATCH 3/5] fix invalid escape sequences position

---
 Parser/action_helpers.c |  60 ++++++++++++++++-------
 Parser/string_parser.c  | 104 +++++++++++++++++++++++-----------------
 2 files changed, 103 insertions(+), 61 deletions(-)

diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
index 50337f48b3a303..042b3ea97628ca 100644
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@@ -1311,17 +1311,34 @@ unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start,
 }
 
 static PyObject*
-_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len,
-                            int is_first, int is_raw, expr_ty constant, Token* token)
+_PyPegen_dedent_string_part(
+        Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len,
+        int is_first, int is_raw, expr_ty constant, Token* token)
 {
     Py_ssize_t lineno = constant->lineno;
     const char *line_start = s;
-    const char *s_end = s + len;
+    const char *end = s + len;
+
+    int _prev_call_invalid = p->call_invalid_rules;
+    if (!_prev_call_invalid && !is_raw) {
+        // _PyPegen_decode_string() and decode_bytes_with_escapes() may call
+        // warn_invalid_escape_sequence(). It may emit issue or raise SyntaxError
+        // for invalid escape sequences.
+        // We need to call it before dedenting since SyntaxError needs exact lineno
+        // and col_offset of invalid escape sequences.
+        PyObject *t = _PyPegen_decode_string(p, 0, s, len, token);
+        if (t == NULL) {
+            return NULL;
+        }
+        Py_DECREF(t);
+        p->call_invalid_rules = 1;
+    }
 
     PyUnicodeWriter *w = PyUnicodeWriter_Create(len);
     if (w == NULL) {
         return NULL;
     }
+
     if (is_first) {
         assert (line_start[0] == '\n');
         line_start++;  // skip the first newline
@@ -1332,25 +1349,24 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *in
         //      next line
         //    """"
         // We don't need to dedent the first line in the non-first parts.
-        const char *line_end = memchr(line_start, '\n', s_end - line_start);
+        const char *line_end = memchr(line_start, '\n', end - line_start);
         if (line_end) {
             line_end++; // include the newline
         }
         else {
-            line_end = s_end;
+            line_end = end;
         }
         if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
-            PyUnicodeWriter_Discard(w);
-            return NULL;
+            goto error;
         }
         line_start = line_end;
     }
 
-    while (line_start < s + len) {
+    while (line_start < end) {
         lineno++;
 
         Py_ssize_t i = 0;
-        while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) {
+        while (line_start + i < end && i < indent_len && line_start[i] == indent[i]) {
             i++;
         }
 
@@ -1359,36 +1375,39 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *in
         }
         if (line_start[i] == '\n') {  // found an empty line with newline.
             if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
-                PyUnicodeWriter_Discard(w);
-                return NULL;
+                goto error;
             }
             line_start += i+1;
             continue;
         }
         if (i < indent_len) {  // found an invalid indent.
             assert(line_start[i] != indent[i]);
-            PyUnicodeWriter_Discard(w);
             RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
                 "d-string line missing valid indentation");
-            return NULL;
+            goto error;
         }
 
         // found a indented line. let's dedent it.
         line_start += i;
-        const char *line_end = memchr(line_start, '\n', s_end - line_start);
+        const char *line_end = memchr(line_start, '\n', end - line_start);
         if (line_end) {
             line_end++; // include the newline
         }
         else {
-            line_end = s_end;
+            line_end = end;
         }
         if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
-            PyUnicodeWriter_Discard(w);
-            return NULL;
+            goto error;
         }
         line_start = line_end;
     }
+    p->call_invalid_rules = _prev_call_invalid;
     return  PyUnicodeWriter_Finish(w);
+
+error:
+    p->call_invalid_rules = _prev_call_invalid;
+    PyUnicodeWriter_Discard(w);
+    return NULL;
 }
 
 static expr_ty
@@ -1405,7 +1424,7 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw,
     is_raw = is_raw || strchr(bstr, '\\') == NULL;
 
     PyObject *str = NULL;
-    if (indent_len > 0) {
+    if (indent != NULL) {
         str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len,
                                           is_first, is_raw, constant, token);
     }
@@ -1518,6 +1537,11 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes);
         common_indent_len = _Py_search_longest_common_leading_whitespace(
             temp_str, temp_end, &common_indent_start);
+        // _py_serach_longest_common_leading_whitespace() may return NULL when
+        // indent_len is 0.
+        if (common_indent_len == 0) {
+            common_indent_start = "";
+        }
     }
 
     Py_ssize_t index = 0;
diff --git a/Parser/string_parser.c b/Parser/string_parser.c
index 0fbc20ee4ce6fb..3425b856796fe5 100644
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -257,41 +257,20 @@ _Py_search_longest_common_leading_whitespace(
 
 // Dedent d-string and return result as a bytes.
 static PyObject*
-_PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token)
+_PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len,
+                       const char *indent, Py_ssize_t indent_len, int lineno)
 {
-    // this function is for d-string without t/f-string.
-    // dt/df-string are processed in action_helper.c:_get_resized_exprs
-    Py_ssize_t lineno = token->lineno;
-    const char *end = s + len;
-
-    // skips the first newline.
-    if (len == 0 || s[0] != '\n') {
-        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
-            token,
-            "d-string must start with a newline"
-        );
-        return NULL;
-    }
-
-    // We find common indent from [s, end+1) because we want to include the last line
-    // for indent calculation.
-    assert(*end == '"' || *end == '\''); // end[0:3] is the trailing quotes
-    const char *indent;
-    Py_ssize_t indent_len = _Py_search_longest_common_leading_whitespace(s, end+1, &indent);
-
     PyBytesWriter *w = PyBytesWriter_Create(0);
     if (w == NULL) {
         return NULL;
     }
-    const char *line_start = s + 1;
-
-    while (line_start < end) {
-        lineno++;
 
+    const char *end = s + len;
+    for (; s < end; lineno++) {
         Py_ssize_t i;
         for (i = 0; i < indent_len; i++) {
-            if (line_start[i] != indent[i]) {
-                if (line_start[i] == '\n') {
+            if (s[i] != indent[i]) {
+                if (s[i] == '\n') {
                     break; // empty line
                 }
                 PyBytesWriter_Discard(w);
@@ -301,18 +280,18 @@ _PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token)
             }
         }
 
-        if (line_start[i] == '\n') {  // found an empty line with newline.
+        if (s[i] == '\n') {  // found an empty line with newline.
             if (PyBytesWriter_WriteBytes(w, "\n", 1) < 0) {
                 PyBytesWriter_Discard(w);
                 return NULL;
             }
-            line_start += i+1;
+            s += i+1;
             continue;
         }
 
         // found a indented line. let's dedent it.
-        line_start += i;
-        const char *line_end = memchr(line_start, '\n', end - line_start);
+        s += i;
+        const char *line_end = memchr(s, '\n', end - s);
         if (line_end == NULL) {
             line_end = end; // last line without newline
         }
@@ -320,11 +299,11 @@ _PyPegen_dedent_string(Parser *p, const char *s, Py_ssize_t len, Token* token)
             line_end++; // include the newline in the line
         }
 
-        if (PyBytesWriter_WriteBytes(w, line_start, line_end - line_start) < 0) {
+        if (PyBytesWriter_WriteBytes(w, s, line_end - s) < 0) {
             PyBytesWriter_Discard(w);
             return NULL;
         }
-        line_start = line_end;
+        s = line_end;
     }
     return PyBytesWriter_Finish(w);
 }
@@ -403,25 +382,62 @@ _PyPegen_parse_string(Parser *p, Token *t)
         }
     }
     else if (dedentmode) {
-        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
-            t,
-            "d-string must be triple-quoted");
+        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "d-string must be triple-quoted");
         return NULL;
     }
+
+    /* Avoid invoking escape decoding routines if possible. */
+    rawmode = rawmode || strchr(s, '\\') == NULL;
+
+    int _prev_call_invald = p->call_invalid_rules;
+
     PyObject *dedent_bytes = NULL;
     if (dedentmode) {
-        dedent_bytes = _PyPegen_dedent_string(p, s, len, t);
-        if (dedent_bytes == NULL) {
+        if (len == 0 || s[0] != '\n') {
+            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "d-string must start with a newline");
             return NULL;
         }
-        if (PyBytes_AsStringAndSize(dedent_bytes, &s, (Py_ssize_t*)&len) < 0) {
-            Py_DECREF(dedent_bytes);
-            return NULL;
+
+        // _PyPegen_decode_string() and decode_bytes_with_escapes() emit
+        // a warning for invalid escape sequences.
+        // We need to call it before dedenting since it shifts the positions.
+        if (!_prev_call_invald && !rawmode) {
+            PyObject *temp;
+            if (bytesmode) {
+                temp = decode_bytes_with_escapes(p, s, len, t);
+            }
+            else {
+                temp = _PyPegen_decode_string(p, 0, s, len, t);
+            }
+            if (temp == NULL) {
+                return NULL;
+            }
+            Py_DECREF(temp);
+        }
+
+        // We find common indent from [s, end+1) because we want to include the last line
+        // for indent calculation.
+        const char *end = s + len;
+        assert(*end == '"' || *end == '\''); // end[0:3] is the trailing quotes
+        const char *indent;
+        Py_ssize_t indent_len = _Py_search_longest_common_leading_whitespace(s+1, end+1, &indent);
+
+        s++; len--; // skip the first newline
+        if (indent_len > 0) {
+            // dedent the string
+            dedent_bytes = _PyPegen_dedent_string(p, s, len, indent, indent_len, t->lineno + 1);
+            if (dedent_bytes == NULL) {
+                return NULL;
+            }
+            if (PyBytes_AsStringAndSize(dedent_bytes, (char**)&s, (Py_ssize_t*)&len) < 0) {
+                Py_DECREF(dedent_bytes);
+                return NULL;
+            }
         }
+
+        p->call_invalid_rules = 1;
     }
 
-    /* Avoid invoking escape decoding routines if possible. */
-    rawmode = rawmode || strchr(s, '\\') == NULL;
     PyObject *result;
     if (bytesmode) {
         /* Disallow non-ASCII characters. */
@@ -433,6 +449,7 @@ _PyPegen_parse_string(Parser *p, Token *t)
                                    "bytes can only contain ASCII "
                                    "literal characters");
                 Py_XDECREF(dedent_bytes);
+                p->call_invalid_rules = _prev_call_invald;
                 return NULL;
             }
         }
@@ -447,6 +464,7 @@ _PyPegen_parse_string(Parser *p, Token *t)
         result = _PyPegen_decode_string(p, rawmode, s, len, t);
     }
     Py_XDECREF(dedent_bytes);
+    p->call_invalid_rules = _prev_call_invald;
     return result;
 }
 

From 733c2d07edf8dd30563add2b8753a64177f6ebd4 Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Sun, 25 Jan 2026 17:27:22 +0900
Subject: [PATCH 4/5] improve tests

---
 Lib/test/test_dstring.py | 94 ++++++++++++++++++++++++++++++++++------
 Parser/action_helpers.c  |  7 +++
 Parser/lexer/lexer.c     |  2 +-
 3 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py
index 7927877e8bb088..5161c21b2a6c26 100644
--- a/Lib/test/test_dstring.py
+++ b/Lib/test/test_dstring.py
@@ -1,37 +1,103 @@
 import unittest
 
 
+_dstring_prefixes = "d db df dt dr drb drf drt".split()
+_dstring_prefixes += [p.upper() for p in _dstring_prefixes]
+
+
+def d(s):
+    # Helper function to evaluate d-strings.
+    if '"""' in s:
+        return eval(f"d'''{s}'''")
+    else:
+        return eval(f'd"""{s}"""')
+
+
 class DStringTestCase(unittest.TestCase):
     def assertAllRaise(self, exception_type, regex, error_strings):
         for str in error_strings:
             with self.subTest(str=str):
                 with self.assertRaisesRegex(exception_type, regex) as cm:
                     eval(str)
-                # print("Testing expression:", repr(str))
-                # print(repr(cm.exception))
-                # print(repr(cm.exception.text))
 
     def test_single_quote(self):
         exprs = [
-            "d'hello'",
-            'D"hello"',
-            "d'hello\\nworld'",
+            f"{p}'hello, world'" for p in _dstring_prefixes
+        ] + [
+            f'{p}"hello, world"' for p in _dstring_prefixes
         ]
         self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs)
 
     def test_empty_dstring(self):
         exprs = [
-            "d''''''",
-            'D""""""',
+            f"{p}''''''" for p in _dstring_prefixes
+        ] + [
+            f'{p}""""""' for p in _dstring_prefixes
         ]
         self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)
 
-    def test_simple_dstring(self):
-        self.assertEqual(eval('d"""\n  hello world\n  """'), "hello world\n")
-        self.assertEqual(eval('d"""\n  hello world\n """'), " hello world\n")
-        self.assertEqual(eval('d"""\n  hello world\n"""'), "  hello world\n")
-        self.assertEqual(eval('d"""\n  hello world\\\n """'), " hello world")
-        self.assertEqual(eval('dr"""\n  hello world\\\n """'), " hello world\\\n")
+        for prefix in _dstring_prefixes:
+            expr = f"{prefix}'''\n'''"
+            expr2 = f'{prefix}"""\n"""'
+            with self.subTest(expr=expr):
+                v = eval(expr)
+                v2 = eval(expr2)
+                if 't' in prefix.lower():
+                    self.assertEqual(v.strings, ("",))
+                    self.assertEqual(v2.strings, ("",))
+                elif 'b' in prefix.lower():
+                    self.assertEqual(v, b"")
+                    self.assertEqual(v2, b"")
+                else:
+                    self.assertEqual(v, "")
+                    self.assertEqual(v2, "")
+
+    def test_dedent(self):
+        # Basic dedent - remove common leading whitespace
+        result = d("""
+    hello
+    world
+    """)
+        self.assertEqual(result, "hello\nworld\n")
+
+        # Dedent with varying indentation
+        result = d("""
+     line1
+       line2
+    line3
+      """)
+        self.assertEqual(result, " line1\n   line2\nline3\n  ")
+
+        # Dedent with tabs
+        result = d("""
+\thello
+\tworld
+\t""")
+        self.assertEqual(result, "hello\nworld\n")
+
+        # Mixed spaces and tabs (using common leading whitespace)
+        result = d("""
+\t\t    hello
+\t\t    world
+\t\t  """)
+        self.assertEqual(result, "  hello\n  world\n")
+
+        # Empty lines do not affect the calculation of common leading whitespace
+        result = d("""
+    hello
+
+    world
+    """)
+        self.assertEqual(result, "hello\n\nworld\n")
+
+        # Lines with only whitespace also have their indentation removed.
+        result = d("""
+    hello
+  \n\
+      \n\
+    world
+    """)
+        self.assertEqual(result, "hello\n\n  \nworld\n")
 
 
 if __name__ == '__main__':
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
index 042b3ea97628ca..b2a778e13e2280 100644
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@@ -1481,6 +1481,13 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
     Py_ssize_t common_indent_len = 0;
 
     if (is_dedent) {
+        if (total_items == 0) {
+            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+                a,
+                "d-string must start with a newline"
+            );
+            return NULL;
+        }
         expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
         if (first_item->kind != Constant_kind
                 || PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') {
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
index cfccdd55a6ae11..07c61f1cb4386b 100644
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@@ -1124,7 +1124,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                 case 'd':
                 case 'D':
                     if (quote_size != 3) {
-                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be a multiline string"));
+                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be triple-quoted"));
                     }
                     break;
                 default:

From 5faa196b6cba1811b7afa80ebe58562d3f83578c Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Wed, 28 Jan 2026 19:30:21 +0900
Subject: [PATCH 5/5] t/f-string: calculate common indent without temp string

---
 Parser/action_helpers.c | 151 +++++++++++++++++++++++++++-------------
 1 file changed, 103 insertions(+), 48 deletions(-)

diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
index b2a778e13e2280..567da3475eb098 100644
--- a/Parser/action_helpers.c
+++ b/Parser/action_helpers.c
@@ -1445,13 +1445,102 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw,
                            p->arena);
 }
 
-/* defined in unicodeobject.c */
-extern Py_ssize_t
-_Py_search_longest_common_leading_whitespace(
+/*
+This function is customized version of _Py_search_longest_common_leading_whitespace()
+in unicodeobject.c
+*/
+static void
+search_longest_common_leading_whitespace(
     const char *const src,
     const char *const end,
-    const char **output
-    );
+    const char **indent,
+    Py_ssize_t *indent_len)
+{
+    // [_start, _start + _len)
+    // describes the current longest common leading whitespace
+    const char *_start = *indent;
+    Py_ssize_t _len = *indent_len;
+
+    // skip the first line. for example:
+    // s = df"""
+    //    first part
+    //    first part{x}second part
+    //    second part
+    //    """
+    // we don't need newline after opening qute.
+    // we don't need first line in the second part too.
+    const char *iter = memchr(src, '\n', end - src);
+    if (iter == NULL) {
+        // single line string
+        return;
+    }
+
+    for (iter++; iter <= end; iter++) {
+        const char *line_start = iter;
+        const char *leading_whitespace_end = NULL;
+
+        // scan the whole line
+        while (iter < end && *iter != '\n') {
+            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
+                /* `iter` points to the first non-whitespace character
+                   in this line */
+                if (iter == line_start) {
+                    // some line has no indent, fast exit!
+                    *indent = iter;
+                    *indent_len = 0;
+                    return;
+                }
+                leading_whitespace_end = iter;
+            }
+            ++iter;
+        }
+
+        if (!leading_whitespace_end) {
+            // if this line has all white space, skip it
+            if (iter < end) {
+                continue;
+            }
+            leading_whitespace_end = iter;  // last line may not end with '\n'
+        }
+
+        if (!_start) {
+            // update the first leading whitespace
+            _start = line_start;
+            _len = leading_whitespace_end - line_start;
+        }
+        else {
+            /* We then compare with the current longest leading whitespace.
+
+               [line_start, leading_whitespace_end) is the leading
+               whitespace of this line,
+
+               [_start, _start + _len) is the leading whitespace of the
+               current longest leading whitespace. */
+            Py_ssize_t new_len = 0;
+            const char *_iter = _start, *line_iter = line_start;
+
+            while (_iter < _start + _len && line_iter < leading_whitespace_end
+                   && *_iter == *line_iter)
+            {
+                ++_iter;
+                ++line_iter;
+                ++new_len;
+            }
+
+            _len = new_len;
+            if (_len == 0) {
+                // No common things now, fast exit!
+                *indent = _start;
+                *indent_len = 0;
+                return;
+            }
+        }
+    }
+
+    *indent = _start;
+    *indent_len = _len;
+}
+
 
 static asdl_expr_seq *
 _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind)
@@ -1477,8 +1566,8 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         return NULL;
     }
 
-    const char *common_indent_start = NULL;
-    Py_ssize_t common_indent_len = 0;
+    const char *indent_start = NULL;
+    Py_ssize_t indent_len = 0;
 
     if (is_dedent) {
         if (total_items == 0) {
@@ -1498,56 +1587,22 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
             return NULL;
         }
 
-        // Instead of calculating common indent from all parts,
-        // build temporary string and calculate common indent from it.
-        PyBytesWriter *w = PyBytesWriter_Create(0);
-        if (w == NULL) {
-            return NULL;
-        }
-
         for (Py_ssize_t i = 0; i < n_items; i++) {
             expr_ty item = asdl_seq_GET(raw_expressions, i);
-
-            if (item->kind == JoinedStr_kind) {
-                // Write a placeholder.
-                if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
-                    PyBytesWriter_Discard(w);
-                    return NULL;
-                }
-                continue;
-            }
             if (item->kind == Constant_kind) {
                 Py_ssize_t blen;
                 const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen);
-                if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) {
-                    PyBytesWriter_Discard(w);
+                if (bstr == NULL) {
                     return NULL;
                 }
-                continue;
+                search_longest_common_leading_whitespace(bstr, bstr + blen, &indent_start, &indent_len);
             }
         }
-        // Add a terminator to include the last line before the ending quote
-        if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
-            PyBytesWriter_Discard(w);
-            return NULL;
-        }
 
-        // TODO: instead of creating temp_bytes, we could search
-        // common index from each part directly. But this need reimplementation
-        // of _Py_search_longest_common_leading_whitespace.
-        PyObject *temp_bytes = PyBytesWriter_Finish(w);
-        if (temp_bytes == NULL) {
-            return NULL;
-        }
-        _PyArena_AddPyObject(p->arena, temp_bytes);
-        const char *temp_str = PyBytes_AsString(temp_bytes);
-        const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes);
-        common_indent_len = _Py_search_longest_common_leading_whitespace(
-            temp_str, temp_end, &common_indent_start);
-        // _py_serach_longest_common_leading_whitespace() may return NULL when
-        // indent_len is 0.
-        if (common_indent_len == 0) {
-            common_indent_start = "";
+        assert(indent_start != NULL); // TODO: is this assert true?
+        // _py_serach_longest_common_leading_whitespace() may not set indent_start when string is empty.
+        if (indent_len == 0) {
+            indent_start = "";
         }
     }
 
@@ -1582,7 +1637,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         }
 
         if (item->kind == Constant_kind) {
-            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b);
+            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_start, indent_len, item, b);
             if (item == NULL) {
                 return NULL;
             }