snowflakedb · sfc-gh-aalam · Jun 10, 2025 · May 6, 2025 · May 6, 2025 · May 7, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,7 @@
   - Added support for ignoring surrounding whitespace in the XML element using `ignoreSurroundingWhitespace` option.
 - Added support for parameter `return_dataframe` in `Session.call`, which can be used to set the return type of the functions to a `DataFrame` object.
 - Added a new argument to `Dataframe.describe` called `strings_include_math_stats` that triggers `stddev` and `mean` to be calculated for String columns.
+- Added debuggability improvements to show a trace of most recent dataframe transformations if an operation leads to a `SnowparkSQLException`. Enable it using `snowflake.snowpark.context.configure_development_features()`. This feature also depends on AST collection to be enabled in the session which can be done using `session.ast_enabled = True`.
 - Improved the error message for `Session.write_pandas()` and `Session.create_dataframe()` when the input pandas DataFrame does not have a column.
 - Added support for retrieving `Edge.properties` when retrieving lineage from `DGQL` in `DataFrame.lineage.trace`.
 - Added a parameter `table_exists` to `DataFrameWriter.save_as_table` that allows specifying if a table already exists. This allows skipping a table lookup that can be expensive.

@@ -4,6 +4,7 @@
 #
 import copy
 import difflib
+from logging import getLogger
 import re
 import sys
 import uuid
@@ -32,6 +33,9 @@
     TableFunctionRelation,
     TableFunctionJoin,
 )
+from snowflake.snowpark._internal.debug_utils import (
+    get_df_transform_trace_message,
+)
 
 if TYPE_CHECKING:
     from snowflake.snowpark._internal.analyzer.select_statement import (
@@ -134,6 +138,8 @@
 else:
     from collections.abc import Iterable
 
+_logger = getLogger(__name__)
+
 
 class SnowflakePlan(LogicalPlan):
     class Decorator:
@@ -147,7 +153,15 @@ class Decorator:
 
         @staticmethod
         def wrap_exception(func):
+            """This wrapper is used to wrap snowflake connector ProgrammingError into SnowparkSQLException.
+            It also adds additional debug information to the raised exception when possible.
+            """
+
             def wrap(*args, **kwargs):
+                from snowflake.snowpark.context import (
+                    _enable_dataframe_trace_on_error,
+                )
+
                 try:
                     return func(*args, **kwargs)
                 except snowflake.connector.errors.ProgrammingError as e:
@@ -158,9 +172,35 @@ def wrap(*args, **kwargs):
                     query = getattr(e, "query", None)
                     tb = sys.exc_info()[2]
                     assert e.msg is not None
+
+                    # extract df_ast_id, stmt_cache from args
+                    df_ast_id, stmt_cache = None, None
+                    for arg in args:
+                        if isinstance(arg, SnowflakePlan):
+                            df_ast_id = arg.df_ast_id
+                            stmt_cache = arg.session._ast_batch._bind_stmt_cache
+                            break
+                    df_transform_debug_trace = None
+                    try:
+                        if (
+                            _enable_dataframe_trace_on_error
+                            and df_ast_id is not None
+                            and stmt_cache is not None
+                        ):
+                            df_transform_debug_trace = get_df_transform_trace_message(
+                                df_ast_id, stmt_cache
+                            )
+                    except Exception as trace_error:
+                        # If we encounter an error when getting the df_transform_debug_trace,
+                        # we will ignore the error and not add the debug trace to the error message.
+                        _logger.info(
+                            f"Error when getting the df_transform_debug_trace: {trace_error}"
+                        )
+                        pass
+
                     if "unexpected 'as'" in e.msg.lower():
                         ne = SnowparkClientExceptionMessages.SQL_PYTHON_REPORT_UNEXPECTED_ALIAS(
-                            query
+                            query, debug_context=df_transform_debug_trace
                         )
                         raise ne.with_traceback(tb) from None
                     elif e.sqlstate == "42000" and "invalid identifier" in e.msg:
@@ -171,7 +211,7 @@ def wrap(*args, **kwargs):
                         )
                         if not match:  # pragma: no cover
                             ne = SnowparkClientExceptionMessages.SQL_EXCEPTION_FROM_PROGRAMMING_ERROR(
-                                e
+                                e, debug_context=df_transform_debug_trace
                             )
                             raise ne.with_traceback(tb) from None
                         col = match.group(1)
@@ -193,7 +233,9 @@ def wrap(*args, **kwargs):
                                 unaliased_cols[0] if unaliased_cols else "<colname>"
                             )
                             ne = SnowparkClientExceptionMessages.SQL_PYTHON_REPORT_INVALID_ID(
-                                orig_col_name, query
+                                orig_col_name,
+                                query,
+                                debug_context=df_transform_debug_trace,
                             )
                             raise ne.with_traceback(tb) from None
                         elif (
@@ -210,7 +252,7 @@ def wrap(*args, **kwargs):
                             > 1
                         ):
                             ne = SnowparkClientExceptionMessages.SQL_PYTHON_REPORT_JOIN_AMBIGUOUS(
-                                col, col, query
+                                col, col, query, debug_context=df_transform_debug_trace
                             )
                             raise ne.with_traceback(tb) from None
                         else:
@@ -220,7 +262,7 @@ def wrap(*args, **kwargs):
                             )
                             if not match:  # pragma: no cover
                                 ne = SnowparkClientExceptionMessages.SQL_EXCEPTION_FROM_PROGRAMMING_ERROR(
-                                    e
+                                    e, debug_context=df_transform_debug_trace
                                 )
                                 raise ne.with_traceback(tb) from None
                             col = match.group(1)
@@ -282,7 +324,7 @@ def add_single_quote(string: str) -> str:
 
                             e.msg = f"{e.msg}\n{msg}"
                             ne = SnowparkClientExceptionMessages.SQL_EXCEPTION_FROM_PROGRAMMING_ERROR(
-                                e
+                                e, debug_context=df_transform_debug_trace
                             )
                             raise ne.with_traceback(tb) from None
                     elif e.sqlstate == "42601" and "SELECT with no columns" in e.msg:
@@ -329,7 +371,7 @@ def search_read_file_node(
                                     raise ne.with_traceback(tb) from None
 
                     ne = SnowparkClientExceptionMessages.SQL_EXCEPTION_FROM_PROGRAMMING_ERROR(
-                        e
+                        e, debug_context=df_transform_debug_trace
                     )
                     raise ne.with_traceback(tb) from None
 

@@ -742,8 +742,7 @@ def with_src_position(
         # Once we've stepped out of the snowpark package, we should be in the code of interest.
         # However, the code of interest may execute in an environment that is not accessible via the filesystem.
         # e.g. Jupyter notebooks, REPLs, calls to exec, etc.
-        filename = frame.f_code.co_filename if frame is not None else ""
-        if frame is None or not Path(filename).is_file():
+        if frame is None:
             src.file = __intern_string("")
             return expr_ast
 

@@ -0,0 +1,200 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
+
+from functools import cached_property
+import os
+import sys
+from typing import Dict, List, Optional
+import itertools
+
+from snowflake.snowpark._internal.ast.batch import get_dependent_bind_ids
+from snowflake.snowpark._internal.ast.utils import __STRING_INTERNING_MAP__
+import snowflake.snowpark._internal.proto.generated.ast_pb2 as proto
+
+UNKNOWN_FILE = "__UNKNOWN_FILE__"
+SNOWPARK_PYTHON_DATAFRAME_TRANSFORM_TRACE_LENGTH = (
+    "SNOWPARK_PYTHON_DATAFRAME_TRANSFORM_TRACE_LENGTH"
+)
+
+
+class DataFrameTraceNode:
+    """A node representing a dataframe operation in the DAG that represents the lineage of a DataFrame."""
+
+    def __init__(self, batch_id: int, stmt_cache: Dict[int, proto.Stmt]) -> None:
+        self.batch_id = batch_id
+        self.stmt_cache = stmt_cache
+
+    @cached_property
+    def children(self) -> set[int]:
+        """Returns the batch_ids of the children of this node."""
+        return get_dependent_bind_ids(self.stmt_cache[self.batch_id])
+
+    def get_src(self) -> Optional[proto.SrcPosition]:
 def get_user_source_location(group: str) -> dict[str, str]: 
 def context_manager_code_location(frame_info, func) -> Tuple[str, int]: 
 def get_user_source_location(group: str) -> dict[str, str]: 
 def context_manager_code_location(frame_info, func) -> Tuple[str, int]: 
+        """The source Stmt of the DataFrame described by the batch_id."""
+        stmt = self.stmt_cache[self.batch_id]
+        api_call = stmt.bind.expr.WhichOneof("variant")
+        return (
+            getattr(stmt.bind.expr, api_call).src
+            if api_call and getattr(stmt.bind.expr, api_call).HasField("src")
+            else None
+        )
+
+    def _read_file(
+        self, filename, start_line, end_line, start_column, end_column
+    ) -> str:
+        """Read the relevant code snippets of where the DataFrame was created. The filename given here
+        must have read permissions for the executing user."""
+        with open(filename) as f:
+            code_lines = []
+            if sys.version_info >= (3, 11):
+                # Skip to start_line and read only the required lines
+                lines = itertools.islice(f, start_line - 1, end_line)
+                code_lines = list(lines)
+                if start_line == end_line:
+                    code_lines[0] = code_lines[0][start_column:end_column]
+                else:
+                    code_lines[0] = code_lines[0][start_column:]
+                    code_lines[-1] = code_lines[-1][:end_column]
+            else:
+                # For python 3.9/3.10, we do not extract the end line from the source code
+                # so we just read the start line and return.
+                for line in itertools.islice(f, start_line - 1, start_line):
+                    code_lines.append(line)
+
+            code_lines = [line.rstrip() for line in code_lines]
+            return "\n".join(code_lines)
+
+    @cached_property
+    def source_id(self) -> str:
+        """Unique identifier of the location of the DataFrame creation in the source code."""
+        src = self.get_src()
+        if src is None:  # pragma: no cover
+            return ""
+
+        fileno = src.file
+        start_line = src.start_line
+        start_column = src.start_column
+        end_line = src.end_line
+        end_column = src.end_column
+        return f"{fileno}:{start_line}:{start_column}-{end_line}:{end_column}"
+
+    def get_source_snippet(self) -> str:
+        """Read the source file and extract the snippet where the dataframe is created."""
+        src = self.get_src()
+        if src is None:  # pragma: no cover
+            return "No source"
+
+        # get the latest mapping of fileno to filename
+        _fileno_to_filename_map = {v: k for k, v in __STRING_INTERNING_MAP__.items()}
+        fileno = src.file
+        filename = _fileno_to_filename_map.get(fileno, UNKNOWN_FILE)
+
+        start_line = src.start_line
+        end_line = src.end_line
+        start_column = src.start_column
+        end_column = src.end_column
+
+        # Build the code identifier to find the operations where the DataFrame was created
+        if sys.version_info >= (3, 11):
+            code_identifier = (
+                f"{filename}|{start_line}:{start_column}-{end_line}:{end_column}"
+            )
+        else:
+            code_identifier = f"{filename}|{start_line}"
+
+        if filename != UNKNOWN_FILE and os.access(filename, os.R_OK):
+            # If the file is readable, read the code snippet
+            code = self._read_file(
+                filename, start_line, end_line, start_column, end_column
+            )
+            return f"{code_identifier}: {code}"
+        return code_identifier  # pragma: no cover
+
+
+def _get_df_transform_trace(
+    batch_id: int,
+    stmt_cache: Dict[int, proto.Stmt],
+) -> List[DataFrameTraceNode]:
+    """Helper function to get the transform trace of the dataframe involved in the exception.
+    It gathers the lineage in the following way:
+
+    1. Start by creating a DataFrameTraceNode for the given batch_id.
+    2. We use BFS to traverse the lineage using the node created in 1. as the first layer.
+    3. During each iteration, we check if the node's source_id has been visited. If not,
+        we add it to the visited set and append its source format to the trace. This step
+        is needed to avoid source_id added multiple times in lineage due to loops.
+    4. We then explore the next layer by adding the children of the current node to the
+        next layer. We check if the child ID has been visited and if not, we add it to the
+        visited set and append the DataFrameTraceNode for it to the next layer.
+    5. We repeat this process until there are no more nodes to explore.
+
+    Args:
+        batch_id: The batch ID of the dataframe involved in the exception.
+        stmt_cache: The statement cache of the session.
+
+    Returns:
+        A list of DataFrameTraceNode objects representing the transform trace of the dataframe.
+    """
+    visited_batch_id = set()
+    visited_source_id = set()
+
+    visited_batch_id.add(batch_id)
+    curr = [DataFrameTraceNode(batch_id, stmt_cache)]
+    lineage = []
+
+    while curr:
+        next: List[DataFrameTraceNode] = []
+        for node in curr:
+            # tracing updates
+            source_id = node.source_id
+            if source_id not in visited_source_id:
+                visited_source_id.add(source_id)
+                lineage.append(node)
+
+            # explore next layer
+            for child_id in node.children:
+                if child_id in visited_batch_id:
+                    continue
+                visited_batch_id.add(child_id)
+                next.append(DataFrameTraceNode(child_id, stmt_cache))
+
+        curr = next
+
+    return lineage
+
+
+def get_df_transform_trace_message(
+    df_ast_id: int, stmt_cache: Dict[int, proto.Stmt]
+) -> Optional[str]:
+    """Get the transform trace message for the dataframe involved in the exception.
+
+    Args:
+        df_ast_id: The AST ID of the dataframe involved in the exception.
+        stmt_cache: The statement cache of the session.
+
+    Returns:
+        A string representing the transform trace message.
+    """
+    df_transform_trace_nodes = _get_df_transform_trace(df_ast_id, stmt_cache)
+    if len(df_transform_trace_nodes) == 0:  # pragma: no cover
+        return None
+
+    df_transform_trace_length = len(df_transform_trace_nodes)
+    show_trace_length = int(
+        os.environ.get(SNOWPARK_PYTHON_DATAFRAME_TRANSFORM_TRACE_LENGTH, 5)
+    )
+
+    debug_info_lines = [
+        "\n\n--- Additional Debug Information ---\n",
+        f"Trace of the most recent dataframe operations associated with the error (total {df_transform_trace_length}):\n",
+    ]
+    for node in df_transform_trace_nodes[:show_trace_length]:
+        debug_info_lines.append(node.get_source_snippet())
+    if df_transform_trace_length > show_trace_length:
+        debug_info_lines.append(
+            f"... and {df_transform_trace_length - show_trace_length} more.\nYou can increase "
+            f"the lineage length by setting {SNOWPARK_PYTHON_DATAFRAME_TRANSFORM_TRACE_LENGTH} "
+            "environment variable."
+        )
+    return "\n".join(debug_info_lines)